masster 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1685 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +393 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -736
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1406 -886
  41. masster/study/helpers.py +1713 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1231 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +161 -134
  48. masster/study/study.py +612 -522
  49. masster/study/study5_schema.json +253 -241
  50. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/METADATA +15 -10
  51. masster-0.3.1.dist-info/RECORD +59 -0
  52. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.5.dist-info/RECORD +0 -50
  54. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/WHEEL +0 -0
  55. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/entry_points.txt +0 -0
@@ -1,1416 +1,1402 @@
1
- from __future__ import annotations
2
-
3
- from datetime import datetime
4
-
5
- import numpy as np
6
- import polars as pl
7
- import pyopenms as oms
8
-
9
- from tqdm import tqdm
10
-
11
- from masster.chromatogram import Chromatogram
12
- # Parameters removed - using hardcoded defaults
13
- from masster.spectrum import Spectrum
14
- from .defaults.find_features_def import find_features_defaults
15
- from .defaults.find_adducts_def import find_adducts_defaults
16
- from .defaults.find_ms2_def import find_ms2_defaults
17
- from .defaults.get_spectrum_def import get_spectrum_defaults
18
-
19
-
20
- def get_spectrum(self, scan, **kwargs):
21
- """
22
- Retrieve and process a spectrum from the data file based on the given scan identifier.
23
-
24
- This method locates the scan in the internal DataFrame, extracts the metadata (such as energy,
25
- MS level, and retention time), and then retrieves the corresponding spectrum data from the file.
26
- Depending on the file interface (either 'oms' or 'alpharaw'), the spectrum data is obtained
27
- and processed (including optional denoising, centroiding, deisotoping, and precursor m/z trimming).
28
-
29
- Parameters:
30
- scan (int): Unique identifier of the scan to retrieve. This is a mandatory parameter.
31
- **kwargs: Keyword arguments for spectrum retrieval parameters. Can include:
32
- - A get_spectrum_defaults instance to set all parameters at once
33
- - Individual parameter names and values (see get_spectrum_defaults for details)
34
-
35
- Key Parameters:
36
- precursor_trim (int, optional): Value used to trim the precursor m/z for MS2 spectra.
37
- If provided and the spectrum's MS level is greater than 1,
38
- m/z values above (precursor_mz - precursor_trim) will be trimmed.
39
- Default is 20.
40
- max_peaks (int, optional): Maximum number of peaks to retain in the spectrum. Default is 100.
41
- centroid (bool, optional): Flag indicating whether the spectrum should be centroided.
42
- If True and the spectrum is not already centroided, the method
43
- applies denoising followed by centroiding using parameters from self.parameters.
44
- Default is True.
45
- deisotope (bool, optional): Flag indicating whether deisotoping should be performed. Default is False.
46
- dia_stats (optional): Flag or parameter for processing DIA (data-independent acquisition)
47
- statistics. If provided (and if applicable to the file type), additional
48
- statistics will be computed for 'ztscan' files. Default is None.
49
- feature (optional): An optional identifier used when computing DIA statistics. Default is None.
50
- label (str, optional): Optional label to assign to the spectrum. If not provided,
51
- a default name is generated based on the MS level and retention time.
52
- Default is None.
53
- centroid_algo (str, optional): Algorithm to use for centroiding. Default is None.
54
-
55
- Returns:
56
- spectrum: A processed spectrum object containing:
57
- - m/z and intensity arrays
58
- - metadata such as MS level, retention time, energy, and an assigned label
59
- Depending on the processing steps (centroiding, trimming, deisotoping, etc.), the
60
- returned spectrum is modified accordingly.
61
- Returns None or an empty spectrum if the scan is not found or if an error occurs.
62
-
63
- Notes:
64
- - For the 'oms' file interface, the spectrum is retrieved via self.file_obj.getSpectrum
65
- and handled accordingly.
66
- - For the 'alpharaw' file interface, the method uses internal DataFrame attributes to locate the
67
- scan and its associated peaks.
68
- - The method applies additional processing (denoising, centroiding, deisotoping, trimming) based on
69
- the input flags and the MS level of the spectrum.
70
- """
71
-
72
- # parameters initialization
73
- params = get_spectrum_defaults(scan=scan)
74
- for key, value in kwargs.items():
75
- if isinstance(value, get_spectrum_defaults):
76
- params = value
77
- self.logger.debug("Using provided get_spectrum_defaults parameters")
78
- else:
79
- if hasattr(params, key):
80
- if params.set(key, value, validate=True):
81
- self.logger.debug(f"Updated parameter {key} = {value}")
82
- else:
83
- self.logger.warning(
84
- f"Failed to set parameter {key} = {value} (validation failed)",
85
- )
86
- else:
87
- self.logger.debug(f"Unknown parameter {key} ignored")
88
- # end of parameter initialization
89
-
90
- # Extract parameter values
91
- scan = params.get("scan")
92
- precursor_trim = params.get("precursor_trim")
93
- max_peaks = params.get("max_peaks")
94
- centroid = params.get("centroid")
95
- deisotope = params.get("deisotope")
96
- dia_stats = params.get("dia_stats")
97
- feature_uid = params.get("feature")
98
- label = params.get("label")
99
- centroid_algo = params.get("centroid_algo")
100
-
101
- # get energy, ms_level, rt from scans_df
102
- scan_uid = scan # Preserve original scan ID
103
- scan_info = self.scans_df.filter(pl.col("scan_uid") == scan_uid)
104
- if len(scan_info) == 0:
105
- self.logger.warning(f"Scan {scan_uid} not found.")
106
- return None
107
- scan_info = scan_info[0]
108
- energy = scan_info["energy"][0]
109
- ms_level = scan_info["ms_level"][0]
110
- rt = scan_info["rt"][0]
111
- if label is None:
112
- if ms_level == 1:
113
- name = f"MS1, rt {rt:.2f} s, scan {scan_uid}"
114
- else:
115
- name = f"MS2 of mz {scan_info['prec_mz'][0]:0.1f}, rt {rt:.2f} s, scan {scan_uid}"
116
- else:
117
- name = label
118
-
119
- if centroid_algo is None:
120
- if "centroid_algo" in self.parameters:
121
- centroid_algo = self.parameters.get("centroid_algo")
122
- else:
123
- # this is for backward compatibility. This is the old default
124
- self.parameters.centroid_algo = "lmp"
125
- centroid_algo = self.parameters.get("centroid_algo")
126
-
127
- spec0 = Spectrum(mz=np.array([]), inty=np.array([]))
128
- if self.file_interface == "oms":
129
- # if check that file_obj is not None
130
- if self.file_obj is None:
131
- self.logger.error("Please load a file first.")
132
- return
133
- try:
134
- spect = self.file_obj.getSpectrum(scan_uid).get_peaks()
135
- except Exception as e:
136
- self.logger.error(f"Error: {e}")
137
- return spec0
138
- if len(spect[0]) == 0:
139
- return spec0
140
- elif len(spect[0]) == 1:
141
- mz = np.array([spect[0][0]])
142
- inty = np.array([spect[1][0]])
143
- else:
144
- mz = np.array(spect[0])
145
- inty = np.array(spect[1])
146
- if ms_level == 1:
147
- spect = Spectrum(
148
- mz=mz,
149
- inty=inty,
150
- ms_level=ms_level,
151
- rt=rt,
152
- energy=None,
153
- precursor_mz=None,
154
- label=name,
155
- )
156
- else:
157
- spect = Spectrum(
158
- mz=mz,
159
- inty=inty,
160
- ms_level=ms_level,
161
- rt=rt,
162
- energy=energy,
163
- precursor_mz=scan_info["prec_mz"][0],
164
- label=name,
165
- )
166
- if centroid and not spect.centroided:
167
- spect = spect.denoise()
168
- if Spectrum.ms_level == 1:
169
- spect = spect.centroid(
170
- algo=centroid_algo,
171
- tolerance=self.parameters.get("mz_tol_ms1_da"),
172
- ppm=self.parameters.get("mz_tol_ms1_ppm"),
173
- min_points=self.parameters.get("centroid_min_points_ms1"),
174
- smooth=self.parameters.get("centroid_smooth"),
175
- prominence=self.parameters.get("centroid_prominence"),
176
- refine=self.parameters.get("centroid_refine"),
177
- )
178
- elif Spectrum.ms_level == 2:
179
- spect = spect.centroid(
180
- algo=centroid_algo,
181
- tolerance=self.parameters.get("mz_tol_ms2_da"),
182
- ppm=self.parameters.get("mz_tol_ms2_ppm"),
183
- min_points=self.parameters.get("centroid_min_points_ms2"),
184
- smooth=self.parameters.get("centroid_smooth"),
185
- prominence=self.parameters.get("centroid_prominence"),
186
- refine=self.parameters.get("centroid_refine"),
187
- )
188
-
189
- elif self.file_interface == "alpharaw":
190
- spec_df = self.file_obj.spectrum_df
191
- spect = (
192
- spec_df.filter(pl.col("scan_id") == scan_uid).row(0, named=True)
193
- if isinstance(spec_df, pl.DataFrame)
194
- else spec_df.loc[scan_uid]
195
- )
196
- peak_stop_idx = spect["peak_stop_idx"]
197
- peak_start_idx = spect["peak_start_idx"]
198
-
199
- if isinstance(self.file_obj.peak_df, pl.DataFrame):
200
- peaks = self.file_obj.peak_df.slice(
201
- peak_start_idx,
202
- peak_stop_idx - peak_start_idx,
203
- )
204
- mz_values = peaks.select("mz").to_numpy().flatten()
205
- intensity_values = peaks.select("intensity").to_numpy().flatten()
206
- else:
207
- peaks = self.file_obj.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
208
- mz_values = peaks.mz.values
209
- intensity_values = peaks.intensity.values
210
-
211
- if spect["ms_level"] > 1:
212
- spect = Spectrum(
213
- mz=np.asarray(mz_values, dtype=np.float64),
214
- inty=np.asarray(intensity_values, dtype=np.float64),
215
- ms_level=ms_level,
216
- centroided=False,
217
- precursor_mz=spect["precursor_mz"],
218
- energy=energy,
219
- rt=rt,
220
- label=name,
221
- )
222
- else:
223
- spect = Spectrum(
224
- mz=np.asarray(mz_values, dtype=np.float64),
225
- inty=np.asarray(intensity_values, dtype=np.float64),
226
- ms_level=ms_level,
227
- centroided=False,
228
- precursor_mz=None,
229
- energy=None,
230
- rt=rt,
231
- label=name,
232
- )
233
-
234
- if len(spect) and centroid and not spect.centroided:
235
- spect = spect.denoise()
236
- if spect.ms_level == 1:
237
- spect = spect.centroid(
238
- algo=centroid_algo,
239
- tolerance=self.parameters.get("mz_tol_ms1_da"),
240
- ppm=self.parameters.get("mz_tol_ms1_ppm"),
241
- min_points=self.parameters.get("centroid_min_points_ms1"),
242
- smooth=self.parameters.get("centroid_smooth"),
243
- prominence=self.parameters.get("centroid_prominence"),
244
- refine=self.parameters.get("centroid_refine"),
245
- )
246
- elif spect.ms_level == 2:
247
- spect = spect.centroid(
248
- algo=centroid_algo,
249
- tolerance=self.parameters.get("mz_tol_ms2_da"),
250
- ppm=self.parameters.get("mz_tol_ms2_ppm"),
251
- min_points=self.parameters.get("centroid_min_points_ms2"),
252
- smooth=self.parameters.get("centroid_smooth"),
253
- prominence=self.parameters.get("centroid_prominence"),
254
- refine=self.parameters.get("centroid_refine"),
255
- )
256
-
257
- else:
258
- self.logger.error(
259
- f"File interface {self.file_interface} not supported. Reload data.",
260
- )
261
- return spec0
262
-
263
- if precursor_trim is not None and spect.ms_level > 1:
264
- spect = spect.trim(mz_min=None, mz_max=spect.precursor_mz - precursor_trim) # type: ignore[attr-defined]
265
- if deisotope:
266
- spect = spect.deisotope()
267
-
268
- if max_peaks is not None:
269
- spect = spect.keep_top(max_peaks)
270
-
271
- if dia_stats:
272
- if self.file_type in ["ztscan", "dia"]:
273
- spect = self._get_ztscan_stats(
274
- spec=spect,
275
- scan_uid=scan_uid,
276
- feature_uid=scan_info["feature_uid"][0]
277
- if "feature_uid" in scan_info
278
- and scan_info["feature_uid"][0] is not None
279
- else feature_uid,
280
- q1_step=2,
281
- deisotope=deisotope,
282
- centroid=centroid,
283
- )
284
- return spect
285
-
286
-
287
- def _get_ztscan_stats(
288
- self,
289
- spec,
290
- scan_uid=None,
291
- feature_uid=None,
292
- q1_step=2,
293
- mz_tol=0.005,
294
- # TODO check this
295
- # deisotope=SpectrumParameters().deisotope,
296
- deisotope=False,
297
- # TODO there is no `centroid_algo`?
298
- centroid=True,
299
- ):
300
- spec.size = spec.mz.size
301
- # spec.ms_entropy = spec.entropy()
302
-
303
- if self.scans_df is None:
304
- self.logger.warning("No scans found.")
305
- return spec
306
- scan = self.scans_df.filter(pl.col("scan_uid") == scan_uid)
307
- if len(scan) == 0:
308
- self.logger.warning(f"Scan {scan_uid} not found.")
309
- return spec
310
- scan = scan[0]
311
- if scan["ms_level"][0] != 2:
312
- self.logger.warning(f"Scan {scan_uid} is not a MS2 scan.")
313
- # Q1
314
- lscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid - q1_step)
315
- if len(lscan) == 0:
316
- self.logger.warning(f"Scan {scan_uid - q1_step} not found.")
317
- return spec
318
- lscan = lscan[0]
319
- # check that lscan['ms_level'] == 2 and lscan['cycle'] == scan['cycle']
320
- if lscan["ms_level"][0] != 2:
321
- self.logger.warning(f"Scan {scan_uid - q1_step} is not a MS2 scan.")
322
- return spec
323
- if lscan["cycle"][0] != scan["cycle"][0]:
324
- self.logger.warning(
325
- f"Scan {scan_uid - q1_step} is not in the same cycle as scan {scan_uid}.",
326
- )
327
- return spec
328
- rscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid + q1_step)
329
- if len(rscan) == 0:
330
- self.logger.warning(f"Scan {scan_uid + q1_step} not found.")
331
- return spec
332
- rscan = rscan[0]
333
- # check that rscan['ms_level'] == 2 and rscan['cycle'] == scan['cycle']
334
- if rscan["ms_level"][0] != 2:
335
- self.logger.warning(f"Scan {scan_uid + q1_step} is not a MS2 scan.")
336
- return spec
337
- if rscan["cycle"][0] != scan["cycle"][0]:
338
- self.logger.warning(
339
- f"Scan {scan_uid + q1_step} is not in the same cycle as scan {scan_uid}.",
340
- )
341
- return spec
342
- intymat = self._spec_to_mat(
343
- scan_uids=[scan_uid - q1_step, scan_uid, scan_uid + q1_step],
344
- mz_ref=spec.mz,
345
- mz_tol=mz_tol,
346
- deisotope=deisotope,
347
- centroid=centroid,
348
- )
349
- # pick only mzs that are close to spec.mz
350
- if intymat is None:
351
- return spec
352
- if intymat.shape[1] < 3:
353
- self.logger.warning(f"Not enough data points for scan {scan_uid}.")
354
- return spec
355
- q1_ratio = (2 * intymat[:, 1] + 0.01) / (intymat[:, 0] + intymat[:, 2] + 0.01)
356
- spec.q1_ratio = np.log2(q1_ratio)
357
- # where intymat[:, 0] + intymat[:, 2]==0, set q1_ratio to -1
358
- spec.q1_ratio[np.isclose(intymat[:, 0] + intymat[:, 2], 0)] = -10
359
-
360
- # EIC correlation
361
- # find rt_start and rt_end of the feature_uid
362
- if self.features_df is None:
363
- self.logger.warning("No features found.")
364
- return spec
365
- if feature_uid is None:
366
- return spec
367
- # spec.precursor_mz = feature['mz']
368
- feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
369
- if len(feature) == 0:
370
- self.logger.warning(f"Feature {feature_uid} not found.")
371
- return spec
372
- feature = feature.row(0, named=True)
373
- rt_start = feature["rt_start"]
374
- rt_end = feature["rt_end"]
375
- # get the cycle at rt_start and the cycle at rt_end from the closest scan with ms_level == 1
376
- scans = self.scans_df.filter(pl.col("ms_level") == 1)
377
- scans = scans.filter(pl.col("rt") > rt_start)
378
- scans = scans.filter(pl.col("rt") < rt_end)
379
- if len(scans) == 0:
380
- self.logger.warning(f"No scans found between {rt_start} and {rt_end}.")
381
- return spec
382
- scan_uids = scans["scan_uid"].to_list()
383
- eic_prec = self._spec_to_mat(
384
- scan_uids=scan_uids,
385
- mz_ref=feature["mz"],
386
- mz_tol=mz_tol,
387
- deisotope=deisotope,
388
- centroid=centroid,
389
- )
390
- # find width at half maximum of the eic_prec
391
- # hm = np.max(eic_prec[0, :]) / 3
392
- # find index of maximum
393
- # eic_prec_max_idx = np.argmax(eic_prec[0, :])
394
- # find index of the closest point to half maximum
395
- # idx = np.argmin(np.abs(eic_prec[0, :] - hm))
396
- # eic_fwhm_prec = abs(eic_prec_max_idx - idx)
397
-
398
- # get all unique cycles from scans
399
- cycles = scans["cycle"].unique()
400
- scandids = []
401
- # iterate over all cycles and get the scan_uid of scan with ms_level == 2 and closest precursor_mz to spec.precursor_mz
402
- for cycle in cycles:
403
- scans = self.scans_df.filter(pl.col("cycle") == cycle)
404
- scans = scans.filter(pl.col("ms_level") == 2)
405
- scans = scans.filter(pl.col("prec_mz") > feature["mz"] - 4)
406
- scans = scans.filter(pl.col("prec_mz") < feature["mz"] + 4)
407
- if len(scans) == 0:
408
- self.logger.warning(f"No scans found for cycle {cycle}.")
409
- continue
410
- scan = scans[(scans["prec_mz"] - feature["mz"]).abs().arg_sort()[:1]]
411
- scandids.append(scan["scan_uid"][0])
412
-
413
- eic_prod = self._spec_to_mat(
414
- scandids,
415
- mz_ref=spec.mz,
416
- mz_tol=mz_tol,
417
- deisotope=deisotope,
418
- centroid=centroid,
419
- )
420
- # eic_prod = eic_prod.T
421
- # eic_prec = eic_prec.T
422
- # calculate correlation between eic_prec and all columns of eic_prod, column by column
423
- eic_corr = np.zeros(eic_prod.shape[0])
424
- # eic_width_ratio = np.zeros(eic_prod.shape[0])
425
- for i in range(eic_prod.shape[0]):
426
- try:
427
- with np.errstate(divide="ignore", invalid="ignore"):
428
- eic_corr[i] = np.corrcoef(eic_prod[i, :], eic_prec[0, :])[0, 1]
429
- except:
430
- pass
431
-
432
- spec.eic_corr = eic_corr
433
- return spec
434
-
435
-
436
- def _spec_to_mat(
437
- self,
438
- scan_uids,
439
- mz_ref=None,
440
- mz_tol=0.01,
441
- # TODO check this
442
- # deisotope=SpectrumParameters().deisotope,
443
- deisotope=False,
444
- # TODO there is no `centroid_algo`?
445
- # TODO there is no `dia_stats`?
446
- # TODO unused (see below)
447
- centroid=True,
448
- # TODO check this
449
- # precursor_trim=SpectrumParameters().precursor_trim,
450
- # TODO unused (see below)
451
- precursor_trim=None,
452
- ):
453
- # get all spectra in scan_uids
454
-
455
- if mz_ref is None:
456
- return None
457
-
458
- if not isinstance(mz_ref, np.ndarray):
459
- if isinstance(mz_ref, list):
460
- mz_ref = np.array(mz_ref)
461
- else:
462
- mz_ref = np.array([mz_ref])
463
-
464
- def align_mzs(ar1, ar2, tol):
465
- closest_indices = []
466
- # find the closest pair between each element in ar1 and ar2, within a maximum tolerance of tol
467
- for i, val1 in enumerate(ar1):
468
- closest_index = np.argmin(np.abs(ar2 - val1))
469
- closest_indices.append((i, closest_index))
470
- # filter out pairs that are not within the specified tolerance
471
- closest_indices = [
472
- (i, j) for i, j in closest_indices if np.abs(ar1[i] - ar2[j]) <= tol
473
- ]
474
- # remove duplicates from the list of indices
475
- closest_indices = list(set(closest_indices))
476
- # sort the list of indices by the first element (i) in ascending order
477
- closest_indices = sorted(closest_indices, key=lambda x: x[0])
478
-
479
- # Convert the list of indices into an array for easier indexing in subsequent operations
480
- return np.array(closest_indices)
481
-
482
- specs = []
483
- for scan_uid in scan_uids:
484
- spec = self.get_spectrum(
485
- scan_uid=scan_uid,
486
- centroid=True,
487
- dia_stats=False,
488
- precursor_trim=5,
489
- )
490
- if deisotope:
491
- spec = spec.deisotope()
492
- # align to reference spectrum
493
- if spec.mz.size == 0:
494
- continue
495
- if mz_ref.size == 0:
496
- continue
497
- closest_indices = align_mzs(spec.mz, mz_ref, mz_tol)
498
- # store the aligned spectrum in the list
499
- aligned_inty = np.zeros(len(mz_ref))
500
- for i, j in closest_indices:
501
- if abs(spec.mz[i] - mz_ref[j]) <= mz_tol:
502
- if aligned_inty[j] < spec.inty[i]:
503
- aligned_inty[j] = spec.inty[i]
504
- specs.append(aligned_inty)
505
-
506
- if len(specs) == 0:
507
- return None
508
- # create a matrix with the aligned spectra. Each spec goes into a column
509
- mat = np.column_stack(specs)
510
-
511
- return mat
512
-
513
-
514
- def find_features(self, **kwargs):
515
- """
516
- Detect features in mass spectrometry data by processing MS1 spectra, performing mass trace detection,
517
- elution peak detection, and feature detection. Optionally, deisotope features and remove low-quality peaks.
518
-
519
- This method leverages an MSExperiment constructed from the object's ms1_df, where each cycle in the data
520
- corresponds to an MSSpectrum. It then runs mass trace detection using set parameters, deconvolutes the mass
521
- traces to detect chromatographic peaks, and finally identifies features with a feature finding algorithm. The
522
- resulting feature map is cleaned, deisotoped (if enabled), and assigned unique IDs before being stored.
523
-
524
- Parameters:
525
- **kwargs: Keyword arguments for feature detection parameters. Can include:
526
- - A find_features_defaults instance to set all parameters at once
527
- - Individual parameter names and values (see find_features_defaults for details)
528
-
529
- Key Parameters:
530
- tol_ppm (float): Mass error tolerance in parts-per-million for mass trace detection (default: 30.0).
531
- noise (float): Noise threshold intensity to filter out low-intensity signals (default: 200.0).
532
- chrom_fwhm (float): Full width at half maximum for chromatographic peak shape (default: 1.0).
533
- chrom_fwhm_min (float): Minimum FWHM for chromatographic peak detection (default: 0.5).
534
- chrom_peak_snr (float): Signal-to-noise ratio required for chromatographic peaks (default: 10.0).
535
- mz_scoring_13C (bool): Whether to enable scoring of 13C isotopic patterns (default: False).
536
- masstrace_snr_filtering (bool): Whether to apply SNR filtering to mass traces (default: False).
537
- deisotope (bool): Whether to perform deisotoping of detected features (default: True).
538
-
539
- Attributes set:
540
- self.features: An updated feature map with unique IDs after feature detection and deisotoping.
541
- self.features_df: A cleaned DataFrame of features, with peaks of zero quality removed, representing the final
542
- detected features.
543
-
544
- Notes:
545
- - The method processes the ms1_df by iterating over cycles to build an MSExperiment.
546
- - External OMS modules (e.g., MSExperiment, MSSpectrum, MassTraceDetection, ElutionPeakDetection,
547
- FeatureFindingMetabo) are used throughout the processing.
548
- - After feature detection, additional cleaning is performed via internal helper methods.
549
- """
550
- if self.ms1_df is None:
551
- self.logger.error("No MS1 data found. Please load a file first.")
552
- return
553
- if len(self.ms1_df) == 0:
554
- self.logger.error("MS1 data is empty. Please load a file first.")
555
- return
556
- # parameters initialization
557
- params = find_features_defaults()
558
- for key, value in kwargs.items():
559
- if isinstance(value, find_features_defaults):
560
- # set
561
- params = value
562
- self.logger.debug("Using provided find_features_defaults parameters")
563
- else:
564
- if hasattr(params, key):
565
- if params.set(key, value, validate=True):
566
- self.logger.debug(f"Updated parameter {key} = {value}")
567
- else:
568
- self.logger.warning(
569
- f"Failed to set parameter {key} = {value} (validation failed)",
570
- )
571
- else:
572
- self.logger.warning(f"Unknown parameter {key} ignored")
573
-
574
- self.logger.debug("Starting feature detection...")
575
- self.logger.debug(
576
- f"Parameters: chrom_fwhm={params.get('chrom_fwhm')}, noise={params.get('noise')}, tol_ppm={params.get('tol_ppm')}",
577
- )
578
-
579
- exp = oms.MSExperiment()
580
- # find max number of cycles in self.ms1_df
581
- max_cycle = self.ms1_df["cycle"].max()
582
- # iterate over all cycles, find rows with 1 cycle and append to exp2
583
- for cycle in range(1, max_cycle + 1):
584
- cycle_df = self.ms1_df.filter(pl.col("cycle") == cycle)
585
- # check if len(cycle_df) > 0
586
- if len(cycle_df) > 0:
587
- spectrum = oms.MSSpectrum()
588
- spectrum.setRT(cycle_df[0]["rt"].item())
589
- spectrum.setMSLevel(1) # MS1
590
- mz = cycle_df["mz"]
591
- inty = cycle_df["inty"]
592
- spectrum.set_peaks([mz, inty]) # type: ignore[attr-defined]
593
- spectrum.sortByPosition()
594
- exp.addSpectrum(spectrum)
595
-
596
- # exp.sortSpectra(True)
597
- # mass trace detection
598
- mass_traces: list = []
599
- mtd = oms.MassTraceDetection()
600
- mtd_par = mtd.getDefaults()
601
-
602
- # Apply MTD parameters
603
- mtd_par.setValue("mass_error_ppm", float(params.get("tol_ppm")))
604
- mtd_par.setValue("noise_threshold_int", float(params.get("noise")))
605
- mtd_par.setValue(
606
- "min_trace_length",
607
- float(params.get("min_trace_length_multiplier"))
608
- * float(params.get("chrom_fwhm_min")),
609
- )
610
- mtd_par.setValue(
611
- "trace_termination_outliers",
612
- int(params.get("trace_termination_outliers")),
613
- )
614
- mtd_par.setValue("chrom_peak_snr", float(params.get("chrom_peak_snr")))
615
-
616
- mtd.setParameters(mtd_par) # set the new parameters
617
- mtd.run(exp, mass_traces, 0) # run mass trace detection
618
-
619
- # elution peak detection
620
- mass_traces_deconvol: list = []
621
- epd = oms.ElutionPeakDetection()
622
- epd_par = epd.getDefaults()
623
-
624
- # Apply EPD parameters using our parameter class
625
- epd_par.setValue("width_filtering", params.get("width_filtering"))
626
- epd_par.setValue("min_fwhm", float(params.get("chrom_fwhm_min")))
627
- epd_par.setValue("chrom_fwhm", float(params.get("chrom_fwhm")))
628
- epd_par.setValue("chrom_peak_snr", float(params.get("chrom_peak_snr")))
629
- if params.get("masstrace_snr_filtering"):
630
- epd_par.setValue("masstrace_snr_filtering", "true")
631
- if params.get("mz_scoring_13C"):
632
- epd_par.setValue("mz_scoring_13C", "true")
633
-
634
- epd.setParameters(epd_par)
635
- epd.detectPeaks(mass_traces, mass_traces_deconvol)
636
-
637
- # feature detection
638
- feature_map = oms.FeatureMap() # output features
639
- chrom_out: list = [] # output chromatograms
640
- ffm = oms.FeatureFindingMetabo()
641
- ffm_par = ffm.getDefaults()
642
-
643
- # Apply FFM parameters using our parameter class
644
- ffm_par.setValue(
645
- "remove_single_traces",
646
- "true" if params.get("remove_single_traces") else "false",
647
- )
648
- ffm_par.setValue(
649
- "report_convex_hulls",
650
- "true" if params.get("report_convex_hulls") else "false",
651
- )
652
- ffm_par.setValue(
653
- "report_summed_ints",
654
- "true" if params.get("report_summed_ints") else "false",
655
- )
656
- ffm_par.setValue(
657
- "report_chromatograms",
658
- "true" if params.get("report_chromatograms") else "false",
659
- )
660
-
661
- ffm.setParameters(ffm_par)
662
- self.logger.debug("Running feature finding with parameters:")
663
- self.logger.debug(ffm_par)
664
- ffm.run(mass_traces_deconvol, feature_map, chrom_out)
665
- # Assigns a new, valid unique id per feature
666
- feature_map.ensureUniqueId()
667
- df = feature_map.get_df(export_peptide_identifications=False) # type: ignore[attr-defined]
668
- # Sets the file path to the primary MS run (usually the mzML file)
669
- feature_map.setPrimaryMSRunPath([self.file_path.encode()])
670
- self.features = feature_map
671
- # remove peaks with quality == 0
672
- df = self._clean_features_df(df)
673
-
674
- # desotope features
675
- df = self._features_deisotope(
676
- df,
677
- mz_tol=params.get("deisotope_mz_tol"),
678
- rt_tol=params.get("chrom_fwhm_min") / 4 * params.get("deisotope_rt_tol_factor"),
679
- )
680
- if params.get("deisotope"):
681
- # record size before deisotoping
682
- size_before_deisotope = len(df)
683
- df = df.filter(pl.col("iso") == 0)
684
- self.logger.debug(
685
- f"Deisotoping features: {size_before_deisotope - len(df)} features removed.",
686
- )
687
-
688
- # update eic - create lists to collect results
689
- chroms: list[Chromatogram] = []
690
- coherences: list[float] = []
691
- prominences: list[float] = []
692
- prominence_scaleds: list[float] = []
693
- height_scaleds: list[float] = []
694
-
695
- mz_tol = params.get("eic_mz_tol")
696
- rt_tol = params.get("eic_rt_tol")
697
-
698
- # iterate over all rows in df using polars iteration
699
- self.logger.debug("Extracting EICs...")
700
- for row in df.iter_rows(named=True):
701
- # select data in ms1_df with mz in range [mz_start - mz_tol, mz_end + mz_tol] and rt in range [rt_start - rt_tol, rt_end + rt_tol]
702
- d = self.ms1_df.filter(
703
- (pl.col("rt") >= row["rt_start"] - rt_tol)
704
- & (pl.col("rt") <= row["rt_end"] + rt_tol)
705
- & (pl.col("mz") >= row["mz"] - mz_tol)
706
- & (pl.col("mz") <= row["mz"] + mz_tol),
707
- )
708
- # for all unique rt values, find the maximum inty
709
- eic_rt = d.group_by("rt").agg(pl.col("inty").max())
710
- if len(eic_rt) < 4:
711
- chroms.append(None)
712
- coherences.append(None)
713
- prominences.append(None)
714
- prominence_scaleds.append(None)
715
- height_scaleds.append(None)
716
- continue
717
-
718
- eic = Chromatogram(
719
- eic_rt["rt"].to_numpy(),
720
- eic_rt["inty"].to_numpy(),
721
- label=f"EIC mz={row['mz']:.4f}",
722
- file=self.file_path,
723
- mz=row["mz"],
724
- mz_tol=mz_tol,
725
- feature_start=row["rt_start"],
726
- feature_end=row["rt_end"],
727
- feature_apex=row["rt"],
728
- ).find_peaks()
729
-
730
- # collect results
731
- chroms.append(eic)
732
- if len(eic.peak_widths) > 0:
733
- coherences.append(round(eic.feature_coherence, 3))
734
- prominences.append(round(eic.peak_prominences[0], 3))
735
- prominence_scaleds.append(
736
- round(eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3),
737
- )
738
- height_scaleds.append(
739
- round(eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3),
740
- )
741
- else:
742
- coherences.append(None)
743
- prominences.append(None)
744
- prominence_scaleds.append(None)
745
- height_scaleds.append(None)
746
-
747
- # Add the computed columns to the dataframe
748
- df = df.with_columns([
749
- pl.Series("chrom", chroms, dtype=pl.Object),
750
- pl.Series("chrom_coherence", coherences, dtype=pl.Float64),
751
- pl.Series("chrom_prominence", prominences, dtype=pl.Float64),
752
- pl.Series("chrom_prominence_scaled", prominence_scaleds, dtype=pl.Float64),
753
- pl.Series("chrom_height_scaled", height_scaleds, dtype=pl.Float64),
754
- ])
755
-
756
- self.features_df = df
757
- self.logger.info(f"Feature detection completed. Total features: {len(df)}")
758
-
759
- # store params
760
- self.store_history(["find_features"], params.to_dict())
761
- self.logger.debug(
762
- "Parameters stored to find_features",
763
- )
764
- keys_to_remove = ["find_adducts", "find_ms2"]
765
- for key in keys_to_remove:
766
- if key in self.history:
767
- del self.history[key]
768
- self.logger.debug(f"Removed {key} from history")
769
-
770
-
771
- def find_adducts(self, **kwargs):
772
- """
773
- Detect adducts in mass spectrometry features using OpenMS MetaboliteFeatureDeconvolution.
774
-
775
- This method analyzes detected features to identify adduct relationships based on mass differences,
776
- charge states, and retention time proximity. It groups features that likely represent the same
777
- metabolite in different ionization states.
778
-
779
- Parameters:
780
- **kwargs: Keyword arguments for adduct detection parameters. Can include:
781
- - A find_adducts_defaults instance to set all parameters at once
782
- - Individual parameter names and values (see find_adducts_defaults for details)
783
-
784
- Key Parameters:
785
- adducts (Union[List[str], str, None]): List of potential adducts or ionization mode string.
786
- charge_min (int): Minimal possible charge state (default: 1).
787
- charge_max (int): Maximal possible charge state (default: 2).
788
- retention_max_diff (float): Maximum retention time difference for grouping (default: 1.0).
789
-
790
- Attributes set:
791
- self.features_df: Updated with adduct information including 'adduct', 'adduct_mass',
792
- and 'adduct_group' columns.
793
- """
794
- params = find_adducts_defaults()
795
- for key, value in kwargs.items():
796
- if isinstance(value, find_adducts_defaults):
797
- # set
798
- params = value
799
- self.logger.debug("Using provided find_adducts_defaults parameters")
800
- else:
801
- if hasattr(params, key):
802
- if params.set(key, value, validate=True):
803
- self.logger.debug(f"Updated parameter {key} = {value}")
804
- else:
805
- self.logger.warning(
806
- f"Failed to set parameter {key} = {value} (validation failed)",
807
- )
808
- else:
809
- self.logger.warning(f"Unknown parameter {key} ignored")
810
-
811
- self.logger.debug("Starting adduct detection...")
812
- self.logger.debug(
813
- f"Parameters: adducts={params.get('adducts')}, charge_min={params.get('charge_min')}, charge_max={params.get('charge_max')}",
814
- )
815
-
816
- mfd = oms.MetaboliteFeatureDeconvolution()
817
-
818
- openms_params = mfd.getDefaults()
819
-
820
- # Set adducts using the helper method
821
- adducts_list = params.get_openms_adducts()
822
- openms_params.setValue("potential_adducts", [a.encode() for a in adducts_list])
823
-
824
- # Apply other parameters
825
- openms_params.setValue("charge_min", params.get("charge_min"))
826
- openms_params.setValue("charge_max", params.get("charge_max"))
827
- openms_params.setValue("charge_span_max", params.get("charge_span_max"))
828
- openms_params.setValue("retention_max_diff", params.get("retention_max_diff"))
829
- openms_params.setValue(
830
- "retention_max_diff_local",
831
- params.get("retention_max_diff_local"),
832
- )
833
-
834
- # set updated parameters object
835
- mfd.setParameters(openms_params)
836
- self.logger.debug("Running adduct detection with parameters:")
837
- self.logger.debug(openms_params)
838
- # result feature map: will store features with adduct information
839
- feature_map_MFD = oms.FeatureMap()
840
- # result consensus map: will store grouped features belonging to a charge group
841
- groups = oms.ConsensusMap()
842
- # result consensus map: will store paired features connected by an edge
843
- edges = oms.ConsensusMap()
844
-
845
- # compute adducts
846
- mfd.compute(self.features, feature_map_MFD, groups, edges)
847
- self.logger.debug("Extracting information.")
848
-
849
- # export feature map as pandas DataFrame and append adduct information
850
- adducts_map = feature_map_MFD.get_df(export_peptide_identifications=False) # type: ignore[attr-defined]
851
- adducts_map["adduct"] = [
852
- f.getMetaValue("dc_charge_adducts") for f in feature_map_MFD
853
- ]
854
- adducts_map["adduct_group_id"] = [f.getMetaValue("Group") for f in feature_map_MFD]
855
- adducts_map["adduct_mass"] = [
856
- f.getMetaValue("dc_charge_adduct_mass") for f in feature_map_MFD
857
- ]
858
- # clean up the DataFrame
859
-
860
- # Clean up 'None' strings that should be actual None values from OpenMS getMetaValue
861
- for col in ["adduct", "adduct_group_id", "adduct_mass"]:
862
- if col in adducts_map.columns:
863
- adducts_map[col] = adducts_map[col].replace("None", None)
864
-
865
- # Convert adducts_map to polars and merge
866
- adducts_df = pl.DataFrame({
867
- "index": range(len(adducts_map)),
868
- "adduct": adducts_map["adduct"],
869
- "adduct_mass": adducts_map["adduct_mass"],
870
- "adduct_group_id": adducts_map["adduct_group_id"],
871
- })
872
- features_pl = (
873
- self.features_df
874
- if isinstance(self.features_df, pl.DataFrame)
875
- else pl.from_pandas(self.features_df)
876
- )
877
-
878
- # Remove existing adduct columns if they exist (likely all null)
879
- if "adduct" in features_pl.columns:
880
- features_pl = features_pl.drop("adduct")
881
- if "adduct_mass" in features_pl.columns:
882
- features_pl = features_pl.drop("adduct_mass")
883
- if "adduct_group" in features_pl.columns:
884
- features_pl = features_pl.drop("adduct_group")
885
-
886
- df = features_pl.join(
887
- adducts_df,
888
- left_on="feature_uid",
889
- right_on="index",
890
- how="left",
891
- )
892
-
893
- # Create adduct_group from adduct_group_id column
894
- unique_groups = df["adduct_group_id"].unique().to_list()
895
- group_mapping = {group: idx for idx, group in enumerate(unique_groups)}
896
- df = df.with_columns(
897
- pl.col("adduct_group_id")
898
- .map_elements(lambda x: group_mapping.get(x, 0), return_dtype=pl.Int64)
899
- .alias("adduct_group"),
900
- )
901
-
902
- # remove adduct_group_id
903
- df = df.drop("adduct_group_id")
904
- # move adduct, adduct_mass, and adduct_group after column iso_of
905
- if "iso_of" in df.columns:
906
- adduct_cols = ["adduct", "adduct_mass", "adduct_group"]
907
- # Get all column names and reorder them
908
- all_cols = df.columns
909
- iso_of_idx = all_cols.index("iso_of")
910
-
911
- # Create new column order: everything before iso_of, then iso_of, then adduct columns, then the rest
912
- new_order = []
913
- # columns up to and including iso_of
914
- new_order.extend(all_cols[: iso_of_idx + 1])
915
- # adduct columns that exist
916
- new_order.extend([col for col in adduct_cols if col in all_cols])
917
- new_order.extend([
918
- col for col in all_cols[iso_of_idx + 1 :] if col not in adduct_cols
919
- ]) # remaining columns
920
-
921
- df = df.select(new_order)
922
- # Update the features_df attribute with the new DataFrame
923
-
924
- self.features_df = df
925
- total_adducts = df.filter(pl.col("adduct").is_not_null()).shape[0]
926
- self.logger.info(f"Adduct detection completed. Total adducts: {total_adducts}")
927
-
928
- # store params
929
- self.store_history(["find_adducts"], params.to_dict())
930
- self.logger.debug(
931
- "Parameters stored to find_adducts",
932
- )
933
-
934
-
935
- def _clean_features_df(self, df):
936
- # Convert pandas DataFrame to polars if needed
937
- df["feature_id"] = df.index
938
- if hasattr(df, "columns") and not isinstance(df, pl.DataFrame):
939
- df_pl = pl.from_pandas(df)
940
- else:
941
- df_pl = df
942
-
943
- # Filter out rows with quality == 0
944
- df2 = df_pl.filter(pl.col("quality") != 0)
945
-
946
- # Create new dataframe with required columns and transformations using select
947
- df_result = df2.select([
948
- pl.int_range(pl.len()).alias("feature_uid"),
949
- pl.col("feature_id").cast(pl.String).alias("feature_id"),
950
- pl.col("mz").round(5),
951
- pl.col("RT").round(3).alias("rt"),
952
- pl.col("RT").round(3).alias("rt_original"), # keep original RT
953
- pl.col("RTstart").round(3).alias("rt_start"),
954
- pl.col("RTend").round(3).alias("rt_end"),
955
- (pl.col("RTend") - pl.col("RTstart")).round(3).alias("rt_delta"),
956
- pl.col("MZstart").round(5).alias("mz_start"),
957
- pl.col("MZend").round(5).alias("mz_end"),
958
- pl.col("intensity").alias("inty"),
959
- pl.col("quality"),
960
- pl.col("charge"),
961
- pl.lit(0).alias("iso"),
962
- pl.lit(None, dtype=pl.Int64).alias("iso_of"),
963
- pl.lit(None, dtype=pl.Int64).alias("adduct_group"),
964
- pl.lit(None, dtype=pl.Utf8).alias("adduct"),
965
- pl.lit(None, dtype=pl.Float64).alias("adduct_mass"),
966
- pl.lit(None, dtype=pl.Object).alias("chrom"),
967
- pl.lit(None, dtype=pl.Float64).alias("chrom_coherence"),
968
- pl.lit(None, dtype=pl.Float64).alias("chrom_prominence"),
969
- pl.lit(None, dtype=pl.Float64).alias("chrom_prominence_scaled"),
970
- pl.lit(None, dtype=pl.Float64).alias("chrom_height_scaled"),
971
- pl.lit(None, dtype=pl.Object).alias("ms2_scans"),
972
- pl.lit(None, dtype=pl.Object).alias("ms2_specs"),
973
- ])
974
-
975
- return df_result
976
-
977
-
978
- def _features_deisotope(
979
- self,
980
- df,
981
- mz_tol=None,
982
- rt_tol=None,
983
- ):
984
- if mz_tol is None:
985
- mz_tol = 0.02
986
- if rt_tol is None:
987
- rt_tol = 0.2
988
-
989
- # Convert to polars if needed
990
- if not isinstance(df, pl.DataFrame):
991
- df = pl.from_pandas(df)
992
-
993
- # Initialize new columns
994
- df = df.with_columns([
995
- pl.lit(0).alias("iso"),
996
- pl.col("feature_uid").alias("iso_of"),
997
- ])
998
-
999
- # Sort by 'mz'
1000
- df = df.sort("mz")
1001
-
1002
- # Get arrays for efficient processing
1003
- rt_arr = df["rt"].to_numpy()
1004
- mz_arr = df["mz"].to_numpy()
1005
- intensity_arr = df["inty"].to_numpy()
1006
- feature_uid_arr = df["feature_uid"].to_numpy()
1007
- n = len(df)
1008
- mz_diff = 1.003355
1009
-
1010
- # Create arrays to track isotope assignments
1011
- iso_arr = np.zeros(n, dtype=int)
1012
- iso_of_arr = feature_uid_arr.copy()
1013
-
1014
- for i in range(n):
1015
- base_rt = rt_arr[i]
1016
- base_mz = mz_arr[i]
1017
- base_int = intensity_arr[i]
1018
- base_feature_uid = feature_uid_arr[i]
1019
-
1020
- # Search for first isotope candidate (offset = mz_diff)
1021
- t1_lower = base_mz + mz_diff - mz_tol
1022
- t1_upper = base_mz + mz_diff + mz_tol
1023
- li = np.searchsorted(mz_arr, t1_lower, side="left")
1024
- ri = np.searchsorted(mz_arr, t1_upper, side="right")
1025
- if li < ri:
1026
- cand_idx = np.arange(li, ri)
1027
- mask = (
1028
- (rt_arr[cand_idx] > base_rt - rt_tol)
1029
- & (rt_arr[cand_idx] < base_rt + rt_tol)
1030
- & (intensity_arr[cand_idx] < 2 * base_int)
1031
- )
1032
- valid_cand = cand_idx[mask]
1033
- for cand in valid_cand:
1034
- if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
1035
- iso_arr[cand] = iso_arr[i] + 1 # first isotope
1036
- iso_of_arr[cand] = base_feature_uid
1037
-
1038
- # Search for second isotope candidate (offset = 2*mz_diff)
1039
- t2_lower = base_mz + 2 * mz_diff - 1.5 * mz_tol
1040
- t2_upper = base_mz + 2 * mz_diff + 1.5 * mz_tol
1041
- li = np.searchsorted(mz_arr, t2_lower, side="left")
1042
- ri = np.searchsorted(mz_arr, t2_upper, side="right")
1043
- if li < ri:
1044
- cand_idx = np.arange(li, ri)
1045
- mask = (
1046
- (rt_arr[cand_idx] > base_rt - rt_tol)
1047
- & (rt_arr[cand_idx] < base_rt + rt_tol)
1048
- & (intensity_arr[cand_idx] < 2 * base_int)
1049
- )
1050
- valid_cand = cand_idx[mask]
1051
- for cand in valid_cand:
1052
- if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
1053
- iso_arr[cand] = iso_arr[i] + 2 # second isotope
1054
- iso_of_arr[cand] = base_feature_uid
1055
-
1056
- # Search for third isotope candidate (offset = 3*mz_diff)
1057
- t3_lower = base_mz + 3 * mz_diff - 1.5 * mz_tol
1058
- t3_upper = base_mz + 3 * mz_diff + 1.5 * mz_tol
1059
- li = np.searchsorted(mz_arr, t3_lower, side="left")
1060
- ri = np.searchsorted(mz_arr, t3_upper, side="right")
1061
- if li < ri:
1062
- cand_idx = np.arange(li, ri)
1063
- mask = (
1064
- (rt_arr[cand_idx] > base_rt - rt_tol)
1065
- & (rt_arr[cand_idx] < base_rt + rt_tol)
1066
- & (intensity_arr[cand_idx] < 2 * base_int)
1067
- )
1068
- valid_cand = cand_idx[mask]
1069
- for cand in valid_cand:
1070
- if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
1071
- iso_arr[cand] = iso_arr[i] + 3 # third isotope
1072
- iso_of_arr[cand] = base_feature_uid
1073
-
1074
- # Update the dataframe with isotope assignments
1075
- df = df.with_columns([
1076
- pl.Series("iso", iso_arr),
1077
- pl.Series("iso_of", iso_of_arr),
1078
- ])
1079
-
1080
- return df
1081
-
1082
-
1083
- def analyze_dda(self):
1084
- # Preallocate variables
1085
- cycle_records = []
1086
- previous_rt = 0
1087
- previous_level = 0
1088
- ms1_index = None
1089
- cyclestart = None
1090
- ms2_n = 0
1091
- ms1_duration = 0
1092
- ms2_duration: list[float] = []
1093
-
1094
- for row in self.scans_df.iter_rows(named=True):
1095
- if row["ms_level"] == 1:
1096
- if previous_level == 2:
1097
- ms2_to_ms2 = float(np.mean(ms2_duration)) if ms2_duration else -1.0
1098
- d = {
1099
- "scan_uid": ms1_index,
1100
- "ms2_n": ms2_n,
1101
- "time_cycle": row["rt"] - cyclestart,
1102
- "time_ms1_to_ms1": -1.0,
1103
- "time_ms1_to_ms2": ms1_duration,
1104
- "time_ms2_to_ms2": ms2_to_ms2,
1105
- "time_ms2_to_ms1": row["rt"] - previous_rt,
1106
- }
1107
- cycle_records.append(d)
1108
- elif previous_level == 1:
1109
- d = {
1110
- "scan_uid": ms1_index,
1111
- "ms2_n": 0,
1112
- "time_cycle": row["rt"] - cyclestart,
1113
- "time_ms1_to_ms1": row["rt"] - cyclestart,
1114
- "time_ms1_to_ms2": -1.0,
1115
- "time_ms2_to_ms2": -1.0,
1116
- "time_ms2_to_ms1": -1.0,
1117
- }
1118
- cycle_records.append(d)
1119
-
1120
- ms1_index = row["scan_uid"]
1121
- cyclestart = row["rt"]
1122
- ms2_n = 0
1123
- ms1_duration = 0
1124
- ms2_duration = []
1125
- elif previous_level == 2:
1126
- ms2_n += 1
1127
- ms2_duration.append(row["rt"] - previous_rt)
1128
- elif previous_level == 1:
1129
- ms1_duration = row["rt"] - cyclestart
1130
- ms2_n += 1
1131
- previous_level = row["ms_level"]
1132
- previous_rt = row["rt"]
1133
-
1134
- # Create DataFrame once at the end
1135
- if cycle_records:
1136
- cycle_data = pl.DataFrame(cycle_records)
1137
- self.scans_df = self.scans_df.join(cycle_data, on="scan_uid", how="left")
1138
- else:
1139
- self.scans_df = self.scans_df.with_columns(
1140
- [
1141
- pl.lit(None).alias("ms2_n"),
1142
- pl.lit(None).alias("time_cycle"),
1143
- pl.lit(None).alias("time_ms1_to_ms1"),
1144
- pl.lit(None).alias("time_ms1_to_ms2"),
1145
- pl.lit(None).alias("time_ms2_to_ms2"),
1146
- pl.lit(None).alias("time_ms2_to_ms1"),
1147
- ],
1148
- )
1149
-
1150
-
1151
- def find_ms2(self, **kwargs):
1152
- """
1153
- Link MS2 spectra to features in the dataset.
1154
- This method matches MS2 spectra from the scans dataframe with features in the features dataframe
1155
- based on retention time (RT) and precursor m/z tolerance criteria. For each feature in the provided
1156
- or inferred list of feature ids (feature_uid), it computes the RT difference between the feature and available
1157
- MS2 spectra. It then selects MS2 spectra that fall within a computed RT radius (based on the feature's
1158
- start and end times) and a specified m/z tolerance. For each feature, it chooses one MS2 spectrum per
1159
- unique cycle based on the closest RT difference, and it updates the feature with the list of matched
1160
- scan ids and the spectrum corresponding to the first matching scan id. Additionally, the scan dataframe
1161
- is updated to associate matched scan ids with the corresponding feature id.
1162
-
1163
- Parameters:
1164
- **kwargs: Keyword arguments for MS2 linking parameters. Can include:
1165
- - A find_ms2_defaults instance to set all parameters at once
1166
- - Individual parameter names and values (see find_ms2_defaults for details)
1167
-
1168
- Key Parameters:
1169
- features (int or list of int, optional): A specific feature id or a list of feature ids to process.
1170
- If an individual feature_uid is provided and equals -1, all features with no associated MS2 data will be processed.
1171
- If None, all features in the features dataframe are processed.
1172
- mz_tol (float, optional): The precursor m/z tolerance to consider when matching MS2 spectra. If not provided,
1173
- it defaults to 0.5, except for certain file types ('ztscan' or 'dia') which set it to 4.
1174
- centroid (bool, optional): If True, the returned spectrum will be centroided. Default is True.
1175
- deisotope (bool, optional): Flag indicating whether deisotoping should be performed. Default is False.
1176
- dia_stats (bool, optional): A flag to collect additional DIA-related statistics when retrieving a spectrum.
1177
- Default is False.
1178
-
1179
- Returns:
1180
- None
1181
-
1182
- Side Effects:
1183
- Updates self.features_df with new columns 'ms2_scans' (a list of scan ids) and 'ms2_specs' (containing
1184
- the retrieved spectrum for the first matched scan id). Also, self.scans_df is updated by setting the 'feature_uid'
1185
- column for matched MS2 spectra.
1186
-
1187
- Notes:
1188
- - The function uses vectorized operations to quickly filter MS2 spectra with ms_level equal to 2.
1189
- - If no MS2 spectra are available or if features_df is not loaded, appropriate messages are printed and the
1190
- method exits early.
1191
- - The function assumes that self.features_df and self.scans_df are already set up and contain the expected
1192
- columns ('feature_uid', 'rt', 'rt_start', 'rt_end', 'mz' for features and 'scan_uid', 'rt', 'prec_mz', 'cycle', 'ms_level'
1193
- for scans).
1194
-
1195
- Examples:
1196
- Assume the current instance has features and scans data loaded, then to link MS2 spectra for all features:
1197
- instance.find_ms2()
1198
- To link MS2 spectra for a specific list of feature ids:
1199
- instance.find_ms2(feature_uid=[1, 3, 5])
1200
- """
1201
-
1202
- # parameters initialization
1203
- params = find_ms2_defaults()
1204
- for key, value in kwargs.items():
1205
- if isinstance(value, find_ms2_defaults):
1206
- params = value
1207
- self.logger.debug("Using provided find_ms2_defaults parameters")
1208
- else:
1209
- if hasattr(params, key):
1210
- if params.set(key, value, validate=True):
1211
- self.logger.debug(f"Updated parameter {key} = {value}")
1212
- else:
1213
- self.logger.warning(
1214
- f"Failed to set parameter {key} = {value} (validation failed)",
1215
- )
1216
- else:
1217
- self.logger.debug(f"Unknown parameter {key} ignored")
1218
- # end of parameter initialization
1219
-
1220
- # Extract parameter values
1221
- features = params.get("features")
1222
- mz_tol = params.get_mz_tolerance(self.file_type)
1223
- centroid = params.get("centroid")
1224
- deisotope = params.get("deisotope")
1225
- dia_stats = params.get("dia_stats")
1226
-
1227
- self.logger.debug("Starting MS2 spectra linking...")
1228
- self.logger.debug(
1229
- f"Parameters: mz_tol={mz_tol}, centroid={centroid}, deisotope={deisotope}",
1230
- )
1231
-
1232
- # Ensure features_df is loaded and has the MS2 columns
1233
- if self.features_df is None:
1234
- self.logger.error("Please find features first.")
1235
- return
1236
- if "ms2_scans" not in self.features_df.columns:
1237
- self.features_df["ms2_scans"] = None
1238
- if "ms2_specs" not in self.features_df.columns:
1239
- self.features_df["ms2_specs"] = None
1240
-
1241
- feature_uid_list = []
1242
- self.logger.debug("Building lookup lists")
1243
- if features == []:
1244
- features = None # If empty list, treat as None
1245
- feature_uid_list = self._get_feature_uids(features)
1246
-
1247
- if len(feature_uid_list) == 0:
1248
- self.logger.warning("No features to process.")
1249
- return
1250
-
1251
- ms2_df = self.scans_df.filter(pl.col("ms_level") == 2)
1252
- if len(ms2_df) == 0:
1253
- self.logger.warning("No MS2 spectra found in file.")
1254
- return
1255
-
1256
- ms2_index_arr = ms2_df["scan_uid"].to_numpy()
1257
- ms2_rt = ms2_df["rt"].to_numpy()
1258
- ms2_precursor = ms2_df["prec_mz"].to_numpy()
1259
- ms2_cycle = ms2_df["cycle"].to_numpy()
1260
-
1261
- features_df = self.features_df
1262
- c = 0
1263
-
1264
- if self.file_interface is None:
1265
- self.index_file()
1266
-
1267
- # Vectorize the entire operation for better performance
1268
- features_subset = features_df.filter(pl.col("feature_uid").is_in(feature_uid_list))
1269
-
1270
- if len(features_subset) == 0:
1271
- return
1272
-
1273
- # Convert to numpy arrays for vectorized operations
1274
- feature_rt = features_subset.select("rt").to_numpy().flatten()
1275
- feature_mz = features_subset.select("mz").to_numpy().flatten()
1276
- feature_rt_start = features_subset.select("rt_start").to_numpy().flatten()
1277
- feature_rt_end = features_subset.select("rt_end").to_numpy().flatten()
1278
- feature_uids = features_subset.select("feature_uid").to_numpy().flatten()
1279
- feature_indices = (
1280
- features_subset.with_row_index().select("index").to_numpy().flatten()
1281
- )
1282
-
1283
- # Pre-compute RT radius for all features
1284
- rt_radius = np.minimum(feature_rt - feature_rt_start, feature_rt_end - feature_rt)
1285
-
1286
- # Batch process all features
1287
- scan_uid_lists: list[list[int]] = []
1288
- spec_lists: list[list[Spectrum]] = []
1289
- updated_feature_uids = []
1290
- updated_scan_uids = []
1291
-
1292
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1293
-
1294
- for i, (rt_center, mz_center, radius, feature_uid, idx) in enumerate(
1295
- tqdm(
1296
- zip(
1297
- feature_rt,
1298
- feature_mz,
1299
- rt_radius,
1300
- feature_uids,
1301
- feature_indices,
1302
- strict=False,
1303
- ),
1304
- total=len(features_subset),
1305
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Link MS2 spectra",
1306
- disable=tdqm_disable,
1307
- ),
1308
- ):
1309
- # Vectorized filtering
1310
- rt_mask = np.abs(ms2_rt - rt_center) <= radius
1311
- mz_mask = np.abs(ms2_precursor - mz_center) <= mz_tol
1312
- valid_mask = rt_mask & mz_mask
1313
-
1314
- if not np.any(valid_mask):
1315
- scan_uid_lists.append(None)
1316
- spec_lists.append(None)
1317
- continue
1318
-
1319
- valid_indices = np.nonzero(valid_mask)[0]
1320
- rt_diffs = np.abs(ms2_rt[valid_indices] - rt_center)
1321
- sorted_indices = valid_indices[np.argsort(rt_diffs)]
1322
-
1323
- # Get unique cycles and their first occurrences
1324
- cycles = ms2_cycle[sorted_indices]
1325
- _, first_idx = np.unique(cycles, return_index=True)
1326
- final_indices = sorted_indices[first_idx]
1327
-
1328
- # Sort by RT difference again
1329
- final_rt_diffs = np.abs(ms2_rt[final_indices] - rt_center)
1330
- final_indices = final_indices[np.argsort(final_rt_diffs)]
1331
-
1332
- scan_uids = ms2_index_arr[final_indices].tolist()
1333
- scan_uid_lists.append(scan_uids)
1334
- spec_lists.append([
1335
- self.get_spectrum(
1336
- scan_uids[0],
1337
- centroid=centroid,
1338
- deisotope=deisotope,
1339
- dia_stats=dia_stats,
1340
- feature_uid=feature_uid,
1341
- ),
1342
- ])
1343
-
1344
- # Collect updates for batch processing
1345
- updated_feature_uids.extend([feature_uid] * len(final_indices))
1346
- updated_scan_uids.extend(ms2_index_arr[final_indices])
1347
- c += 1
1348
-
1349
- self.logger.debug("Update features.")
1350
- # Convert to polars if needed and batch update features_df
1351
- if not isinstance(features_df, pl.DataFrame):
1352
- features_df = pl.from_pandas(features_df)
1353
-
1354
- # Update the features_df
1355
- update_df = pl.DataFrame({
1356
- "temp_idx": feature_indices,
1357
- "ms2_scans": pl.Series("ms2_scans", scan_uid_lists, dtype=pl.Object),
1358
- "ms2_specs": pl.Series("ms2_specs", spec_lists, dtype=pl.Object),
1359
- })
1360
-
1361
- # Join and update
1362
- features_df = (
1363
- features_df.with_row_index("temp_idx")
1364
- .join(
1365
- update_df,
1366
- on="temp_idx",
1367
- how="left",
1368
- suffix="_new",
1369
- )
1370
- .with_columns([
1371
- pl.when(pl.col("ms2_scans_new").is_not_null())
1372
- .then(pl.col("ms2_scans_new"))
1373
- .otherwise(pl.col("ms2_scans"))
1374
- .alias("ms2_scans"),
1375
- pl.when(pl.col("ms2_specs_new").is_not_null())
1376
- .then(pl.col("ms2_specs_new"))
1377
- .otherwise(pl.col("ms2_specs"))
1378
- .alias("ms2_specs"),
1379
- ])
1380
- .drop(["temp_idx", "ms2_scans_new", "ms2_specs_new"])
1381
- )
1382
-
1383
- # Batch update scans_df
1384
- if updated_scan_uids:
1385
- scan_feature_uid_updates = dict(
1386
- zip(updated_scan_uids, updated_feature_uids, strict=True),
1387
- )
1388
- self.scans_df = (
1389
- self.scans_df.with_columns(
1390
- pl.col("scan_uid")
1391
- .map_elements(
1392
- lambda x: scan_feature_uid_updates.get(x),
1393
- return_dtype=pl.Int64,
1394
- )
1395
- .alias("feature_uid_update"),
1396
- )
1397
- .with_columns(
1398
- pl.when(pl.col("feature_uid_update").is_not_null())
1399
- .then(pl.col("feature_uid_update"))
1400
- .otherwise(pl.col("feature_uid"))
1401
- .alias("feature_uid"),
1402
- )
1403
- .drop("feature_uid_update")
1404
- )
1405
-
1406
- # Log completion
1407
- self.logger.info(
1408
- f"MS2 linking completed. Total features with MS2 data: {c}",
1409
- )
1410
- self.features_df = features_df
1411
-
1412
- # store params
1413
- self.store_history(["find_ms2"], params.to_dict())
1414
- self.logger.debug(
1415
- "Parameters stored to find_ms2",
1416
- )
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ import pyopenms as oms
8
+
9
+ from tqdm import tqdm
10
+
11
+ from masster.chromatogram import Chromatogram
12
+
13
+ # Parameters removed - using hardcoded defaults
14
+ from masster.spectrum import Spectrum
15
+ from .defaults.find_features_def import find_features_defaults
16
+ from .defaults.find_adducts_def import find_adducts_defaults
17
+ from .defaults.find_ms2_def import find_ms2_defaults
18
+ from .defaults.get_spectrum_def import get_spectrum_defaults
19
+
20
+
21
+ def get_spectrum(self, scan, **kwargs):
22
+ """
23
+ Retrieve and process a spectrum from the data file based on the given scan identifier.
24
+
25
+ This method locates the scan in the internal DataFrame, extracts the metadata (such as energy,
26
+ MS level, and retention time), and then retrieves the corresponding spectrum data from the file.
27
+ Depending on the file interface (either 'oms' or 'alpharaw'), the spectrum data is obtained
28
+ and processed (including optional denoising, centroiding, deisotoping, and precursor m/z trimming).
29
+
30
+ Parameters:
31
+ scan (int): Unique identifier of the scan to retrieve. This is a mandatory parameter.
32
+ **kwargs: Keyword arguments for spectrum retrieval parameters. Can include:
33
+ - A get_spectrum_defaults instance to set all parameters at once
34
+ - Individual parameter names and values (see get_spectrum_defaults for details)
35
+
36
+ Key Parameters:
37
+ precursor_trim (int, optional): Value used to trim the precursor m/z for MS2 spectra.
38
+ If provided and the spectrum's MS level is greater than 1,
39
+ m/z values above (precursor_mz - precursor_trim) will be trimmed.
40
+ Default is 20.
41
+ max_peaks (int, optional): Maximum number of peaks to retain in the spectrum. Default is 100.
42
+ centroid (bool, optional): Flag indicating whether the spectrum should be centroided.
43
+ If True and the spectrum is not already centroided, the method
44
+ applies denoising followed by centroiding using parameters from self.parameters.
45
+ Default is True.
46
+ deisotope (bool, optional): Flag indicating whether deisotoping should be performed. Default is False.
47
+ dia_stats (optional): Flag or parameter for processing DIA (data-independent acquisition)
48
+ statistics. If provided (and if applicable to the file type), additional
49
+ statistics will be computed for 'ztscan' files. Default is None.
50
+ feature (optional): An optional identifier used when computing DIA statistics. Default is None.
51
+ label (str, optional): Optional label to assign to the spectrum. If not provided,
52
+ a default name is generated based on the MS level and retention time.
53
+ Default is None.
54
+ centroid_algo (str, optional): Algorithm to use for centroiding. Default is None.
55
+
56
+ Returns:
57
+ spectrum: A processed spectrum object containing:
58
+ - m/z and intensity arrays
59
+ - metadata such as MS level, retention time, energy, and an assigned label
60
+ Depending on the processing steps (centroiding, trimming, deisotoping, etc.), the
61
+ returned spectrum is modified accordingly.
62
+ Returns None or an empty spectrum if the scan is not found or if an error occurs.
63
+
64
+ Notes:
65
+ - For the 'oms' file interface, the spectrum is retrieved via self.file_obj.getSpectrum
66
+ and handled accordingly.
67
+ - For the 'alpharaw' file interface, the method uses internal DataFrame attributes to locate the
68
+ scan and its associated peaks.
69
+ - The method applies additional processing (denoising, centroiding, deisotoping, trimming) based on
70
+ the input flags and the MS level of the spectrum.
71
+ """
72
+
73
+ # parameters initialization
74
+ params = get_spectrum_defaults(scan=scan)
75
+ for key, value in kwargs.items():
76
+ if isinstance(value, get_spectrum_defaults):
77
+ params = value
78
+ self.logger.debug("Using provided get_spectrum_defaults parameters")
79
+ else:
80
+ if hasattr(params, key):
81
+ if params.set(key, value, validate=True):
82
+ self.logger.debug(f"Updated parameter {key} = {value}")
83
+ else:
84
+ self.logger.warning(
85
+ f"Failed to set parameter {key} = {value} (validation failed)",
86
+ )
87
+ else:
88
+ self.logger.debug(f"Unknown parameter {key} ignored")
89
+ # end of parameter initialization
90
+
91
+ # Extract parameter values
92
+ scan = params.get("scan")
93
+ precursor_trim = params.get("precursor_trim")
94
+ max_peaks = params.get("max_peaks")
95
+ centroid = params.get("centroid")
96
+ deisotope = params.get("deisotope")
97
+ dia_stats = params.get("dia_stats")
98
+ feature_uid = params.get("feature")
99
+ label = params.get("label")
100
+ centroid_algo = params.get("centroid_algo")
101
+
102
+ # get energy, ms_level, rt from scans_df
103
+ scan_uid = scan # Preserve original scan ID
104
+ scan_info = self.scans_df.filter(pl.col("scan_uid") == scan_uid)
105
+ if len(scan_info) == 0:
106
+ self.logger.warning(f"Scan {scan_uid} not found.")
107
+ return None
108
+ scan_info = scan_info[0]
109
+ energy = scan_info["energy"][0]
110
+ ms_level = scan_info["ms_level"][0]
111
+ rt = scan_info["rt"][0]
112
+ if label is None:
113
+ if ms_level == 1:
114
+ name = f"MS1, rt {rt:.2f} s, scan {scan_uid}"
115
+ else:
116
+ name = f"MS2 of mz {scan_info['prec_mz'][0]:0.1f}, rt {rt:.2f} s, scan {scan_uid}"
117
+ else:
118
+ name = label
119
+
120
+ if centroid_algo is None:
121
+ if "centroid_algo" in self.parameters:
122
+ centroid_algo = self.parameters.get("centroid_algo")
123
+ else:
124
+ # this is for backward compatibility. This is the old default
125
+ self.parameters.centroid_algo = "lmp"
126
+ centroid_algo = self.parameters.get("centroid_algo")
127
+
128
+ spec0 = Spectrum(mz=np.array([]), inty=np.array([]))
129
+ if self.file_interface == "oms":
130
+ # if check that file_obj is not None
131
+ if self.file_obj is None:
132
+ self.logger.error("Please load a file first.")
133
+ return
134
+ try:
135
+ spect = self.file_obj.getSpectrum(scan_uid).get_peaks()
136
+ except Exception as e:
137
+ self.logger.error(f"Error: {e}")
138
+ return spec0
139
+ if len(spect[0]) == 0:
140
+ return spec0
141
+ elif len(spect[0]) == 1:
142
+ mz = np.array([spect[0][0]])
143
+ inty = np.array([spect[1][0]])
144
+ else:
145
+ mz = np.array(spect[0])
146
+ inty = np.array(spect[1])
147
+ if ms_level == 1:
148
+ spect = Spectrum(
149
+ mz=mz,
150
+ inty=inty,
151
+ ms_level=ms_level,
152
+ rt=rt,
153
+ energy=None,
154
+ precursor_mz=None,
155
+ label=name,
156
+ )
157
+ else:
158
+ spect = Spectrum(
159
+ mz=mz,
160
+ inty=inty,
161
+ ms_level=ms_level,
162
+ rt=rt,
163
+ energy=energy,
164
+ precursor_mz=scan_info["prec_mz"][0],
165
+ label=name,
166
+ )
167
+ if centroid and not spect.centroided:
168
+ spect = spect.denoise()
169
+ if spect.ms_level == 1:
170
+ spect = spect.centroid(
171
+ algo=centroid_algo,
172
+ tolerance=self.parameters.get("mz_tol_ms1_da"),
173
+ ppm=self.parameters.get("mz_tol_ms1_ppm"),
174
+ min_points=self.parameters.get("centroid_min_points_ms1"),
175
+ smooth=self.parameters.get("centroid_smooth"),
176
+ prominence=self.parameters.get("centroid_prominence"),
177
+ refine=self.parameters.get("centroid_refine"),
178
+ )
179
+ elif spect.ms_level == 2:
180
+ spect = spect.centroid(
181
+ algo=centroid_algo,
182
+ tolerance=self.parameters.get("mz_tol_ms2_da"),
183
+ ppm=self.parameters.get("mz_tol_ms2_ppm"),
184
+ min_points=self.parameters.get("centroid_min_points_ms2"),
185
+ smooth=self.parameters.get("centroid_smooth"),
186
+ prominence=self.parameters.get("centroid_prominence"),
187
+ refine=self.parameters.get("centroid_refine"),
188
+ )
189
+
190
+ elif self.file_interface == "alpharaw":
191
+ spec_df = self.file_obj.spectrum_df
192
+ spect = (
193
+ spec_df.filter(pl.col("scan_id") == scan_uid).row(0, named=True)
194
+ if isinstance(spec_df, pl.DataFrame)
195
+ else spec_df.loc[scan_uid]
196
+ )
197
+ peak_stop_idx = spect["peak_stop_idx"]
198
+ peak_start_idx = spect["peak_start_idx"]
199
+
200
+ if isinstance(self.file_obj.peak_df, pl.DataFrame):
201
+ peaks = self.file_obj.peak_df.slice(
202
+ peak_start_idx,
203
+ peak_stop_idx - peak_start_idx,
204
+ )
205
+ mz_values = peaks.select("mz").to_numpy().flatten()
206
+ intensity_values = peaks.select("intensity").to_numpy().flatten()
207
+ else:
208
+ peaks = self.file_obj.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
209
+ mz_values = peaks.mz.values
210
+ intensity_values = peaks.intensity.values
211
+
212
+ if spect["ms_level"] > 1:
213
+ spect = Spectrum(
214
+ mz=np.asarray(mz_values, dtype=np.float64),
215
+ inty=np.asarray(intensity_values, dtype=np.float64),
216
+ ms_level=ms_level,
217
+ centroided=False,
218
+ precursor_mz=spect["precursor_mz"],
219
+ energy=energy,
220
+ rt=rt,
221
+ label=name,
222
+ )
223
+ else:
224
+ spect = Spectrum(
225
+ mz=np.asarray(mz_values, dtype=np.float64),
226
+ inty=np.asarray(intensity_values, dtype=np.float64),
227
+ ms_level=ms_level,
228
+ centroided=False,
229
+ precursor_mz=None,
230
+ energy=None,
231
+ rt=rt,
232
+ label=name,
233
+ )
234
+
235
+ if len(spect) and centroid and not spect.centroided:
236
+ spect = spect.denoise()
237
+ if spect.ms_level == 1:
238
+ spect = spect.centroid(
239
+ algo=centroid_algo,
240
+ tolerance=self.parameters.get("mz_tol_ms1_da"),
241
+ ppm=self.parameters.get("mz_tol_ms1_ppm"),
242
+ min_points=self.parameters.get("centroid_min_points_ms1"),
243
+ smooth=self.parameters.get("centroid_smooth"),
244
+ prominence=self.parameters.get("centroid_prominence"),
245
+ refine=self.parameters.get("centroid_refine"),
246
+ )
247
+ elif spect.ms_level == 2:
248
+ spect = spect.centroid(
249
+ algo=centroid_algo,
250
+ tolerance=self.parameters.get("mz_tol_ms2_da"),
251
+ ppm=self.parameters.get("mz_tol_ms2_ppm"),
252
+ min_points=self.parameters.get("centroid_min_points_ms2"),
253
+ smooth=self.parameters.get("centroid_smooth"),
254
+ prominence=self.parameters.get("centroid_prominence"),
255
+ refine=self.parameters.get("centroid_refine"),
256
+ )
257
+
258
+ else:
259
+ self.logger.error(
260
+ f"File interface {self.file_interface} not supported. Reload data.",
261
+ )
262
+ return spec0
263
+
264
+ if precursor_trim is not None and spect.ms_level > 1:
265
+ spect = spect.trim(mz_min=None, mz_max=spect.precursor_mz - precursor_trim) # type: ignore[attr-defined]
266
+ if deisotope:
267
+ spect = spect.deisotope()
268
+
269
+ if max_peaks is not None:
270
+ spect = spect.keep_top(max_peaks)
271
+
272
+ if dia_stats:
273
+ if self.file_type in ["ztscan", "dia"]:
274
+ spect = self._get_ztscan_stats(
275
+ spec=spect,
276
+ scan_uid=scan_uid,
277
+ feature_uid=scan_info["feature_uid"][0]
278
+ if "feature_uid" in scan_info and scan_info["feature_uid"][0] is not None
279
+ else feature_uid,
280
+ q1_step=2,
281
+ deisotope=deisotope,
282
+ centroid=centroid,
283
+ )
284
+ return spect
285
+
286
+
287
+ def _get_ztscan_stats(
288
+ self,
289
+ spec,
290
+ scan_uid=None,
291
+ feature_uid=None,
292
+ q1_step=2,
293
+ mz_tol=0.005,
294
+ # TODO check this
295
+ # deisotope=SpectrumParameters().deisotope,
296
+ deisotope=False,
297
+ # TODO there is no `centroid_algo`?
298
+ centroid=True,
299
+ ):
300
+ spec.size = spec.mz.size
301
+ # spec.ms_entropy = spec.entropy()
302
+
303
+ if self.scans_df is None:
304
+ self.logger.warning("No scans found.")
305
+ return spec
306
+ scan = self.scans_df.filter(pl.col("scan_uid") == scan_uid)
307
+ if len(scan) == 0:
308
+ self.logger.warning(f"Scan {scan_uid} not found.")
309
+ return spec
310
+ scan = scan[0]
311
+ if scan["ms_level"][0] != 2:
312
+ self.logger.warning(f"Scan {scan_uid} is not a MS2 scan.")
313
+ # Q1
314
+ lscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid - q1_step)
315
+ if len(lscan) == 0:
316
+ self.logger.warning(f"Scan {scan_uid - q1_step} not found.")
317
+ return spec
318
+ lscan = lscan[0]
319
+ # check that lscan['ms_level'] == 2 and lscan['cycle'] == scan['cycle']
320
+ if lscan["ms_level"][0] != 2:
321
+ self.logger.warning(f"Scan {scan_uid - q1_step} is not a MS2 scan.")
322
+ return spec
323
+ if lscan["cycle"][0] != scan["cycle"][0]:
324
+ self.logger.warning(
325
+ f"Scan {scan_uid - q1_step} is not in the same cycle as scan {scan_uid}.",
326
+ )
327
+ return spec
328
+ rscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid + q1_step)
329
+ if len(rscan) == 0:
330
+ self.logger.warning(f"Scan {scan_uid + q1_step} not found.")
331
+ return spec
332
+ rscan = rscan[0]
333
+ # check that rscan['ms_level'] == 2 and rscan['cycle'] == scan['cycle']
334
+ if rscan["ms_level"][0] != 2:
335
+ self.logger.warning(f"Scan {scan_uid + q1_step} is not a MS2 scan.")
336
+ return spec
337
+ if rscan["cycle"][0] != scan["cycle"][0]:
338
+ self.logger.warning(
339
+ f"Scan {scan_uid + q1_step} is not in the same cycle as scan {scan_uid}.",
340
+ )
341
+ return spec
342
+ intymat = self._spec_to_mat(
343
+ scan_uids=[scan_uid - q1_step, scan_uid, scan_uid + q1_step],
344
+ mz_ref=spec.mz,
345
+ mz_tol=mz_tol,
346
+ deisotope=deisotope,
347
+ centroid=centroid,
348
+ )
349
+ # pick only mzs that are close to spec.mz
350
+ if intymat is None:
351
+ return spec
352
+ if intymat.shape[1] < 3:
353
+ self.logger.warning(f"Not enough data points for scan {scan_uid}.")
354
+ return spec
355
+ q1_ratio = (2 * intymat[:, 1] + 0.01) / (intymat[:, 0] + intymat[:, 2] + 0.01)
356
+ spec.q1_ratio = np.log2(q1_ratio)
357
+ # where intymat[:, 0] + intymat[:, 2]==0, set q1_ratio to -1
358
+ spec.q1_ratio[np.isclose(intymat[:, 0] + intymat[:, 2], 0)] = -10
359
+
360
+ # EIC correlation
361
+ # find rt_start and rt_end of the feature_uid
362
+ if self.features_df is None:
363
+ self.logger.warning("No features found.")
364
+ return spec
365
+ if feature_uid is None:
366
+ return spec
367
+ # spec.precursor_mz = feature['mz']
368
+ feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
369
+ if len(feature) == 0:
370
+ self.logger.warning(f"Feature {feature_uid} not found.")
371
+ return spec
372
+ feature = feature.row(0, named=True)
373
+ rt_start = feature["rt_start"]
374
+ rt_end = feature["rt_end"]
375
+ # get the cycle at rt_start and the cycle at rt_end from the closest scan with ms_level == 1
376
+ scans = self.scans_df.filter(pl.col("ms_level") == 1)
377
+ scans = scans.filter(pl.col("rt") > rt_start)
378
+ scans = scans.filter(pl.col("rt") < rt_end)
379
+ if len(scans) == 0:
380
+ self.logger.warning(f"No scans found between {rt_start} and {rt_end}.")
381
+ return spec
382
+ scan_uids = scans["scan_uid"].to_list()
383
+ eic_prec = self._spec_to_mat(
384
+ scan_uids=scan_uids,
385
+ mz_ref=feature["mz"],
386
+ mz_tol=mz_tol,
387
+ deisotope=deisotope,
388
+ centroid=centroid,
389
+ )
390
+ # find width at half maximum of the eic_prec
391
+ # hm = np.max(eic_prec[0, :]) / 3
392
+ # find index of maximum
393
+ # eic_prec_max_idx = np.argmax(eic_prec[0, :])
394
+ # find index of the closest point to half maximum
395
+ # idx = np.argmin(np.abs(eic_prec[0, :] - hm))
396
+ # eic_fwhm_prec = abs(eic_prec_max_idx - idx)
397
+
398
+ # get all unique cycles from scans
399
+ cycles = scans["cycle"].unique()
400
+ scandids = []
401
+ # iterate over all cycles and get the scan_uid of scan with ms_level == 2 and closest precursor_mz to spec.precursor_mz
402
+ for cycle in cycles:
403
+ scans = self.scans_df.filter(pl.col("cycle") == cycle)
404
+ scans = scans.filter(pl.col("ms_level") == 2)
405
+ scans = scans.filter(pl.col("prec_mz") > feature["mz"] - 4)
406
+ scans = scans.filter(pl.col("prec_mz") < feature["mz"] + 4)
407
+ if len(scans) == 0:
408
+ self.logger.warning(f"No scans found for cycle {cycle}.")
409
+ continue
410
+ scan = scans[(scans["prec_mz"] - feature["mz"]).abs().arg_sort()[:1]]
411
+ scandids.append(scan["scan_uid"][0])
412
+
413
+ eic_prod = self._spec_to_mat(
414
+ scandids,
415
+ mz_ref=spec.mz,
416
+ mz_tol=mz_tol,
417
+ deisotope=deisotope,
418
+ centroid=centroid,
419
+ )
420
+ # eic_prod = eic_prod.T
421
+ # eic_prec = eic_prec.T
422
+ # calculate correlation between eic_prec and all columns of eic_prod, column by column
423
+ eic_corr = np.zeros(eic_prod.shape[0])
424
+ # eic_width_ratio = np.zeros(eic_prod.shape[0])
425
+ for i in range(eic_prod.shape[0]):
426
+ try:
427
+ with np.errstate(divide="ignore", invalid="ignore"):
428
+ eic_corr[i] = np.corrcoef(eic_prod[i, :], eic_prec[0, :])[0, 1]
429
+ except:
430
+ pass
431
+
432
+ spec.eic_corr = eic_corr
433
+ return spec
434
+
435
+
436
+ def _spec_to_mat(
437
+ self,
438
+ scan_uids,
439
+ mz_ref=None,
440
+ mz_tol=0.01,
441
+ # TODO check this
442
+ # deisotope=SpectrumParameters().deisotope,
443
+ deisotope=False,
444
+ # TODO there is no `centroid_algo`?
445
+ # TODO there is no `dia_stats`?
446
+ # TODO unused (see below)
447
+ centroid=True,
448
+ # TODO check this
449
+ # precursor_trim=SpectrumParameters().precursor_trim,
450
+ # TODO unused (see below)
451
+ precursor_trim=None,
452
+ ):
453
+ # get all spectra in scan_uids
454
+
455
+ if mz_ref is None:
456
+ return None
457
+
458
+ if not isinstance(mz_ref, np.ndarray):
459
+ if isinstance(mz_ref, list):
460
+ mz_ref = np.array(mz_ref)
461
+ else:
462
+ mz_ref = np.array([mz_ref])
463
+
464
+ def align_mzs(ar1, ar2, tol):
465
+ closest_indices = []
466
+ # find the closest pair between each element in ar1 and ar2, within a maximum tolerance of tol
467
+ for i, val1 in enumerate(ar1):
468
+ closest_index = np.argmin(np.abs(ar2 - val1))
469
+ closest_indices.append((i, closest_index))
470
+ # filter out pairs that are not within the specified tolerance
471
+ closest_indices = [(i, j) for i, j in closest_indices if np.abs(ar1[i] - ar2[j]) <= tol]
472
+ # remove duplicates from the list of indices
473
+ closest_indices = list(set(closest_indices))
474
+ # sort the list of indices by the first element (i) in ascending order
475
+ closest_indices = sorted(closest_indices, key=lambda x: x[0])
476
+
477
+ # Convert the list of indices into an array for easier indexing in subsequent operations
478
+ return np.array(closest_indices)
479
+
480
+ specs = []
481
+ for scan_uid in scan_uids:
482
+ spec = self.get_spectrum(
483
+ scan_uid=scan_uid,
484
+ centroid=True,
485
+ dia_stats=False,
486
+ precursor_trim=5,
487
+ )
488
+ if deisotope:
489
+ spec = spec.deisotope()
490
+ # align to reference spectrum
491
+ if spec.mz.size == 0:
492
+ continue
493
+ if mz_ref.size == 0:
494
+ continue
495
+ closest_indices = align_mzs(spec.mz, mz_ref, mz_tol)
496
+ # store the aligned spectrum in the list
497
+ aligned_inty = np.zeros(len(mz_ref))
498
+ for i, j in closest_indices:
499
+ if abs(spec.mz[i] - mz_ref[j]) <= mz_tol:
500
+ if aligned_inty[j] < spec.inty[i]:
501
+ aligned_inty[j] = spec.inty[i]
502
+ specs.append(aligned_inty)
503
+
504
+ if len(specs) == 0:
505
+ return None
506
+ # create a matrix with the aligned spectra. Each spec goes into a column
507
+ mat = np.column_stack(specs)
508
+
509
+ return mat
510
+
511
+
512
+ def find_features(self, **kwargs):
513
+ """
514
+ Detect features in mass spectrometry data by processing MS1 spectra, performing mass trace detection,
515
+ elution peak detection, and feature detection. Optionally, deisotope features and remove low-quality peaks.
516
+
517
+ This method leverages an MSExperiment constructed from the object's ms1_df, where each cycle in the data
518
+ corresponds to an MSSpectrum. It then runs mass trace detection using set parameters, deconvolutes the mass
519
+ traces to detect chromatographic peaks, and finally identifies features with a feature finding algorithm. The
520
+ resulting feature map is cleaned, deisotoped (if enabled), and assigned unique IDs before being stored.
521
+
522
+ Parameters:
523
+ **kwargs: Keyword arguments for feature detection parameters. Can include:
524
+ - A find_features_defaults instance to set all parameters at once
525
+ - Individual parameter names and values (see find_features_defaults for details)
526
+
527
+ Key Parameters:
528
+ tol_ppm (float): Mass error tolerance in parts-per-million for mass trace detection (default: 30.0).
529
+ noise (float): Noise threshold intensity to filter out low-intensity signals (default: 200.0).
530
+ chrom_fwhm (float): Full width at half maximum for chromatographic peak shape (default: 1.0).
531
+ chrom_fwhm_min (float): Minimum FWHM for chromatographic peak detection (default: 0.5).
532
+ chrom_peak_snr (float): Signal-to-noise ratio required for chromatographic peaks (default: 10.0).
533
+ mz_scoring_13C (bool): Whether to enable scoring of 13C isotopic patterns (default: False).
534
+ masstrace_snr_filtering (bool): Whether to apply SNR filtering to mass traces (default: False).
535
+ deisotope (bool): Whether to perform deisotoping of detected features (default: True).
536
+
537
+ Attributes set:
538
+ self.features: An updated feature map with unique IDs after feature detection and deisotoping.
539
+ self.features_df: A cleaned DataFrame of features, with peaks of zero quality removed, representing the final
540
+ detected features.
541
+
542
+ Notes:
543
+ - The method processes the ms1_df by iterating over cycles to build an MSExperiment.
544
+ - External OMS modules (e.g., MSExperiment, MSSpectrum, MassTraceDetection, ElutionPeakDetection,
545
+ FeatureFindingMetabo) are used throughout the processing.
546
+ - After feature detection, additional cleaning is performed via internal helper methods.
547
+ """
548
+ if self.ms1_df is None:
549
+ self.logger.error("No MS1 data found. Please load a file first.")
550
+ return
551
+ if len(self.ms1_df) == 0:
552
+ self.logger.error("MS1 data is empty. Please load a file first.")
553
+ return
554
+ # parameters initialization
555
+ params = find_features_defaults()
556
+ for key, value in kwargs.items():
557
+ if isinstance(value, find_features_defaults):
558
+ # set
559
+ params = value
560
+ self.logger.debug("Using provided find_features_defaults parameters")
561
+ else:
562
+ if hasattr(params, key):
563
+ if params.set(key, value, validate=True):
564
+ self.logger.debug(f"Updated parameter {key} = {value}")
565
+ else:
566
+ self.logger.warning(
567
+ f"Failed to set parameter {key} = {value} (validation failed)",
568
+ )
569
+ else:
570
+ self.logger.warning(f"Unknown parameter {key} ignored")
571
+
572
+ self.logger.info("Starting feature detection...")
573
+ self.logger.debug(
574
+ f"Parameters: chrom_fwhm={params.get('chrom_fwhm')}, noise={params.get('noise')}, tol_ppm={params.get('tol_ppm')}",
575
+ )
576
+
577
+ exp = oms.MSExperiment()
578
+ # find max number of cycles in self.ms1_df
579
+ max_cycle = self.ms1_df["cycle"].max()
580
+ # iterate over all cycles, find rows with 1 cycle and append to exp2
581
+ for cycle in range(1, max_cycle + 1):
582
+ cycle_df = self.ms1_df.filter(pl.col("cycle") == cycle)
583
+ # check if len(cycle_df) > 0
584
+ if len(cycle_df) > 0:
585
+ spectrum = oms.MSSpectrum()
586
+ spectrum.setRT(cycle_df[0]["rt"].item())
587
+ spectrum.setMSLevel(1) # MS1
588
+ mz = cycle_df["mz"]
589
+ inty = cycle_df["inty"]
590
+ spectrum.set_peaks([mz, inty]) # type: ignore[attr-defined]
591
+ spectrum.sortByPosition()
592
+ exp.addSpectrum(spectrum)
593
+
594
+ # exp.sortSpectra(True)
595
+ # mass trace detection
596
+ mass_traces: list = []
597
+ mtd = oms.MassTraceDetection()
598
+ mtd_par = mtd.getDefaults()
599
+
600
+ # Apply MTD parameters
601
+ mtd_par.setValue("mass_error_ppm", float(params.get("tol_ppm")))
602
+ mtd_par.setValue("noise_threshold_int", float(params.get("noise")))
603
+ mtd_par.setValue(
604
+ "min_trace_length",
605
+ float(params.get("min_trace_length_multiplier")) * float(params.get("chrom_fwhm_min")),
606
+ )
607
+ mtd_par.setValue(
608
+ "trace_termination_outliers",
609
+ int(params.get("trace_termination_outliers")),
610
+ )
611
+ mtd_par.setValue("chrom_peak_snr", float(params.get("chrom_peak_snr")))
612
+
613
+ mtd.setParameters(mtd_par) # set the new parameters
614
+ mtd.run(exp, mass_traces, 0) # run mass trace detection
615
+
616
+ # elution peak detection
617
+ mass_traces_deconvol: list = []
618
+ epd = oms.ElutionPeakDetection()
619
+ epd_par = epd.getDefaults()
620
+
621
+ # Apply EPD parameters using our parameter class
622
+ epd_par.setValue("width_filtering", params.get("width_filtering"))
623
+ epd_par.setValue("min_fwhm", float(params.get("chrom_fwhm_min")))
624
+ epd_par.setValue("chrom_fwhm", float(params.get("chrom_fwhm")))
625
+ epd_par.setValue("chrom_peak_snr", float(params.get("chrom_peak_snr")))
626
+ if params.get("masstrace_snr_filtering"):
627
+ epd_par.setValue("masstrace_snr_filtering", "true")
628
+ if params.get("mz_scoring_13C"):
629
+ epd_par.setValue("mz_scoring_13C", "true")
630
+
631
+ epd.setParameters(epd_par)
632
+ epd.detectPeaks(mass_traces, mass_traces_deconvol)
633
+
634
+ # feature detection
635
+ feature_map = oms.FeatureMap() # output features
636
+ chrom_out: list = [] # output chromatograms
637
+ ffm = oms.FeatureFindingMetabo()
638
+ ffm_par = ffm.getDefaults()
639
+
640
+ # Apply FFM parameters using our parameter class
641
+ ffm_par.setValue(
642
+ "remove_single_traces",
643
+ "true" if params.get("remove_single_traces") else "false",
644
+ )
645
+ ffm_par.setValue(
646
+ "report_convex_hulls",
647
+ "true" if params.get("report_convex_hulls") else "false",
648
+ )
649
+ ffm_par.setValue(
650
+ "report_summed_ints",
651
+ "true" if params.get("report_summed_ints") else "false",
652
+ )
653
+ ffm_par.setValue(
654
+ "report_chromatograms",
655
+ "true" if params.get("report_chromatograms") else "false",
656
+ )
657
+
658
+ ffm.setParameters(ffm_par)
659
+ self.logger.debug("Running feature finding with parameters:")
660
+ self.logger.debug(ffm_par)
661
+ ffm.run(mass_traces_deconvol, feature_map, chrom_out)
662
+ # Assigns a new, valid unique id per feature
663
+ feature_map.ensureUniqueId()
664
+ df = feature_map.get_df(export_peptide_identifications=False) # type: ignore[attr-defined]
665
+ # Sets the file path to the primary MS run (usually the mzML file)
666
+ feature_map.setPrimaryMSRunPath([self.file_path.encode()])
667
+ self.features = feature_map
668
+ # remove peaks with quality == 0
669
+ df = self._clean_features_df(df)
670
+
671
+ # desotope features
672
+ df = self._features_deisotope(
673
+ df,
674
+ mz_tol=params.get("deisotope_mz_tol"),
675
+ rt_tol=params.get("chrom_fwhm_min") / 4 * params.get("deisotope_rt_tol_factor"),
676
+ )
677
+ if params.get("deisotope"):
678
+ # record size before deisotoping
679
+ size_before_deisotope = len(df)
680
+ df = df.filter(pl.col("iso") == 0)
681
+ self.logger.debug(
682
+ f"Deisotoping features: {size_before_deisotope - len(df)} features removed.",
683
+ )
684
+
685
+ # update eic - create lists to collect results
686
+ chroms: list[Chromatogram] = []
687
+ coherences: list[float] = []
688
+ prominences: list[float] = []
689
+ prominence_scaleds: list[float] = []
690
+ height_scaleds: list[float] = []
691
+
692
+ mz_tol = params.get("eic_mz_tol")
693
+ rt_tol = params.get("eic_rt_tol")
694
+
695
+ # iterate over all rows in df using polars iteration
696
+ self.logger.debug("Extracting EICs...")
697
+ for row in df.iter_rows(named=True):
698
+ # select data in ms1_df with mz in range [mz_start - mz_tol, mz_end + mz_tol] and rt in range [rt_start - rt_tol, rt_end + rt_tol]
699
+ d = self.ms1_df.filter(
700
+ (pl.col("rt") >= row["rt_start"] - rt_tol)
701
+ & (pl.col("rt") <= row["rt_end"] + rt_tol)
702
+ & (pl.col("mz") >= row["mz"] - mz_tol)
703
+ & (pl.col("mz") <= row["mz"] + mz_tol),
704
+ )
705
+ # for all unique rt values, find the maximum inty
706
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max())
707
+ if len(eic_rt) < 4:
708
+ chroms.append(None)
709
+ coherences.append(None)
710
+ prominences.append(None)
711
+ prominence_scaleds.append(None)
712
+ height_scaleds.append(None)
713
+ continue
714
+
715
+ eic = Chromatogram(
716
+ eic_rt["rt"].to_numpy(),
717
+ eic_rt["inty"].to_numpy(),
718
+ label=f"EIC mz={row['mz']:.4f}",
719
+ file=self.file_path,
720
+ mz=row["mz"],
721
+ mz_tol=mz_tol,
722
+ feature_start=row["rt_start"],
723
+ feature_end=row["rt_end"],
724
+ feature_apex=row["rt"],
725
+ ).find_peaks()
726
+
727
+ # collect results
728
+ chroms.append(eic)
729
+ if len(eic.peak_widths) > 0:
730
+ coherences.append(round(eic.feature_coherence, 3))
731
+ prominences.append(round(eic.peak_prominences[0], 3))
732
+ prominence_scaleds.append(
733
+ round(eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3),
734
+ )
735
+ height_scaleds.append(
736
+ round(eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3),
737
+ )
738
+ else:
739
+ coherences.append(None)
740
+ prominences.append(None)
741
+ prominence_scaleds.append(None)
742
+ height_scaleds.append(None)
743
+
744
+ # Add the computed columns to the dataframe
745
+ df = df.with_columns([
746
+ pl.Series("chrom", chroms, dtype=pl.Object),
747
+ pl.Series("chrom_coherence", coherences, dtype=pl.Float64),
748
+ pl.Series("chrom_prominence", prominences, dtype=pl.Float64),
749
+ pl.Series("chrom_prominence_scaled", prominence_scaleds, dtype=pl.Float64),
750
+ pl.Series("chrom_height_scaled", height_scaleds, dtype=pl.Float64),
751
+ ])
752
+
753
+ self.features_df = df
754
+ self._features_sync()
755
+ self.logger.info(f"Feature detection completed. Total features: {len(df)}")
756
+
757
+ # store params
758
+ self.store_history(["find_features"], params.to_dict())
759
+ self.logger.debug(
760
+ "Parameters stored to find_features",
761
+ )
762
+ keys_to_remove = ["find_adducts", "find_ms2"]
763
+ for key in keys_to_remove:
764
+ if key in self.history:
765
+ del self.history[key]
766
+ self.logger.debug(f"Removed {key} from history")
767
+
768
+
769
+ def find_adducts(self, **kwargs):
770
+ """
771
+ Detect adducts in mass spectrometry features using OpenMS MetaboliteFeatureDeconvolution.
772
+
773
+ This method analyzes detected features to identify adduct relationships based on mass differences,
774
+ charge states, and retention time proximity. It groups features that likely represent the same
775
+ metabolite in different ionization states.
776
+
777
+ Parameters:
778
+ **kwargs: Keyword arguments for adduct detection parameters. Can include:
779
+ - A find_adducts_defaults instance to set all parameters at once
780
+ - Individual parameter names and values (see find_adducts_defaults for details)
781
+
782
+ Key Parameters:
783
+ adducts (Union[List[str], str, None]): List of potential adducts or ionization mode string.
784
+ charge_min (int): Minimal possible charge state (default: 1).
785
+ charge_max (int): Maximal possible charge state (default: 2).
786
+ retention_max_diff (float): Maximum retention time difference for grouping (default: 1.0).
787
+
788
+ Attributes set:
789
+ self.features_df: Updated with adduct information including 'adduct', 'adduct_mass',
790
+ and 'adduct_group' columns.
791
+ """
792
+ params = find_adducts_defaults()
793
+ for key, value in kwargs.items():
794
+ if isinstance(value, find_adducts_defaults):
795
+ # set
796
+ params = value
797
+ self.logger.debug("Using provided find_adducts_defaults parameters")
798
+ else:
799
+ if hasattr(params, key):
800
+ if params.set(key, value, validate=True):
801
+ self.logger.debug(f"Updated parameter {key} = {value}")
802
+ else:
803
+ self.logger.warning(
804
+ f"Failed to set parameter {key} = {value} (validation failed)",
805
+ )
806
+ else:
807
+ self.logger.warning(f"Unknown parameter {key} ignored")
808
+
809
+ self.logger.debug("Starting adduct detection...")
810
+ self.logger.debug(
811
+ f"Parameters: adducts={params.get('adducts')}, charge_min={params.get('charge_min')}, charge_max={params.get('charge_max')}",
812
+ )
813
+
814
+ mfd = oms.MetaboliteFeatureDeconvolution()
815
+
816
+ openms_params = mfd.getDefaults()
817
+
818
+ # Set adducts using the helper method
819
+ adducts_list = params.get_openms_adducts()
820
+ openms_params.setValue("potential_adducts", [a.encode() for a in adducts_list])
821
+
822
+ # Apply other parameters
823
+ openms_params.setValue("charge_min", params.get("charge_min"))
824
+ openms_params.setValue("charge_max", params.get("charge_max"))
825
+ openms_params.setValue("charge_span_max", params.get("charge_span_max"))
826
+ openms_params.setValue("retention_max_diff", params.get("retention_max_diff"))
827
+ openms_params.setValue(
828
+ "retention_max_diff_local",
829
+ params.get("retention_max_diff_local"),
830
+ )
831
+
832
+ # set updated parameters object
833
+ mfd.setParameters(openms_params)
834
+ self.logger.debug("Running adduct detection with parameters:")
835
+ self.logger.debug(openms_params)
836
+ # result feature map: will store features with adduct information
837
+ feature_map_MFD = oms.FeatureMap()
838
+ # result consensus map: will store grouped features belonging to a charge group
839
+ groups = oms.ConsensusMap()
840
+ # result consensus map: will store paired features connected by an edge
841
+ edges = oms.ConsensusMap()
842
+
843
+ # compute adducts
844
+ mfd.compute(self.features, feature_map_MFD, groups, edges)
845
+ self.logger.debug("Extracting information.")
846
+
847
+ # export feature map as pandas DataFrame and append adduct information
848
+ adducts_map = feature_map_MFD.get_df(export_peptide_identifications=False) # type: ignore[attr-defined]
849
+ adducts_map["adduct"] = [f.getMetaValue("dc_charge_adducts") for f in feature_map_MFD]
850
+ adducts_map["adduct_group_id"] = [f.getMetaValue("Group") for f in feature_map_MFD]
851
+ adducts_map["adduct_mass"] = [f.getMetaValue("dc_charge_adduct_mass") for f in feature_map_MFD]
852
+ # clean up the DataFrame
853
+
854
+ # Clean up 'None' strings that should be actual None values from OpenMS getMetaValue
855
+ for col in ["adduct", "adduct_group_id", "adduct_mass"]:
856
+ if col in adducts_map.columns:
857
+ adducts_map[col] = adducts_map[col].replace("None", None)
858
+
859
+ # Convert adducts_map to polars and merge
860
+ adducts_df = pl.DataFrame({
861
+ "index": range(len(adducts_map)),
862
+ "adduct": adducts_map["adduct"],
863
+ "adduct_mass": adducts_map["adduct_mass"],
864
+ "adduct_group_id": adducts_map["adduct_group_id"],
865
+ })
866
+ features_pl = self.features_df if isinstance(self.features_df, pl.DataFrame) else pl.from_pandas(self.features_df)
867
+
868
+ # Remove existing adduct columns if they exist (likely all null)
869
+ if "adduct" in features_pl.columns:
870
+ features_pl = features_pl.drop("adduct")
871
+ if "adduct_mass" in features_pl.columns:
872
+ features_pl = features_pl.drop("adduct_mass")
873
+ if "adduct_group" in features_pl.columns:
874
+ features_pl = features_pl.drop("adduct_group")
875
+
876
+ df = features_pl.join(
877
+ adducts_df,
878
+ left_on="feature_uid",
879
+ right_on="index",
880
+ how="left",
881
+ )
882
+
883
+ # Create adduct_group from adduct_group_id column
884
+ unique_groups = df["adduct_group_id"].unique().to_list()
885
+ group_mapping = {group: idx for idx, group in enumerate(unique_groups)}
886
+ df = df.with_columns(
887
+ pl.col("adduct_group_id")
888
+ .map_elements(lambda x: group_mapping.get(x, 0), return_dtype=pl.Int64)
889
+ .alias("adduct_group"),
890
+ )
891
+
892
+ # remove adduct_group_id
893
+ df = df.drop("adduct_group_id")
894
+ # move adduct, adduct_mass, and adduct_group after column iso_of
895
+ if "iso_of" in df.columns:
896
+ adduct_cols = ["adduct", "adduct_mass", "adduct_group"]
897
+ # Get all column names and reorder them
898
+ all_cols = df.columns
899
+ iso_of_idx = all_cols.index("iso_of")
900
+
901
+ # Create new column order: everything before iso_of, then iso_of, then adduct columns, then the rest
902
+ new_order = []
903
+ # columns up to and including iso_of
904
+ new_order.extend(all_cols[: iso_of_idx + 1])
905
+ # adduct columns that exist
906
+ new_order.extend([col for col in adduct_cols if col in all_cols])
907
+ new_order.extend([col for col in all_cols[iso_of_idx + 1 :] if col not in adduct_cols]) # remaining columns
908
+
909
+ df = df.select(new_order)
910
+ # Update the features_df attribute with the new DataFrame
911
+
912
+ self.features_df = df
913
+ total_adducts = df.filter(pl.col("adduct").is_not_null()).shape[0]
914
+ self.logger.info(f"Adduct detection completed. Total adducts: {total_adducts}")
915
+
916
+ # store params
917
+ self.store_history(["find_adducts"], params.to_dict())
918
+ self.logger.debug(
919
+ "Parameters stored to find_adducts",
920
+ )
921
+
922
+
923
+ def _clean_features_df(self, df):
924
+ # Convert pandas DataFrame to polars if needed
925
+ df["feature_id"] = df.index
926
+ if hasattr(df, "columns") and not isinstance(df, pl.DataFrame):
927
+ df_pl = pl.from_pandas(df)
928
+ else:
929
+ df_pl = df
930
+
931
+ # Filter out rows with quality == 0
932
+ df2 = df_pl.filter(pl.col("quality") != 0)
933
+
934
+ # Create new dataframe with required columns and transformations using select
935
+ df_result = df2.select([
936
+ pl.int_range(pl.len()).alias("feature_uid"),
937
+ pl.col("feature_id").cast(pl.String).alias("feature_id"),
938
+ pl.col("mz").round(5),
939
+ pl.col("RT").round(3).alias("rt"),
940
+ pl.col("RT").round(3).alias("rt_original"), # keep original RT
941
+ pl.col("RTstart").round(3).alias("rt_start"),
942
+ pl.col("RTend").round(3).alias("rt_end"),
943
+ (pl.col("RTend") - pl.col("RTstart")).round(3).alias("rt_delta"),
944
+ pl.col("MZstart").round(5).alias("mz_start"),
945
+ pl.col("MZend").round(5).alias("mz_end"),
946
+ pl.col("intensity").alias("inty"),
947
+ pl.col("quality"),
948
+ pl.col("charge"),
949
+ pl.lit(0).alias("iso"),
950
+ pl.lit(None, dtype=pl.Int64).alias("iso_of"),
951
+ pl.lit(None, dtype=pl.Int64).alias("adduct_group"),
952
+ pl.lit(None, dtype=pl.Utf8).alias("adduct"),
953
+ pl.lit(None, dtype=pl.Float64).alias("adduct_mass"),
954
+ pl.lit(None, dtype=pl.Object).alias("chrom"),
955
+ pl.lit(None, dtype=pl.Float64).alias("chrom_coherence"),
956
+ pl.lit(None, dtype=pl.Float64).alias("chrom_prominence"),
957
+ pl.lit(None, dtype=pl.Float64).alias("chrom_prominence_scaled"),
958
+ pl.lit(None, dtype=pl.Float64).alias("chrom_height_scaled"),
959
+ pl.lit(None, dtype=pl.Object).alias("ms2_scans"),
960
+ pl.lit(None, dtype=pl.Object).alias("ms2_specs"),
961
+ ])
962
+
963
+ return df_result
964
+
965
+
966
+ def _features_deisotope(
967
+ self,
968
+ df,
969
+ mz_tol=None,
970
+ rt_tol=None,
971
+ ):
972
+ if mz_tol is None:
973
+ mz_tol = 0.02
974
+ if rt_tol is None:
975
+ rt_tol = 0.2
976
+
977
+ # Convert to polars if needed
978
+ if not isinstance(df, pl.DataFrame):
979
+ df = pl.from_pandas(df)
980
+
981
+ # Initialize new columns
982
+ df = df.with_columns([
983
+ pl.lit(0).alias("iso"),
984
+ pl.col("feature_uid").alias("iso_of"),
985
+ ])
986
+
987
+ # Sort by 'mz'
988
+ df = df.sort("mz")
989
+
990
+ # Get arrays for efficient processing
991
+ rt_arr = df["rt"].to_numpy()
992
+ mz_arr = df["mz"].to_numpy()
993
+ intensity_arr = df["inty"].to_numpy()
994
+ feature_uid_arr = df["feature_uid"].to_numpy()
995
+ n = len(df)
996
+ mz_diff = 1.003355
997
+
998
+ # Create arrays to track isotope assignments
999
+ iso_arr = np.zeros(n, dtype=int)
1000
+ iso_of_arr = feature_uid_arr.copy()
1001
+
1002
+ for i in range(n):
1003
+ base_rt = rt_arr[i]
1004
+ base_mz = mz_arr[i]
1005
+ base_int = intensity_arr[i]
1006
+ base_feature_uid = feature_uid_arr[i]
1007
+
1008
+ # Search for first isotope candidate (offset = mz_diff)
1009
+ t1_lower = base_mz + mz_diff - mz_tol
1010
+ t1_upper = base_mz + mz_diff + mz_tol
1011
+ li = np.searchsorted(mz_arr, t1_lower, side="left")
1012
+ ri = np.searchsorted(mz_arr, t1_upper, side="right")
1013
+ if li < ri:
1014
+ cand_idx = np.arange(li, ri)
1015
+ mask = (
1016
+ (rt_arr[cand_idx] > base_rt - rt_tol)
1017
+ & (rt_arr[cand_idx] < base_rt + rt_tol)
1018
+ & (intensity_arr[cand_idx] < 2 * base_int)
1019
+ )
1020
+ valid_cand = cand_idx[mask]
1021
+ for cand in valid_cand:
1022
+ if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
1023
+ iso_arr[cand] = iso_arr[i] + 1 # first isotope
1024
+ iso_of_arr[cand] = base_feature_uid
1025
+
1026
+ # Search for second isotope candidate (offset = 2*mz_diff)
1027
+ t2_lower = base_mz + 2 * mz_diff - 1.5 * mz_tol
1028
+ t2_upper = base_mz + 2 * mz_diff + 1.5 * mz_tol
1029
+ li = np.searchsorted(mz_arr, t2_lower, side="left")
1030
+ ri = np.searchsorted(mz_arr, t2_upper, side="right")
1031
+ if li < ri:
1032
+ cand_idx = np.arange(li, ri)
1033
+ mask = (
1034
+ (rt_arr[cand_idx] > base_rt - rt_tol)
1035
+ & (rt_arr[cand_idx] < base_rt + rt_tol)
1036
+ & (intensity_arr[cand_idx] < 2 * base_int)
1037
+ )
1038
+ valid_cand = cand_idx[mask]
1039
+ for cand in valid_cand:
1040
+ if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
1041
+ iso_arr[cand] = iso_arr[i] + 2 # second isotope
1042
+ iso_of_arr[cand] = base_feature_uid
1043
+
1044
+ # Search for third isotope candidate (offset = 3*mz_diff)
1045
+ t3_lower = base_mz + 3 * mz_diff - 1.5 * mz_tol
1046
+ t3_upper = base_mz + 3 * mz_diff + 1.5 * mz_tol
1047
+ li = np.searchsorted(mz_arr, t3_lower, side="left")
1048
+ ri = np.searchsorted(mz_arr, t3_upper, side="right")
1049
+ if li < ri:
1050
+ cand_idx = np.arange(li, ri)
1051
+ mask = (
1052
+ (rt_arr[cand_idx] > base_rt - rt_tol)
1053
+ & (rt_arr[cand_idx] < base_rt + rt_tol)
1054
+ & (intensity_arr[cand_idx] < 2 * base_int)
1055
+ )
1056
+ valid_cand = cand_idx[mask]
1057
+ for cand in valid_cand:
1058
+ if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
1059
+ iso_arr[cand] = iso_arr[i] + 3 # third isotope
1060
+ iso_of_arr[cand] = base_feature_uid
1061
+
1062
+ # Update the dataframe with isotope assignments
1063
+ df = df.with_columns([
1064
+ pl.Series("iso", iso_arr),
1065
+ pl.Series("iso_of", iso_of_arr),
1066
+ ])
1067
+
1068
+ return df
1069
+
1070
+
1071
+ def analyze_dda(self):
1072
+ # Preallocate variables
1073
+ cycle_records = []
1074
+ previous_rt = 0
1075
+ previous_level = 0
1076
+ ms1_index = None
1077
+ cyclestart = None
1078
+ ms2_n = 0
1079
+ ms1_duration = 0
1080
+ ms2_duration: list[float] = []
1081
+
1082
+ for row in self.scans_df.iter_rows(named=True):
1083
+ if row["ms_level"] == 1:
1084
+ if previous_level == 2:
1085
+ ms2_to_ms2 = float(np.mean(ms2_duration)) if ms2_duration else -1.0
1086
+ d = {
1087
+ "scan_uid": ms1_index,
1088
+ "ms2_n": ms2_n,
1089
+ "time_cycle": row["rt"] - cyclestart,
1090
+ "time_ms1_to_ms1": -1.0,
1091
+ "time_ms1_to_ms2": ms1_duration,
1092
+ "time_ms2_to_ms2": ms2_to_ms2,
1093
+ "time_ms2_to_ms1": row["rt"] - previous_rt,
1094
+ }
1095
+ cycle_records.append(d)
1096
+ elif previous_level == 1:
1097
+ d = {
1098
+ "scan_uid": ms1_index,
1099
+ "ms2_n": 0,
1100
+ "time_cycle": row["rt"] - cyclestart,
1101
+ "time_ms1_to_ms1": row["rt"] - cyclestart,
1102
+ "time_ms1_to_ms2": -1.0,
1103
+ "time_ms2_to_ms2": -1.0,
1104
+ "time_ms2_to_ms1": -1.0,
1105
+ }
1106
+ cycle_records.append(d)
1107
+
1108
+ ms1_index = row["scan_uid"]
1109
+ cyclestart = row["rt"]
1110
+ ms2_n = 0
1111
+ ms1_duration = 0
1112
+ ms2_duration = []
1113
+ elif previous_level == 2:
1114
+ ms2_n += 1
1115
+ ms2_duration.append(row["rt"] - previous_rt)
1116
+ elif previous_level == 1:
1117
+ ms1_duration = row["rt"] - cyclestart
1118
+ ms2_n += 1
1119
+ previous_level = row["ms_level"]
1120
+ previous_rt = row["rt"]
1121
+
1122
+ # Create DataFrame once at the end
1123
+ if cycle_records:
1124
+ cycle_data = pl.DataFrame(cycle_records)
1125
+ self.scans_df = self.scans_df.join(cycle_data, on="scan_uid", how="left")
1126
+ else:
1127
+ self.scans_df = self.scans_df.with_columns(
1128
+ [
1129
+ pl.lit(None).alias("ms2_n"),
1130
+ pl.lit(None).alias("time_cycle"),
1131
+ pl.lit(None).alias("time_ms1_to_ms1"),
1132
+ pl.lit(None).alias("time_ms1_to_ms2"),
1133
+ pl.lit(None).alias("time_ms2_to_ms2"),
1134
+ pl.lit(None).alias("time_ms2_to_ms1"),
1135
+ ],
1136
+ )
1137
+
1138
+
1139
+ def find_ms2(self, **kwargs):
1140
+ """
1141
+ Link MS2 spectra to features in the dataset.
1142
+ This method matches MS2 spectra from the scans dataframe with features in the features dataframe
1143
+ based on retention time (RT) and precursor m/z tolerance criteria. For each feature in the provided
1144
+ or inferred list of feature ids (feature_uid), it computes the RT difference between the feature and available
1145
+ MS2 spectra. It then selects MS2 spectra that fall within a computed RT radius (based on the feature's
1146
+ start and end times) and a specified m/z tolerance. For each feature, it chooses one MS2 spectrum per
1147
+ unique cycle based on the closest RT difference, and it updates the feature with the list of matched
1148
+ scan ids and the spectrum corresponding to the first matching scan id. Additionally, the scan dataframe
1149
+ is updated to associate matched scan ids with the corresponding feature id.
1150
+
1151
+ Parameters:
1152
+ **kwargs: Keyword arguments for MS2 linking parameters. Can include:
1153
+ - A find_ms2_defaults instance to set all parameters at once
1154
+ - Individual parameter names and values (see find_ms2_defaults for details)
1155
+
1156
+ Key Parameters:
1157
+ features (int or list of int, optional): A specific feature id or a list of feature ids to process.
1158
+ If an individual feature_uid is provided and equals -1, all features with no associated MS2 data will be processed.
1159
+ If None, all features in the features dataframe are processed.
1160
+ mz_tol (float, optional): The precursor m/z tolerance to consider when matching MS2 spectra. If not provided,
1161
+ it defaults to 0.5, except for certain file types ('ztscan' or 'dia') which set it to 4.
1162
+ centroid (bool, optional): If True, the returned spectrum will be centroided. Default is True.
1163
+ deisotope (bool, optional): Flag indicating whether deisotoping should be performed. Default is False.
1164
+ dia_stats (bool, optional): A flag to collect additional DIA-related statistics when retrieving a spectrum.
1165
+ Default is False.
1166
+
1167
+ Returns:
1168
+ None
1169
+
1170
+ Side Effects:
1171
+ Updates self.features_df with new columns 'ms2_scans' (a list of scan ids) and 'ms2_specs' (containing
1172
+ the retrieved spectrum for the first matched scan id). Also, self.scans_df is updated by setting the 'feature_uid'
1173
+ column for matched MS2 spectra.
1174
+
1175
+ Notes:
1176
+ - The function uses vectorized operations to quickly filter MS2 spectra with ms_level equal to 2.
1177
+ - If no MS2 spectra are available or if features_df is not loaded, appropriate messages are printed and the
1178
+ method exits early.
1179
+ - The function assumes that self.features_df and self.scans_df are already set up and contain the expected
1180
+ columns ('feature_uid', 'rt', 'rt_start', 'rt_end', 'mz' for features and 'scan_uid', 'rt', 'prec_mz', 'cycle', 'ms_level'
1181
+ for scans).
1182
+
1183
+ Examples:
1184
+ Assume the current instance has features and scans data loaded, then to link MS2 spectra for all features:
1185
+ instance.find_ms2()
1186
+ To link MS2 spectra for a specific list of feature ids:
1187
+ instance.find_ms2(feature_uid=[1, 3, 5])
1188
+ """
1189
+
1190
+ # parameters initialization
1191
+ params = find_ms2_defaults()
1192
+ for key, value in kwargs.items():
1193
+ if isinstance(value, find_ms2_defaults):
1194
+ params = value
1195
+ self.logger.debug("Using provided find_ms2_defaults parameters")
1196
+ else:
1197
+ if hasattr(params, key):
1198
+ if params.set(key, value, validate=True):
1199
+ self.logger.debug(f"Updated parameter {key} = {value}")
1200
+ else:
1201
+ self.logger.warning(
1202
+ f"Failed to set parameter {key} = {value} (validation failed)",
1203
+ )
1204
+ else:
1205
+ self.logger.debug(f"Unknown parameter {key} ignored")
1206
+ # end of parameter initialization
1207
+
1208
+ # Extract parameter values
1209
+ features = params.get("features")
1210
+ mz_tol = params.get_mz_tolerance(self.file_type)
1211
+ centroid = params.get("centroid")
1212
+ deisotope = params.get("deisotope")
1213
+ dia_stats = params.get("dia_stats")
1214
+
1215
+ self.logger.debug("Starting MS2 spectra linking...")
1216
+ self.logger.debug(
1217
+ f"Parameters: mz_tol={mz_tol}, centroid={centroid}, deisotope={deisotope}",
1218
+ )
1219
+
1220
+ # Ensure features_df is loaded and has the MS2 columns
1221
+ if self.features_df is None:
1222
+ self.logger.error("Please find features first.")
1223
+ return
1224
+ if "ms2_scans" not in self.features_df.columns:
1225
+ self.features_df["ms2_scans"] = None
1226
+ if "ms2_specs" not in self.features_df.columns:
1227
+ self.features_df["ms2_specs"] = None
1228
+
1229
+ feature_uid_list = []
1230
+ self.logger.debug("Building lookup lists")
1231
+ if features == []:
1232
+ features = None # If empty list, treat as None
1233
+ feature_uid_list = self._get_feature_uids(features)
1234
+
1235
+ if len(feature_uid_list) == 0:
1236
+ self.logger.warning("No features to process.")
1237
+ return
1238
+
1239
+ ms2_df = self.scans_df.filter(pl.col("ms_level") == 2)
1240
+ if len(ms2_df) == 0:
1241
+ self.logger.warning("No MS2 spectra found in file.")
1242
+ return
1243
+
1244
+ ms2_index_arr = ms2_df["scan_uid"].to_numpy()
1245
+ ms2_rt = ms2_df["rt"].to_numpy()
1246
+ ms2_precursor = ms2_df["prec_mz"].to_numpy()
1247
+ ms2_cycle = ms2_df["cycle"].to_numpy()
1248
+
1249
+ features_df = self.features_df
1250
+ c = 0
1251
+
1252
+ if self.file_interface is None:
1253
+ self.index_file()
1254
+
1255
+ # Vectorize the entire operation for better performance
1256
+ features_subset = features_df.filter(pl.col("feature_uid").is_in(feature_uid_list))
1257
+
1258
+ if len(features_subset) == 0:
1259
+ return
1260
+
1261
+ # Convert to numpy arrays for vectorized operations
1262
+ feature_rt = features_subset.select("rt").to_numpy().flatten()
1263
+ feature_mz = features_subset.select("mz").to_numpy().flatten()
1264
+ feature_rt_start = features_subset.select("rt_start").to_numpy().flatten()
1265
+ feature_rt_end = features_subset.select("rt_end").to_numpy().flatten()
1266
+ feature_uids = features_subset.select("feature_uid").to_numpy().flatten()
1267
+ feature_indices = features_subset.with_row_index().select("index").to_numpy().flatten()
1268
+
1269
+ # Pre-compute RT radius for all features
1270
+ rt_radius = np.minimum(feature_rt - feature_rt_start, feature_rt_end - feature_rt)
1271
+
1272
+ # Batch process all features
1273
+ scan_uid_lists: list[list[int]] = []
1274
+ spec_lists: list[list[Spectrum]] = []
1275
+ updated_feature_uids = []
1276
+ updated_scan_uids = []
1277
+
1278
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1279
+
1280
+ for i, (rt_center, mz_center, radius, feature_uid, idx) in enumerate(
1281
+ tqdm(
1282
+ zip(
1283
+ feature_rt,
1284
+ feature_mz,
1285
+ rt_radius,
1286
+ feature_uids,
1287
+ feature_indices,
1288
+ strict=False,
1289
+ ),
1290
+ total=len(features_subset),
1291
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Link MS2 spectra",
1292
+ disable=tdqm_disable,
1293
+ ),
1294
+ ):
1295
+ # Vectorized filtering
1296
+ rt_mask = np.abs(ms2_rt - rt_center) <= radius
1297
+ mz_mask = np.abs(ms2_precursor - mz_center) <= mz_tol
1298
+ valid_mask = rt_mask & mz_mask
1299
+
1300
+ if not np.any(valid_mask):
1301
+ scan_uid_lists.append(None)
1302
+ spec_lists.append(None)
1303
+ continue
1304
+
1305
+ valid_indices = np.nonzero(valid_mask)[0]
1306
+ rt_diffs = np.abs(ms2_rt[valid_indices] - rt_center)
1307
+ sorted_indices = valid_indices[np.argsort(rt_diffs)]
1308
+
1309
+ # Get unique cycles and their first occurrences
1310
+ cycles = ms2_cycle[sorted_indices]
1311
+ _, first_idx = np.unique(cycles, return_index=True)
1312
+ final_indices = sorted_indices[first_idx]
1313
+
1314
+ # Sort by RT difference again
1315
+ final_rt_diffs = np.abs(ms2_rt[final_indices] - rt_center)
1316
+ final_indices = final_indices[np.argsort(final_rt_diffs)]
1317
+
1318
+ scan_uids = ms2_index_arr[final_indices].tolist()
1319
+ scan_uid_lists.append(scan_uids)
1320
+ spec_lists.append([
1321
+ self.get_spectrum(
1322
+ scan_uids[0],
1323
+ centroid=centroid,
1324
+ deisotope=deisotope,
1325
+ dia_stats=dia_stats,
1326
+ feature_uid=feature_uid,
1327
+ ),
1328
+ ])
1329
+
1330
+ # Collect updates for batch processing
1331
+ updated_feature_uids.extend([feature_uid] * len(final_indices))
1332
+ updated_scan_uids.extend(ms2_index_arr[final_indices])
1333
+ c += 1
1334
+
1335
+ self.logger.debug("Update features.")
1336
+ # Convert to polars if needed and batch update features_df
1337
+ if not isinstance(features_df, pl.DataFrame):
1338
+ features_df = pl.from_pandas(features_df)
1339
+
1340
+ # Update the features_df
1341
+ update_df = pl.DataFrame({
1342
+ "temp_idx": feature_indices,
1343
+ "ms2_scans": pl.Series("ms2_scans", scan_uid_lists, dtype=pl.Object),
1344
+ "ms2_specs": pl.Series("ms2_specs", spec_lists, dtype=pl.Object),
1345
+ })
1346
+
1347
+ # Join and update
1348
+ features_df = (
1349
+ features_df.with_row_index("temp_idx")
1350
+ .join(
1351
+ update_df,
1352
+ on="temp_idx",
1353
+ how="left",
1354
+ suffix="_new",
1355
+ )
1356
+ .with_columns([
1357
+ pl.when(pl.col("ms2_scans_new").is_not_null())
1358
+ .then(pl.col("ms2_scans_new"))
1359
+ .otherwise(pl.col("ms2_scans"))
1360
+ .alias("ms2_scans"),
1361
+ pl.when(pl.col("ms2_specs_new").is_not_null())
1362
+ .then(pl.col("ms2_specs_new"))
1363
+ .otherwise(pl.col("ms2_specs"))
1364
+ .alias("ms2_specs"),
1365
+ ])
1366
+ .drop(["temp_idx", "ms2_scans_new", "ms2_specs_new"])
1367
+ )
1368
+
1369
+ # Batch update scans_df
1370
+ if updated_scan_uids:
1371
+ scan_feature_uid_updates = dict(
1372
+ zip(updated_scan_uids, updated_feature_uids, strict=True),
1373
+ )
1374
+ self.scans_df = (
1375
+ self.scans_df.with_columns(
1376
+ pl.col("scan_uid")
1377
+ .map_elements(
1378
+ lambda x: scan_feature_uid_updates.get(x),
1379
+ return_dtype=pl.Int64,
1380
+ )
1381
+ .alias("feature_uid_update"),
1382
+ )
1383
+ .with_columns(
1384
+ pl.when(pl.col("feature_uid_update").is_not_null())
1385
+ .then(pl.col("feature_uid_update"))
1386
+ .otherwise(pl.col("feature_uid"))
1387
+ .alias("feature_uid"),
1388
+ )
1389
+ .drop("feature_uid_update")
1390
+ )
1391
+
1392
+ # Log completion
1393
+ self.logger.info(
1394
+ f"MS2 linking completed. Total features with MS2 data: {c}",
1395
+ )
1396
+ self.features_df = features_df
1397
+
1398
+ # store params
1399
+ self.store_history(["find_ms2"], params.to_dict())
1400
+ self.logger.debug(
1401
+ "Parameters stored to find_ms2",
1402
+ )