masster 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -736
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.5.dist-info/RECORD +0 -50
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/processing.py
CHANGED
|
@@ -1,1416 +1,1402 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import polars as pl
|
|
7
|
-
import pyopenms as oms
|
|
8
|
-
|
|
9
|
-
from tqdm import tqdm
|
|
10
|
-
|
|
11
|
-
from masster.chromatogram import Chromatogram
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
from .
|
|
15
|
-
from .defaults.
|
|
16
|
-
from .defaults.
|
|
17
|
-
from .defaults.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
statistics
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
if
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
and scan_info["feature_uid"][0] is not None
|
|
279
|
-
else feature_uid,
|
|
280
|
-
q1_step=2,
|
|
281
|
-
deisotope=deisotope,
|
|
282
|
-
centroid=centroid,
|
|
283
|
-
)
|
|
284
|
-
return spect
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def _get_ztscan_stats(
|
|
288
|
-
self,
|
|
289
|
-
spec,
|
|
290
|
-
scan_uid=None,
|
|
291
|
-
feature_uid=None,
|
|
292
|
-
q1_step=2,
|
|
293
|
-
mz_tol=0.005,
|
|
294
|
-
# TODO check this
|
|
295
|
-
# deisotope=SpectrumParameters().deisotope,
|
|
296
|
-
deisotope=False,
|
|
297
|
-
# TODO there is no `centroid_algo`?
|
|
298
|
-
centroid=True,
|
|
299
|
-
):
|
|
300
|
-
spec.size = spec.mz.size
|
|
301
|
-
# spec.ms_entropy = spec.entropy()
|
|
302
|
-
|
|
303
|
-
if self.scans_df is None:
|
|
304
|
-
self.logger.warning("No scans found.")
|
|
305
|
-
return spec
|
|
306
|
-
scan = self.scans_df.filter(pl.col("scan_uid") == scan_uid)
|
|
307
|
-
if len(scan) == 0:
|
|
308
|
-
self.logger.warning(f"Scan {scan_uid} not found.")
|
|
309
|
-
return spec
|
|
310
|
-
scan = scan[0]
|
|
311
|
-
if scan["ms_level"][0] != 2:
|
|
312
|
-
self.logger.warning(f"Scan {scan_uid} is not a MS2 scan.")
|
|
313
|
-
# Q1
|
|
314
|
-
lscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid - q1_step)
|
|
315
|
-
if len(lscan) == 0:
|
|
316
|
-
self.logger.warning(f"Scan {scan_uid - q1_step} not found.")
|
|
317
|
-
return spec
|
|
318
|
-
lscan = lscan[0]
|
|
319
|
-
# check that lscan['ms_level'] == 2 and lscan['cycle'] == scan['cycle']
|
|
320
|
-
if lscan["ms_level"][0] != 2:
|
|
321
|
-
self.logger.warning(f"Scan {scan_uid - q1_step} is not a MS2 scan.")
|
|
322
|
-
return spec
|
|
323
|
-
if lscan["cycle"][0] != scan["cycle"][0]:
|
|
324
|
-
self.logger.warning(
|
|
325
|
-
f"Scan {scan_uid - q1_step} is not in the same cycle as scan {scan_uid}.",
|
|
326
|
-
)
|
|
327
|
-
return spec
|
|
328
|
-
rscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid + q1_step)
|
|
329
|
-
if len(rscan) == 0:
|
|
330
|
-
self.logger.warning(f"Scan {scan_uid + q1_step} not found.")
|
|
331
|
-
return spec
|
|
332
|
-
rscan = rscan[0]
|
|
333
|
-
# check that rscan['ms_level'] == 2 and rscan['cycle'] == scan['cycle']
|
|
334
|
-
if rscan["ms_level"][0] != 2:
|
|
335
|
-
self.logger.warning(f"Scan {scan_uid + q1_step} is not a MS2 scan.")
|
|
336
|
-
return spec
|
|
337
|
-
if rscan["cycle"][0] != scan["cycle"][0]:
|
|
338
|
-
self.logger.warning(
|
|
339
|
-
f"Scan {scan_uid + q1_step} is not in the same cycle as scan {scan_uid}.",
|
|
340
|
-
)
|
|
341
|
-
return spec
|
|
342
|
-
intymat = self._spec_to_mat(
|
|
343
|
-
scan_uids=[scan_uid - q1_step, scan_uid, scan_uid + q1_step],
|
|
344
|
-
mz_ref=spec.mz,
|
|
345
|
-
mz_tol=mz_tol,
|
|
346
|
-
deisotope=deisotope,
|
|
347
|
-
centroid=centroid,
|
|
348
|
-
)
|
|
349
|
-
# pick only mzs that are close to spec.mz
|
|
350
|
-
if intymat is None:
|
|
351
|
-
return spec
|
|
352
|
-
if intymat.shape[1] < 3:
|
|
353
|
-
self.logger.warning(f"Not enough data points for scan {scan_uid}.")
|
|
354
|
-
return spec
|
|
355
|
-
q1_ratio = (2 * intymat[:, 1] + 0.01) / (intymat[:, 0] + intymat[:, 2] + 0.01)
|
|
356
|
-
spec.q1_ratio = np.log2(q1_ratio)
|
|
357
|
-
# where intymat[:, 0] + intymat[:, 2]==0, set q1_ratio to -1
|
|
358
|
-
spec.q1_ratio[np.isclose(intymat[:, 0] + intymat[:, 2], 0)] = -10
|
|
359
|
-
|
|
360
|
-
# EIC correlation
|
|
361
|
-
# find rt_start and rt_end of the feature_uid
|
|
362
|
-
if self.features_df is None:
|
|
363
|
-
self.logger.warning("No features found.")
|
|
364
|
-
return spec
|
|
365
|
-
if feature_uid is None:
|
|
366
|
-
return spec
|
|
367
|
-
# spec.precursor_mz = feature['mz']
|
|
368
|
-
feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
|
|
369
|
-
if len(feature) == 0:
|
|
370
|
-
self.logger.warning(f"Feature {feature_uid} not found.")
|
|
371
|
-
return spec
|
|
372
|
-
feature = feature.row(0, named=True)
|
|
373
|
-
rt_start = feature["rt_start"]
|
|
374
|
-
rt_end = feature["rt_end"]
|
|
375
|
-
# get the cycle at rt_start and the cycle at rt_end from the closest scan with ms_level == 1
|
|
376
|
-
scans = self.scans_df.filter(pl.col("ms_level") == 1)
|
|
377
|
-
scans = scans.filter(pl.col("rt") > rt_start)
|
|
378
|
-
scans = scans.filter(pl.col("rt") < rt_end)
|
|
379
|
-
if len(scans) == 0:
|
|
380
|
-
self.logger.warning(f"No scans found between {rt_start} and {rt_end}.")
|
|
381
|
-
return spec
|
|
382
|
-
scan_uids = scans["scan_uid"].to_list()
|
|
383
|
-
eic_prec = self._spec_to_mat(
|
|
384
|
-
scan_uids=scan_uids,
|
|
385
|
-
mz_ref=feature["mz"],
|
|
386
|
-
mz_tol=mz_tol,
|
|
387
|
-
deisotope=deisotope,
|
|
388
|
-
centroid=centroid,
|
|
389
|
-
)
|
|
390
|
-
# find width at half maximum of the eic_prec
|
|
391
|
-
# hm = np.max(eic_prec[0, :]) / 3
|
|
392
|
-
# find index of maximum
|
|
393
|
-
# eic_prec_max_idx = np.argmax(eic_prec[0, :])
|
|
394
|
-
# find index of the closest point to half maximum
|
|
395
|
-
# idx = np.argmin(np.abs(eic_prec[0, :] - hm))
|
|
396
|
-
# eic_fwhm_prec = abs(eic_prec_max_idx - idx)
|
|
397
|
-
|
|
398
|
-
# get all unique cycles from scans
|
|
399
|
-
cycles = scans["cycle"].unique()
|
|
400
|
-
scandids = []
|
|
401
|
-
# iterate over all cycles and get the scan_uid of scan with ms_level == 2 and closest precursor_mz to spec.precursor_mz
|
|
402
|
-
for cycle in cycles:
|
|
403
|
-
scans = self.scans_df.filter(pl.col("cycle") == cycle)
|
|
404
|
-
scans = scans.filter(pl.col("ms_level") == 2)
|
|
405
|
-
scans = scans.filter(pl.col("prec_mz") > feature["mz"] - 4)
|
|
406
|
-
scans = scans.filter(pl.col("prec_mz") < feature["mz"] + 4)
|
|
407
|
-
if len(scans) == 0:
|
|
408
|
-
self.logger.warning(f"No scans found for cycle {cycle}.")
|
|
409
|
-
continue
|
|
410
|
-
scan = scans[(scans["prec_mz"] - feature["mz"]).abs().arg_sort()[:1]]
|
|
411
|
-
scandids.append(scan["scan_uid"][0])
|
|
412
|
-
|
|
413
|
-
eic_prod = self._spec_to_mat(
|
|
414
|
-
scandids,
|
|
415
|
-
mz_ref=spec.mz,
|
|
416
|
-
mz_tol=mz_tol,
|
|
417
|
-
deisotope=deisotope,
|
|
418
|
-
centroid=centroid,
|
|
419
|
-
)
|
|
420
|
-
# eic_prod = eic_prod.T
|
|
421
|
-
# eic_prec = eic_prec.T
|
|
422
|
-
# calculate correlation between eic_prec and all columns of eic_prod, column by column
|
|
423
|
-
eic_corr = np.zeros(eic_prod.shape[0])
|
|
424
|
-
# eic_width_ratio = np.zeros(eic_prod.shape[0])
|
|
425
|
-
for i in range(eic_prod.shape[0]):
|
|
426
|
-
try:
|
|
427
|
-
with np.errstate(divide="ignore", invalid="ignore"):
|
|
428
|
-
eic_corr[i] = np.corrcoef(eic_prod[i, :], eic_prec[0, :])[0, 1]
|
|
429
|
-
except:
|
|
430
|
-
pass
|
|
431
|
-
|
|
432
|
-
spec.eic_corr = eic_corr
|
|
433
|
-
return spec
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
def _spec_to_mat(
|
|
437
|
-
self,
|
|
438
|
-
scan_uids,
|
|
439
|
-
mz_ref=None,
|
|
440
|
-
mz_tol=0.01,
|
|
441
|
-
# TODO check this
|
|
442
|
-
# deisotope=SpectrumParameters().deisotope,
|
|
443
|
-
deisotope=False,
|
|
444
|
-
# TODO there is no `centroid_algo`?
|
|
445
|
-
# TODO there is no `dia_stats`?
|
|
446
|
-
# TODO unused (see below)
|
|
447
|
-
centroid=True,
|
|
448
|
-
# TODO check this
|
|
449
|
-
# precursor_trim=SpectrumParameters().precursor_trim,
|
|
450
|
-
# TODO unused (see below)
|
|
451
|
-
precursor_trim=None,
|
|
452
|
-
):
|
|
453
|
-
# get all spectra in scan_uids
|
|
454
|
-
|
|
455
|
-
if mz_ref is None:
|
|
456
|
-
return None
|
|
457
|
-
|
|
458
|
-
if not isinstance(mz_ref, np.ndarray):
|
|
459
|
-
if isinstance(mz_ref, list):
|
|
460
|
-
mz_ref = np.array(mz_ref)
|
|
461
|
-
else:
|
|
462
|
-
mz_ref = np.array([mz_ref])
|
|
463
|
-
|
|
464
|
-
def align_mzs(ar1, ar2, tol):
|
|
465
|
-
closest_indices = []
|
|
466
|
-
# find the closest pair between each element in ar1 and ar2, within a maximum tolerance of tol
|
|
467
|
-
for i, val1 in enumerate(ar1):
|
|
468
|
-
closest_index = np.argmin(np.abs(ar2 - val1))
|
|
469
|
-
closest_indices.append((i, closest_index))
|
|
470
|
-
# filter out pairs that are not within the specified tolerance
|
|
471
|
-
closest_indices = [
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
#
|
|
475
|
-
closest_indices =
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
if
|
|
494
|
-
continue
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
mat
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
self.logger.
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
)
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
# find
|
|
581
|
-
max_cycle
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
spectrum
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
mz
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
mtd_par.setValue(
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
)
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
epd_par
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
epd_par.setValue("
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
ffm_par
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
"
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
"
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
"
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
)
|
|
660
|
-
|
|
661
|
-
ffm.
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
#
|
|
666
|
-
feature_map.
|
|
667
|
-
|
|
668
|
-
#
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
#
|
|
672
|
-
df = self.
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
(pl.col("
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
)
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
round(eic.
|
|
737
|
-
)
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
)
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
pl.Series("
|
|
750
|
-
pl.Series("
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
self.logger.
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
)
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
openms_params.setValue("
|
|
826
|
-
openms_params.setValue("
|
|
827
|
-
openms_params.setValue(
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
# result
|
|
839
|
-
|
|
840
|
-
# result consensus map: will store
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
adducts_map =
|
|
851
|
-
adducts_map["
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
df =
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
.
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
adduct_cols
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
)
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
pl.
|
|
949
|
-
pl.
|
|
950
|
-
pl.
|
|
951
|
-
pl.
|
|
952
|
-
pl.
|
|
953
|
-
pl.
|
|
954
|
-
pl.
|
|
955
|
-
|
|
956
|
-
pl.
|
|
957
|
-
pl.
|
|
958
|
-
pl.
|
|
959
|
-
pl.
|
|
960
|
-
pl.
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
)
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
"
|
|
1100
|
-
"
|
|
1101
|
-
"
|
|
1102
|
-
"
|
|
1103
|
-
"
|
|
1104
|
-
"
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
the
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
#
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
)
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
if
|
|
1253
|
-
self.
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
)
|
|
1308
|
-
|
|
1309
|
-
#
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
.drop("feature_uid_update")
|
|
1404
|
-
)
|
|
1405
|
-
|
|
1406
|
-
# Log completion
|
|
1407
|
-
self.logger.info(
|
|
1408
|
-
f"MS2 linking completed. Total features with MS2 data: {c}",
|
|
1409
|
-
)
|
|
1410
|
-
self.features_df = features_df
|
|
1411
|
-
|
|
1412
|
-
# store params
|
|
1413
|
-
self.store_history(["find_ms2"], params.to_dict())
|
|
1414
|
-
self.logger.debug(
|
|
1415
|
-
"Parameters stored to find_ms2",
|
|
1416
|
-
)
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
import pyopenms as oms
|
|
8
|
+
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
from masster.chromatogram import Chromatogram
|
|
12
|
+
|
|
13
|
+
# Parameters removed - using hardcoded defaults
|
|
14
|
+
from masster.spectrum import Spectrum
|
|
15
|
+
from .defaults.find_features_def import find_features_defaults
|
|
16
|
+
from .defaults.find_adducts_def import find_adducts_defaults
|
|
17
|
+
from .defaults.find_ms2_def import find_ms2_defaults
|
|
18
|
+
from .defaults.get_spectrum_def import get_spectrum_defaults
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_spectrum(self, scan, **kwargs):
|
|
22
|
+
"""
|
|
23
|
+
Retrieve and process a spectrum from the data file based on the given scan identifier.
|
|
24
|
+
|
|
25
|
+
This method locates the scan in the internal DataFrame, extracts the metadata (such as energy,
|
|
26
|
+
MS level, and retention time), and then retrieves the corresponding spectrum data from the file.
|
|
27
|
+
Depending on the file interface (either 'oms' or 'alpharaw'), the spectrum data is obtained
|
|
28
|
+
and processed (including optional denoising, centroiding, deisotoping, and precursor m/z trimming).
|
|
29
|
+
|
|
30
|
+
Parameters:
|
|
31
|
+
scan (int): Unique identifier of the scan to retrieve. This is a mandatory parameter.
|
|
32
|
+
**kwargs: Keyword arguments for spectrum retrieval parameters. Can include:
|
|
33
|
+
- A get_spectrum_defaults instance to set all parameters at once
|
|
34
|
+
- Individual parameter names and values (see get_spectrum_defaults for details)
|
|
35
|
+
|
|
36
|
+
Key Parameters:
|
|
37
|
+
precursor_trim (int, optional): Value used to trim the precursor m/z for MS2 spectra.
|
|
38
|
+
If provided and the spectrum's MS level is greater than 1,
|
|
39
|
+
m/z values above (precursor_mz - precursor_trim) will be trimmed.
|
|
40
|
+
Default is 20.
|
|
41
|
+
max_peaks (int, optional): Maximum number of peaks to retain in the spectrum. Default is 100.
|
|
42
|
+
centroid (bool, optional): Flag indicating whether the spectrum should be centroided.
|
|
43
|
+
If True and the spectrum is not already centroided, the method
|
|
44
|
+
applies denoising followed by centroiding using parameters from self.parameters.
|
|
45
|
+
Default is True.
|
|
46
|
+
deisotope (bool, optional): Flag indicating whether deisotoping should be performed. Default is False.
|
|
47
|
+
dia_stats (optional): Flag or parameter for processing DIA (data-independent acquisition)
|
|
48
|
+
statistics. If provided (and if applicable to the file type), additional
|
|
49
|
+
statistics will be computed for 'ztscan' files. Default is None.
|
|
50
|
+
feature (optional): An optional identifier used when computing DIA statistics. Default is None.
|
|
51
|
+
label (str, optional): Optional label to assign to the spectrum. If not provided,
|
|
52
|
+
a default name is generated based on the MS level and retention time.
|
|
53
|
+
Default is None.
|
|
54
|
+
centroid_algo (str, optional): Algorithm to use for centroiding. Default is None.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
spectrum: A processed spectrum object containing:
|
|
58
|
+
- m/z and intensity arrays
|
|
59
|
+
- metadata such as MS level, retention time, energy, and an assigned label
|
|
60
|
+
Depending on the processing steps (centroiding, trimming, deisotoping, etc.), the
|
|
61
|
+
returned spectrum is modified accordingly.
|
|
62
|
+
Returns None or an empty spectrum if the scan is not found or if an error occurs.
|
|
63
|
+
|
|
64
|
+
Notes:
|
|
65
|
+
- For the 'oms' file interface, the spectrum is retrieved via self.file_obj.getSpectrum
|
|
66
|
+
and handled accordingly.
|
|
67
|
+
- For the 'alpharaw' file interface, the method uses internal DataFrame attributes to locate the
|
|
68
|
+
scan and its associated peaks.
|
|
69
|
+
- The method applies additional processing (denoising, centroiding, deisotoping, trimming) based on
|
|
70
|
+
the input flags and the MS level of the spectrum.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# parameters initialization
|
|
74
|
+
params = get_spectrum_defaults(scan=scan)
|
|
75
|
+
for key, value in kwargs.items():
|
|
76
|
+
if isinstance(value, get_spectrum_defaults):
|
|
77
|
+
params = value
|
|
78
|
+
self.logger.debug("Using provided get_spectrum_defaults parameters")
|
|
79
|
+
else:
|
|
80
|
+
if hasattr(params, key):
|
|
81
|
+
if params.set(key, value, validate=True):
|
|
82
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
83
|
+
else:
|
|
84
|
+
self.logger.warning(
|
|
85
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
89
|
+
# end of parameter initialization
|
|
90
|
+
|
|
91
|
+
# Extract parameter values
|
|
92
|
+
scan = params.get("scan")
|
|
93
|
+
precursor_trim = params.get("precursor_trim")
|
|
94
|
+
max_peaks = params.get("max_peaks")
|
|
95
|
+
centroid = params.get("centroid")
|
|
96
|
+
deisotope = params.get("deisotope")
|
|
97
|
+
dia_stats = params.get("dia_stats")
|
|
98
|
+
feature_uid = params.get("feature")
|
|
99
|
+
label = params.get("label")
|
|
100
|
+
centroid_algo = params.get("centroid_algo")
|
|
101
|
+
|
|
102
|
+
# get energy, ms_level, rt from scans_df
|
|
103
|
+
scan_uid = scan # Preserve original scan ID
|
|
104
|
+
scan_info = self.scans_df.filter(pl.col("scan_uid") == scan_uid)
|
|
105
|
+
if len(scan_info) == 0:
|
|
106
|
+
self.logger.warning(f"Scan {scan_uid} not found.")
|
|
107
|
+
return None
|
|
108
|
+
scan_info = scan_info[0]
|
|
109
|
+
energy = scan_info["energy"][0]
|
|
110
|
+
ms_level = scan_info["ms_level"][0]
|
|
111
|
+
rt = scan_info["rt"][0]
|
|
112
|
+
if label is None:
|
|
113
|
+
if ms_level == 1:
|
|
114
|
+
name = f"MS1, rt {rt:.2f} s, scan {scan_uid}"
|
|
115
|
+
else:
|
|
116
|
+
name = f"MS2 of mz {scan_info['prec_mz'][0]:0.1f}, rt {rt:.2f} s, scan {scan_uid}"
|
|
117
|
+
else:
|
|
118
|
+
name = label
|
|
119
|
+
|
|
120
|
+
if centroid_algo is None:
|
|
121
|
+
if "centroid_algo" in self.parameters:
|
|
122
|
+
centroid_algo = self.parameters.get("centroid_algo")
|
|
123
|
+
else:
|
|
124
|
+
# this is for backward compatibility. This is the old default
|
|
125
|
+
self.parameters.centroid_algo = "lmp"
|
|
126
|
+
centroid_algo = self.parameters.get("centroid_algo")
|
|
127
|
+
|
|
128
|
+
spec0 = Spectrum(mz=np.array([]), inty=np.array([]))
|
|
129
|
+
if self.file_interface == "oms":
|
|
130
|
+
# if check that file_obj is not None
|
|
131
|
+
if self.file_obj is None:
|
|
132
|
+
self.logger.error("Please load a file first.")
|
|
133
|
+
return
|
|
134
|
+
try:
|
|
135
|
+
spect = self.file_obj.getSpectrum(scan_uid).get_peaks()
|
|
136
|
+
except Exception as e:
|
|
137
|
+
self.logger.error(f"Error: {e}")
|
|
138
|
+
return spec0
|
|
139
|
+
if len(spect[0]) == 0:
|
|
140
|
+
return spec0
|
|
141
|
+
elif len(spect[0]) == 1:
|
|
142
|
+
mz = np.array([spect[0][0]])
|
|
143
|
+
inty = np.array([spect[1][0]])
|
|
144
|
+
else:
|
|
145
|
+
mz = np.array(spect[0])
|
|
146
|
+
inty = np.array(spect[1])
|
|
147
|
+
if ms_level == 1:
|
|
148
|
+
spect = Spectrum(
|
|
149
|
+
mz=mz,
|
|
150
|
+
inty=inty,
|
|
151
|
+
ms_level=ms_level,
|
|
152
|
+
rt=rt,
|
|
153
|
+
energy=None,
|
|
154
|
+
precursor_mz=None,
|
|
155
|
+
label=name,
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
spect = Spectrum(
|
|
159
|
+
mz=mz,
|
|
160
|
+
inty=inty,
|
|
161
|
+
ms_level=ms_level,
|
|
162
|
+
rt=rt,
|
|
163
|
+
energy=energy,
|
|
164
|
+
precursor_mz=scan_info["prec_mz"][0],
|
|
165
|
+
label=name,
|
|
166
|
+
)
|
|
167
|
+
if centroid and not spect.centroided:
|
|
168
|
+
spect = spect.denoise()
|
|
169
|
+
if spect.ms_level == 1:
|
|
170
|
+
spect = spect.centroid(
|
|
171
|
+
algo=centroid_algo,
|
|
172
|
+
tolerance=self.parameters.get("mz_tol_ms1_da"),
|
|
173
|
+
ppm=self.parameters.get("mz_tol_ms1_ppm"),
|
|
174
|
+
min_points=self.parameters.get("centroid_min_points_ms1"),
|
|
175
|
+
smooth=self.parameters.get("centroid_smooth"),
|
|
176
|
+
prominence=self.parameters.get("centroid_prominence"),
|
|
177
|
+
refine=self.parameters.get("centroid_refine"),
|
|
178
|
+
)
|
|
179
|
+
elif spect.ms_level == 2:
|
|
180
|
+
spect = spect.centroid(
|
|
181
|
+
algo=centroid_algo,
|
|
182
|
+
tolerance=self.parameters.get("mz_tol_ms2_da"),
|
|
183
|
+
ppm=self.parameters.get("mz_tol_ms2_ppm"),
|
|
184
|
+
min_points=self.parameters.get("centroid_min_points_ms2"),
|
|
185
|
+
smooth=self.parameters.get("centroid_smooth"),
|
|
186
|
+
prominence=self.parameters.get("centroid_prominence"),
|
|
187
|
+
refine=self.parameters.get("centroid_refine"),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
elif self.file_interface == "alpharaw":
|
|
191
|
+
spec_df = self.file_obj.spectrum_df
|
|
192
|
+
spect = (
|
|
193
|
+
spec_df.filter(pl.col("scan_id") == scan_uid).row(0, named=True)
|
|
194
|
+
if isinstance(spec_df, pl.DataFrame)
|
|
195
|
+
else spec_df.loc[scan_uid]
|
|
196
|
+
)
|
|
197
|
+
peak_stop_idx = spect["peak_stop_idx"]
|
|
198
|
+
peak_start_idx = spect["peak_start_idx"]
|
|
199
|
+
|
|
200
|
+
if isinstance(self.file_obj.peak_df, pl.DataFrame):
|
|
201
|
+
peaks = self.file_obj.peak_df.slice(
|
|
202
|
+
peak_start_idx,
|
|
203
|
+
peak_stop_idx - peak_start_idx,
|
|
204
|
+
)
|
|
205
|
+
mz_values = peaks.select("mz").to_numpy().flatten()
|
|
206
|
+
intensity_values = peaks.select("intensity").to_numpy().flatten()
|
|
207
|
+
else:
|
|
208
|
+
peaks = self.file_obj.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
|
|
209
|
+
mz_values = peaks.mz.values
|
|
210
|
+
intensity_values = peaks.intensity.values
|
|
211
|
+
|
|
212
|
+
if spect["ms_level"] > 1:
|
|
213
|
+
spect = Spectrum(
|
|
214
|
+
mz=np.asarray(mz_values, dtype=np.float64),
|
|
215
|
+
inty=np.asarray(intensity_values, dtype=np.float64),
|
|
216
|
+
ms_level=ms_level,
|
|
217
|
+
centroided=False,
|
|
218
|
+
precursor_mz=spect["precursor_mz"],
|
|
219
|
+
energy=energy,
|
|
220
|
+
rt=rt,
|
|
221
|
+
label=name,
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
spect = Spectrum(
|
|
225
|
+
mz=np.asarray(mz_values, dtype=np.float64),
|
|
226
|
+
inty=np.asarray(intensity_values, dtype=np.float64),
|
|
227
|
+
ms_level=ms_level,
|
|
228
|
+
centroided=False,
|
|
229
|
+
precursor_mz=None,
|
|
230
|
+
energy=None,
|
|
231
|
+
rt=rt,
|
|
232
|
+
label=name,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if len(spect) and centroid and not spect.centroided:
|
|
236
|
+
spect = spect.denoise()
|
|
237
|
+
if spect.ms_level == 1:
|
|
238
|
+
spect = spect.centroid(
|
|
239
|
+
algo=centroid_algo,
|
|
240
|
+
tolerance=self.parameters.get("mz_tol_ms1_da"),
|
|
241
|
+
ppm=self.parameters.get("mz_tol_ms1_ppm"),
|
|
242
|
+
min_points=self.parameters.get("centroid_min_points_ms1"),
|
|
243
|
+
smooth=self.parameters.get("centroid_smooth"),
|
|
244
|
+
prominence=self.parameters.get("centroid_prominence"),
|
|
245
|
+
refine=self.parameters.get("centroid_refine"),
|
|
246
|
+
)
|
|
247
|
+
elif spect.ms_level == 2:
|
|
248
|
+
spect = spect.centroid(
|
|
249
|
+
algo=centroid_algo,
|
|
250
|
+
tolerance=self.parameters.get("mz_tol_ms2_da"),
|
|
251
|
+
ppm=self.parameters.get("mz_tol_ms2_ppm"),
|
|
252
|
+
min_points=self.parameters.get("centroid_min_points_ms2"),
|
|
253
|
+
smooth=self.parameters.get("centroid_smooth"),
|
|
254
|
+
prominence=self.parameters.get("centroid_prominence"),
|
|
255
|
+
refine=self.parameters.get("centroid_refine"),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
else:
|
|
259
|
+
self.logger.error(
|
|
260
|
+
f"File interface {self.file_interface} not supported. Reload data.",
|
|
261
|
+
)
|
|
262
|
+
return spec0
|
|
263
|
+
|
|
264
|
+
if precursor_trim is not None and spect.ms_level > 1:
|
|
265
|
+
spect = spect.trim(mz_min=None, mz_max=spect.precursor_mz - precursor_trim) # type: ignore[attr-defined]
|
|
266
|
+
if deisotope:
|
|
267
|
+
spect = spect.deisotope()
|
|
268
|
+
|
|
269
|
+
if max_peaks is not None:
|
|
270
|
+
spect = spect.keep_top(max_peaks)
|
|
271
|
+
|
|
272
|
+
if dia_stats:
|
|
273
|
+
if self.file_type in ["ztscan", "dia"]:
|
|
274
|
+
spect = self._get_ztscan_stats(
|
|
275
|
+
spec=spect,
|
|
276
|
+
scan_uid=scan_uid,
|
|
277
|
+
feature_uid=scan_info["feature_uid"][0]
|
|
278
|
+
if "feature_uid" in scan_info and scan_info["feature_uid"][0] is not None
|
|
279
|
+
else feature_uid,
|
|
280
|
+
q1_step=2,
|
|
281
|
+
deisotope=deisotope,
|
|
282
|
+
centroid=centroid,
|
|
283
|
+
)
|
|
284
|
+
return spect
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _get_ztscan_stats(
|
|
288
|
+
self,
|
|
289
|
+
spec,
|
|
290
|
+
scan_uid=None,
|
|
291
|
+
feature_uid=None,
|
|
292
|
+
q1_step=2,
|
|
293
|
+
mz_tol=0.005,
|
|
294
|
+
# TODO check this
|
|
295
|
+
# deisotope=SpectrumParameters().deisotope,
|
|
296
|
+
deisotope=False,
|
|
297
|
+
# TODO there is no `centroid_algo`?
|
|
298
|
+
centroid=True,
|
|
299
|
+
):
|
|
300
|
+
spec.size = spec.mz.size
|
|
301
|
+
# spec.ms_entropy = spec.entropy()
|
|
302
|
+
|
|
303
|
+
if self.scans_df is None:
|
|
304
|
+
self.logger.warning("No scans found.")
|
|
305
|
+
return spec
|
|
306
|
+
scan = self.scans_df.filter(pl.col("scan_uid") == scan_uid)
|
|
307
|
+
if len(scan) == 0:
|
|
308
|
+
self.logger.warning(f"Scan {scan_uid} not found.")
|
|
309
|
+
return spec
|
|
310
|
+
scan = scan[0]
|
|
311
|
+
if scan["ms_level"][0] != 2:
|
|
312
|
+
self.logger.warning(f"Scan {scan_uid} is not a MS2 scan.")
|
|
313
|
+
# Q1
|
|
314
|
+
lscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid - q1_step)
|
|
315
|
+
if len(lscan) == 0:
|
|
316
|
+
self.logger.warning(f"Scan {scan_uid - q1_step} not found.")
|
|
317
|
+
return spec
|
|
318
|
+
lscan = lscan[0]
|
|
319
|
+
# check that lscan['ms_level'] == 2 and lscan['cycle'] == scan['cycle']
|
|
320
|
+
if lscan["ms_level"][0] != 2:
|
|
321
|
+
self.logger.warning(f"Scan {scan_uid - q1_step} is not a MS2 scan.")
|
|
322
|
+
return spec
|
|
323
|
+
if lscan["cycle"][0] != scan["cycle"][0]:
|
|
324
|
+
self.logger.warning(
|
|
325
|
+
f"Scan {scan_uid - q1_step} is not in the same cycle as scan {scan_uid}.",
|
|
326
|
+
)
|
|
327
|
+
return spec
|
|
328
|
+
rscan = self.scans_df.filter(pl.col("scan_uid") == scan_uid + q1_step)
|
|
329
|
+
if len(rscan) == 0:
|
|
330
|
+
self.logger.warning(f"Scan {scan_uid + q1_step} not found.")
|
|
331
|
+
return spec
|
|
332
|
+
rscan = rscan[0]
|
|
333
|
+
# check that rscan['ms_level'] == 2 and rscan['cycle'] == scan['cycle']
|
|
334
|
+
if rscan["ms_level"][0] != 2:
|
|
335
|
+
self.logger.warning(f"Scan {scan_uid + q1_step} is not a MS2 scan.")
|
|
336
|
+
return spec
|
|
337
|
+
if rscan["cycle"][0] != scan["cycle"][0]:
|
|
338
|
+
self.logger.warning(
|
|
339
|
+
f"Scan {scan_uid + q1_step} is not in the same cycle as scan {scan_uid}.",
|
|
340
|
+
)
|
|
341
|
+
return spec
|
|
342
|
+
intymat = self._spec_to_mat(
|
|
343
|
+
scan_uids=[scan_uid - q1_step, scan_uid, scan_uid + q1_step],
|
|
344
|
+
mz_ref=spec.mz,
|
|
345
|
+
mz_tol=mz_tol,
|
|
346
|
+
deisotope=deisotope,
|
|
347
|
+
centroid=centroid,
|
|
348
|
+
)
|
|
349
|
+
# pick only mzs that are close to spec.mz
|
|
350
|
+
if intymat is None:
|
|
351
|
+
return spec
|
|
352
|
+
if intymat.shape[1] < 3:
|
|
353
|
+
self.logger.warning(f"Not enough data points for scan {scan_uid}.")
|
|
354
|
+
return spec
|
|
355
|
+
q1_ratio = (2 * intymat[:, 1] + 0.01) / (intymat[:, 0] + intymat[:, 2] + 0.01)
|
|
356
|
+
spec.q1_ratio = np.log2(q1_ratio)
|
|
357
|
+
# where intymat[:, 0] + intymat[:, 2]==0, set q1_ratio to -1
|
|
358
|
+
spec.q1_ratio[np.isclose(intymat[:, 0] + intymat[:, 2], 0)] = -10
|
|
359
|
+
|
|
360
|
+
# EIC correlation
|
|
361
|
+
# find rt_start and rt_end of the feature_uid
|
|
362
|
+
if self.features_df is None:
|
|
363
|
+
self.logger.warning("No features found.")
|
|
364
|
+
return spec
|
|
365
|
+
if feature_uid is None:
|
|
366
|
+
return spec
|
|
367
|
+
# spec.precursor_mz = feature['mz']
|
|
368
|
+
feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
|
|
369
|
+
if len(feature) == 0:
|
|
370
|
+
self.logger.warning(f"Feature {feature_uid} not found.")
|
|
371
|
+
return spec
|
|
372
|
+
feature = feature.row(0, named=True)
|
|
373
|
+
rt_start = feature["rt_start"]
|
|
374
|
+
rt_end = feature["rt_end"]
|
|
375
|
+
# get the cycle at rt_start and the cycle at rt_end from the closest scan with ms_level == 1
|
|
376
|
+
scans = self.scans_df.filter(pl.col("ms_level") == 1)
|
|
377
|
+
scans = scans.filter(pl.col("rt") > rt_start)
|
|
378
|
+
scans = scans.filter(pl.col("rt") < rt_end)
|
|
379
|
+
if len(scans) == 0:
|
|
380
|
+
self.logger.warning(f"No scans found between {rt_start} and {rt_end}.")
|
|
381
|
+
return spec
|
|
382
|
+
scan_uids = scans["scan_uid"].to_list()
|
|
383
|
+
eic_prec = self._spec_to_mat(
|
|
384
|
+
scan_uids=scan_uids,
|
|
385
|
+
mz_ref=feature["mz"],
|
|
386
|
+
mz_tol=mz_tol,
|
|
387
|
+
deisotope=deisotope,
|
|
388
|
+
centroid=centroid,
|
|
389
|
+
)
|
|
390
|
+
# find width at half maximum of the eic_prec
|
|
391
|
+
# hm = np.max(eic_prec[0, :]) / 3
|
|
392
|
+
# find index of maximum
|
|
393
|
+
# eic_prec_max_idx = np.argmax(eic_prec[0, :])
|
|
394
|
+
# find index of the closest point to half maximum
|
|
395
|
+
# idx = np.argmin(np.abs(eic_prec[0, :] - hm))
|
|
396
|
+
# eic_fwhm_prec = abs(eic_prec_max_idx - idx)
|
|
397
|
+
|
|
398
|
+
# get all unique cycles from scans
|
|
399
|
+
cycles = scans["cycle"].unique()
|
|
400
|
+
scandids = []
|
|
401
|
+
# iterate over all cycles and get the scan_uid of scan with ms_level == 2 and closest precursor_mz to spec.precursor_mz
|
|
402
|
+
for cycle in cycles:
|
|
403
|
+
scans = self.scans_df.filter(pl.col("cycle") == cycle)
|
|
404
|
+
scans = scans.filter(pl.col("ms_level") == 2)
|
|
405
|
+
scans = scans.filter(pl.col("prec_mz") > feature["mz"] - 4)
|
|
406
|
+
scans = scans.filter(pl.col("prec_mz") < feature["mz"] + 4)
|
|
407
|
+
if len(scans) == 0:
|
|
408
|
+
self.logger.warning(f"No scans found for cycle {cycle}.")
|
|
409
|
+
continue
|
|
410
|
+
scan = scans[(scans["prec_mz"] - feature["mz"]).abs().arg_sort()[:1]]
|
|
411
|
+
scandids.append(scan["scan_uid"][0])
|
|
412
|
+
|
|
413
|
+
eic_prod = self._spec_to_mat(
|
|
414
|
+
scandids,
|
|
415
|
+
mz_ref=spec.mz,
|
|
416
|
+
mz_tol=mz_tol,
|
|
417
|
+
deisotope=deisotope,
|
|
418
|
+
centroid=centroid,
|
|
419
|
+
)
|
|
420
|
+
# eic_prod = eic_prod.T
|
|
421
|
+
# eic_prec = eic_prec.T
|
|
422
|
+
# calculate correlation between eic_prec and all columns of eic_prod, column by column
|
|
423
|
+
eic_corr = np.zeros(eic_prod.shape[0])
|
|
424
|
+
# eic_width_ratio = np.zeros(eic_prod.shape[0])
|
|
425
|
+
for i in range(eic_prod.shape[0]):
|
|
426
|
+
try:
|
|
427
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
428
|
+
eic_corr[i] = np.corrcoef(eic_prod[i, :], eic_prec[0, :])[0, 1]
|
|
429
|
+
except:
|
|
430
|
+
pass
|
|
431
|
+
|
|
432
|
+
spec.eic_corr = eic_corr
|
|
433
|
+
return spec
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _spec_to_mat(
|
|
437
|
+
self,
|
|
438
|
+
scan_uids,
|
|
439
|
+
mz_ref=None,
|
|
440
|
+
mz_tol=0.01,
|
|
441
|
+
# TODO check this
|
|
442
|
+
# deisotope=SpectrumParameters().deisotope,
|
|
443
|
+
deisotope=False,
|
|
444
|
+
# TODO there is no `centroid_algo`?
|
|
445
|
+
# TODO there is no `dia_stats`?
|
|
446
|
+
# TODO unused (see below)
|
|
447
|
+
centroid=True,
|
|
448
|
+
# TODO check this
|
|
449
|
+
# precursor_trim=SpectrumParameters().precursor_trim,
|
|
450
|
+
# TODO unused (see below)
|
|
451
|
+
precursor_trim=None,
|
|
452
|
+
):
|
|
453
|
+
# get all spectra in scan_uids
|
|
454
|
+
|
|
455
|
+
if mz_ref is None:
|
|
456
|
+
return None
|
|
457
|
+
|
|
458
|
+
if not isinstance(mz_ref, np.ndarray):
|
|
459
|
+
if isinstance(mz_ref, list):
|
|
460
|
+
mz_ref = np.array(mz_ref)
|
|
461
|
+
else:
|
|
462
|
+
mz_ref = np.array([mz_ref])
|
|
463
|
+
|
|
464
|
+
def align_mzs(ar1, ar2, tol):
|
|
465
|
+
closest_indices = []
|
|
466
|
+
# find the closest pair between each element in ar1 and ar2, within a maximum tolerance of tol
|
|
467
|
+
for i, val1 in enumerate(ar1):
|
|
468
|
+
closest_index = np.argmin(np.abs(ar2 - val1))
|
|
469
|
+
closest_indices.append((i, closest_index))
|
|
470
|
+
# filter out pairs that are not within the specified tolerance
|
|
471
|
+
closest_indices = [(i, j) for i, j in closest_indices if np.abs(ar1[i] - ar2[j]) <= tol]
|
|
472
|
+
# remove duplicates from the list of indices
|
|
473
|
+
closest_indices = list(set(closest_indices))
|
|
474
|
+
# sort the list of indices by the first element (i) in ascending order
|
|
475
|
+
closest_indices = sorted(closest_indices, key=lambda x: x[0])
|
|
476
|
+
|
|
477
|
+
# Convert the list of indices into an array for easier indexing in subsequent operations
|
|
478
|
+
return np.array(closest_indices)
|
|
479
|
+
|
|
480
|
+
specs = []
|
|
481
|
+
for scan_uid in scan_uids:
|
|
482
|
+
spec = self.get_spectrum(
|
|
483
|
+
scan_uid=scan_uid,
|
|
484
|
+
centroid=True,
|
|
485
|
+
dia_stats=False,
|
|
486
|
+
precursor_trim=5,
|
|
487
|
+
)
|
|
488
|
+
if deisotope:
|
|
489
|
+
spec = spec.deisotope()
|
|
490
|
+
# align to reference spectrum
|
|
491
|
+
if spec.mz.size == 0:
|
|
492
|
+
continue
|
|
493
|
+
if mz_ref.size == 0:
|
|
494
|
+
continue
|
|
495
|
+
closest_indices = align_mzs(spec.mz, mz_ref, mz_tol)
|
|
496
|
+
# store the aligned spectrum in the list
|
|
497
|
+
aligned_inty = np.zeros(len(mz_ref))
|
|
498
|
+
for i, j in closest_indices:
|
|
499
|
+
if abs(spec.mz[i] - mz_ref[j]) <= mz_tol:
|
|
500
|
+
if aligned_inty[j] < spec.inty[i]:
|
|
501
|
+
aligned_inty[j] = spec.inty[i]
|
|
502
|
+
specs.append(aligned_inty)
|
|
503
|
+
|
|
504
|
+
if len(specs) == 0:
|
|
505
|
+
return None
|
|
506
|
+
# create a matrix with the aligned spectra. Each spec goes into a column
|
|
507
|
+
mat = np.column_stack(specs)
|
|
508
|
+
|
|
509
|
+
return mat
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def find_features(self, **kwargs):
|
|
513
|
+
"""
|
|
514
|
+
Detect features in mass spectrometry data by processing MS1 spectra, performing mass trace detection,
|
|
515
|
+
elution peak detection, and feature detection. Optionally, deisotope features and remove low-quality peaks.
|
|
516
|
+
|
|
517
|
+
This method leverages an MSExperiment constructed from the object's ms1_df, where each cycle in the data
|
|
518
|
+
corresponds to an MSSpectrum. It then runs mass trace detection using set parameters, deconvolutes the mass
|
|
519
|
+
traces to detect chromatographic peaks, and finally identifies features with a feature finding algorithm. The
|
|
520
|
+
resulting feature map is cleaned, deisotoped (if enabled), and assigned unique IDs before being stored.
|
|
521
|
+
|
|
522
|
+
Parameters:
|
|
523
|
+
**kwargs: Keyword arguments for feature detection parameters. Can include:
|
|
524
|
+
- A find_features_defaults instance to set all parameters at once
|
|
525
|
+
- Individual parameter names and values (see find_features_defaults for details)
|
|
526
|
+
|
|
527
|
+
Key Parameters:
|
|
528
|
+
tol_ppm (float): Mass error tolerance in parts-per-million for mass trace detection (default: 30.0).
|
|
529
|
+
noise (float): Noise threshold intensity to filter out low-intensity signals (default: 200.0).
|
|
530
|
+
chrom_fwhm (float): Full width at half maximum for chromatographic peak shape (default: 1.0).
|
|
531
|
+
chrom_fwhm_min (float): Minimum FWHM for chromatographic peak detection (default: 0.5).
|
|
532
|
+
chrom_peak_snr (float): Signal-to-noise ratio required for chromatographic peaks (default: 10.0).
|
|
533
|
+
mz_scoring_13C (bool): Whether to enable scoring of 13C isotopic patterns (default: False).
|
|
534
|
+
masstrace_snr_filtering (bool): Whether to apply SNR filtering to mass traces (default: False).
|
|
535
|
+
deisotope (bool): Whether to perform deisotoping of detected features (default: True).
|
|
536
|
+
|
|
537
|
+
Attributes set:
|
|
538
|
+
self.features: An updated feature map with unique IDs after feature detection and deisotoping.
|
|
539
|
+
self.features_df: A cleaned DataFrame of features, with peaks of zero quality removed, representing the final
|
|
540
|
+
detected features.
|
|
541
|
+
|
|
542
|
+
Notes:
|
|
543
|
+
- The method processes the ms1_df by iterating over cycles to build an MSExperiment.
|
|
544
|
+
- External OMS modules (e.g., MSExperiment, MSSpectrum, MassTraceDetection, ElutionPeakDetection,
|
|
545
|
+
FeatureFindingMetabo) are used throughout the processing.
|
|
546
|
+
- After feature detection, additional cleaning is performed via internal helper methods.
|
|
547
|
+
"""
|
|
548
|
+
if self.ms1_df is None:
|
|
549
|
+
self.logger.error("No MS1 data found. Please load a file first.")
|
|
550
|
+
return
|
|
551
|
+
if len(self.ms1_df) == 0:
|
|
552
|
+
self.logger.error("MS1 data is empty. Please load a file first.")
|
|
553
|
+
return
|
|
554
|
+
# parameters initialization
|
|
555
|
+
params = find_features_defaults()
|
|
556
|
+
for key, value in kwargs.items():
|
|
557
|
+
if isinstance(value, find_features_defaults):
|
|
558
|
+
# set
|
|
559
|
+
params = value
|
|
560
|
+
self.logger.debug("Using provided find_features_defaults parameters")
|
|
561
|
+
else:
|
|
562
|
+
if hasattr(params, key):
|
|
563
|
+
if params.set(key, value, validate=True):
|
|
564
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
565
|
+
else:
|
|
566
|
+
self.logger.warning(
|
|
567
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
568
|
+
)
|
|
569
|
+
else:
|
|
570
|
+
self.logger.warning(f"Unknown parameter {key} ignored")
|
|
571
|
+
|
|
572
|
+
self.logger.info("Starting feature detection...")
|
|
573
|
+
self.logger.debug(
|
|
574
|
+
f"Parameters: chrom_fwhm={params.get('chrom_fwhm')}, noise={params.get('noise')}, tol_ppm={params.get('tol_ppm')}",
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
exp = oms.MSExperiment()
|
|
578
|
+
# find max number of cycles in self.ms1_df
|
|
579
|
+
max_cycle = self.ms1_df["cycle"].max()
|
|
580
|
+
# iterate over all cycles, find rows with 1 cycle and append to exp2
|
|
581
|
+
for cycle in range(1, max_cycle + 1):
|
|
582
|
+
cycle_df = self.ms1_df.filter(pl.col("cycle") == cycle)
|
|
583
|
+
# check if len(cycle_df) > 0
|
|
584
|
+
if len(cycle_df) > 0:
|
|
585
|
+
spectrum = oms.MSSpectrum()
|
|
586
|
+
spectrum.setRT(cycle_df[0]["rt"].item())
|
|
587
|
+
spectrum.setMSLevel(1) # MS1
|
|
588
|
+
mz = cycle_df["mz"]
|
|
589
|
+
inty = cycle_df["inty"]
|
|
590
|
+
spectrum.set_peaks([mz, inty]) # type: ignore[attr-defined]
|
|
591
|
+
spectrum.sortByPosition()
|
|
592
|
+
exp.addSpectrum(spectrum)
|
|
593
|
+
|
|
594
|
+
# exp.sortSpectra(True)
|
|
595
|
+
# mass trace detection
|
|
596
|
+
mass_traces: list = []
|
|
597
|
+
mtd = oms.MassTraceDetection()
|
|
598
|
+
mtd_par = mtd.getDefaults()
|
|
599
|
+
|
|
600
|
+
# Apply MTD parameters
|
|
601
|
+
mtd_par.setValue("mass_error_ppm", float(params.get("tol_ppm")))
|
|
602
|
+
mtd_par.setValue("noise_threshold_int", float(params.get("noise")))
|
|
603
|
+
mtd_par.setValue(
|
|
604
|
+
"min_trace_length",
|
|
605
|
+
float(params.get("min_trace_length_multiplier")) * float(params.get("chrom_fwhm_min")),
|
|
606
|
+
)
|
|
607
|
+
mtd_par.setValue(
|
|
608
|
+
"trace_termination_outliers",
|
|
609
|
+
int(params.get("trace_termination_outliers")),
|
|
610
|
+
)
|
|
611
|
+
mtd_par.setValue("chrom_peak_snr", float(params.get("chrom_peak_snr")))
|
|
612
|
+
|
|
613
|
+
mtd.setParameters(mtd_par) # set the new parameters
|
|
614
|
+
mtd.run(exp, mass_traces, 0) # run mass trace detection
|
|
615
|
+
|
|
616
|
+
# elution peak detection
|
|
617
|
+
mass_traces_deconvol: list = []
|
|
618
|
+
epd = oms.ElutionPeakDetection()
|
|
619
|
+
epd_par = epd.getDefaults()
|
|
620
|
+
|
|
621
|
+
# Apply EPD parameters using our parameter class
|
|
622
|
+
epd_par.setValue("width_filtering", params.get("width_filtering"))
|
|
623
|
+
epd_par.setValue("min_fwhm", float(params.get("chrom_fwhm_min")))
|
|
624
|
+
epd_par.setValue("chrom_fwhm", float(params.get("chrom_fwhm")))
|
|
625
|
+
epd_par.setValue("chrom_peak_snr", float(params.get("chrom_peak_snr")))
|
|
626
|
+
if params.get("masstrace_snr_filtering"):
|
|
627
|
+
epd_par.setValue("masstrace_snr_filtering", "true")
|
|
628
|
+
if params.get("mz_scoring_13C"):
|
|
629
|
+
epd_par.setValue("mz_scoring_13C", "true")
|
|
630
|
+
|
|
631
|
+
epd.setParameters(epd_par)
|
|
632
|
+
epd.detectPeaks(mass_traces, mass_traces_deconvol)
|
|
633
|
+
|
|
634
|
+
# feature detection
|
|
635
|
+
feature_map = oms.FeatureMap() # output features
|
|
636
|
+
chrom_out: list = [] # output chromatograms
|
|
637
|
+
ffm = oms.FeatureFindingMetabo()
|
|
638
|
+
ffm_par = ffm.getDefaults()
|
|
639
|
+
|
|
640
|
+
# Apply FFM parameters using our parameter class
|
|
641
|
+
ffm_par.setValue(
|
|
642
|
+
"remove_single_traces",
|
|
643
|
+
"true" if params.get("remove_single_traces") else "false",
|
|
644
|
+
)
|
|
645
|
+
ffm_par.setValue(
|
|
646
|
+
"report_convex_hulls",
|
|
647
|
+
"true" if params.get("report_convex_hulls") else "false",
|
|
648
|
+
)
|
|
649
|
+
ffm_par.setValue(
|
|
650
|
+
"report_summed_ints",
|
|
651
|
+
"true" if params.get("report_summed_ints") else "false",
|
|
652
|
+
)
|
|
653
|
+
ffm_par.setValue(
|
|
654
|
+
"report_chromatograms",
|
|
655
|
+
"true" if params.get("report_chromatograms") else "false",
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
ffm.setParameters(ffm_par)
|
|
659
|
+
self.logger.debug("Running feature finding with parameters:")
|
|
660
|
+
self.logger.debug(ffm_par)
|
|
661
|
+
ffm.run(mass_traces_deconvol, feature_map, chrom_out)
|
|
662
|
+
# Assigns a new, valid unique id per feature
|
|
663
|
+
feature_map.ensureUniqueId()
|
|
664
|
+
df = feature_map.get_df(export_peptide_identifications=False) # type: ignore[attr-defined]
|
|
665
|
+
# Sets the file path to the primary MS run (usually the mzML file)
|
|
666
|
+
feature_map.setPrimaryMSRunPath([self.file_path.encode()])
|
|
667
|
+
self.features = feature_map
|
|
668
|
+
# remove peaks with quality == 0
|
|
669
|
+
df = self._clean_features_df(df)
|
|
670
|
+
|
|
671
|
+
# desotope features
|
|
672
|
+
df = self._features_deisotope(
|
|
673
|
+
df,
|
|
674
|
+
mz_tol=params.get("deisotope_mz_tol"),
|
|
675
|
+
rt_tol=params.get("chrom_fwhm_min") / 4 * params.get("deisotope_rt_tol_factor"),
|
|
676
|
+
)
|
|
677
|
+
if params.get("deisotope"):
|
|
678
|
+
# record size before deisotoping
|
|
679
|
+
size_before_deisotope = len(df)
|
|
680
|
+
df = df.filter(pl.col("iso") == 0)
|
|
681
|
+
self.logger.debug(
|
|
682
|
+
f"Deisotoping features: {size_before_deisotope - len(df)} features removed.",
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# update eic - create lists to collect results
|
|
686
|
+
chroms: list[Chromatogram] = []
|
|
687
|
+
coherences: list[float] = []
|
|
688
|
+
prominences: list[float] = []
|
|
689
|
+
prominence_scaleds: list[float] = []
|
|
690
|
+
height_scaleds: list[float] = []
|
|
691
|
+
|
|
692
|
+
mz_tol = params.get("eic_mz_tol")
|
|
693
|
+
rt_tol = params.get("eic_rt_tol")
|
|
694
|
+
|
|
695
|
+
# iterate over all rows in df using polars iteration
|
|
696
|
+
self.logger.debug("Extracting EICs...")
|
|
697
|
+
for row in df.iter_rows(named=True):
|
|
698
|
+
# select data in ms1_df with mz in range [mz_start - mz_tol, mz_end + mz_tol] and rt in range [rt_start - rt_tol, rt_end + rt_tol]
|
|
699
|
+
d = self.ms1_df.filter(
|
|
700
|
+
(pl.col("rt") >= row["rt_start"] - rt_tol)
|
|
701
|
+
& (pl.col("rt") <= row["rt_end"] + rt_tol)
|
|
702
|
+
& (pl.col("mz") >= row["mz"] - mz_tol)
|
|
703
|
+
& (pl.col("mz") <= row["mz"] + mz_tol),
|
|
704
|
+
)
|
|
705
|
+
# for all unique rt values, find the maximum inty
|
|
706
|
+
eic_rt = d.group_by("rt").agg(pl.col("inty").max())
|
|
707
|
+
if len(eic_rt) < 4:
|
|
708
|
+
chroms.append(None)
|
|
709
|
+
coherences.append(None)
|
|
710
|
+
prominences.append(None)
|
|
711
|
+
prominence_scaleds.append(None)
|
|
712
|
+
height_scaleds.append(None)
|
|
713
|
+
continue
|
|
714
|
+
|
|
715
|
+
eic = Chromatogram(
|
|
716
|
+
eic_rt["rt"].to_numpy(),
|
|
717
|
+
eic_rt["inty"].to_numpy(),
|
|
718
|
+
label=f"EIC mz={row['mz']:.4f}",
|
|
719
|
+
file=self.file_path,
|
|
720
|
+
mz=row["mz"],
|
|
721
|
+
mz_tol=mz_tol,
|
|
722
|
+
feature_start=row["rt_start"],
|
|
723
|
+
feature_end=row["rt_end"],
|
|
724
|
+
feature_apex=row["rt"],
|
|
725
|
+
).find_peaks()
|
|
726
|
+
|
|
727
|
+
# collect results
|
|
728
|
+
chroms.append(eic)
|
|
729
|
+
if len(eic.peak_widths) > 0:
|
|
730
|
+
coherences.append(round(eic.feature_coherence, 3))
|
|
731
|
+
prominences.append(round(eic.peak_prominences[0], 3))
|
|
732
|
+
prominence_scaleds.append(
|
|
733
|
+
round(eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3),
|
|
734
|
+
)
|
|
735
|
+
height_scaleds.append(
|
|
736
|
+
round(eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3),
|
|
737
|
+
)
|
|
738
|
+
else:
|
|
739
|
+
coherences.append(None)
|
|
740
|
+
prominences.append(None)
|
|
741
|
+
prominence_scaleds.append(None)
|
|
742
|
+
height_scaleds.append(None)
|
|
743
|
+
|
|
744
|
+
# Add the computed columns to the dataframe
|
|
745
|
+
df = df.with_columns([
|
|
746
|
+
pl.Series("chrom", chroms, dtype=pl.Object),
|
|
747
|
+
pl.Series("chrom_coherence", coherences, dtype=pl.Float64),
|
|
748
|
+
pl.Series("chrom_prominence", prominences, dtype=pl.Float64),
|
|
749
|
+
pl.Series("chrom_prominence_scaled", prominence_scaleds, dtype=pl.Float64),
|
|
750
|
+
pl.Series("chrom_height_scaled", height_scaleds, dtype=pl.Float64),
|
|
751
|
+
])
|
|
752
|
+
|
|
753
|
+
self.features_df = df
|
|
754
|
+
self._features_sync()
|
|
755
|
+
self.logger.info(f"Feature detection completed. Total features: {len(df)}")
|
|
756
|
+
|
|
757
|
+
# store params
|
|
758
|
+
self.store_history(["find_features"], params.to_dict())
|
|
759
|
+
self.logger.debug(
|
|
760
|
+
"Parameters stored to find_features",
|
|
761
|
+
)
|
|
762
|
+
keys_to_remove = ["find_adducts", "find_ms2"]
|
|
763
|
+
for key in keys_to_remove:
|
|
764
|
+
if key in self.history:
|
|
765
|
+
del self.history[key]
|
|
766
|
+
self.logger.debug(f"Removed {key} from history")
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def find_adducts(self, **kwargs):
|
|
770
|
+
"""
|
|
771
|
+
Detect adducts in mass spectrometry features using OpenMS MetaboliteFeatureDeconvolution.
|
|
772
|
+
|
|
773
|
+
This method analyzes detected features to identify adduct relationships based on mass differences,
|
|
774
|
+
charge states, and retention time proximity. It groups features that likely represent the same
|
|
775
|
+
metabolite in different ionization states.
|
|
776
|
+
|
|
777
|
+
Parameters:
|
|
778
|
+
**kwargs: Keyword arguments for adduct detection parameters. Can include:
|
|
779
|
+
- A find_adducts_defaults instance to set all parameters at once
|
|
780
|
+
- Individual parameter names and values (see find_adducts_defaults for details)
|
|
781
|
+
|
|
782
|
+
Key Parameters:
|
|
783
|
+
adducts (Union[List[str], str, None]): List of potential adducts or ionization mode string.
|
|
784
|
+
charge_min (int): Minimal possible charge state (default: 1).
|
|
785
|
+
charge_max (int): Maximal possible charge state (default: 2).
|
|
786
|
+
retention_max_diff (float): Maximum retention time difference for grouping (default: 1.0).
|
|
787
|
+
|
|
788
|
+
Attributes set:
|
|
789
|
+
self.features_df: Updated with adduct information including 'adduct', 'adduct_mass',
|
|
790
|
+
and 'adduct_group' columns.
|
|
791
|
+
"""
|
|
792
|
+
params = find_adducts_defaults()
|
|
793
|
+
for key, value in kwargs.items():
|
|
794
|
+
if isinstance(value, find_adducts_defaults):
|
|
795
|
+
# set
|
|
796
|
+
params = value
|
|
797
|
+
self.logger.debug("Using provided find_adducts_defaults parameters")
|
|
798
|
+
else:
|
|
799
|
+
if hasattr(params, key):
|
|
800
|
+
if params.set(key, value, validate=True):
|
|
801
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
802
|
+
else:
|
|
803
|
+
self.logger.warning(
|
|
804
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
805
|
+
)
|
|
806
|
+
else:
|
|
807
|
+
self.logger.warning(f"Unknown parameter {key} ignored")
|
|
808
|
+
|
|
809
|
+
self.logger.debug("Starting adduct detection...")
|
|
810
|
+
self.logger.debug(
|
|
811
|
+
f"Parameters: adducts={params.get('adducts')}, charge_min={params.get('charge_min')}, charge_max={params.get('charge_max')}",
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
mfd = oms.MetaboliteFeatureDeconvolution()
|
|
815
|
+
|
|
816
|
+
openms_params = mfd.getDefaults()
|
|
817
|
+
|
|
818
|
+
# Set adducts using the helper method
|
|
819
|
+
adducts_list = params.get_openms_adducts()
|
|
820
|
+
openms_params.setValue("potential_adducts", [a.encode() for a in adducts_list])
|
|
821
|
+
|
|
822
|
+
# Apply other parameters
|
|
823
|
+
openms_params.setValue("charge_min", params.get("charge_min"))
|
|
824
|
+
openms_params.setValue("charge_max", params.get("charge_max"))
|
|
825
|
+
openms_params.setValue("charge_span_max", params.get("charge_span_max"))
|
|
826
|
+
openms_params.setValue("retention_max_diff", params.get("retention_max_diff"))
|
|
827
|
+
openms_params.setValue(
|
|
828
|
+
"retention_max_diff_local",
|
|
829
|
+
params.get("retention_max_diff_local"),
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
# set updated parameters object
|
|
833
|
+
mfd.setParameters(openms_params)
|
|
834
|
+
self.logger.debug("Running adduct detection with parameters:")
|
|
835
|
+
self.logger.debug(openms_params)
|
|
836
|
+
# result feature map: will store features with adduct information
|
|
837
|
+
feature_map_MFD = oms.FeatureMap()
|
|
838
|
+
# result consensus map: will store grouped features belonging to a charge group
|
|
839
|
+
groups = oms.ConsensusMap()
|
|
840
|
+
# result consensus map: will store paired features connected by an edge
|
|
841
|
+
edges = oms.ConsensusMap()
|
|
842
|
+
|
|
843
|
+
# compute adducts
|
|
844
|
+
mfd.compute(self.features, feature_map_MFD, groups, edges)
|
|
845
|
+
self.logger.debug("Extracting information.")
|
|
846
|
+
|
|
847
|
+
# export feature map as pandas DataFrame and append adduct information
|
|
848
|
+
adducts_map = feature_map_MFD.get_df(export_peptide_identifications=False) # type: ignore[attr-defined]
|
|
849
|
+
adducts_map["adduct"] = [f.getMetaValue("dc_charge_adducts") for f in feature_map_MFD]
|
|
850
|
+
adducts_map["adduct_group_id"] = [f.getMetaValue("Group") for f in feature_map_MFD]
|
|
851
|
+
adducts_map["adduct_mass"] = [f.getMetaValue("dc_charge_adduct_mass") for f in feature_map_MFD]
|
|
852
|
+
# clean up the DataFrame
|
|
853
|
+
|
|
854
|
+
# Clean up 'None' strings that should be actual None values from OpenMS getMetaValue
|
|
855
|
+
for col in ["adduct", "adduct_group_id", "adduct_mass"]:
|
|
856
|
+
if col in adducts_map.columns:
|
|
857
|
+
adducts_map[col] = adducts_map[col].replace("None", None)
|
|
858
|
+
|
|
859
|
+
# Convert adducts_map to polars and merge
|
|
860
|
+
adducts_df = pl.DataFrame({
|
|
861
|
+
"index": range(len(adducts_map)),
|
|
862
|
+
"adduct": adducts_map["adduct"],
|
|
863
|
+
"adduct_mass": adducts_map["adduct_mass"],
|
|
864
|
+
"adduct_group_id": adducts_map["adduct_group_id"],
|
|
865
|
+
})
|
|
866
|
+
features_pl = self.features_df if isinstance(self.features_df, pl.DataFrame) else pl.from_pandas(self.features_df)
|
|
867
|
+
|
|
868
|
+
# Remove existing adduct columns if they exist (likely all null)
|
|
869
|
+
if "adduct" in features_pl.columns:
|
|
870
|
+
features_pl = features_pl.drop("adduct")
|
|
871
|
+
if "adduct_mass" in features_pl.columns:
|
|
872
|
+
features_pl = features_pl.drop("adduct_mass")
|
|
873
|
+
if "adduct_group" in features_pl.columns:
|
|
874
|
+
features_pl = features_pl.drop("adduct_group")
|
|
875
|
+
|
|
876
|
+
df = features_pl.join(
|
|
877
|
+
adducts_df,
|
|
878
|
+
left_on="feature_uid",
|
|
879
|
+
right_on="index",
|
|
880
|
+
how="left",
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
# Create adduct_group from adduct_group_id column
|
|
884
|
+
unique_groups = df["adduct_group_id"].unique().to_list()
|
|
885
|
+
group_mapping = {group: idx for idx, group in enumerate(unique_groups)}
|
|
886
|
+
df = df.with_columns(
|
|
887
|
+
pl.col("adduct_group_id")
|
|
888
|
+
.map_elements(lambda x: group_mapping.get(x, 0), return_dtype=pl.Int64)
|
|
889
|
+
.alias("adduct_group"),
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
# remove adduct_group_id
|
|
893
|
+
df = df.drop("adduct_group_id")
|
|
894
|
+
# move adduct, adduct_mass, and adduct_group after column iso_of
|
|
895
|
+
if "iso_of" in df.columns:
|
|
896
|
+
adduct_cols = ["adduct", "adduct_mass", "adduct_group"]
|
|
897
|
+
# Get all column names and reorder them
|
|
898
|
+
all_cols = df.columns
|
|
899
|
+
iso_of_idx = all_cols.index("iso_of")
|
|
900
|
+
|
|
901
|
+
# Create new column order: everything before iso_of, then iso_of, then adduct columns, then the rest
|
|
902
|
+
new_order = []
|
|
903
|
+
# columns up to and including iso_of
|
|
904
|
+
new_order.extend(all_cols[: iso_of_idx + 1])
|
|
905
|
+
# adduct columns that exist
|
|
906
|
+
new_order.extend([col for col in adduct_cols if col in all_cols])
|
|
907
|
+
new_order.extend([col for col in all_cols[iso_of_idx + 1 :] if col not in adduct_cols]) # remaining columns
|
|
908
|
+
|
|
909
|
+
df = df.select(new_order)
|
|
910
|
+
# Update the features_df attribute with the new DataFrame
|
|
911
|
+
|
|
912
|
+
self.features_df = df
|
|
913
|
+
total_adducts = df.filter(pl.col("adduct").is_not_null()).shape[0]
|
|
914
|
+
self.logger.info(f"Adduct detection completed. Total adducts: {total_adducts}")
|
|
915
|
+
|
|
916
|
+
# store params
|
|
917
|
+
self.store_history(["find_adducts"], params.to_dict())
|
|
918
|
+
self.logger.debug(
|
|
919
|
+
"Parameters stored to find_adducts",
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
def _clean_features_df(self, df):
|
|
924
|
+
# Convert pandas DataFrame to polars if needed
|
|
925
|
+
df["feature_id"] = df.index
|
|
926
|
+
if hasattr(df, "columns") and not isinstance(df, pl.DataFrame):
|
|
927
|
+
df_pl = pl.from_pandas(df)
|
|
928
|
+
else:
|
|
929
|
+
df_pl = df
|
|
930
|
+
|
|
931
|
+
# Filter out rows with quality == 0
|
|
932
|
+
df2 = df_pl.filter(pl.col("quality") != 0)
|
|
933
|
+
|
|
934
|
+
# Create new dataframe with required columns and transformations using select
|
|
935
|
+
df_result = df2.select([
|
|
936
|
+
pl.int_range(pl.len()).alias("feature_uid"),
|
|
937
|
+
pl.col("feature_id").cast(pl.String).alias("feature_id"),
|
|
938
|
+
pl.col("mz").round(5),
|
|
939
|
+
pl.col("RT").round(3).alias("rt"),
|
|
940
|
+
pl.col("RT").round(3).alias("rt_original"), # keep original RT
|
|
941
|
+
pl.col("RTstart").round(3).alias("rt_start"),
|
|
942
|
+
pl.col("RTend").round(3).alias("rt_end"),
|
|
943
|
+
(pl.col("RTend") - pl.col("RTstart")).round(3).alias("rt_delta"),
|
|
944
|
+
pl.col("MZstart").round(5).alias("mz_start"),
|
|
945
|
+
pl.col("MZend").round(5).alias("mz_end"),
|
|
946
|
+
pl.col("intensity").alias("inty"),
|
|
947
|
+
pl.col("quality"),
|
|
948
|
+
pl.col("charge"),
|
|
949
|
+
pl.lit(0).alias("iso"),
|
|
950
|
+
pl.lit(None, dtype=pl.Int64).alias("iso_of"),
|
|
951
|
+
pl.lit(None, dtype=pl.Int64).alias("adduct_group"),
|
|
952
|
+
pl.lit(None, dtype=pl.Utf8).alias("adduct"),
|
|
953
|
+
pl.lit(None, dtype=pl.Float64).alias("adduct_mass"),
|
|
954
|
+
pl.lit(None, dtype=pl.Object).alias("chrom"),
|
|
955
|
+
pl.lit(None, dtype=pl.Float64).alias("chrom_coherence"),
|
|
956
|
+
pl.lit(None, dtype=pl.Float64).alias("chrom_prominence"),
|
|
957
|
+
pl.lit(None, dtype=pl.Float64).alias("chrom_prominence_scaled"),
|
|
958
|
+
pl.lit(None, dtype=pl.Float64).alias("chrom_height_scaled"),
|
|
959
|
+
pl.lit(None, dtype=pl.Object).alias("ms2_scans"),
|
|
960
|
+
pl.lit(None, dtype=pl.Object).alias("ms2_specs"),
|
|
961
|
+
])
|
|
962
|
+
|
|
963
|
+
return df_result
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
def _features_deisotope(
|
|
967
|
+
self,
|
|
968
|
+
df,
|
|
969
|
+
mz_tol=None,
|
|
970
|
+
rt_tol=None,
|
|
971
|
+
):
|
|
972
|
+
if mz_tol is None:
|
|
973
|
+
mz_tol = 0.02
|
|
974
|
+
if rt_tol is None:
|
|
975
|
+
rt_tol = 0.2
|
|
976
|
+
|
|
977
|
+
# Convert to polars if needed
|
|
978
|
+
if not isinstance(df, pl.DataFrame):
|
|
979
|
+
df = pl.from_pandas(df)
|
|
980
|
+
|
|
981
|
+
# Initialize new columns
|
|
982
|
+
df = df.with_columns([
|
|
983
|
+
pl.lit(0).alias("iso"),
|
|
984
|
+
pl.col("feature_uid").alias("iso_of"),
|
|
985
|
+
])
|
|
986
|
+
|
|
987
|
+
# Sort by 'mz'
|
|
988
|
+
df = df.sort("mz")
|
|
989
|
+
|
|
990
|
+
# Get arrays for efficient processing
|
|
991
|
+
rt_arr = df["rt"].to_numpy()
|
|
992
|
+
mz_arr = df["mz"].to_numpy()
|
|
993
|
+
intensity_arr = df["inty"].to_numpy()
|
|
994
|
+
feature_uid_arr = df["feature_uid"].to_numpy()
|
|
995
|
+
n = len(df)
|
|
996
|
+
mz_diff = 1.003355
|
|
997
|
+
|
|
998
|
+
# Create arrays to track isotope assignments
|
|
999
|
+
iso_arr = np.zeros(n, dtype=int)
|
|
1000
|
+
iso_of_arr = feature_uid_arr.copy()
|
|
1001
|
+
|
|
1002
|
+
for i in range(n):
|
|
1003
|
+
base_rt = rt_arr[i]
|
|
1004
|
+
base_mz = mz_arr[i]
|
|
1005
|
+
base_int = intensity_arr[i]
|
|
1006
|
+
base_feature_uid = feature_uid_arr[i]
|
|
1007
|
+
|
|
1008
|
+
# Search for first isotope candidate (offset = mz_diff)
|
|
1009
|
+
t1_lower = base_mz + mz_diff - mz_tol
|
|
1010
|
+
t1_upper = base_mz + mz_diff + mz_tol
|
|
1011
|
+
li = np.searchsorted(mz_arr, t1_lower, side="left")
|
|
1012
|
+
ri = np.searchsorted(mz_arr, t1_upper, side="right")
|
|
1013
|
+
if li < ri:
|
|
1014
|
+
cand_idx = np.arange(li, ri)
|
|
1015
|
+
mask = (
|
|
1016
|
+
(rt_arr[cand_idx] > base_rt - rt_tol)
|
|
1017
|
+
& (rt_arr[cand_idx] < base_rt + rt_tol)
|
|
1018
|
+
& (intensity_arr[cand_idx] < 2 * base_int)
|
|
1019
|
+
)
|
|
1020
|
+
valid_cand = cand_idx[mask]
|
|
1021
|
+
for cand in valid_cand:
|
|
1022
|
+
if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
|
|
1023
|
+
iso_arr[cand] = iso_arr[i] + 1 # first isotope
|
|
1024
|
+
iso_of_arr[cand] = base_feature_uid
|
|
1025
|
+
|
|
1026
|
+
# Search for second isotope candidate (offset = 2*mz_diff)
|
|
1027
|
+
t2_lower = base_mz + 2 * mz_diff - 1.5 * mz_tol
|
|
1028
|
+
t2_upper = base_mz + 2 * mz_diff + 1.5 * mz_tol
|
|
1029
|
+
li = np.searchsorted(mz_arr, t2_lower, side="left")
|
|
1030
|
+
ri = np.searchsorted(mz_arr, t2_upper, side="right")
|
|
1031
|
+
if li < ri:
|
|
1032
|
+
cand_idx = np.arange(li, ri)
|
|
1033
|
+
mask = (
|
|
1034
|
+
(rt_arr[cand_idx] > base_rt - rt_tol)
|
|
1035
|
+
& (rt_arr[cand_idx] < base_rt + rt_tol)
|
|
1036
|
+
& (intensity_arr[cand_idx] < 2 * base_int)
|
|
1037
|
+
)
|
|
1038
|
+
valid_cand = cand_idx[mask]
|
|
1039
|
+
for cand in valid_cand:
|
|
1040
|
+
if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
|
|
1041
|
+
iso_arr[cand] = iso_arr[i] + 2 # second isotope
|
|
1042
|
+
iso_of_arr[cand] = base_feature_uid
|
|
1043
|
+
|
|
1044
|
+
# Search for third isotope candidate (offset = 3*mz_diff)
|
|
1045
|
+
t3_lower = base_mz + 3 * mz_diff - 1.5 * mz_tol
|
|
1046
|
+
t3_upper = base_mz + 3 * mz_diff + 1.5 * mz_tol
|
|
1047
|
+
li = np.searchsorted(mz_arr, t3_lower, side="left")
|
|
1048
|
+
ri = np.searchsorted(mz_arr, t3_upper, side="right")
|
|
1049
|
+
if li < ri:
|
|
1050
|
+
cand_idx = np.arange(li, ri)
|
|
1051
|
+
mask = (
|
|
1052
|
+
(rt_arr[cand_idx] > base_rt - rt_tol)
|
|
1053
|
+
& (rt_arr[cand_idx] < base_rt + rt_tol)
|
|
1054
|
+
& (intensity_arr[cand_idx] < 2 * base_int)
|
|
1055
|
+
)
|
|
1056
|
+
valid_cand = cand_idx[mask]
|
|
1057
|
+
for cand in valid_cand:
|
|
1058
|
+
if cand != i and iso_of_arr[cand] == feature_uid_arr[cand]:
|
|
1059
|
+
iso_arr[cand] = iso_arr[i] + 3 # third isotope
|
|
1060
|
+
iso_of_arr[cand] = base_feature_uid
|
|
1061
|
+
|
|
1062
|
+
# Update the dataframe with isotope assignments
|
|
1063
|
+
df = df.with_columns([
|
|
1064
|
+
pl.Series("iso", iso_arr),
|
|
1065
|
+
pl.Series("iso_of", iso_of_arr),
|
|
1066
|
+
])
|
|
1067
|
+
|
|
1068
|
+
return df
|
|
1069
|
+
|
|
1070
|
+
|
|
1071
|
+
def analyze_dda(self):
|
|
1072
|
+
# Preallocate variables
|
|
1073
|
+
cycle_records = []
|
|
1074
|
+
previous_rt = 0
|
|
1075
|
+
previous_level = 0
|
|
1076
|
+
ms1_index = None
|
|
1077
|
+
cyclestart = None
|
|
1078
|
+
ms2_n = 0
|
|
1079
|
+
ms1_duration = 0
|
|
1080
|
+
ms2_duration: list[float] = []
|
|
1081
|
+
|
|
1082
|
+
for row in self.scans_df.iter_rows(named=True):
|
|
1083
|
+
if row["ms_level"] == 1:
|
|
1084
|
+
if previous_level == 2:
|
|
1085
|
+
ms2_to_ms2 = float(np.mean(ms2_duration)) if ms2_duration else -1.0
|
|
1086
|
+
d = {
|
|
1087
|
+
"scan_uid": ms1_index,
|
|
1088
|
+
"ms2_n": ms2_n,
|
|
1089
|
+
"time_cycle": row["rt"] - cyclestart,
|
|
1090
|
+
"time_ms1_to_ms1": -1.0,
|
|
1091
|
+
"time_ms1_to_ms2": ms1_duration,
|
|
1092
|
+
"time_ms2_to_ms2": ms2_to_ms2,
|
|
1093
|
+
"time_ms2_to_ms1": row["rt"] - previous_rt,
|
|
1094
|
+
}
|
|
1095
|
+
cycle_records.append(d)
|
|
1096
|
+
elif previous_level == 1:
|
|
1097
|
+
d = {
|
|
1098
|
+
"scan_uid": ms1_index,
|
|
1099
|
+
"ms2_n": 0,
|
|
1100
|
+
"time_cycle": row["rt"] - cyclestart,
|
|
1101
|
+
"time_ms1_to_ms1": row["rt"] - cyclestart,
|
|
1102
|
+
"time_ms1_to_ms2": -1.0,
|
|
1103
|
+
"time_ms2_to_ms2": -1.0,
|
|
1104
|
+
"time_ms2_to_ms1": -1.0,
|
|
1105
|
+
}
|
|
1106
|
+
cycle_records.append(d)
|
|
1107
|
+
|
|
1108
|
+
ms1_index = row["scan_uid"]
|
|
1109
|
+
cyclestart = row["rt"]
|
|
1110
|
+
ms2_n = 0
|
|
1111
|
+
ms1_duration = 0
|
|
1112
|
+
ms2_duration = []
|
|
1113
|
+
elif previous_level == 2:
|
|
1114
|
+
ms2_n += 1
|
|
1115
|
+
ms2_duration.append(row["rt"] - previous_rt)
|
|
1116
|
+
elif previous_level == 1:
|
|
1117
|
+
ms1_duration = row["rt"] - cyclestart
|
|
1118
|
+
ms2_n += 1
|
|
1119
|
+
previous_level = row["ms_level"]
|
|
1120
|
+
previous_rt = row["rt"]
|
|
1121
|
+
|
|
1122
|
+
# Create DataFrame once at the end
|
|
1123
|
+
if cycle_records:
|
|
1124
|
+
cycle_data = pl.DataFrame(cycle_records)
|
|
1125
|
+
self.scans_df = self.scans_df.join(cycle_data, on="scan_uid", how="left")
|
|
1126
|
+
else:
|
|
1127
|
+
self.scans_df = self.scans_df.with_columns(
|
|
1128
|
+
[
|
|
1129
|
+
pl.lit(None).alias("ms2_n"),
|
|
1130
|
+
pl.lit(None).alias("time_cycle"),
|
|
1131
|
+
pl.lit(None).alias("time_ms1_to_ms1"),
|
|
1132
|
+
pl.lit(None).alias("time_ms1_to_ms2"),
|
|
1133
|
+
pl.lit(None).alias("time_ms2_to_ms2"),
|
|
1134
|
+
pl.lit(None).alias("time_ms2_to_ms1"),
|
|
1135
|
+
],
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
def find_ms2(self, **kwargs):
|
|
1140
|
+
"""
|
|
1141
|
+
Link MS2 spectra to features in the dataset.
|
|
1142
|
+
This method matches MS2 spectra from the scans dataframe with features in the features dataframe
|
|
1143
|
+
based on retention time (RT) and precursor m/z tolerance criteria. For each feature in the provided
|
|
1144
|
+
or inferred list of feature ids (feature_uid), it computes the RT difference between the feature and available
|
|
1145
|
+
MS2 spectra. It then selects MS2 spectra that fall within a computed RT radius (based on the feature's
|
|
1146
|
+
start and end times) and a specified m/z tolerance. For each feature, it chooses one MS2 spectrum per
|
|
1147
|
+
unique cycle based on the closest RT difference, and it updates the feature with the list of matched
|
|
1148
|
+
scan ids and the spectrum corresponding to the first matching scan id. Additionally, the scan dataframe
|
|
1149
|
+
is updated to associate matched scan ids with the corresponding feature id.
|
|
1150
|
+
|
|
1151
|
+
Parameters:
|
|
1152
|
+
**kwargs: Keyword arguments for MS2 linking parameters. Can include:
|
|
1153
|
+
- A find_ms2_defaults instance to set all parameters at once
|
|
1154
|
+
- Individual parameter names and values (see find_ms2_defaults for details)
|
|
1155
|
+
|
|
1156
|
+
Key Parameters:
|
|
1157
|
+
features (int or list of int, optional): A specific feature id or a list of feature ids to process.
|
|
1158
|
+
If an individual feature_uid is provided and equals -1, all features with no associated MS2 data will be processed.
|
|
1159
|
+
If None, all features in the features dataframe are processed.
|
|
1160
|
+
mz_tol (float, optional): The precursor m/z tolerance to consider when matching MS2 spectra. If not provided,
|
|
1161
|
+
it defaults to 0.5, except for certain file types ('ztscan' or 'dia') which set it to 4.
|
|
1162
|
+
centroid (bool, optional): If True, the returned spectrum will be centroided. Default is True.
|
|
1163
|
+
deisotope (bool, optional): Flag indicating whether deisotoping should be performed. Default is False.
|
|
1164
|
+
dia_stats (bool, optional): A flag to collect additional DIA-related statistics when retrieving a spectrum.
|
|
1165
|
+
Default is False.
|
|
1166
|
+
|
|
1167
|
+
Returns:
|
|
1168
|
+
None
|
|
1169
|
+
|
|
1170
|
+
Side Effects:
|
|
1171
|
+
Updates self.features_df with new columns 'ms2_scans' (a list of scan ids) and 'ms2_specs' (containing
|
|
1172
|
+
the retrieved spectrum for the first matched scan id). Also, self.scans_df is updated by setting the 'feature_uid'
|
|
1173
|
+
column for matched MS2 spectra.
|
|
1174
|
+
|
|
1175
|
+
Notes:
|
|
1176
|
+
- The function uses vectorized operations to quickly filter MS2 spectra with ms_level equal to 2.
|
|
1177
|
+
- If no MS2 spectra are available or if features_df is not loaded, appropriate messages are printed and the
|
|
1178
|
+
method exits early.
|
|
1179
|
+
- The function assumes that self.features_df and self.scans_df are already set up and contain the expected
|
|
1180
|
+
columns ('feature_uid', 'rt', 'rt_start', 'rt_end', 'mz' for features and 'scan_uid', 'rt', 'prec_mz', 'cycle', 'ms_level'
|
|
1181
|
+
for scans).
|
|
1182
|
+
|
|
1183
|
+
Examples:
|
|
1184
|
+
Assume the current instance has features and scans data loaded, then to link MS2 spectra for all features:
|
|
1185
|
+
instance.find_ms2()
|
|
1186
|
+
To link MS2 spectra for a specific list of feature ids:
|
|
1187
|
+
instance.find_ms2(feature_uid=[1, 3, 5])
|
|
1188
|
+
"""
|
|
1189
|
+
|
|
1190
|
+
# parameters initialization
|
|
1191
|
+
params = find_ms2_defaults()
|
|
1192
|
+
for key, value in kwargs.items():
|
|
1193
|
+
if isinstance(value, find_ms2_defaults):
|
|
1194
|
+
params = value
|
|
1195
|
+
self.logger.debug("Using provided find_ms2_defaults parameters")
|
|
1196
|
+
else:
|
|
1197
|
+
if hasattr(params, key):
|
|
1198
|
+
if params.set(key, value, validate=True):
|
|
1199
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
1200
|
+
else:
|
|
1201
|
+
self.logger.warning(
|
|
1202
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
1203
|
+
)
|
|
1204
|
+
else:
|
|
1205
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
1206
|
+
# end of parameter initialization
|
|
1207
|
+
|
|
1208
|
+
# Extract parameter values
|
|
1209
|
+
features = params.get("features")
|
|
1210
|
+
mz_tol = params.get_mz_tolerance(self.file_type)
|
|
1211
|
+
centroid = params.get("centroid")
|
|
1212
|
+
deisotope = params.get("deisotope")
|
|
1213
|
+
dia_stats = params.get("dia_stats")
|
|
1214
|
+
|
|
1215
|
+
self.logger.debug("Starting MS2 spectra linking...")
|
|
1216
|
+
self.logger.debug(
|
|
1217
|
+
f"Parameters: mz_tol={mz_tol}, centroid={centroid}, deisotope={deisotope}",
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
# Ensure features_df is loaded and has the MS2 columns
|
|
1221
|
+
if self.features_df is None:
|
|
1222
|
+
self.logger.error("Please find features first.")
|
|
1223
|
+
return
|
|
1224
|
+
if "ms2_scans" not in self.features_df.columns:
|
|
1225
|
+
self.features_df["ms2_scans"] = None
|
|
1226
|
+
if "ms2_specs" not in self.features_df.columns:
|
|
1227
|
+
self.features_df["ms2_specs"] = None
|
|
1228
|
+
|
|
1229
|
+
feature_uid_list = []
|
|
1230
|
+
self.logger.debug("Building lookup lists")
|
|
1231
|
+
if features == []:
|
|
1232
|
+
features = None # If empty list, treat as None
|
|
1233
|
+
feature_uid_list = self._get_feature_uids(features)
|
|
1234
|
+
|
|
1235
|
+
if len(feature_uid_list) == 0:
|
|
1236
|
+
self.logger.warning("No features to process.")
|
|
1237
|
+
return
|
|
1238
|
+
|
|
1239
|
+
ms2_df = self.scans_df.filter(pl.col("ms_level") == 2)
|
|
1240
|
+
if len(ms2_df) == 0:
|
|
1241
|
+
self.logger.warning("No MS2 spectra found in file.")
|
|
1242
|
+
return
|
|
1243
|
+
|
|
1244
|
+
ms2_index_arr = ms2_df["scan_uid"].to_numpy()
|
|
1245
|
+
ms2_rt = ms2_df["rt"].to_numpy()
|
|
1246
|
+
ms2_precursor = ms2_df["prec_mz"].to_numpy()
|
|
1247
|
+
ms2_cycle = ms2_df["cycle"].to_numpy()
|
|
1248
|
+
|
|
1249
|
+
features_df = self.features_df
|
|
1250
|
+
c = 0
|
|
1251
|
+
|
|
1252
|
+
if self.file_interface is None:
|
|
1253
|
+
self.index_file()
|
|
1254
|
+
|
|
1255
|
+
# Vectorize the entire operation for better performance
|
|
1256
|
+
features_subset = features_df.filter(pl.col("feature_uid").is_in(feature_uid_list))
|
|
1257
|
+
|
|
1258
|
+
if len(features_subset) == 0:
|
|
1259
|
+
return
|
|
1260
|
+
|
|
1261
|
+
# Convert to numpy arrays for vectorized operations
|
|
1262
|
+
feature_rt = features_subset.select("rt").to_numpy().flatten()
|
|
1263
|
+
feature_mz = features_subset.select("mz").to_numpy().flatten()
|
|
1264
|
+
feature_rt_start = features_subset.select("rt_start").to_numpy().flatten()
|
|
1265
|
+
feature_rt_end = features_subset.select("rt_end").to_numpy().flatten()
|
|
1266
|
+
feature_uids = features_subset.select("feature_uid").to_numpy().flatten()
|
|
1267
|
+
feature_indices = features_subset.with_row_index().select("index").to_numpy().flatten()
|
|
1268
|
+
|
|
1269
|
+
# Pre-compute RT radius for all features
|
|
1270
|
+
rt_radius = np.minimum(feature_rt - feature_rt_start, feature_rt_end - feature_rt)
|
|
1271
|
+
|
|
1272
|
+
# Batch process all features
|
|
1273
|
+
scan_uid_lists: list[list[int]] = []
|
|
1274
|
+
spec_lists: list[list[Spectrum]] = []
|
|
1275
|
+
updated_feature_uids = []
|
|
1276
|
+
updated_scan_uids = []
|
|
1277
|
+
|
|
1278
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1279
|
+
|
|
1280
|
+
for i, (rt_center, mz_center, radius, feature_uid, idx) in enumerate(
|
|
1281
|
+
tqdm(
|
|
1282
|
+
zip(
|
|
1283
|
+
feature_rt,
|
|
1284
|
+
feature_mz,
|
|
1285
|
+
rt_radius,
|
|
1286
|
+
feature_uids,
|
|
1287
|
+
feature_indices,
|
|
1288
|
+
strict=False,
|
|
1289
|
+
),
|
|
1290
|
+
total=len(features_subset),
|
|
1291
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Link MS2 spectra",
|
|
1292
|
+
disable=tdqm_disable,
|
|
1293
|
+
),
|
|
1294
|
+
):
|
|
1295
|
+
# Vectorized filtering
|
|
1296
|
+
rt_mask = np.abs(ms2_rt - rt_center) <= radius
|
|
1297
|
+
mz_mask = np.abs(ms2_precursor - mz_center) <= mz_tol
|
|
1298
|
+
valid_mask = rt_mask & mz_mask
|
|
1299
|
+
|
|
1300
|
+
if not np.any(valid_mask):
|
|
1301
|
+
scan_uid_lists.append(None)
|
|
1302
|
+
spec_lists.append(None)
|
|
1303
|
+
continue
|
|
1304
|
+
|
|
1305
|
+
valid_indices = np.nonzero(valid_mask)[0]
|
|
1306
|
+
rt_diffs = np.abs(ms2_rt[valid_indices] - rt_center)
|
|
1307
|
+
sorted_indices = valid_indices[np.argsort(rt_diffs)]
|
|
1308
|
+
|
|
1309
|
+
# Get unique cycles and their first occurrences
|
|
1310
|
+
cycles = ms2_cycle[sorted_indices]
|
|
1311
|
+
_, first_idx = np.unique(cycles, return_index=True)
|
|
1312
|
+
final_indices = sorted_indices[first_idx]
|
|
1313
|
+
|
|
1314
|
+
# Sort by RT difference again
|
|
1315
|
+
final_rt_diffs = np.abs(ms2_rt[final_indices] - rt_center)
|
|
1316
|
+
final_indices = final_indices[np.argsort(final_rt_diffs)]
|
|
1317
|
+
|
|
1318
|
+
scan_uids = ms2_index_arr[final_indices].tolist()
|
|
1319
|
+
scan_uid_lists.append(scan_uids)
|
|
1320
|
+
spec_lists.append([
|
|
1321
|
+
self.get_spectrum(
|
|
1322
|
+
scan_uids[0],
|
|
1323
|
+
centroid=centroid,
|
|
1324
|
+
deisotope=deisotope,
|
|
1325
|
+
dia_stats=dia_stats,
|
|
1326
|
+
feature_uid=feature_uid,
|
|
1327
|
+
),
|
|
1328
|
+
])
|
|
1329
|
+
|
|
1330
|
+
# Collect updates for batch processing
|
|
1331
|
+
updated_feature_uids.extend([feature_uid] * len(final_indices))
|
|
1332
|
+
updated_scan_uids.extend(ms2_index_arr[final_indices])
|
|
1333
|
+
c += 1
|
|
1334
|
+
|
|
1335
|
+
self.logger.debug("Update features.")
|
|
1336
|
+
# Convert to polars if needed and batch update features_df
|
|
1337
|
+
if not isinstance(features_df, pl.DataFrame):
|
|
1338
|
+
features_df = pl.from_pandas(features_df)
|
|
1339
|
+
|
|
1340
|
+
# Update the features_df
|
|
1341
|
+
update_df = pl.DataFrame({
|
|
1342
|
+
"temp_idx": feature_indices,
|
|
1343
|
+
"ms2_scans": pl.Series("ms2_scans", scan_uid_lists, dtype=pl.Object),
|
|
1344
|
+
"ms2_specs": pl.Series("ms2_specs", spec_lists, dtype=pl.Object),
|
|
1345
|
+
})
|
|
1346
|
+
|
|
1347
|
+
# Join and update
|
|
1348
|
+
features_df = (
|
|
1349
|
+
features_df.with_row_index("temp_idx")
|
|
1350
|
+
.join(
|
|
1351
|
+
update_df,
|
|
1352
|
+
on="temp_idx",
|
|
1353
|
+
how="left",
|
|
1354
|
+
suffix="_new",
|
|
1355
|
+
)
|
|
1356
|
+
.with_columns([
|
|
1357
|
+
pl.when(pl.col("ms2_scans_new").is_not_null())
|
|
1358
|
+
.then(pl.col("ms2_scans_new"))
|
|
1359
|
+
.otherwise(pl.col("ms2_scans"))
|
|
1360
|
+
.alias("ms2_scans"),
|
|
1361
|
+
pl.when(pl.col("ms2_specs_new").is_not_null())
|
|
1362
|
+
.then(pl.col("ms2_specs_new"))
|
|
1363
|
+
.otherwise(pl.col("ms2_specs"))
|
|
1364
|
+
.alias("ms2_specs"),
|
|
1365
|
+
])
|
|
1366
|
+
.drop(["temp_idx", "ms2_scans_new", "ms2_specs_new"])
|
|
1367
|
+
)
|
|
1368
|
+
|
|
1369
|
+
# Batch update scans_df
|
|
1370
|
+
if updated_scan_uids:
|
|
1371
|
+
scan_feature_uid_updates = dict(
|
|
1372
|
+
zip(updated_scan_uids, updated_feature_uids, strict=True),
|
|
1373
|
+
)
|
|
1374
|
+
self.scans_df = (
|
|
1375
|
+
self.scans_df.with_columns(
|
|
1376
|
+
pl.col("scan_uid")
|
|
1377
|
+
.map_elements(
|
|
1378
|
+
lambda x: scan_feature_uid_updates.get(x),
|
|
1379
|
+
return_dtype=pl.Int64,
|
|
1380
|
+
)
|
|
1381
|
+
.alias("feature_uid_update"),
|
|
1382
|
+
)
|
|
1383
|
+
.with_columns(
|
|
1384
|
+
pl.when(pl.col("feature_uid_update").is_not_null())
|
|
1385
|
+
.then(pl.col("feature_uid_update"))
|
|
1386
|
+
.otherwise(pl.col("feature_uid"))
|
|
1387
|
+
.alias("feature_uid"),
|
|
1388
|
+
)
|
|
1389
|
+
.drop("feature_uid_update")
|
|
1390
|
+
)
|
|
1391
|
+
|
|
1392
|
+
# Log completion
|
|
1393
|
+
self.logger.info(
|
|
1394
|
+
f"MS2 linking completed. Total features with MS2 data: {c}",
|
|
1395
|
+
)
|
|
1396
|
+
self.features_df = features_df
|
|
1397
|
+
|
|
1398
|
+
# store params
|
|
1399
|
+
self.store_history(["find_ms2"], params.to_dict())
|
|
1400
|
+
self.logger.debug(
|
|
1401
|
+
"Parameters stored to find_ms2",
|
|
1402
|
+
)
|