masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -719
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.4.dist-info/RECORD +0 -50
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/study/processing.py
CHANGED
|
@@ -1,1046 +1,1057 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import polars as pl
|
|
7
|
-
import pyopenms as oms
|
|
8
|
-
|
|
9
|
-
from tqdm import tqdm
|
|
10
|
-
|
|
11
|
-
from masster.study.defaults import (
|
|
12
|
-
align_defaults,
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def align(self, **kwargs):
|
|
20
|
-
"""
|
|
21
|
-
Aligns feature maps using pose clustering and updates retention times in the features DataFrame.
|
|
22
|
-
|
|
23
|
-
Parameters:
|
|
24
|
-
**kwargs: Keyword arguments for alignment parameters. Can include:
|
|
25
|
-
- An align_defaults instance to set all parameters at once
|
|
26
|
-
- Individual parameter names and values (see align_defaults for details)
|
|
27
|
-
|
|
28
|
-
Key Parameters:
|
|
29
|
-
rt_max_diff (float): Maximum RT difference for alignment (default: 60.0).
|
|
30
|
-
mz_max_diff (float): Maximum m/z difference for alignment (default: 0.01).
|
|
31
|
-
rt_pair_distance_frac (float): RT pair distance fraction for superimposer (default: 0.2).
|
|
32
|
-
mz_pair_max_distance (float): Maximum m/z pair distance for superimposer (default: 0.01).
|
|
33
|
-
num_used_points (int): Number of points used for superimposer (default: 1000).
|
|
34
|
-
save_features (bool): Whether to save features after alignment (default: True).
|
|
35
|
-
skip_blanks (bool): Whether to skip blank samples during alignment (default: True).
|
|
36
|
-
"""
|
|
37
|
-
# parameters initialization
|
|
38
|
-
params = align_defaults()
|
|
39
|
-
for key, value in kwargs.items():
|
|
40
|
-
if isinstance(value, align_defaults):
|
|
41
|
-
params = value
|
|
42
|
-
self.logger.debug("Using provided align_defaults parameters")
|
|
43
|
-
else:
|
|
44
|
-
if hasattr(params, key):
|
|
45
|
-
if params.set(key, value, validate=True):
|
|
46
|
-
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
47
|
-
else:
|
|
48
|
-
self.logger.warning(
|
|
49
|
-
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
50
|
-
)
|
|
51
|
-
else:
|
|
52
|
-
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
53
|
-
# end of parameter initialization
|
|
54
|
-
|
|
55
|
-
# Store parameters in the Study object
|
|
56
|
-
self.store_history(["align"], params.to_dict())
|
|
57
|
-
self.logger.debug("Parameters stored to align")
|
|
58
|
-
|
|
59
|
-
if len(self.features_maps) < len(self.samples_df):
|
|
60
|
-
self.features_maps = []
|
|
61
|
-
self.load_features()
|
|
62
|
-
|
|
63
|
-
self.logger.
|
|
64
|
-
|
|
65
|
-
fmaps = self.features_maps
|
|
66
|
-
# set ref_index to feature map index with largest number of features
|
|
67
|
-
ref_index = [
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
params_oms
|
|
78
|
-
params_oms.setValue("
|
|
79
|
-
params_oms.setValue("pairfinder:
|
|
80
|
-
params_oms.setValue("
|
|
81
|
-
params_oms.setValue("
|
|
82
|
-
params_oms.setValue("
|
|
83
|
-
params_oms.setValue("superimposer:
|
|
84
|
-
params_oms.setValue("
|
|
85
|
-
params_oms.setValue("
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
b'superimposer:
|
|
92
|
-
b'superimposer:
|
|
93
|
-
b'superimposer:
|
|
94
|
-
b'superimposer:
|
|
95
|
-
b'superimposer:
|
|
96
|
-
b'superimposer:
|
|
97
|
-
b'superimposer:
|
|
98
|
-
b'
|
|
99
|
-
b'
|
|
100
|
-
b'pairfinder:
|
|
101
|
-
b'pairfinder:
|
|
102
|
-
b'pairfinder:
|
|
103
|
-
b'pairfinder:
|
|
104
|
-
b'pairfinder:distance_RT:
|
|
105
|
-
b'pairfinder:
|
|
106
|
-
b'pairfinder:
|
|
107
|
-
b'pairfinder:distance_MZ:
|
|
108
|
-
b'pairfinder:distance_MZ:
|
|
109
|
-
b'pairfinder:
|
|
110
|
-
b'pairfinder:
|
|
111
|
-
b'pairfinder:distance_intensity:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
self.
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if index ==
|
|
128
|
-
continue
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
#
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
self.features_df =
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
# parameters initialization
|
|
250
|
-
params =
|
|
251
|
-
for key, value in kwargs.items():
|
|
252
|
-
if isinstance(value,
|
|
253
|
-
params = value
|
|
254
|
-
self.logger.debug("Using provided
|
|
255
|
-
else:
|
|
256
|
-
if hasattr(params, key):
|
|
257
|
-
if params.set(key, value, validate=True):
|
|
258
|
-
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
259
|
-
else:
|
|
260
|
-
self.logger.warning(
|
|
261
|
-
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
262
|
-
)
|
|
263
|
-
else:
|
|
264
|
-
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
265
|
-
# end of parameter initialization
|
|
266
|
-
|
|
267
|
-
# Store parameters in the Study object
|
|
268
|
-
self.store_history(["
|
|
269
|
-
self.logger.debug("Parameters stored to
|
|
270
|
-
|
|
271
|
-
# Get parameter values for use in the method
|
|
272
|
-
algorithm = params.get("algorithm")
|
|
273
|
-
min_samples = params.get("min_samples")
|
|
274
|
-
link_ms2 = params.get("link_ms2")
|
|
275
|
-
mz_tol = kwargs.get("mz_tol", 0.01) # Default values for parameters not in defaults class
|
|
276
|
-
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
277
|
-
|
|
278
|
-
if len(self.samples_df) > 200 and algorithm ==
|
|
279
|
-
self.logger.warning(
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
params_oms.setValue("
|
|
296
|
-
|
|
297
|
-
params_oms.setValue("warp:
|
|
298
|
-
|
|
299
|
-
params_oms.setValue("
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
params_oms.setValue("
|
|
307
|
-
params_oms.setValue("
|
|
308
|
-
params_oms.setValue("
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
params_oms.setValue("
|
|
316
|
-
params_oms.setValue("
|
|
317
|
-
params_oms.setValue("
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
params_oms.setValue("
|
|
324
|
-
params_oms.setValue("
|
|
325
|
-
params_oms.setValue("
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
file_description
|
|
334
|
-
file_description.
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
#
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
feature_grouper.
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
"
|
|
392
|
-
"
|
|
393
|
-
"
|
|
394
|
-
"
|
|
395
|
-
"
|
|
396
|
-
"
|
|
397
|
-
"
|
|
398
|
-
"
|
|
399
|
-
"
|
|
400
|
-
"
|
|
401
|
-
"
|
|
402
|
-
"
|
|
403
|
-
"
|
|
404
|
-
"
|
|
405
|
-
"
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
fuid
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
"
|
|
441
|
-
"
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
feature_data
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
#
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
if fd.get("
|
|
467
|
-
])
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
if fd.get("
|
|
472
|
-
])
|
|
473
|
-
|
|
474
|
-
fd.get("
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
fd.get("
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
#
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
"
|
|
532
|
-
|
|
533
|
-
"
|
|
534
|
-
"
|
|
535
|
-
"
|
|
536
|
-
"
|
|
537
|
-
"
|
|
538
|
-
if len(rt_values) > 0
|
|
539
|
-
else 0.0,
|
|
540
|
-
"
|
|
541
|
-
if len(rt_start_values) > 0
|
|
542
|
-
else 0.0,
|
|
543
|
-
"
|
|
544
|
-
if len(
|
|
545
|
-
else 0.0,
|
|
546
|
-
"
|
|
547
|
-
if len(
|
|
548
|
-
else 0.0,
|
|
549
|
-
"
|
|
550
|
-
"
|
|
551
|
-
"
|
|
552
|
-
if len(
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
else 0.0,
|
|
563
|
-
"
|
|
564
|
-
"
|
|
565
|
-
if
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
self.
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
"
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
self.
|
|
723
|
-
|
|
724
|
-
)
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
)
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
)
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
""
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
"
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
.
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
.
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
)
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
f"
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
import pyopenms as oms
|
|
8
|
+
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
from masster.study.defaults import (
|
|
12
|
+
align_defaults,
|
|
13
|
+
find_ms2_defaults,
|
|
14
|
+
integrate_defaults,
|
|
15
|
+
merge_defaults,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def align(self, **kwargs):
|
|
20
|
+
"""
|
|
21
|
+
Aligns feature maps using pose clustering and updates retention times in the features DataFrame.
|
|
22
|
+
|
|
23
|
+
Parameters:
|
|
24
|
+
**kwargs: Keyword arguments for alignment parameters. Can include:
|
|
25
|
+
- An align_defaults instance to set all parameters at once
|
|
26
|
+
- Individual parameter names and values (see align_defaults for details)
|
|
27
|
+
|
|
28
|
+
Key Parameters:
|
|
29
|
+
rt_max_diff (float): Maximum RT difference for alignment (default: 60.0).
|
|
30
|
+
mz_max_diff (float): Maximum m/z difference for alignment (default: 0.01).
|
|
31
|
+
rt_pair_distance_frac (float): RT pair distance fraction for superimposer (default: 0.2).
|
|
32
|
+
mz_pair_max_distance (float): Maximum m/z pair distance for superimposer (default: 0.01).
|
|
33
|
+
num_used_points (int): Number of points used for superimposer (default: 1000).
|
|
34
|
+
save_features (bool): Whether to save features after alignment (default: True).
|
|
35
|
+
skip_blanks (bool): Whether to skip blank samples during alignment (default: True).
|
|
36
|
+
"""
|
|
37
|
+
# parameters initialization
|
|
38
|
+
params = align_defaults()
|
|
39
|
+
for key, value in kwargs.items():
|
|
40
|
+
if isinstance(value, align_defaults):
|
|
41
|
+
params = value
|
|
42
|
+
self.logger.debug("Using provided align_defaults parameters")
|
|
43
|
+
else:
|
|
44
|
+
if hasattr(params, key):
|
|
45
|
+
if params.set(key, value, validate=True):
|
|
46
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
47
|
+
else:
|
|
48
|
+
self.logger.warning(
|
|
49
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
53
|
+
# end of parameter initialization
|
|
54
|
+
|
|
55
|
+
# Store parameters in the Study object
|
|
56
|
+
self.store_history(["align"], params.to_dict())
|
|
57
|
+
self.logger.debug("Parameters stored to align")
|
|
58
|
+
|
|
59
|
+
if len(self.features_maps) < len(self.samples_df):
|
|
60
|
+
self.features_maps = []
|
|
61
|
+
self.load_features()
|
|
62
|
+
|
|
63
|
+
self.logger.debug("Starting alignment")
|
|
64
|
+
|
|
65
|
+
fmaps = self.features_maps
|
|
66
|
+
# set ref_index to feature map index with largest number of features
|
|
67
|
+
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
|
|
68
|
+
|
|
69
|
+
self.logger.info(
|
|
70
|
+
f"Align on {self.samples_df.row(ref_index, named=True)['sample_name']}",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
aligner = oms.MapAlignmentAlgorithmPoseClustering()
|
|
74
|
+
|
|
75
|
+
params_oms = oms.Param()
|
|
76
|
+
params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
|
|
77
|
+
params_oms.setValue("pairfinder:ignore_charge", "true")
|
|
78
|
+
params_oms.setValue("max_num_peaks_considered", 1000)
|
|
79
|
+
params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
|
|
80
|
+
params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
|
|
81
|
+
params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
|
|
82
|
+
params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
|
|
83
|
+
params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
|
|
84
|
+
params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
|
|
85
|
+
params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
|
|
86
|
+
aligner.setParameters(params_oms)
|
|
87
|
+
"""
|
|
88
|
+
{b'max_num_peaks_considered': 1000,
|
|
89
|
+
b'superimposer:mz_pair_max_distance': 0.5,
|
|
90
|
+
b'superimposer:rt_pair_distance_fraction': 0.1,
|
|
91
|
+
b'superimposer:num_used_points': 2000,
|
|
92
|
+
b'superimposer:scaling_bucket_size': 0.005,
|
|
93
|
+
b'superimposer:shift_bucket_size': 3.0,
|
|
94
|
+
b'superimposer:max_shift': 1000.0,
|
|
95
|
+
b'superimposer:max_scaling': 2.0,
|
|
96
|
+
b'superimposer:dump_buckets': '',
|
|
97
|
+
b'superimposer:dump_pairs': '',
|
|
98
|
+
b'pairfinder:second_nearest_gap': 2.0,
|
|
99
|
+
b'pairfinder:use_identifications': 'false',
|
|
100
|
+
b'pairfinder:ignore_charge': 'false',
|
|
101
|
+
b'pairfinder:ignore_adduct': 'true',
|
|
102
|
+
b'pairfinder:distance_RT:max_difference': 100.0,
|
|
103
|
+
b'pairfinder:distance_RT:exponent': 1.0,
|
|
104
|
+
b'pairfinder:distance_RT:weight': 1.0,
|
|
105
|
+
b'pairfinder:distance_MZ:max_difference': 0.3,
|
|
106
|
+
b'pairfinder:distance_MZ:unit': 'Da',
|
|
107
|
+
b'pairfinder:distance_MZ:exponent': 2.0,
|
|
108
|
+
b'pairfinder:distance_MZ:weight': 1.0,
|
|
109
|
+
b'pairfinder:distance_intensity:exponent': 1.0,
|
|
110
|
+
b'pairfinder:distance_intensity:weight': 0.0,
|
|
111
|
+
b'pairfinder:distance_intensity:log_transform': 'disabled'} """
|
|
112
|
+
|
|
113
|
+
aligner.setReference(fmaps[ref_index])
|
|
114
|
+
|
|
115
|
+
self.logger.debug(f"Parameters for alignment: {params}")
|
|
116
|
+
|
|
117
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
118
|
+
# perform alignment and transformation of feature maps to the reference map (exclude reference map)
|
|
119
|
+
for index, fm in tqdm(
|
|
120
|
+
list(enumerate(fmaps)),
|
|
121
|
+
total=len(fmaps),
|
|
122
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Align feature maps",
|
|
123
|
+
disable=tdqm_disable,
|
|
124
|
+
):
|
|
125
|
+
if index == ref_index:
|
|
126
|
+
continue
|
|
127
|
+
if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
|
|
128
|
+
continue
|
|
129
|
+
trafo = oms.TransformationDescription()
|
|
130
|
+
aligner.align(fm, trafo)
|
|
131
|
+
transformer = oms.MapAlignmentTransformer()
|
|
132
|
+
transformer.transformRetentionTimes(fm, trafo, True)
|
|
133
|
+
|
|
134
|
+
self.alignment_ref_index = ref_index
|
|
135
|
+
|
|
136
|
+
# check if rt_original exists in features_df, if not, add it after rt
|
|
137
|
+
if "rt_original" not in self.features_df.columns:
|
|
138
|
+
# add column 'rt_original' after 'rt'
|
|
139
|
+
rt_index = self.features_df.columns.get_loc("rt") + 1
|
|
140
|
+
self.features_df.insert(rt_index, "rt_original", 0)
|
|
141
|
+
self.features_df["rt_original"] = self.features_df["rt"]
|
|
142
|
+
|
|
143
|
+
# iterate through all feature_maps and add the transformed retention times to the features_df
|
|
144
|
+
|
|
145
|
+
# Build a fast lookup for (sample_uid, feature_uid) to index in features_df
|
|
146
|
+
feats = self.features_df
|
|
147
|
+
|
|
148
|
+
# Pre-build sample_uid lookup for faster access
|
|
149
|
+
self.logger.debug("Build sample_uid lookup for fast access...")
|
|
150
|
+
sample_uid_lookup = {
|
|
151
|
+
idx: row_dict["sample_uid"] for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
# Build the main lookup using feature_uid (not feature_id)
|
|
155
|
+
if "feature_id" in feats.columns:
|
|
156
|
+
# Create lookup mapping (sample_uid, feature_uid) to DataFrame index using Polars
|
|
157
|
+
# Since we need a pandas-style index lookup, we'll create a simple dict
|
|
158
|
+
sample_uids = feats.get_column("sample_uid").to_list()
|
|
159
|
+
|
|
160
|
+
# Handle feature_id column - it might be Object type due to conversion
|
|
161
|
+
feature_id_col = feats.get_column("feature_id")
|
|
162
|
+
if feature_id_col.dtype == pl.Object:
|
|
163
|
+
# If it's Object type, convert to list and let Python handle the conversion
|
|
164
|
+
feature_ids = feature_id_col.to_list()
|
|
165
|
+
# Convert to strings if they're not already
|
|
166
|
+
feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
|
|
167
|
+
else:
|
|
168
|
+
# Safe to cast normally
|
|
169
|
+
feature_ids = feature_id_col.cast(pl.Utf8).to_list()
|
|
170
|
+
|
|
171
|
+
lookup = {
|
|
172
|
+
(sample_uid, feature_id): idx
|
|
173
|
+
for idx, (sample_uid, feature_id) in enumerate(
|
|
174
|
+
zip(sample_uids, feature_ids, strict=True),
|
|
175
|
+
)
|
|
176
|
+
}
|
|
177
|
+
else:
|
|
178
|
+
# fallback: skip if feature_uid column missing
|
|
179
|
+
lookup = {}
|
|
180
|
+
self.logger.warning("feature_id column not found in features_df")
|
|
181
|
+
|
|
182
|
+
# Pre-allocate update lists for better performance
|
|
183
|
+
all_update_idx = []
|
|
184
|
+
all_update_rt = []
|
|
185
|
+
all_update_rt_original = []
|
|
186
|
+
|
|
187
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
188
|
+
|
|
189
|
+
for index, fm in tqdm(
|
|
190
|
+
list(enumerate(fmaps)),
|
|
191
|
+
total=len(fmaps),
|
|
192
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract RTs",
|
|
193
|
+
disable=tdqm_disable,
|
|
194
|
+
):
|
|
195
|
+
sample_uid = sample_uid_lookup.get(index)
|
|
196
|
+
if sample_uid is None:
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
# Collect all updates for this feature map
|
|
200
|
+
for f in fm:
|
|
201
|
+
feature_uid = str(f.getUniqueId())
|
|
202
|
+
idx = lookup.get((sample_uid, feature_uid))
|
|
203
|
+
if idx is not None:
|
|
204
|
+
rt = round(f.getRT(), 3)
|
|
205
|
+
# rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
|
|
206
|
+
all_update_idx.append(idx)
|
|
207
|
+
all_update_rt.append(rt)
|
|
208
|
+
# all_update_rt_original.append(rt_or)
|
|
209
|
+
|
|
210
|
+
# Single batch update for all features at once
|
|
211
|
+
if all_update_idx:
|
|
212
|
+
# Update "rt" column for specified indices using Polars
|
|
213
|
+
self.features_df = self.features_df.with_columns(
|
|
214
|
+
pl.when(pl.int_range(0, self.features_df.height).is_in(all_update_idx))
|
|
215
|
+
.then(pl.Series("rt", all_update_rt))
|
|
216
|
+
.otherwise(pl.col("rt"))
|
|
217
|
+
.alias("rt"),
|
|
218
|
+
)
|
|
219
|
+
# self.features_df.loc[all_update_idx, "rt_original"] = all_update_rt_original
|
|
220
|
+
|
|
221
|
+
self.logger.debug("Alignment completed successfully.")
|
|
222
|
+
|
|
223
|
+
if params.get("save_features"):
|
|
224
|
+
self.save_samples()
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def merge(self, **kwargs):
|
|
228
|
+
"""
|
|
229
|
+
Groups features across samples into consensus features using the specified algorithm.
|
|
230
|
+
|
|
231
|
+
Parameters:
|
|
232
|
+
**kwargs: Keyword arguments for consensus parameters. Can include:
|
|
233
|
+
- A merge_defaults instance to set all parameters at once
|
|
234
|
+
- Individual parameter names and values (see merge_defaults for details)
|
|
235
|
+
|
|
236
|
+
Key Parameters:
|
|
237
|
+
algorithm (str): Feature grouping algorithm ('kd', 'unlabeled', 'sequential', or default 'qt').
|
|
238
|
+
min_samples (int): Minimum number of samples for a consensus feature.
|
|
239
|
+
link_ms2 (bool): Whether to link MS2 spectra to consensus features.
|
|
240
|
+
mz_tol (float): m/z tolerance for grouping (default: 0.01).
|
|
241
|
+
rt_tol (float): RT tolerance for grouping (default: 1.0).
|
|
242
|
+
"""
|
|
243
|
+
# Reset consensus-related DataFrames at the start
|
|
244
|
+
self.consensus_df = pl.DataFrame()
|
|
245
|
+
self.consensus_ms2 = pl.DataFrame()
|
|
246
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
247
|
+
|
|
248
|
+
self.logger.info('Merging...')
|
|
249
|
+
# parameters initialization
|
|
250
|
+
params = merge_defaults()
|
|
251
|
+
for key, value in kwargs.items():
|
|
252
|
+
if isinstance(value, merge_defaults):
|
|
253
|
+
params = value
|
|
254
|
+
self.logger.debug("Using provided merge_defaults parameters")
|
|
255
|
+
else:
|
|
256
|
+
if hasattr(params, key):
|
|
257
|
+
if params.set(key, value, validate=True):
|
|
258
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
259
|
+
else:
|
|
260
|
+
self.logger.warning(
|
|
261
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
264
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
265
|
+
# end of parameter initialization
|
|
266
|
+
|
|
267
|
+
# Store parameters in the Study object
|
|
268
|
+
self.store_history(["merge"], params.to_dict())
|
|
269
|
+
self.logger.debug("Parameters stored to merge")
|
|
270
|
+
|
|
271
|
+
# Get parameter values for use in the method
|
|
272
|
+
algorithm = params.get("algorithm")
|
|
273
|
+
min_samples = params.get("min_samples")
|
|
274
|
+
link_ms2 = params.get("link_ms2")
|
|
275
|
+
mz_tol = kwargs.get("mz_tol", 0.01) # Default values for parameters not in defaults class
|
|
276
|
+
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
277
|
+
|
|
278
|
+
if len(self.samples_df) > 200 and algorithm == "qt":
|
|
279
|
+
self.logger.warning(
|
|
280
|
+
"Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# check that features_maps is not empty
|
|
284
|
+
if not self.features_maps or len(self.features_maps) == 0:
|
|
285
|
+
self.load_features()
|
|
286
|
+
params_oms = oms.Param()
|
|
287
|
+
## TODO expose these
|
|
288
|
+
|
|
289
|
+
feature_grouper: object # Use generic type for different OpenMS algorithms
|
|
290
|
+
match algorithm.lower():
|
|
291
|
+
case "kd":
|
|
292
|
+
feature_grouper = oms.FeatureGroupingAlgorithmKD()
|
|
293
|
+
self.logger.debug("Merging features with KDTree...")
|
|
294
|
+
params_oms.setValue("mz_unit", "Da")
|
|
295
|
+
params_oms.setValue("nr_partitions", len(self.samples_df))
|
|
296
|
+
|
|
297
|
+
params_oms.setValue("warp:enabled", "true")
|
|
298
|
+
params_oms.setValue("warp:rt_tol", rt_tol)
|
|
299
|
+
params_oms.setValue("warp:mz_tol", mz_tol)
|
|
300
|
+
|
|
301
|
+
params_oms.setValue("link:rt_tol", rt_tol)
|
|
302
|
+
params_oms.setValue("link:mz_tol", mz_tol)
|
|
303
|
+
case "unlabeled":
|
|
304
|
+
feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
|
|
305
|
+
self.logger.debug("Merging features with Unlabelled algorithm...")
|
|
306
|
+
params_oms.setValue("second_nearest_gap", 2.0)
|
|
307
|
+
params_oms.setValue("ignore_charge", "true")
|
|
308
|
+
params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
|
|
309
|
+
params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
|
|
310
|
+
params_oms.setValue("distance_MZ:unit", "Da")
|
|
311
|
+
case "sequential":
|
|
312
|
+
self.logger.debug(
|
|
313
|
+
"Merging features sequentially with Unlabelled algorithm...",
|
|
314
|
+
)
|
|
315
|
+
params_oms.setValue("second_nearest_gap", 2.0)
|
|
316
|
+
params_oms.setValue("ignore_charge", "true")
|
|
317
|
+
params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
|
|
318
|
+
params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
|
|
319
|
+
params_oms.setValue("distance_MZ:unit", "Da")
|
|
320
|
+
case "qt":
|
|
321
|
+
feature_grouper = oms.FeatureGroupingAlgorithmQT()
|
|
322
|
+
self.logger.debug("Grouping features with QT...")
|
|
323
|
+
params_oms.setValue("nr_partitions", len(self.samples_df))
|
|
324
|
+
params_oms.setValue("ignore_charge", "true")
|
|
325
|
+
params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
|
|
326
|
+
params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
|
|
327
|
+
params_oms.setValue("distance_MZ:unit", "Da")
|
|
328
|
+
self.logger.debug(f"Parameters for feature grouping: {params_oms}")
|
|
329
|
+
consensus_map = oms.ConsensusMap()
|
|
330
|
+
file_descriptions = consensus_map.getColumnHeaders() # type: ignore
|
|
331
|
+
feature_maps = self.features_maps
|
|
332
|
+
for i, feature_map in enumerate(feature_maps):
|
|
333
|
+
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
334
|
+
file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
|
|
335
|
+
file_description.size = feature_map.size()
|
|
336
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
337
|
+
file_descriptions[i] = file_description
|
|
338
|
+
|
|
339
|
+
consensus_map.setColumnHeaders(file_descriptions) # type: ignore
|
|
340
|
+
|
|
341
|
+
# create a copy of the feature maps to store the original feature map information
|
|
342
|
+
match algorithm.lower():
|
|
343
|
+
case "sequential":
|
|
344
|
+
# set the reference map to self.alignment_ref_index
|
|
345
|
+
if self.alignment_ref_index is None:
|
|
346
|
+
# pick the feature map with the most features as reference
|
|
347
|
+
self.alignment_ref_index = max(
|
|
348
|
+
range(len(self.features_maps)),
|
|
349
|
+
key=lambda i: self.features_maps[i].size(),
|
|
350
|
+
)
|
|
351
|
+
feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
|
|
352
|
+
feature_grouper.setParameters(params_oms)
|
|
353
|
+
feature_grouper.setReference(
|
|
354
|
+
self.alignment_ref_index,
|
|
355
|
+
self.features_maps[self.alignment_ref_index],
|
|
356
|
+
)
|
|
357
|
+
self.logger.info(
|
|
358
|
+
f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
362
|
+
for i, feature_map in tqdm(
|
|
363
|
+
enumerate(self.features_maps),
|
|
364
|
+
total=len(self.features_maps),
|
|
365
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add samples",
|
|
366
|
+
disable=tdqm_disable,
|
|
367
|
+
):
|
|
368
|
+
if i == self.alignment_ref_index:
|
|
369
|
+
continue
|
|
370
|
+
feature_grouper.addToGroup(i, feature_map)
|
|
371
|
+
self.logger.debug("Grouping features.")
|
|
372
|
+
consensus_map = feature_grouper.getResultMap()
|
|
373
|
+
if hasattr(consensus_map, "setUniqueIds"):
|
|
374
|
+
consensus_map.setUniqueIds()
|
|
375
|
+
case _:
|
|
376
|
+
feature_grouper.setParameters(params_oms) # type: ignore
|
|
377
|
+
# add all feature maps and group in one batch
|
|
378
|
+
self.logger.debug("Grouping features in one batch...")
|
|
379
|
+
feature_grouper.group(feature_maps, consensus_map) # type: ignore
|
|
380
|
+
if hasattr(consensus_map, "setUniqueIds"):
|
|
381
|
+
consensus_map.setUniqueIds()
|
|
382
|
+
|
|
383
|
+
# create a dict to map uid to feature_uid using self.features_df
|
|
384
|
+
feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in self.features_df.iter_rows(named=True)}
|
|
385
|
+
imax = consensus_map.size()
|
|
386
|
+
|
|
387
|
+
# Pre-build fast lookup tables for features_df data
|
|
388
|
+
features_lookup = {}
|
|
389
|
+
feature_columns = [
|
|
390
|
+
"rt",
|
|
391
|
+
"mz",
|
|
392
|
+
"rt_start",
|
|
393
|
+
"rt_end",
|
|
394
|
+
"rt_delta",
|
|
395
|
+
"mz_start",
|
|
396
|
+
"mz_end",
|
|
397
|
+
"inty",
|
|
398
|
+
"chrom_coherence",
|
|
399
|
+
"chrom_prominence",
|
|
400
|
+
"chrom_prominence_scaled",
|
|
401
|
+
"chrom_height_scaled",
|
|
402
|
+
"iso",
|
|
403
|
+
"charge",
|
|
404
|
+
"ms2_scans",
|
|
405
|
+
"adduct",
|
|
406
|
+
"adduct_mass",
|
|
407
|
+
]
|
|
408
|
+
|
|
409
|
+
for row in self.features_df.iter_rows(named=True):
|
|
410
|
+
feature_uid = row["feature_uid"]
|
|
411
|
+
features_lookup[feature_uid] = {col: row[col] for col in feature_columns if col in self.features_df.columns}
|
|
412
|
+
|
|
413
|
+
# create a list to store the consensus mapping
|
|
414
|
+
consensus_mapping = []
|
|
415
|
+
metadata_list = []
|
|
416
|
+
|
|
417
|
+
tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
|
|
418
|
+
|
|
419
|
+
for i, feature in enumerate(
|
|
420
|
+
tqdm(
|
|
421
|
+
consensus_map,
|
|
422
|
+
total=imax,
|
|
423
|
+
disable=tqdm_disable,
|
|
424
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
|
|
425
|
+
),
|
|
426
|
+
):
|
|
427
|
+
# get all features in the feature map with the same unique id as the consensus feature
|
|
428
|
+
features_list = feature.getFeatureList()
|
|
429
|
+
uids = []
|
|
430
|
+
feature_data_list = []
|
|
431
|
+
|
|
432
|
+
for _j, f in enumerate(features_list):
|
|
433
|
+
fuid = str(f.getUniqueId())
|
|
434
|
+
if fuid not in feature_uid_map:
|
|
435
|
+
# this is a feature that was removed but is still in the feature maps
|
|
436
|
+
continue
|
|
437
|
+
fuid = feature_uid_map[fuid]
|
|
438
|
+
consensus_mapping.append({
|
|
439
|
+
"consensus_uid": i,
|
|
440
|
+
"sample_uid": f.getMapIndex() + 1,
|
|
441
|
+
"feature_uid": fuid,
|
|
442
|
+
})
|
|
443
|
+
uids.append(fuid)
|
|
444
|
+
|
|
445
|
+
# Get feature data from lookup instead of DataFrame filtering
|
|
446
|
+
feature_data = features_lookup.get(fuid)
|
|
447
|
+
if feature_data:
|
|
448
|
+
feature_data_list.append(feature_data)
|
|
449
|
+
|
|
450
|
+
if not feature_data_list:
|
|
451
|
+
# Skip this consensus feature if no valid features found
|
|
452
|
+
continue
|
|
453
|
+
|
|
454
|
+
# Compute statistics using vectorized operations on collected data
|
|
455
|
+
# Convert to numpy arrays for faster computation
|
|
456
|
+
rt_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
|
|
457
|
+
mz_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
|
|
458
|
+
rt_start_values = np.array([
|
|
459
|
+
fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None
|
|
460
|
+
])
|
|
461
|
+
rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
|
|
462
|
+
rt_delta_values = np.array([
|
|
463
|
+
fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None
|
|
464
|
+
])
|
|
465
|
+
mz_start_values = np.array([
|
|
466
|
+
fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None
|
|
467
|
+
])
|
|
468
|
+
mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
|
|
469
|
+
inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
|
|
470
|
+
coherence_values = np.array([
|
|
471
|
+
fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
|
|
472
|
+
])
|
|
473
|
+
prominence_values = np.array([
|
|
474
|
+
fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
|
|
475
|
+
])
|
|
476
|
+
prominence_scaled_values = np.array([
|
|
477
|
+
fd.get("chrom_prominence_scaled", 0)
|
|
478
|
+
for fd in feature_data_list
|
|
479
|
+
if fd.get("chrom_prominence_scaled") is not None
|
|
480
|
+
])
|
|
481
|
+
height_scaled_values = np.array([
|
|
482
|
+
fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
|
|
483
|
+
])
|
|
484
|
+
iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
|
|
485
|
+
charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
|
|
486
|
+
|
|
487
|
+
# adduct_values
|
|
488
|
+
# Collect all adducts from feature_data_list to create consensus adduct information
|
|
489
|
+
all_adducts = []
|
|
490
|
+
adduct_masses = {}
|
|
491
|
+
|
|
492
|
+
for fd in feature_data_list:
|
|
493
|
+
# Get individual adduct and mass from each feature data (fd)
|
|
494
|
+
adduct = fd.get("adduct")
|
|
495
|
+
adduct_mass = fd.get("adduct_mass")
|
|
496
|
+
|
|
497
|
+
if adduct is not None:
|
|
498
|
+
all_adducts.append(adduct)
|
|
499
|
+
if adduct_mass is not None:
|
|
500
|
+
adduct_masses[adduct] = adduct_mass
|
|
501
|
+
|
|
502
|
+
# Calculate adduct_values for the consensus feature
|
|
503
|
+
adduct_values = []
|
|
504
|
+
if all_adducts:
|
|
505
|
+
adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
|
|
506
|
+
total_count = sum(adduct_counts.values())
|
|
507
|
+
for adduct, count in adduct_counts.items():
|
|
508
|
+
percentage = (count / total_count) * 100 if total_count > 0 else 0
|
|
509
|
+
mass = adduct_masses.get(adduct, None)
|
|
510
|
+
# Store as dict instead of tuple to avoid type confusion
|
|
511
|
+
adduct_values.append({
|
|
512
|
+
"adduct": str(adduct),
|
|
513
|
+
"count": int(count),
|
|
514
|
+
"percentage": float(round(percentage, 2)),
|
|
515
|
+
"mass": float(mass) if mass is not None else None
|
|
516
|
+
})
|
|
517
|
+
|
|
518
|
+
# Sort adduct_values by count in descending order
|
|
519
|
+
adduct_values.sort(key=lambda x: x["count"], reverse=True) # type: ignore[arg-type,return-value]
|
|
520
|
+
# Store adduct_values for use in metadata
|
|
521
|
+
consensus_adduct_values = adduct_values
|
|
522
|
+
|
|
523
|
+
# Calculate number of MS2 spectra
|
|
524
|
+
ms2_count = 0
|
|
525
|
+
for fd in feature_data_list:
|
|
526
|
+
ms2_scans = fd.get("ms2_scans")
|
|
527
|
+
if ms2_scans is not None:
|
|
528
|
+
ms2_count += len(ms2_scans)
|
|
529
|
+
|
|
530
|
+
metadata_list.append({
|
|
531
|
+
"consensus_uid": int(i), # "consensus_id": i,
|
|
532
|
+
"consensus_id": str(feature.getUniqueId()),
|
|
533
|
+
"quality": round(float(feature.getQuality()), 3),
|
|
534
|
+
"number_samples": len(feature_data_list),
|
|
535
|
+
# "number_ext": int(len(features_list)),
|
|
536
|
+
"rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
|
|
537
|
+
"mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
538
|
+
"rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
539
|
+
"rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
540
|
+
"rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
541
|
+
"rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
|
|
542
|
+
"rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
|
|
543
|
+
"rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
|
|
544
|
+
"mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
545
|
+
"mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
546
|
+
"mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
547
|
+
"mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
|
|
548
|
+
"mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
|
|
549
|
+
"inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
|
|
550
|
+
"bl": -1.0,
|
|
551
|
+
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
|
|
552
|
+
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
|
|
553
|
+
"chrom_prominence_scaled_mean": round(
|
|
554
|
+
float(np.mean(prominence_scaled_values)),
|
|
555
|
+
3,
|
|
556
|
+
)
|
|
557
|
+
if len(prominence_scaled_values) > 0
|
|
558
|
+
else 0.0,
|
|
559
|
+
"chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
|
|
560
|
+
if len(height_scaled_values) > 0
|
|
561
|
+
else 0.0,
|
|
562
|
+
"iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
|
|
563
|
+
"charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
|
|
564
|
+
"number_ms2": int(ms2_count),
|
|
565
|
+
"adducts": consensus_adduct_values if consensus_adduct_values else [], # Ensure it's always a list
|
|
566
|
+
})
|
|
567
|
+
|
|
568
|
+
consensus_mapping_df = pl.DataFrame(consensus_mapping)
|
|
569
|
+
# remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
|
|
570
|
+
l1 = len(consensus_mapping_df)
|
|
571
|
+
consensus_mapping_df = consensus_mapping_df.filter(
|
|
572
|
+
pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
|
|
573
|
+
)
|
|
574
|
+
self.logger.debug(
|
|
575
|
+
f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
|
|
576
|
+
)
|
|
577
|
+
self.consensus_mapping_df = consensus_mapping_df
|
|
578
|
+
self.consensus_df = pl.DataFrame(metadata_list, strict=False)
|
|
579
|
+
|
|
580
|
+
if min_samples is None:
|
|
581
|
+
min_samples = 1
|
|
582
|
+
if min_samples < 1:
|
|
583
|
+
min_samples = int(min_samples * len(self.samples_df))
|
|
584
|
+
# filter out consensus features with less than min_samples features
|
|
585
|
+
l1 = len(self.consensus_df)
|
|
586
|
+
self.consensus_df = self.consensus_df.filter(
|
|
587
|
+
pl.col("number_samples") >= min_samples,
|
|
588
|
+
)
|
|
589
|
+
self.logger.debug(
|
|
590
|
+
f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
591
|
+
)
|
|
592
|
+
# filter out consensus mapping with less than min_samples features
|
|
593
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
594
|
+
pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
self.consensus_map = consensus_map
|
|
598
|
+
# calculate the completeness of the consensus map
|
|
599
|
+
c = len(self.consensus_mapping_df) / len(self.consensus_df) / len(self.samples_df)
|
|
600
|
+
self.logger.info(
|
|
601
|
+
f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
|
|
602
|
+
)
|
|
603
|
+
if link_ms2:
|
|
604
|
+
self.find_ms2()
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
# Backward compatibility alias
|
|
608
|
+
find_consensus = merge
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def find_ms2(self, **kwargs):
|
|
612
|
+
"""
|
|
613
|
+
Links MS2 spectra to consensus features and stores the result in self.consensus_ms2.
|
|
614
|
+
|
|
615
|
+
Parameters:
|
|
616
|
+
**kwargs: Keyword arguments for MS2 linking parameters. Can include:
|
|
617
|
+
- A find_ms2_defaults instance to set all parameters at once
|
|
618
|
+
- Individual parameter names and values (see find_ms2_defaults for details)
|
|
619
|
+
"""
|
|
620
|
+
# Reset consensus_ms2 DataFrame at the start
|
|
621
|
+
self.consensus_ms2 = pl.DataFrame()
|
|
622
|
+
|
|
623
|
+
# parameters initialization
|
|
624
|
+
params = find_ms2_defaults()
|
|
625
|
+
for key, value in kwargs.items():
|
|
626
|
+
if isinstance(value, find_ms2_defaults):
|
|
627
|
+
params = value
|
|
628
|
+
self.logger.debug("Using provided find_ms2_defaults parameters")
|
|
629
|
+
else:
|
|
630
|
+
if hasattr(params, key):
|
|
631
|
+
if params.set(key, value, validate=True):
|
|
632
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
633
|
+
else:
|
|
634
|
+
self.logger.warning(
|
|
635
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
636
|
+
)
|
|
637
|
+
else:
|
|
638
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
639
|
+
# end of parameter initialization
|
|
640
|
+
|
|
641
|
+
# Store parameters in the Study object
|
|
642
|
+
self.store_history(["find_ms2"], params.to_dict())
|
|
643
|
+
self.logger.debug("Parameters stored to find_ms2")
|
|
644
|
+
|
|
645
|
+
data = []
|
|
646
|
+
if self.consensus_mapping_df.is_empty():
|
|
647
|
+
self.logger.error(
|
|
648
|
+
"No consensus mapping found. Please run merge() first.",
|
|
649
|
+
)
|
|
650
|
+
return
|
|
651
|
+
self.logger.info("Linking MS2 spectra to consensus features...")
|
|
652
|
+
|
|
653
|
+
# Build fast lookup for feature_uid to features_df row data
|
|
654
|
+
feats = self.features_df
|
|
655
|
+
feature_lookup = {}
|
|
656
|
+
relevant_cols = [
|
|
657
|
+
"ms2_specs",
|
|
658
|
+
"ms2_scans",
|
|
659
|
+
"inty",
|
|
660
|
+
"chrom_coherence",
|
|
661
|
+
"chrom_prominence_scaled",
|
|
662
|
+
]
|
|
663
|
+
for row in feats.iter_rows(named=True):
|
|
664
|
+
feature_uid = row["feature_uid"]
|
|
665
|
+
feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
|
|
666
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
667
|
+
|
|
668
|
+
# Process consensus mapping in batch
|
|
669
|
+
for mapping_row in tqdm(
|
|
670
|
+
self.consensus_mapping_df.iter_rows(named=True),
|
|
671
|
+
total=self.consensus_mapping_df.shape[0],
|
|
672
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}MS2 spectra",
|
|
673
|
+
disable=tdqm_disable,
|
|
674
|
+
):
|
|
675
|
+
feature_uid = mapping_row["feature_uid"]
|
|
676
|
+
feature_data = feature_lookup.get(feature_uid)
|
|
677
|
+
if feature_data is None or feature_data.get("ms2_specs") is None:
|
|
678
|
+
continue
|
|
679
|
+
ms2_specs = feature_data["ms2_specs"]
|
|
680
|
+
ms2_scans = feature_data["ms2_scans"]
|
|
681
|
+
inty = feature_data.get("inty")
|
|
682
|
+
chrom_coherence = feature_data.get("chrom_coherence")
|
|
683
|
+
chrom_prominence_scaled = feature_data.get("chrom_prominence_scaled")
|
|
684
|
+
for j in range(len(ms2_specs)):
|
|
685
|
+
spec = ms2_specs[j]
|
|
686
|
+
scanid = ms2_scans[j]
|
|
687
|
+
data.append({
|
|
688
|
+
"consensus_uid": int(mapping_row["consensus_uid"]),
|
|
689
|
+
"feature_uid": int(mapping_row["feature_uid"]),
|
|
690
|
+
"sample_uid": int(mapping_row["sample_uid"]),
|
|
691
|
+
"scan_id": int(scanid),
|
|
692
|
+
"energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
|
|
693
|
+
"prec_inty": round(inty, 0) if inty is not None else None,
|
|
694
|
+
"prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
|
|
695
|
+
"prec_prominence_scaled": round(chrom_prominence_scaled, 3)
|
|
696
|
+
if chrom_prominence_scaled is not None
|
|
697
|
+
else None,
|
|
698
|
+
"number_frags": len(spec.mz),
|
|
699
|
+
"spec": spec,
|
|
700
|
+
})
|
|
701
|
+
self.consensus_ms2 = pl.DataFrame(data)
|
|
702
|
+
if not self.consensus_ms2.is_empty():
|
|
703
|
+
unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
|
|
704
|
+
else:
|
|
705
|
+
unique_consensus_features = 0
|
|
706
|
+
self.logger.info(
|
|
707
|
+
f"Linking completed. {len(self.consensus_ms2)} MS2 spectra associated to {unique_consensus_features} consensus features.",
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
## TODO these are not modelled the same way as other ranges, harmonize for tuples
|
|
712
|
+
def filter_consensus(
|
|
713
|
+
self,
|
|
714
|
+
inplace=True,
|
|
715
|
+
number_samples=None,
|
|
716
|
+
quality=None,
|
|
717
|
+
coherence=None,
|
|
718
|
+
):
|
|
719
|
+
if self.consensus_df is None:
|
|
720
|
+
self.logger.error("No consensus found.")
|
|
721
|
+
return
|
|
722
|
+
cons = self.consensus_df if inplace else self.consensus_df.copy()
|
|
723
|
+
l = len(cons)
|
|
724
|
+
self.logger.info(f"Filtering consensus features with {l} entries...")
|
|
725
|
+
if coherence is not None:
|
|
726
|
+
if "chrom_coherence" not in cons.columns:
|
|
727
|
+
self.logger.warning("No coherence data found in features.")
|
|
728
|
+
else:
|
|
729
|
+
if isinstance(coherence, tuple) and len(coherence) == 2:
|
|
730
|
+
min_coherence, max_coherence = coherence
|
|
731
|
+
cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
|
|
732
|
+
else:
|
|
733
|
+
cons = cons[cons["chrom_coherence"] >= coherence]
|
|
734
|
+
l2 = len(cons)
|
|
735
|
+
self.logger.info(
|
|
736
|
+
f"Filtered {l - l2} entries based on coherence. Remaining {l2} entries.",
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
if quality is not None:
|
|
740
|
+
if isinstance(quality, tuple) and len(quality) == 2:
|
|
741
|
+
min_quality, max_quality = quality
|
|
742
|
+
cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
|
|
743
|
+
else:
|
|
744
|
+
cons = cons[cons["quality"] >= quality]
|
|
745
|
+
l3 = len(cons)
|
|
746
|
+
self.logger.info(
|
|
747
|
+
f"Filtered {l2 - l3} entries based on quality. Remaining {l3} entries.",
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
if number_samples is not None:
|
|
751
|
+
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
752
|
+
min_number, max_number = number_samples
|
|
753
|
+
cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
|
|
754
|
+
else:
|
|
755
|
+
cons = cons[cons["number_samples"] >= number_samples]
|
|
756
|
+
l4 = len(cons)
|
|
757
|
+
self.logger.info(
|
|
758
|
+
f"Filtered {l3 - l4} entries based on number of samples. Remaining {l4} entries.",
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
self.logger.info(f"Filtering completed. {len(cons)} entries remaining.")
|
|
762
|
+
|
|
763
|
+
if inplace:
|
|
764
|
+
self.consensus_df = cons
|
|
765
|
+
else:
|
|
766
|
+
return cons
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
## TODO is uid supposed to be a list? rt_tol 0?
|
|
770
|
+
def _integrate_chrom_impl(self, **kwargs):
|
|
771
|
+
"""
|
|
772
|
+
Given a consensus_id, integrate the intensity of all features in the consensus map.
|
|
773
|
+
|
|
774
|
+
Parameters:
|
|
775
|
+
**kwargs: Keyword arguments for integration parameters. Can include:
|
|
776
|
+
- An integrate_chrom_defaults instance to set all parameters at once
|
|
777
|
+
- Individual parameter names and values (see integrate_chrom_defaults for details)
|
|
778
|
+
|
|
779
|
+
Key Parameters:
|
|
780
|
+
uids: List of consensus UIDs to integrate (default: all consensus features).
|
|
781
|
+
rt_tol: RT tolerance for integration boundaries.
|
|
782
|
+
"""
|
|
783
|
+
# parameters initialization
|
|
784
|
+
params = integrate_defaults()
|
|
785
|
+
for key, value in kwargs.items():
|
|
786
|
+
if isinstance(value, integrate_defaults):
|
|
787
|
+
params = value
|
|
788
|
+
self.logger.debug("Using provided integrate_chrom_defaults parameters")
|
|
789
|
+
else:
|
|
790
|
+
if hasattr(params, key):
|
|
791
|
+
if params.set(key, value, validate=True):
|
|
792
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
793
|
+
else:
|
|
794
|
+
self.logger.warning(
|
|
795
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
796
|
+
)
|
|
797
|
+
else:
|
|
798
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
799
|
+
# end of parameter initialization
|
|
800
|
+
|
|
801
|
+
# Store parameters in the Study object
|
|
802
|
+
self.store_history(["integrate_chrom"], params.to_dict())
|
|
803
|
+
self.logger.debug("Parameters stored to integrate_chrom")
|
|
804
|
+
|
|
805
|
+
# Get parameter values for use in the method
|
|
806
|
+
uids = params.get("uids")
|
|
807
|
+
rt_tol = params.get("rt_tol")
|
|
808
|
+
|
|
809
|
+
if self.consensus_map is None:
|
|
810
|
+
self.logger.error("No consensus map found.")
|
|
811
|
+
return
|
|
812
|
+
if uids is None:
|
|
813
|
+
# get all consensus_id from consensus_df
|
|
814
|
+
ids = self.consensus_df["consensus_uid"].to_list()
|
|
815
|
+
else:
|
|
816
|
+
# keep only id that are in consensus_df
|
|
817
|
+
ids = [i for i in uids if i in self.consensus_df["consensus_uid"].to_list()]
|
|
818
|
+
|
|
819
|
+
# Ensure chrom_area column is Float64 to avoid dtype conflicts
|
|
820
|
+
if "chrom_area" in self.features_df.columns:
|
|
821
|
+
self.features_df = self.features_df.with_columns(
|
|
822
|
+
pl.col("chrom_area").cast(pl.Float64, strict=False),
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
# Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
|
|
826
|
+
# Use Polars join operation instead of pandas merge
|
|
827
|
+
consensus_subset = self.consensus_df.select([
|
|
828
|
+
"consensus_uid",
|
|
829
|
+
"rt_start_mean",
|
|
830
|
+
"rt_end_mean",
|
|
831
|
+
])
|
|
832
|
+
df1 = self.consensus_mapping_df.join(
|
|
833
|
+
consensus_subset,
|
|
834
|
+
on="consensus_uid",
|
|
835
|
+
how="left",
|
|
836
|
+
)
|
|
837
|
+
df1 = df1.filter(pl.col("consensus_uid").is_in(ids))
|
|
838
|
+
|
|
839
|
+
# Build a fast lookup for feature_uid to row index in features_df
|
|
840
|
+
# Since Polars doesn't have index-based access like pandas, we'll use row position
|
|
841
|
+
feature_uid_to_row = {}
|
|
842
|
+
for i, row_dict in enumerate(self.features_df.iter_rows(named=True)):
|
|
843
|
+
if "feature_uid" in row_dict:
|
|
844
|
+
feature_uid_to_row[row_dict["feature_uid"]] = i
|
|
845
|
+
elif "uid" in row_dict: # fallback column name
|
|
846
|
+
feature_uid_to_row[row_dict["uid"]] = i
|
|
847
|
+
|
|
848
|
+
# Prepare lists for batch update
|
|
849
|
+
update_rows = []
|
|
850
|
+
chroms: list = []
|
|
851
|
+
rt_starts: list[float] = []
|
|
852
|
+
rt_ends: list[float] = []
|
|
853
|
+
rt_deltas: list[float] = []
|
|
854
|
+
chrom_areas = []
|
|
855
|
+
|
|
856
|
+
self.logger.debug(f"Integrating {df1.shape[0]} features using consensus...")
|
|
857
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
858
|
+
for row in tqdm(
|
|
859
|
+
df1.iter_rows(named=True),
|
|
860
|
+
total=df1.shape[0],
|
|
861
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Integrate EICs by consensus",
|
|
862
|
+
disable=tdqm_disable,
|
|
863
|
+
):
|
|
864
|
+
feature_uid = row["feature_uid"]
|
|
865
|
+
row_idx = feature_uid_to_row.get(feature_uid)
|
|
866
|
+
if row_idx is None:
|
|
867
|
+
continue
|
|
868
|
+
|
|
869
|
+
# Get the feature row from Polars DataFrame
|
|
870
|
+
feature_row = self.features_df.row(row_idx, named=True)
|
|
871
|
+
# get chromatogram for the feature
|
|
872
|
+
chrom = feature_row["chrom"]
|
|
873
|
+
if chrom is None or len(chrom) == 0:
|
|
874
|
+
update_rows.append(row_idx)
|
|
875
|
+
chroms.append(None)
|
|
876
|
+
rt_starts.append(None)
|
|
877
|
+
rt_ends.append(None)
|
|
878
|
+
rt_deltas.append(None)
|
|
879
|
+
chrom_areas.append(-1.0)
|
|
880
|
+
continue
|
|
881
|
+
## TODO expose parameters
|
|
882
|
+
rt_start = _find_closest_valley(
|
|
883
|
+
chrom,
|
|
884
|
+
row["rt_start_mean"] - rt_tol,
|
|
885
|
+
dir="left",
|
|
886
|
+
threshold=0.9,
|
|
887
|
+
)
|
|
888
|
+
rt_end = _find_closest_valley(
|
|
889
|
+
chrom,
|
|
890
|
+
row["rt_end_mean"] + rt_tol,
|
|
891
|
+
dir="right",
|
|
892
|
+
threshold=0.9,
|
|
893
|
+
)
|
|
894
|
+
chrom.feature_start = rt_start
|
|
895
|
+
chrom.feature_end = rt_end
|
|
896
|
+
chrom.integrate()
|
|
897
|
+
update_rows.append(row_idx)
|
|
898
|
+
chroms.append(chrom)
|
|
899
|
+
rt_starts.append(rt_start)
|
|
900
|
+
rt_ends.append(rt_end)
|
|
901
|
+
rt_deltas.append(rt_end - rt_start)
|
|
902
|
+
chrom_areas.append(float(chrom.feature_area))
|
|
903
|
+
|
|
904
|
+
# Batch update DataFrame - Polars style
|
|
905
|
+
if update_rows:
|
|
906
|
+
# Create mapping from row index to new values
|
|
907
|
+
row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
|
|
908
|
+
row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
|
|
909
|
+
row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
|
|
910
|
+
row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
|
|
911
|
+
row_to_chrom_area = {
|
|
912
|
+
update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
|
|
913
|
+
for i in range(len(update_rows))
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
# Use with_row_index to create a temporary row index column
|
|
917
|
+
df_with_index = self.features_df.with_row_index("__row_idx")
|
|
918
|
+
|
|
919
|
+
# Create update masks and values
|
|
920
|
+
update_mask = pl.col("__row_idx").is_in(update_rows)
|
|
921
|
+
|
|
922
|
+
# Update columns conditionally
|
|
923
|
+
try:
|
|
924
|
+
self.features_df = df_with_index.with_columns([
|
|
925
|
+
# Update chrom column - use when() to update only specific rows
|
|
926
|
+
pl.when(update_mask)
|
|
927
|
+
.then(
|
|
928
|
+
pl.col("__row_idx").map_elements(
|
|
929
|
+
lambda x: row_to_chrom.get(x, None),
|
|
930
|
+
return_dtype=pl.Object,
|
|
931
|
+
),
|
|
932
|
+
)
|
|
933
|
+
.otherwise(pl.col("chrom"))
|
|
934
|
+
.alias("chrom"),
|
|
935
|
+
# Update rt_start column
|
|
936
|
+
pl.when(update_mask)
|
|
937
|
+
.then(
|
|
938
|
+
pl.col("__row_idx").map_elements(
|
|
939
|
+
lambda x: row_to_rt_start.get(x, None),
|
|
940
|
+
return_dtype=pl.Float64,
|
|
941
|
+
),
|
|
942
|
+
)
|
|
943
|
+
.otherwise(pl.col("rt_start"))
|
|
944
|
+
.alias("rt_start"),
|
|
945
|
+
# Update rt_end column
|
|
946
|
+
pl.when(update_mask)
|
|
947
|
+
.then(
|
|
948
|
+
pl.col("__row_idx").map_elements(
|
|
949
|
+
lambda x: row_to_rt_end.get(x, None),
|
|
950
|
+
return_dtype=pl.Float64,
|
|
951
|
+
),
|
|
952
|
+
)
|
|
953
|
+
.otherwise(pl.col("rt_end"))
|
|
954
|
+
.alias("rt_end"),
|
|
955
|
+
# Update rt_delta column
|
|
956
|
+
pl.when(update_mask)
|
|
957
|
+
.then(
|
|
958
|
+
pl.col("__row_idx").map_elements(
|
|
959
|
+
lambda x: row_to_rt_delta.get(x, None),
|
|
960
|
+
return_dtype=pl.Float64,
|
|
961
|
+
),
|
|
962
|
+
)
|
|
963
|
+
.otherwise(pl.col("rt_delta"))
|
|
964
|
+
.alias("rt_delta"),
|
|
965
|
+
# Update chrom_area column
|
|
966
|
+
pl.when(update_mask)
|
|
967
|
+
.then(
|
|
968
|
+
pl.col("__row_idx").map_elements(
|
|
969
|
+
lambda x: row_to_chrom_area.get(x, 0),
|
|
970
|
+
return_dtype=pl.Float64,
|
|
971
|
+
),
|
|
972
|
+
)
|
|
973
|
+
.otherwise(pl.col("chrom_area"))
|
|
974
|
+
.alias("chrom_area"),
|
|
975
|
+
]).drop("__row_idx") # Remove the temporary row index column
|
|
976
|
+
|
|
977
|
+
self.logger.debug(
|
|
978
|
+
f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
|
|
979
|
+
)
|
|
980
|
+
except Exception as e:
|
|
981
|
+
self.logger.error(f"Failed to update features DataFrame: {e}")
|
|
982
|
+
else:
|
|
983
|
+
self.logger.debug("No features were updated during integration.")
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def integrate(self, **kwargs):
|
|
987
|
+
"""
|
|
988
|
+
Integrate chromatograms across consensus features.
|
|
989
|
+
|
|
990
|
+
Parameters:
|
|
991
|
+
**kwargs: Keyword arguments for integration parameters. Can include:
|
|
992
|
+
- An integrate_defaults instance to set all parameters at once
|
|
993
|
+
- Individual parameter names and values (see integrate_defaults for details)
|
|
994
|
+
|
|
995
|
+
Key Parameters:
|
|
996
|
+
uids (Optional[list]): List of consensus UIDs to integrate (None for all).
|
|
997
|
+
rt_tol (float): RT tolerance for integration boundaries (default: 0.0).
|
|
998
|
+
"""
|
|
999
|
+
# parameters initialization
|
|
1000
|
+
params = integrate_defaults()
|
|
1001
|
+
for key, value in kwargs.items():
|
|
1002
|
+
if isinstance(value, integrate_defaults):
|
|
1003
|
+
params = value
|
|
1004
|
+
self.logger.debug("Using provided integrate_defaults parameters")
|
|
1005
|
+
else:
|
|
1006
|
+
if hasattr(params, key):
|
|
1007
|
+
if params.set(key, value, validate=True):
|
|
1008
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
1009
|
+
else:
|
|
1010
|
+
self.logger.warning(
|
|
1011
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
1012
|
+
)
|
|
1013
|
+
else:
|
|
1014
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
1015
|
+
# end of parameter initialization
|
|
1016
|
+
|
|
1017
|
+
# Store parameters in the Study object
|
|
1018
|
+
self.store_history(["integrate"], params.to_dict())
|
|
1019
|
+
self.logger.debug("Parameters stored to integrate")
|
|
1020
|
+
|
|
1021
|
+
# Call the original integrate_chrom function with extracted parameters
|
|
1022
|
+
return _integrate_chrom_impl(
|
|
1023
|
+
self,
|
|
1024
|
+
uids=params.get("uids"),
|
|
1025
|
+
rt_tol=params.get("rt_tol"),
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
# Backward compatibility alias
|
|
1030
|
+
integrate_chrom = integrate
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
|
|
1034
|
+
# find closest index to rt in chrom['rt']
|
|
1035
|
+
chrom.rt = chrom.rt.astype(np.float64)
|
|
1036
|
+
chrom.inty = chrom.inty.astype(np.float64)
|
|
1037
|
+
idx = np.abs(chrom.rt - rt).argmin()
|
|
1038
|
+
# ensure rt and inty are float64
|
|
1039
|
+
if dir == "left":
|
|
1040
|
+
inty = np.inf
|
|
1041
|
+
# iterate left from idx to the end od the peaks until we find a valley
|
|
1042
|
+
for i in range(idx, 0, -1):
|
|
1043
|
+
if chrom.inty[i] < inty * threshold:
|
|
1044
|
+
idx = i
|
|
1045
|
+
inty = chrom.inty[i]
|
|
1046
|
+
else:
|
|
1047
|
+
break
|
|
1048
|
+
if dir == "right":
|
|
1049
|
+
inty = np.inf
|
|
1050
|
+
# iterate right from idx to the end od the peaks until we find a valley
|
|
1051
|
+
for i in range(idx, len(chrom.inty)):
|
|
1052
|
+
if chrom.inty[i] < inty * threshold:
|
|
1053
|
+
idx = i
|
|
1054
|
+
inty = chrom.inty[i]
|
|
1055
|
+
else:
|
|
1056
|
+
break
|
|
1057
|
+
return chrom.rt[idx]
|