masster 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1685 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +393 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -736
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1406 -886
- masster/study/helpers.py +1713 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1231 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +161 -134
- masster/study/study.py +612 -522
- masster/study/study5_schema.json +253 -241
- {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/METADATA +15 -10
- masster-0.3.1.dist-info/RECORD +59 -0
- {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.5.dist-info/RECORD +0 -50
- {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/WHEEL +0 -0
- {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/entry_points.txt +0 -0
masster/study/load.py
CHANGED
|
@@ -1,1078 +1,1231 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import concurrent.futures
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import polars as pl
|
|
9
|
-
import pyopenms as oms
|
|
10
|
-
|
|
11
|
-
from tqdm import tqdm
|
|
12
|
-
|
|
13
|
-
from masster.chromatogram import Chromatogram
|
|
14
|
-
from masster.study.defaults import
|
|
15
|
-
from masster.sample.sample import Sample
|
|
16
|
-
from masster.spectrum import Spectrum
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# Pre-import heavy modules to avoid repeated loading in add_sample()
|
|
20
|
-
try:
|
|
21
|
-
import alpharaw.sciex
|
|
22
|
-
|
|
23
|
-
ALPHARAW_AVAILABLE = True
|
|
24
|
-
except ImportError:
|
|
25
|
-
ALPHARAW_AVAILABLE = False
|
|
26
|
-
|
|
27
|
-
try:
|
|
28
|
-
import pythonnet
|
|
29
|
-
|
|
30
|
-
PYTHONNET_AVAILABLE = True
|
|
31
|
-
except ImportError:
|
|
32
|
-
PYTHONNET_AVAILABLE = False
|
|
33
|
-
|
|
34
|
-
import glob
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def
|
|
38
|
-
self,
|
|
39
|
-
folder=None,
|
|
40
|
-
reset=False,
|
|
41
|
-
adducts=None,
|
|
42
|
-
max_files=None,
|
|
43
|
-
):
|
|
44
|
-
if folder is None:
|
|
45
|
-
if self.
|
|
46
|
-
folder = self.
|
|
47
|
-
else:
|
|
48
|
-
folder = os.getcwd()
|
|
49
|
-
|
|
50
|
-
files
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
self.logger.
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
#
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
"
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
if
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
self.
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
"
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
)
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
new_features:
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
):
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
"
|
|
749
|
-
"
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
"
|
|
768
|
-
"
|
|
769
|
-
"
|
|
770
|
-
"
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
"
|
|
782
|
-
"
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
)
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
self
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import polars as pl
|
|
9
|
+
import pyopenms as oms
|
|
10
|
+
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
from masster.chromatogram import Chromatogram
|
|
14
|
+
from masster.study.defaults import fill_defaults
|
|
15
|
+
from masster.sample.sample import Sample
|
|
16
|
+
from masster.spectrum import Spectrum
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Pre-import heavy modules to avoid repeated loading in add_sample()
|
|
20
|
+
try:
|
|
21
|
+
import alpharaw.sciex
|
|
22
|
+
|
|
23
|
+
ALPHARAW_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
ALPHARAW_AVAILABLE = False
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
import pythonnet
|
|
29
|
+
|
|
30
|
+
PYTHONNET_AVAILABLE = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
PYTHONNET_AVAILABLE = False
|
|
33
|
+
|
|
34
|
+
import glob
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def add(
|
|
38
|
+
self,
|
|
39
|
+
folder=None,
|
|
40
|
+
reset=False,
|
|
41
|
+
adducts=None,
|
|
42
|
+
max_files=None,
|
|
43
|
+
):
|
|
44
|
+
if folder is None:
|
|
45
|
+
if self.folder is not None:
|
|
46
|
+
folder = self.folder
|
|
47
|
+
else:
|
|
48
|
+
folder = os.getcwd()
|
|
49
|
+
|
|
50
|
+
self.logger.debug(f"Adding files from: {folder}")
|
|
51
|
+
|
|
52
|
+
# Define file extensions to search for in order of priority
|
|
53
|
+
extensions = [".sample5", ".wiff", ".raw", ".mzML"]
|
|
54
|
+
|
|
55
|
+
# Check if folder contains glob patterns
|
|
56
|
+
if not any(char in folder for char in ["*", "?", "[", "]"]):
|
|
57
|
+
search_folder = folder
|
|
58
|
+
else:
|
|
59
|
+
search_folder = os.path.dirname(folder) if os.path.dirname(folder) else folder
|
|
60
|
+
|
|
61
|
+
# Blacklist to track filenames without extensions that have already been processed
|
|
62
|
+
blacklist = set()
|
|
63
|
+
counter = 0
|
|
64
|
+
not_zero = False
|
|
65
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
66
|
+
|
|
67
|
+
# Search for files in order of priority
|
|
68
|
+
for ext in extensions:
|
|
69
|
+
if max_files is not None and counter >= max_files:
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
# Build search pattern
|
|
73
|
+
if any(char in folder for char in ["*", "?", "[", "]"]):
|
|
74
|
+
# If folder already contains glob patterns, modify the extension
|
|
75
|
+
if folder.endswith("*.sample5"):
|
|
76
|
+
pattern = folder.replace("*.sample5", f"*{ext}")
|
|
77
|
+
else:
|
|
78
|
+
pattern = os.path.join(search_folder, "**", f"*{ext}")
|
|
79
|
+
else:
|
|
80
|
+
pattern = os.path.join(search_folder, "**", f"*{ext}")
|
|
81
|
+
|
|
82
|
+
files = glob.glob(pattern, recursive=True)
|
|
83
|
+
|
|
84
|
+
if len(files) > 0:
|
|
85
|
+
# Limit files if max_files is specified
|
|
86
|
+
remaining_slots = max_files - counter if max_files is not None else len(files)
|
|
87
|
+
files = files[:remaining_slots]
|
|
88
|
+
|
|
89
|
+
self.logger.debug(f"Found {len(files)} {ext} files")
|
|
90
|
+
|
|
91
|
+
# Process files
|
|
92
|
+
for i, file in enumerate(
|
|
93
|
+
tqdm(
|
|
94
|
+
files,
|
|
95
|
+
total=len(files),
|
|
96
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add *{ext}",
|
|
97
|
+
disable=tdqm_disable,
|
|
98
|
+
),
|
|
99
|
+
):
|
|
100
|
+
if max_files is not None and counter >= max_files:
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
# Get filename without extension for blacklist check
|
|
104
|
+
basename = os.path.basename(file)
|
|
105
|
+
filename_no_ext = os.path.splitext(basename)[0]
|
|
106
|
+
|
|
107
|
+
# Check if this filename (without extension) is already in blacklist
|
|
108
|
+
if filename_no_ext in blacklist:
|
|
109
|
+
self.logger.debug(f"Skipping {file} - filename already processed")
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
self.logger.debug(f"Add file {counter + 1}: {file}")
|
|
113
|
+
|
|
114
|
+
# Try to add the sample
|
|
115
|
+
try:
|
|
116
|
+
self.add_sample(file=file, reset=reset, adducts=adducts)
|
|
117
|
+
# If successful, add to blacklist and increment counter
|
|
118
|
+
blacklist.add(filename_no_ext)
|
|
119
|
+
counter += 1
|
|
120
|
+
not_zero = True
|
|
121
|
+
except Exception as e:
|
|
122
|
+
self.logger.warning(f"Failed to add sample {file}: {e}")
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
if max_files is not None and counter >= max_files:
|
|
126
|
+
self.logger.debug(
|
|
127
|
+
f"Reached maximum number of files to add: {max_files}. Stopping further additions.",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if not not_zero:
|
|
131
|
+
self.logger.warning(
|
|
132
|
+
f"No files found in {folder}. Please check the folder path or file patterns.",
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
self.logger.debug(f"Successfully added {counter} samples to the study.")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# TODO type is not used
|
|
139
|
+
def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
140
|
+
self.logger.debug(f"Adding: {file}")
|
|
141
|
+
|
|
142
|
+
# Extract sample name by removing any known extension
|
|
143
|
+
basename = os.path.basename(file)
|
|
144
|
+
sample_name = os.path.splitext(basename)[0]
|
|
145
|
+
|
|
146
|
+
# check if sample_name is already in the samples_df
|
|
147
|
+
if sample_name in self.samples_df["sample_name"].to_list():
|
|
148
|
+
self.logger.warning(
|
|
149
|
+
f"Sample {sample_name} already exists in the study. Skipping.",
|
|
150
|
+
)
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
# check if file exists
|
|
154
|
+
if not os.path.exists(file):
|
|
155
|
+
self.logger.error(f"File {file} does not exist.")
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
# Check for supported file extensions
|
|
159
|
+
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
160
|
+
self.logger.error(f"File {file} is not a supported file type. Supported: .sample5, .wiff, .raw, .mzML")
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
# Load the sample based on file type
|
|
164
|
+
ddaobj = Sample()
|
|
165
|
+
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
166
|
+
|
|
167
|
+
if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
168
|
+
ddaobj.load(file)
|
|
169
|
+
else:
|
|
170
|
+
self.logger.error(f"Unsupported file format: {file}")
|
|
171
|
+
return
|
|
172
|
+
if ddaobj.features_df is None and not reset:
|
|
173
|
+
self.logger.warning(
|
|
174
|
+
f"File {file} will be newly processed.",
|
|
175
|
+
)
|
|
176
|
+
ddaobj.features = None
|
|
177
|
+
|
|
178
|
+
if ddaobj.features is None or reset:
|
|
179
|
+
ddaobj.find_features()
|
|
180
|
+
ddaobj.find_adducts(adducts=adducts)
|
|
181
|
+
ddaobj.find_ms2()
|
|
182
|
+
|
|
183
|
+
self.features_maps.append(ddaobj.features)
|
|
184
|
+
|
|
185
|
+
sample_type = "sample" if type is None else type
|
|
186
|
+
if "qc" in sample_name.lower():
|
|
187
|
+
sample_type = "qc"
|
|
188
|
+
if "blank" in sample_name.lower():
|
|
189
|
+
sample_type = "blank"
|
|
190
|
+
map_id_value = str(ddaobj.features.getUniqueId())
|
|
191
|
+
|
|
192
|
+
# Determine the final sample path based on file type
|
|
193
|
+
if file.endswith(".sample5"):
|
|
194
|
+
# If input is already .sample5, keep it in original location
|
|
195
|
+
final_sample_path = file
|
|
196
|
+
self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
|
|
197
|
+
|
|
198
|
+
# Check if there's a corresponding featureXML file in the same directory
|
|
199
|
+
featurexml_path = file.replace(".sample5", ".featureXML")
|
|
200
|
+
if os.path.exists(featurexml_path):
|
|
201
|
+
self.logger.debug(f"Found corresponding featureXML file: {featurexml_path}")
|
|
202
|
+
else:
|
|
203
|
+
self.logger.debug(f"No corresponding featureXML file found at: {featurexml_path}")
|
|
204
|
+
else:
|
|
205
|
+
# For .wiff, .mzML, .raw files, save to study folder (original behavior)
|
|
206
|
+
if self.folder is not None:
|
|
207
|
+
if not os.path.exists(self.folder):
|
|
208
|
+
os.makedirs(self.folder)
|
|
209
|
+
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
210
|
+
ddaobj.save(final_sample_path)
|
|
211
|
+
self.logger.debug(f"Saved converted sample to study folder: {final_sample_path}")
|
|
212
|
+
else:
|
|
213
|
+
# If no study folder is set, save in current directory
|
|
214
|
+
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
215
|
+
ddaobj.save(final_sample_path)
|
|
216
|
+
self.logger.debug(f"Saved converted sample to current directory: {final_sample_path}")
|
|
217
|
+
|
|
218
|
+
# Count MS1 and MS2 scans from the loaded sample
|
|
219
|
+
ms1_count = 0
|
|
220
|
+
ms2_count = 0
|
|
221
|
+
if hasattr(ddaobj, 'scans_df') and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
222
|
+
ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
|
|
223
|
+
ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
|
|
224
|
+
|
|
225
|
+
new_sample = pl.DataFrame(
|
|
226
|
+
{
|
|
227
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
228
|
+
"sample_name": [sample_name],
|
|
229
|
+
"sample_path": [final_sample_path], # Use the determined path
|
|
230
|
+
"sample_type": [sample_type],
|
|
231
|
+
"size": [int(ddaobj.features.size())],
|
|
232
|
+
"map_id": [map_id_value],
|
|
233
|
+
"file_source": [getattr(ddaobj, 'file_source', file)],
|
|
234
|
+
"ms1": [ms1_count],
|
|
235
|
+
"ms2": [ms2_count],
|
|
236
|
+
},
|
|
237
|
+
schema={
|
|
238
|
+
"sample_uid": pl.Int64,
|
|
239
|
+
"sample_name": pl.Utf8,
|
|
240
|
+
"sample_path": pl.Utf8,
|
|
241
|
+
"sample_type": pl.Utf8,
|
|
242
|
+
"size": pl.Int64,
|
|
243
|
+
"map_id": pl.Utf8,
|
|
244
|
+
"file_source": pl.Utf8,
|
|
245
|
+
"ms1": pl.Int64,
|
|
246
|
+
"ms2": pl.Int64,
|
|
247
|
+
},
|
|
248
|
+
)
|
|
249
|
+
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
250
|
+
|
|
251
|
+
# Optimized DataFrame operations - chain operations instead of multiple clones
|
|
252
|
+
columns_to_add = [
|
|
253
|
+
pl.lit(len(self.samples_df)).alias("sample_uid"),
|
|
254
|
+
pl.lit(False).alias("filled"),
|
|
255
|
+
pl.lit(-1.0).alias("chrom_area"),
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
# Only add rt_original if it doesn't exist
|
|
259
|
+
if "rt_original" not in ddaobj.features_df.columns:
|
|
260
|
+
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
261
|
+
|
|
262
|
+
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
263
|
+
|
|
264
|
+
if self.features_df.is_empty():
|
|
265
|
+
# Create new features_df with feature_uid column
|
|
266
|
+
self.features_df = f_df.with_columns(
|
|
267
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid"),
|
|
268
|
+
).select(
|
|
269
|
+
["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
|
|
270
|
+
)
|
|
271
|
+
else:
|
|
272
|
+
offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
|
|
273
|
+
# Chain operations and add to existing DataFrame
|
|
274
|
+
f_df = f_df.with_columns(
|
|
275
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid"),
|
|
276
|
+
).select(
|
|
277
|
+
["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
|
|
278
|
+
)
|
|
279
|
+
self.features_df = pl.concat([self.features_df, f_df])
|
|
280
|
+
self.logger.debug(
|
|
281
|
+
f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def load(self, filename=None):
|
|
286
|
+
"""
|
|
287
|
+
Load a study from an HDF5 file.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
study: The study object to load into
|
|
291
|
+
filename (str, optional): The path to the HDF5 file to load the study from.
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
# Handle default filename
|
|
295
|
+
if filename is None:
|
|
296
|
+
if self.folder is not None:
|
|
297
|
+
# search for *.study5 in folder
|
|
298
|
+
study5_files = glob.glob(os.path.join(self.folder, "*.study5"))
|
|
299
|
+
if study5_files:
|
|
300
|
+
filename = study5_files[-1]
|
|
301
|
+
else:
|
|
302
|
+
self.logger.error("No .study5 files found in folder")
|
|
303
|
+
return
|
|
304
|
+
else:
|
|
305
|
+
self.logger.error("Either filename or folder must be provided")
|
|
306
|
+
return
|
|
307
|
+
|
|
308
|
+
#self.logger.info(f"Loading study from {filename}")
|
|
309
|
+
self._load_study5(filename)
|
|
310
|
+
# After loading the study, check if consensus XML exists and load it
|
|
311
|
+
consensus_xml_path = filename.replace(".study5", ".consensusXML")
|
|
312
|
+
if os.path.exists(consensus_xml_path):
|
|
313
|
+
self._load_consensusXML(filename=consensus_xml_path)
|
|
314
|
+
# self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
|
|
315
|
+
else:
|
|
316
|
+
self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
|
|
317
|
+
self.filename = filename
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _fill_chrom_single_impl(
|
|
321
|
+
self,
|
|
322
|
+
uids=None,
|
|
323
|
+
mz_tol: float = 0.010,
|
|
324
|
+
rt_tol: float = 10.0,
|
|
325
|
+
min_samples_rel: float = 0.0,
|
|
326
|
+
min_samples_abs: int = 2,
|
|
327
|
+
):
|
|
328
|
+
"""Fill missing chromatograms by extracting from raw data.
|
|
329
|
+
|
|
330
|
+
Simplified version that loads one sample at a time without preloading or batching.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
uids: Consensus UIDs to process (default: all)
|
|
334
|
+
mz_tol: m/z tolerance for extraction (default: 0.010 Da)
|
|
335
|
+
rt_tol: RT tolerance for extraction (default: 10.0 seconds)
|
|
336
|
+
min_samples_rel: Relative minimum sample threshold (default: 0.0)
|
|
337
|
+
min_samples_abs: Absolute minimum sample threshold (default: 2)
|
|
338
|
+
"""
|
|
339
|
+
uids = self._get_consensus_uids(uids)
|
|
340
|
+
|
|
341
|
+
self.logger.info("Gap filling...")
|
|
342
|
+
self.logger.debug(
|
|
343
|
+
f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}",
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Apply minimum sample filters
|
|
347
|
+
min_number_rel = 1
|
|
348
|
+
min_number_abs = 1
|
|
349
|
+
if isinstance(min_samples_rel, float) and min_samples_rel > 0:
|
|
350
|
+
min_number_rel = int(min_samples_rel * len(self.samples_df))
|
|
351
|
+
if isinstance(min_samples_abs, int) and min_samples_abs > 0:
|
|
352
|
+
min_number_abs = int(min_samples_abs)
|
|
353
|
+
min_number = max(min_number_rel, min_number_abs)
|
|
354
|
+
self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
|
|
355
|
+
|
|
356
|
+
if min_number > 0:
|
|
357
|
+
original_count = len(uids)
|
|
358
|
+
uids = self.consensus_df.filter(
|
|
359
|
+
(pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
|
|
360
|
+
)["consensus_uid"].to_list()
|
|
361
|
+
self.logger.debug(
|
|
362
|
+
f"Features to fill: {original_count} -> {len(uids)}",
|
|
363
|
+
)
|
|
364
|
+
self.logger.debug("Identifying missing features...")
|
|
365
|
+
# Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
|
|
366
|
+
missing_combinations = self._get_missing_consensus_sample_combinations(uids)
|
|
367
|
+
if not missing_combinations:
|
|
368
|
+
self.logger.info("No missing features found to fill.")
|
|
369
|
+
return
|
|
370
|
+
|
|
371
|
+
# Build lookup dictionaries
|
|
372
|
+
self.logger.debug("Building lookup dictionaries...")
|
|
373
|
+
consensus_info = {}
|
|
374
|
+
consensus_subset = self.consensus_df.select([
|
|
375
|
+
"consensus_uid",
|
|
376
|
+
"rt_start_mean",
|
|
377
|
+
"rt_end_mean",
|
|
378
|
+
"mz",
|
|
379
|
+
"rt",
|
|
380
|
+
]).filter(pl.col("consensus_uid").is_in(uids))
|
|
381
|
+
|
|
382
|
+
for row in consensus_subset.iter_rows(named=True):
|
|
383
|
+
consensus_info[row["consensus_uid"]] = {
|
|
384
|
+
"rt_start_mean": row["rt_start_mean"],
|
|
385
|
+
"rt_end_mean": row["rt_end_mean"],
|
|
386
|
+
"mz": row["mz"],
|
|
387
|
+
"rt": row["rt"],
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
# Process each sample individually
|
|
391
|
+
# Group missing combinations by sample for efficient processing
|
|
392
|
+
missing_by_sample = {}
|
|
393
|
+
for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
|
|
394
|
+
if sample_name not in missing_by_sample:
|
|
395
|
+
missing_by_sample[sample_name] = {
|
|
396
|
+
"sample_uid": sample_uid,
|
|
397
|
+
"sample_path": sample_path,
|
|
398
|
+
"missing_consensus_uids": [],
|
|
399
|
+
}
|
|
400
|
+
missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
|
|
401
|
+
|
|
402
|
+
new_features: list[dict] = []
|
|
403
|
+
new_mapping: list[dict] = []
|
|
404
|
+
counter = 0
|
|
405
|
+
|
|
406
|
+
self.logger.debug(
|
|
407
|
+
f"Missing features: {len(missing_combinations)} in {len(missing_by_sample)} samples...",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
411
|
+
|
|
412
|
+
for sample_name, sample_info in tqdm(
|
|
413
|
+
missing_by_sample.items(),
|
|
414
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}File",
|
|
415
|
+
disable=tdqm_disable,
|
|
416
|
+
):
|
|
417
|
+
# Load this sample
|
|
418
|
+
sample_uid = sample_info["sample_uid"]
|
|
419
|
+
sample_path = sample_info["sample_path"]
|
|
420
|
+
missing_consensus_uids = sample_info["missing_consensus_uids"]
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
# self.logger.debug(f"Loading sample: {sample_path}")
|
|
424
|
+
file = Sample()
|
|
425
|
+
file.logger_update("WARNING")
|
|
426
|
+
file.load(sample_path)
|
|
427
|
+
except Exception as e:
|
|
428
|
+
self.logger.warning(f"Failed to load sample {sample_name}: {e}")
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
self.logger.debug(
|
|
432
|
+
f"Sample {sample_name}: Processing {len(missing_consensus_uids)} missing features",
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Process each missing feature
|
|
436
|
+
for consensus_uid in missing_consensus_uids:
|
|
437
|
+
cons = consensus_info[consensus_uid]
|
|
438
|
+
mz = cons["mz"]
|
|
439
|
+
rt = cons["rt"]
|
|
440
|
+
rt_start_mean = cons["rt_start_mean"]
|
|
441
|
+
rt_end_mean = cons["rt_end_mean"]
|
|
442
|
+
|
|
443
|
+
# Filter MS1 data for this feature
|
|
444
|
+
if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
|
|
445
|
+
d = file.ms1_df.filter(
|
|
446
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
447
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
448
|
+
& (pl.col("rt") >= rt_start_mean - rt_tol)
|
|
449
|
+
& (pl.col("rt") <= rt_end_mean + rt_tol),
|
|
450
|
+
)
|
|
451
|
+
else:
|
|
452
|
+
d = pl.DataFrame()
|
|
453
|
+
|
|
454
|
+
# Create chromatogram
|
|
455
|
+
if d.is_empty():
|
|
456
|
+
self.logger.debug(
|
|
457
|
+
f"Feature {consensus_uid}: No MS1 data found, creating empty chromatogram",
|
|
458
|
+
)
|
|
459
|
+
eic = Chromatogram(
|
|
460
|
+
rt=np.array([rt_start_mean, rt_end_mean]),
|
|
461
|
+
inty=np.array([0.0, 0.0]),
|
|
462
|
+
label=f"EIC mz={mz:.4f}",
|
|
463
|
+
file=sample_path,
|
|
464
|
+
mz=mz,
|
|
465
|
+
mz_tol=mz_tol,
|
|
466
|
+
feature_start=rt_start_mean,
|
|
467
|
+
feature_end=rt_end_mean,
|
|
468
|
+
feature_apex=rt,
|
|
469
|
+
)
|
|
470
|
+
max_inty = 0.0
|
|
471
|
+
area = 0.0
|
|
472
|
+
else:
|
|
473
|
+
self.logger.debug(
|
|
474
|
+
f"Feature {consensus_uid}: Found {len(d)} MS1 points, creating EIC",
|
|
475
|
+
)
|
|
476
|
+
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
477
|
+
|
|
478
|
+
if len(eic_rt) > 4:
|
|
479
|
+
eic = Chromatogram(
|
|
480
|
+
eic_rt["rt"].to_numpy(),
|
|
481
|
+
eic_rt["inty"].to_numpy(),
|
|
482
|
+
label=f"EIC mz={mz:.4f}",
|
|
483
|
+
file=sample_path,
|
|
484
|
+
mz=mz,
|
|
485
|
+
mz_tol=mz_tol,
|
|
486
|
+
feature_start=rt_start_mean,
|
|
487
|
+
feature_end=rt_end_mean,
|
|
488
|
+
feature_apex=rt,
|
|
489
|
+
).find_peaks()
|
|
490
|
+
max_inty = np.max(eic.inty)
|
|
491
|
+
area = eic.feature_area
|
|
492
|
+
else:
|
|
493
|
+
eic = Chromatogram(
|
|
494
|
+
eic_rt["rt"].to_numpy(),
|
|
495
|
+
eic_rt["inty"].to_numpy(),
|
|
496
|
+
label=f"EIC mz={mz:.4f}",
|
|
497
|
+
file=sample_path,
|
|
498
|
+
mz=mz,
|
|
499
|
+
mz_tol=mz_tol,
|
|
500
|
+
feature_start=rt_start_mean,
|
|
501
|
+
feature_end=rt_end_mean,
|
|
502
|
+
feature_apex=rt,
|
|
503
|
+
)
|
|
504
|
+
max_inty = 0.0
|
|
505
|
+
area = 0.0
|
|
506
|
+
|
|
507
|
+
# Generate feature UID
|
|
508
|
+
feature_uid = (
|
|
509
|
+
self.features_df["feature_uid"].max() + len(new_features) + 1
|
|
510
|
+
if not self.features_df.is_empty()
|
|
511
|
+
else len(new_features) + 1
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Create new feature entry
|
|
515
|
+
new_feature = {
|
|
516
|
+
"sample_uid": sample_uid,
|
|
517
|
+
"feature_uid": feature_uid,
|
|
518
|
+
"feature_id": None,
|
|
519
|
+
"mz": mz,
|
|
520
|
+
"rt": rt,
|
|
521
|
+
"rt_original": None,
|
|
522
|
+
"rt_start": rt_start_mean,
|
|
523
|
+
"rt_end": rt_end_mean,
|
|
524
|
+
"rt_delta": rt_end_mean - rt_start_mean,
|
|
525
|
+
"mz_start": None,
|
|
526
|
+
"mz_end": None,
|
|
527
|
+
"inty": max_inty,
|
|
528
|
+
"quality": None,
|
|
529
|
+
"charge": None,
|
|
530
|
+
"iso": None,
|
|
531
|
+
"iso_of": None,
|
|
532
|
+
"adduct": None,
|
|
533
|
+
"adduct_mass": None,
|
|
534
|
+
"adduct_group": None,
|
|
535
|
+
"chrom": eic,
|
|
536
|
+
"chrom_coherence": None,
|
|
537
|
+
"chrom_prominence": None,
|
|
538
|
+
"chrom_prominence_scaled": None,
|
|
539
|
+
"chrom_height_scaled": None,
|
|
540
|
+
"ms2_scans": None,
|
|
541
|
+
"ms2_specs": None,
|
|
542
|
+
"filled": True,
|
|
543
|
+
"chrom_area": area,
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
new_features.append(new_feature)
|
|
547
|
+
new_mapping.append({
|
|
548
|
+
"consensus_uid": consensus_uid,
|
|
549
|
+
"sample_uid": sample_uid,
|
|
550
|
+
"feature_uid": feature_uid,
|
|
551
|
+
})
|
|
552
|
+
counter += 1
|
|
553
|
+
|
|
554
|
+
# Add new features to DataFrames
|
|
555
|
+
self.logger.debug(f"Adding {len(new_features)} new features to DataFrame...")
|
|
556
|
+
if new_features:
|
|
557
|
+
# Create properly formatted rows
|
|
558
|
+
rows_to_add = []
|
|
559
|
+
for feature_dict in new_features:
|
|
560
|
+
new_row = {}
|
|
561
|
+
for col in self.features_df.columns:
|
|
562
|
+
if col in feature_dict:
|
|
563
|
+
new_row[col] = feature_dict[col]
|
|
564
|
+
else:
|
|
565
|
+
new_row[col] = None
|
|
566
|
+
rows_to_add.append(new_row)
|
|
567
|
+
|
|
568
|
+
# Create and add new DataFrame
|
|
569
|
+
new_df = pl.from_dicts(rows_to_add)
|
|
570
|
+
|
|
571
|
+
# Cast columns to match existing schema
|
|
572
|
+
cast_exprs = []
|
|
573
|
+
for col in self.features_df.columns:
|
|
574
|
+
existing_dtype = self.features_df[col].dtype
|
|
575
|
+
cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
|
|
576
|
+
|
|
577
|
+
new_df = new_df.with_columns(cast_exprs)
|
|
578
|
+
self.features_df = self.features_df.vstack(new_df)
|
|
579
|
+
|
|
580
|
+
# Add consensus mapping
|
|
581
|
+
new_mapping_df = pl.DataFrame(new_mapping)
|
|
582
|
+
self.consensus_mapping_df = pl.concat(
|
|
583
|
+
[self.consensus_mapping_df, new_mapping_df],
|
|
584
|
+
how="diagonal",
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
self.logger.info(f"Filled {counter} chromatograms from raw data.")
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def fill_single(self, **kwargs):
|
|
591
|
+
"""Fill missing chromatograms by extracting from raw data.
|
|
592
|
+
|
|
593
|
+
Simplified version that loads one sample at a time without preloading or batching.
|
|
594
|
+
|
|
595
|
+
Parameters:
|
|
596
|
+
**kwargs: Keyword arguments for fill_single parameters. Can include:
|
|
597
|
+
- A fill_defaults instance to set all parameters at once
|
|
598
|
+
- Individual parameter names and values (see fill_defaults for details)
|
|
599
|
+
|
|
600
|
+
Key Parameters:
|
|
601
|
+
uids: Consensus UIDs to process (default: all)
|
|
602
|
+
mz_tol: m/z tolerance for extraction (default: 0.010 Da)
|
|
603
|
+
rt_tol: RT tolerance for extraction (default: 10.0 seconds)
|
|
604
|
+
min_samples_rel: Relative minimum sample threshold (default: 0.0)
|
|
605
|
+
min_samples_abs: Absolute minimum sample threshold (default: 2)
|
|
606
|
+
"""
|
|
607
|
+
# parameters initialization
|
|
608
|
+
from masster.study.defaults import fill_defaults
|
|
609
|
+
params = fill_defaults()
|
|
610
|
+
|
|
611
|
+
for key, value in kwargs.items():
|
|
612
|
+
if isinstance(value, fill_defaults):
|
|
613
|
+
params = value
|
|
614
|
+
self.logger.debug("Using provided fill_defaults parameters")
|
|
615
|
+
else:
|
|
616
|
+
if hasattr(params, key):
|
|
617
|
+
if params.set(key, value, validate=True):
|
|
618
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
619
|
+
else:
|
|
620
|
+
self.logger.warning(
|
|
621
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
622
|
+
)
|
|
623
|
+
else:
|
|
624
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
625
|
+
# end of parameter initialization
|
|
626
|
+
|
|
627
|
+
# Store parameters in the Study object
|
|
628
|
+
self.store_history(["fill_single"], params.to_dict())
|
|
629
|
+
self.logger.debug("Parameters stored to fill_single")
|
|
630
|
+
|
|
631
|
+
# Call the original fill_chrom_single function with extracted parameters
|
|
632
|
+
return _fill_chrom_single_impl(
|
|
633
|
+
self,
|
|
634
|
+
uids=params.get("uids"),
|
|
635
|
+
mz_tol=params.get("mz_tol"),
|
|
636
|
+
rt_tol=params.get("rt_tol"),
|
|
637
|
+
min_samples_rel=params.get("min_samples_rel"),
|
|
638
|
+
min_samples_abs=params.get("min_samples_abs"),
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def _process_sample_for_parallel_fill(
|
|
643
|
+
self,
|
|
644
|
+
sample_info,
|
|
645
|
+
consensus_info,
|
|
646
|
+
uids,
|
|
647
|
+
mz_tol,
|
|
648
|
+
rt_tol,
|
|
649
|
+
missing_combinations_df,
|
|
650
|
+
features_df_max_uid,
|
|
651
|
+
):
|
|
652
|
+
"""Process a single sample for parallel gap filling."""
|
|
653
|
+
sample_uid = sample_info["sample_uid"]
|
|
654
|
+
sample_path = sample_info["sample_path"]
|
|
655
|
+
|
|
656
|
+
new_features: list[dict] = []
|
|
657
|
+
new_mapping: list[dict] = []
|
|
658
|
+
counter = 0
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
# Load this sample
|
|
662
|
+
file = Sample()
|
|
663
|
+
file.logger_update(level="WARNING")
|
|
664
|
+
file.load(sample_path)
|
|
665
|
+
except Exception:
|
|
666
|
+
# Skip this sample if loading fails
|
|
667
|
+
return new_features, new_mapping, counter
|
|
668
|
+
|
|
669
|
+
# Find missing features for this sample from precomputed combinations
|
|
670
|
+
sample_missing = missing_combinations_df.filter(
|
|
671
|
+
pl.col("sample_uid") == sample_uid,
|
|
672
|
+
)["consensus_uid"].to_list()
|
|
673
|
+
|
|
674
|
+
if not sample_missing:
|
|
675
|
+
return new_features, new_mapping, counter
|
|
676
|
+
|
|
677
|
+
# Process each missing feature
|
|
678
|
+
for consensus_uid in sample_missing:
|
|
679
|
+
cons = consensus_info[consensus_uid]
|
|
680
|
+
mz = cons["mz"]
|
|
681
|
+
rt = cons["rt"]
|
|
682
|
+
rt_start_mean = cons["rt_start_mean"]
|
|
683
|
+
rt_end_mean = cons["rt_end_mean"]
|
|
684
|
+
|
|
685
|
+
# Filter MS1 data for this feature
|
|
686
|
+
if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
|
|
687
|
+
d = file.ms1_df.filter(
|
|
688
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
689
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
690
|
+
& (pl.col("rt") >= rt_start_mean - rt_tol)
|
|
691
|
+
& (pl.col("rt") <= rt_end_mean + rt_tol),
|
|
692
|
+
)
|
|
693
|
+
else:
|
|
694
|
+
d = pl.DataFrame()
|
|
695
|
+
|
|
696
|
+
# Create chromatogram
|
|
697
|
+
if d.is_empty():
|
|
698
|
+
eic = Chromatogram(
|
|
699
|
+
rt=np.array([rt_start_mean, rt_end_mean]),
|
|
700
|
+
inty=np.array([0.0, 0.0]),
|
|
701
|
+
label=f"EIC mz={mz:.4f}",
|
|
702
|
+
file=sample_path,
|
|
703
|
+
mz=mz,
|
|
704
|
+
mz_tol=mz_tol,
|
|
705
|
+
feature_start=rt_start_mean,
|
|
706
|
+
feature_end=rt_end_mean,
|
|
707
|
+
feature_apex=rt,
|
|
708
|
+
)
|
|
709
|
+
max_inty = 0.0
|
|
710
|
+
area = 0.0
|
|
711
|
+
else:
|
|
712
|
+
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
713
|
+
|
|
714
|
+
if len(eic_rt) > 4:
|
|
715
|
+
eic = Chromatogram(
|
|
716
|
+
eic_rt["rt"].to_numpy(),
|
|
717
|
+
eic_rt["inty"].to_numpy(),
|
|
718
|
+
label=f"EIC mz={mz:.4f}",
|
|
719
|
+
file=sample_path,
|
|
720
|
+
mz=mz,
|
|
721
|
+
mz_tol=mz_tol,
|
|
722
|
+
feature_start=rt_start_mean,
|
|
723
|
+
feature_end=rt_end_mean,
|
|
724
|
+
feature_apex=rt,
|
|
725
|
+
).find_peaks()
|
|
726
|
+
max_inty = np.max(eic.inty)
|
|
727
|
+
area = eic.feature_area
|
|
728
|
+
else:
|
|
729
|
+
eic = Chromatogram(
|
|
730
|
+
eic_rt["rt"].to_numpy(),
|
|
731
|
+
eic_rt["inty"].to_numpy(),
|
|
732
|
+
label=f"EIC mz={mz:.4f}",
|
|
733
|
+
file=sample_path,
|
|
734
|
+
mz=mz,
|
|
735
|
+
mz_tol=mz_tol,
|
|
736
|
+
feature_start=rt_start_mean,
|
|
737
|
+
feature_end=rt_end_mean,
|
|
738
|
+
feature_apex=rt,
|
|
739
|
+
)
|
|
740
|
+
max_inty = 0.0
|
|
741
|
+
area = 0.0
|
|
742
|
+
|
|
743
|
+
# Generate feature UID (will be adjusted later to ensure global uniqueness)
|
|
744
|
+
feature_uid = features_df_max_uid + len(new_features) + 1
|
|
745
|
+
|
|
746
|
+
# Create new feature entry
|
|
747
|
+
new_feature = {
|
|
748
|
+
"sample_uid": sample_uid,
|
|
749
|
+
"feature_uid": feature_uid,
|
|
750
|
+
"feature_id": None,
|
|
751
|
+
"mz": mz,
|
|
752
|
+
"rt": rt,
|
|
753
|
+
"rt_original": None,
|
|
754
|
+
"rt_start": rt_start_mean,
|
|
755
|
+
"rt_end": rt_end_mean,
|
|
756
|
+
"rt_delta": rt_end_mean - rt_start_mean,
|
|
757
|
+
"mz_start": None,
|
|
758
|
+
"mz_end": None,
|
|
759
|
+
"inty": max_inty,
|
|
760
|
+
"quality": None,
|
|
761
|
+
"charge": None,
|
|
762
|
+
"iso": None,
|
|
763
|
+
"iso_of": None,
|
|
764
|
+
"adduct": None,
|
|
765
|
+
"adduct_mass": None,
|
|
766
|
+
"adduct_group": None,
|
|
767
|
+
"chrom": eic,
|
|
768
|
+
"filled": True,
|
|
769
|
+
"chrom_area": area,
|
|
770
|
+
"chrom_coherence": None,
|
|
771
|
+
"chrom_prominence": None,
|
|
772
|
+
"chrom_prominence_scaled": None,
|
|
773
|
+
"chrom_height_scaled": None,
|
|
774
|
+
"ms2_scans": None,
|
|
775
|
+
"ms2_specs": None,
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
new_features.append(new_feature)
|
|
779
|
+
new_mapping.append({
|
|
780
|
+
"consensus_uid": consensus_uid,
|
|
781
|
+
"sample_uid": sample_uid,
|
|
782
|
+
"feature_uid": feature_uid,
|
|
783
|
+
})
|
|
784
|
+
counter += 1
|
|
785
|
+
|
|
786
|
+
return new_features, new_mapping, counter
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def _fill_chrom_impl(
|
|
790
|
+
self,
|
|
791
|
+
uids=None,
|
|
792
|
+
mz_tol: float = 0.010,
|
|
793
|
+
rt_tol: float = 10.0,
|
|
794
|
+
min_samples_rel: float = 0.0,
|
|
795
|
+
min_samples_abs: int = 2,
|
|
796
|
+
num_workers=4,
|
|
797
|
+
):
|
|
798
|
+
"""Fill missing chromatograms by extracting from raw data using parallel processing.
|
|
799
|
+
|
|
800
|
+
Args:
|
|
801
|
+
uids: Consensus UIDs to process (default: all)
|
|
802
|
+
mz_tol: m/z tolerance for extraction (default: 0.010 Da)
|
|
803
|
+
rt_tol: RT tolerance for extraction (default: 10.0 seconds)
|
|
804
|
+
min_samples_rel: Relative minimum sample threshold (default: 0.0)
|
|
805
|
+
min_samples_abs: Absolute minimum sample threshold (default: 2)
|
|
806
|
+
num_workers: Number of parallel workers (default: 4)
|
|
807
|
+
"""
|
|
808
|
+
uids = self._get_consensus_uids(uids)
|
|
809
|
+
|
|
810
|
+
self.logger.info(f"Gap filling with {num_workers} workers...")
|
|
811
|
+
self.logger.debug(
|
|
812
|
+
f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
# Apply minimum sample filters
|
|
816
|
+
min_number_rel = 1
|
|
817
|
+
min_number_abs = 1
|
|
818
|
+
if isinstance(min_samples_rel, float) and min_samples_rel > 0:
|
|
819
|
+
min_number_rel = int(min_samples_rel * len(self.samples_df))
|
|
820
|
+
if isinstance(min_samples_abs, int) and min_samples_abs > 0:
|
|
821
|
+
min_number_abs = int(min_samples_abs)
|
|
822
|
+
min_number = max(min_number_rel, min_number_abs)
|
|
823
|
+
|
|
824
|
+
self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
|
|
825
|
+
|
|
826
|
+
if min_number > 0:
|
|
827
|
+
original_count = len(uids)
|
|
828
|
+
uids = self.consensus_df.filter(
|
|
829
|
+
(pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
|
|
830
|
+
)["consensus_uid"].to_list()
|
|
831
|
+
self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
|
|
832
|
+
|
|
833
|
+
# Get missing consensus/sample combinations using the optimized method
|
|
834
|
+
self.logger.debug("Identifying missing features...")
|
|
835
|
+
missing_combinations = self._get_missing_consensus_sample_combinations(uids)
|
|
836
|
+
|
|
837
|
+
if not missing_combinations or len(missing_combinations) == 0:
|
|
838
|
+
self.logger.info("No missing features found to fill.")
|
|
839
|
+
return
|
|
840
|
+
|
|
841
|
+
# Convert to DataFrame for easier processing
|
|
842
|
+
missing_combinations_df = pl.DataFrame(
|
|
843
|
+
missing_combinations,
|
|
844
|
+
schema={
|
|
845
|
+
"consensus_uid": pl.Int64,
|
|
846
|
+
"sample_uid": pl.Int64,
|
|
847
|
+
"sample_name": pl.Utf8,
|
|
848
|
+
"sample_path": pl.Utf8,
|
|
849
|
+
},
|
|
850
|
+
orient="row",
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
# Build lookup dictionaries
|
|
854
|
+
self.logger.debug("Building lookup dictionaries...")
|
|
855
|
+
consensus_info = {}
|
|
856
|
+
consensus_subset = self.consensus_df.select([
|
|
857
|
+
"consensus_uid",
|
|
858
|
+
"rt_start_mean",
|
|
859
|
+
"rt_end_mean",
|
|
860
|
+
"mz",
|
|
861
|
+
"rt",
|
|
862
|
+
]).filter(pl.col("consensus_uid").is_in(uids))
|
|
863
|
+
|
|
864
|
+
for row in consensus_subset.iter_rows(named=True):
|
|
865
|
+
consensus_info[row["consensus_uid"]] = {
|
|
866
|
+
"rt_start_mean": row["rt_start_mean"],
|
|
867
|
+
"rt_end_mean": row["rt_end_mean"],
|
|
868
|
+
"mz": row["mz"],
|
|
869
|
+
"rt": row["rt"],
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
# Get sample info for all samples that need processing
|
|
873
|
+
samples_to_process = []
|
|
874
|
+
unique_sample_uids = missing_combinations_df["sample_uid"].unique().to_list()
|
|
875
|
+
|
|
876
|
+
for row in self.samples_df.filter(
|
|
877
|
+
pl.col("sample_uid").is_in(unique_sample_uids),
|
|
878
|
+
).iter_rows(named=True):
|
|
879
|
+
samples_to_process.append({
|
|
880
|
+
"sample_name": row["sample_name"],
|
|
881
|
+
"sample_uid": row["sample_uid"],
|
|
882
|
+
"sample_path": row["sample_path"],
|
|
883
|
+
})
|
|
884
|
+
|
|
885
|
+
total_missing = len(missing_combinations_df)
|
|
886
|
+
total_samples = len(samples_to_process)
|
|
887
|
+
|
|
888
|
+
self.logger.debug(
|
|
889
|
+
f"Gap filling for {total_missing} missing features...",
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
# Calculate current max feature_uid to avoid conflicts
|
|
893
|
+
features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
|
|
894
|
+
|
|
895
|
+
# Process samples in parallel
|
|
896
|
+
all_new_features: list[dict] = []
|
|
897
|
+
all_new_mapping: list[dict] = []
|
|
898
|
+
total_counter = 0
|
|
899
|
+
|
|
900
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
901
|
+
|
|
902
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
903
|
+
# Submit all samples for processing
|
|
904
|
+
future_to_sample = {}
|
|
905
|
+
for sample_info in samples_to_process:
|
|
906
|
+
future = executor.submit(
|
|
907
|
+
self._process_sample_for_parallel_fill,
|
|
908
|
+
sample_info,
|
|
909
|
+
consensus_info,
|
|
910
|
+
uids,
|
|
911
|
+
mz_tol,
|
|
912
|
+
rt_tol,
|
|
913
|
+
missing_combinations_df,
|
|
914
|
+
features_df_max_uid,
|
|
915
|
+
)
|
|
916
|
+
future_to_sample[future] = sample_info
|
|
917
|
+
|
|
918
|
+
# Collect results with progress bar
|
|
919
|
+
with tqdm(
|
|
920
|
+
total=len(samples_to_process),
|
|
921
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Processing samples",
|
|
922
|
+
disable=tdqm_disable,
|
|
923
|
+
) as pbar:
|
|
924
|
+
for future in concurrent.futures.as_completed(future_to_sample):
|
|
925
|
+
try:
|
|
926
|
+
new_features, new_mapping, counter = future.result()
|
|
927
|
+
|
|
928
|
+
# Adjust feature UIDs to ensure global uniqueness
|
|
929
|
+
uid_offset = features_df_max_uid + len(all_new_features)
|
|
930
|
+
for i, feature in enumerate(new_features):
|
|
931
|
+
feature["feature_uid"] = uid_offset + i + 1
|
|
932
|
+
for i, mapping in enumerate(new_mapping):
|
|
933
|
+
mapping["feature_uid"] = uid_offset + i + 1
|
|
934
|
+
|
|
935
|
+
all_new_features.extend(new_features)
|
|
936
|
+
all_new_mapping.extend(new_mapping)
|
|
937
|
+
total_counter += counter
|
|
938
|
+
|
|
939
|
+
except Exception as e:
|
|
940
|
+
sample_info = future_to_sample[future]
|
|
941
|
+
self.logger.warning(
|
|
942
|
+
f"Sample {sample_info['sample_name']} failed: {e}",
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
pbar.update(1)
|
|
946
|
+
|
|
947
|
+
# Add new features to DataFrames
|
|
948
|
+
self.logger.debug(f"Adding {len(all_new_features)} new features to DataFrame...")
|
|
949
|
+
if all_new_features:
|
|
950
|
+
# Create properly formatted rows
|
|
951
|
+
rows_to_add = []
|
|
952
|
+
for feature_dict in all_new_features:
|
|
953
|
+
new_row = {}
|
|
954
|
+
for col in self.features_df.columns:
|
|
955
|
+
if col in feature_dict:
|
|
956
|
+
new_row[col] = feature_dict[col]
|
|
957
|
+
else:
|
|
958
|
+
new_row[col] = None
|
|
959
|
+
rows_to_add.append(new_row)
|
|
960
|
+
|
|
961
|
+
# Create and add new DataFrame
|
|
962
|
+
new_df = pl.from_dicts(rows_to_add)
|
|
963
|
+
|
|
964
|
+
# Cast columns to match existing schema
|
|
965
|
+
cast_exprs = []
|
|
966
|
+
for col in self.features_df.columns:
|
|
967
|
+
existing_dtype = self.features_df[col].dtype
|
|
968
|
+
cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
|
|
969
|
+
|
|
970
|
+
new_df = new_df.with_columns(cast_exprs)
|
|
971
|
+
self.features_df = self.features_df.vstack(new_df)
|
|
972
|
+
|
|
973
|
+
# Add consensus mapping
|
|
974
|
+
new_mapping_df = pl.DataFrame(all_new_mapping)
|
|
975
|
+
self.consensus_mapping_df = pl.concat(
|
|
976
|
+
[self.consensus_mapping_df, new_mapping_df],
|
|
977
|
+
how="diagonal",
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
self.logger.info(
|
|
981
|
+
f"Filled {total_counter} chromatograms from raw data using {num_workers} parallel workers.",
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
def fill(self, **kwargs):
|
|
986
|
+
"""Fill missing chromatograms by extracting from raw data using parallel processing.
|
|
987
|
+
|
|
988
|
+
Parameters:
|
|
989
|
+
**kwargs: Keyword arguments for fill parameters. Can include:
|
|
990
|
+
- A fill_defaults instance to set all parameters at once
|
|
991
|
+
- Individual parameter names and values (see fill_defaults for details)
|
|
992
|
+
|
|
993
|
+
Key Parameters:
|
|
994
|
+
uids: Consensus UIDs to process (default: all)
|
|
995
|
+
mz_tol: m/z tolerance for extraction (default: 0.010 Da)
|
|
996
|
+
rt_tol: RT tolerance for extraction (default: 10.0 seconds)
|
|
997
|
+
min_samples_rel: Relative minimum sample threshold (default: 0.05)
|
|
998
|
+
min_samples_abs: Absolute minimum sample threshold (default: 5)
|
|
999
|
+
num_workers: Number of parallel workers (default: 4)
|
|
1000
|
+
"""
|
|
1001
|
+
# parameters initialization
|
|
1002
|
+
params = fill_defaults()
|
|
1003
|
+
num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
|
|
1004
|
+
|
|
1005
|
+
for key, value in kwargs.items():
|
|
1006
|
+
if isinstance(value, fill_defaults):
|
|
1007
|
+
params = value
|
|
1008
|
+
self.logger.debug("Using provided fill_defaults parameters")
|
|
1009
|
+
else:
|
|
1010
|
+
if hasattr(params, key):
|
|
1011
|
+
if params.set(key, value, validate=True):
|
|
1012
|
+
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
1013
|
+
else:
|
|
1014
|
+
self.logger.warning(
|
|
1015
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
1016
|
+
)
|
|
1017
|
+
elif key != "num_workers": # Allow num_workers as valid parameter
|
|
1018
|
+
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
1019
|
+
# end of parameter initialization
|
|
1020
|
+
|
|
1021
|
+
# Store parameters in the Study object
|
|
1022
|
+
self.store_history(["fill"], params.to_dict())
|
|
1023
|
+
self.logger.debug("Parameters stored to fill")
|
|
1024
|
+
|
|
1025
|
+
# Call the original fill_chrom function with extracted parameters
|
|
1026
|
+
return _fill_chrom_impl(
|
|
1027
|
+
self,
|
|
1028
|
+
uids=params.get("uids"),
|
|
1029
|
+
mz_tol=params.get("mz_tol"),
|
|
1030
|
+
rt_tol=params.get("rt_tol"),
|
|
1031
|
+
min_samples_rel=params.get("min_samples_rel"),
|
|
1032
|
+
min_samples_abs=params.get("min_samples_abs"),
|
|
1033
|
+
num_workers=num_workers,
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
# Backward compatibility alias
|
|
1038
|
+
fill_chrom = fill
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
def _get_missing_consensus_sample_combinations(self, uids):
|
|
1042
|
+
"""
|
|
1043
|
+
Efficiently identify which consensus_uid/sample combinations are missing.
|
|
1044
|
+
Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
|
|
1045
|
+
"""
|
|
1046
|
+
# Get all consensus UIDs we're interested in
|
|
1047
|
+
consensus_uids_set = set(uids)
|
|
1048
|
+
|
|
1049
|
+
# Get all sample UIDs and create lookup
|
|
1050
|
+
all_sample_info = {}
|
|
1051
|
+
for row in self.samples_df.select([
|
|
1052
|
+
"sample_uid",
|
|
1053
|
+
"sample_name",
|
|
1054
|
+
"sample_path",
|
|
1055
|
+
]).iter_rows(named=True):
|
|
1056
|
+
all_sample_info[row["sample_uid"]] = {
|
|
1057
|
+
"sample_name": row["sample_name"],
|
|
1058
|
+
"sample_path": row["sample_path"],
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
# Get existing consensus/sample combinations from consensus_mapping_df
|
|
1062
|
+
existing_combinations = set()
|
|
1063
|
+
consensus_mapping_filtered = self.consensus_mapping_df.filter(
|
|
1064
|
+
pl.col("consensus_uid").is_in(list(consensus_uids_set)),
|
|
1065
|
+
)
|
|
1066
|
+
|
|
1067
|
+
# Join with features_df to get sample_uid information
|
|
1068
|
+
existing_features = consensus_mapping_filtered.join(
|
|
1069
|
+
self.features_df.select(["feature_uid", "sample_uid"]),
|
|
1070
|
+
on="feature_uid",
|
|
1071
|
+
how="inner",
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
|
|
1075
|
+
existing_combinations.add((row[0], row[1])) # (consensus_uid, sample_uid)
|
|
1076
|
+
|
|
1077
|
+
# Find missing combinations
|
|
1078
|
+
missing_combinations = []
|
|
1079
|
+
for consensus_uid in consensus_uids_set:
|
|
1080
|
+
for sample_uid, sample_info in all_sample_info.items():
|
|
1081
|
+
if (consensus_uid, sample_uid) not in existing_combinations:
|
|
1082
|
+
missing_combinations.append((
|
|
1083
|
+
consensus_uid,
|
|
1084
|
+
sample_uid,
|
|
1085
|
+
sample_info["sample_name"],
|
|
1086
|
+
sample_info["sample_path"],
|
|
1087
|
+
))
|
|
1088
|
+
|
|
1089
|
+
return missing_combinations
|
|
1090
|
+
|
|
1091
|
+
|
|
1092
|
+
def sanitize(self):
|
|
1093
|
+
"""
|
|
1094
|
+
Sanitize features DataFrame to ensure all complex objects are properly typed.
|
|
1095
|
+
Convert serialized objects back to their proper types (Chromatogram, Spectrum).
|
|
1096
|
+
"""
|
|
1097
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1098
|
+
return
|
|
1099
|
+
|
|
1100
|
+
self.logger.debug(
|
|
1101
|
+
"Sanitizing features DataFrame to ensure all complex objects are properly typed.",
|
|
1102
|
+
)
|
|
1103
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1104
|
+
|
|
1105
|
+
# Check if we have object columns that need sanitization
|
|
1106
|
+
has_chrom = "chrom" in self.features_df.columns
|
|
1107
|
+
has_ms2_specs = "ms2_specs" in self.features_df.columns
|
|
1108
|
+
|
|
1109
|
+
if not has_chrom and not has_ms2_specs:
|
|
1110
|
+
self.logger.debug("No object columns found that need sanitization.")
|
|
1111
|
+
return
|
|
1112
|
+
|
|
1113
|
+
# Convert to list of dictionaries for easier manipulation
|
|
1114
|
+
rows_data = []
|
|
1115
|
+
|
|
1116
|
+
for row_dict in tqdm(
|
|
1117
|
+
self.features_df.iter_rows(named=True),
|
|
1118
|
+
total=len(self.features_df),
|
|
1119
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO |{self.log_label}Sanitize features",
|
|
1120
|
+
disable=tdqm_disable,
|
|
1121
|
+
):
|
|
1122
|
+
row_data = dict(row_dict)
|
|
1123
|
+
|
|
1124
|
+
# Sanitize chrom column
|
|
1125
|
+
if has_chrom and row_data["chrom"] is not None:
|
|
1126
|
+
if not isinstance(row_data["chrom"], Chromatogram):
|
|
1127
|
+
try:
|
|
1128
|
+
# Create new Chromatogram and populate from dict if needed
|
|
1129
|
+
new_chrom = Chromatogram(rt=np.array([]), inty=np.array([]))
|
|
1130
|
+
if hasattr(row_data["chrom"], "__dict__"):
|
|
1131
|
+
new_chrom.from_dict(row_data["chrom"].__dict__)
|
|
1132
|
+
else:
|
|
1133
|
+
# If it's already a dict
|
|
1134
|
+
new_chrom.from_dict(row_data["chrom"])
|
|
1135
|
+
row_data["chrom"] = new_chrom
|
|
1136
|
+
except Exception as e:
|
|
1137
|
+
self.logger.warning(f"Failed to sanitize chrom object: {e}")
|
|
1138
|
+
row_data["chrom"] = None
|
|
1139
|
+
|
|
1140
|
+
# Sanitize ms2_specs column
|
|
1141
|
+
if has_ms2_specs and row_data["ms2_specs"] is not None:
|
|
1142
|
+
if isinstance(row_data["ms2_specs"], list):
|
|
1143
|
+
sanitized_specs = []
|
|
1144
|
+
for ms2_specs in row_data["ms2_specs"]:
|
|
1145
|
+
if not isinstance(ms2_specs, Spectrum):
|
|
1146
|
+
try:
|
|
1147
|
+
new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
|
|
1148
|
+
if hasattr(ms2_specs, "__dict__"):
|
|
1149
|
+
new_ms2_specs.from_dict(ms2_specs.__dict__)
|
|
1150
|
+
else:
|
|
1151
|
+
new_ms2_specs.from_dict(ms2_specs)
|
|
1152
|
+
sanitized_specs.append(new_ms2_specs)
|
|
1153
|
+
except Exception as e:
|
|
1154
|
+
self.logger.warning(
|
|
1155
|
+
f"Failed to sanitize ms2_specs object: {e}",
|
|
1156
|
+
)
|
|
1157
|
+
sanitized_specs.append(None)
|
|
1158
|
+
else:
|
|
1159
|
+
sanitized_specs.append(ms2_specs)
|
|
1160
|
+
row_data["ms2_specs"] = sanitized_specs
|
|
1161
|
+
elif not isinstance(row_data["ms2_specs"], Spectrum):
|
|
1162
|
+
try:
|
|
1163
|
+
new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
|
|
1164
|
+
if hasattr(row_data["ms2_specs"], "__dict__"):
|
|
1165
|
+
new_ms2_specs.from_dict(row_data["ms2_specs"].__dict__)
|
|
1166
|
+
else:
|
|
1167
|
+
new_ms2_specs.from_dict(row_data["ms2_specs"])
|
|
1168
|
+
row_data["ms2_specs"] = new_ms2_specs
|
|
1169
|
+
except Exception as e:
|
|
1170
|
+
self.logger.warning(f"Failed to sanitize ms2_specs object: {e}")
|
|
1171
|
+
row_data["ms2_specs"] = None
|
|
1172
|
+
|
|
1173
|
+
rows_data.append(row_data)
|
|
1174
|
+
|
|
1175
|
+
# Recreate the DataFrame with sanitized data
|
|
1176
|
+
try:
|
|
1177
|
+
self.features_df = pl.DataFrame(rows_data)
|
|
1178
|
+
self.logger.success("Features DataFrame sanitization completed successfully.")
|
|
1179
|
+
except Exception as e:
|
|
1180
|
+
self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
|
|
1181
|
+
|
|
1182
|
+
|
|
1183
|
+
def load_features(self):
|
|
1184
|
+
# iterate over all samples in samples_df
|
|
1185
|
+
|
|
1186
|
+
self.features_maps = []
|
|
1187
|
+
self.logger.debug("Loading features from featureXML files.")
|
|
1188
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1189
|
+
for _index, row_dict in tqdm(
|
|
1190
|
+
enumerate(self.samples_df.iter_rows(named=True)),
|
|
1191
|
+
total=len(self.samples_df),
|
|
1192
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Load feature maps from XML",
|
|
1193
|
+
disable=tdqm_disable,
|
|
1194
|
+
):
|
|
1195
|
+
if self.folder is not None:
|
|
1196
|
+
filename = os.path.join(
|
|
1197
|
+
self.folder,
|
|
1198
|
+
row_dict["sample_name"] + ".featureXML",
|
|
1199
|
+
)
|
|
1200
|
+
else:
|
|
1201
|
+
filename = os.path.join(
|
|
1202
|
+
os.getcwd(),
|
|
1203
|
+
row_dict["sample_name"] + ".featureXML",
|
|
1204
|
+
)
|
|
1205
|
+
# check if file exists
|
|
1206
|
+
if not os.path.exists(filename):
|
|
1207
|
+
filename = row_dict["sample_path"].replace(".sample5", ".featureXML")
|
|
1208
|
+
|
|
1209
|
+
if not os.path.exists(filename):
|
|
1210
|
+
self.features_maps.append(None)
|
|
1211
|
+
continue
|
|
1212
|
+
|
|
1213
|
+
fh = oms.FeatureXMLFile()
|
|
1214
|
+
fm = oms.FeatureMap()
|
|
1215
|
+
fh.load(filename, fm)
|
|
1216
|
+
self.features_maps.append(fm)
|
|
1217
|
+
self.logger.debug("Features loaded successfully.")
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
1221
|
+
"""
|
|
1222
|
+
Load a consensus map from a file.
|
|
1223
|
+
"""
|
|
1224
|
+
if not os.path.exists(filename):
|
|
1225
|
+
self.logger.error(f"File {filename} does not exist.")
|
|
1226
|
+
return
|
|
1227
|
+
fh = oms.ConsensusXMLFile()
|
|
1228
|
+
self.consensus_map = oms.ConsensusMap()
|
|
1229
|
+
fh.load(filename, self.consensus_map)
|
|
1230
|
+
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1231
|
+
|