masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -719
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.4.dist-info/RECORD +0 -50
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/save.py
CHANGED
|
@@ -1,719 +1,737 @@
|
|
|
1
|
-
"""
|
|
2
|
-
_export.py
|
|
3
|
-
|
|
4
|
-
This module provides data export functionality for mass spectrometry analysis results.
|
|
5
|
-
It handles saving processed data in various formats for downstream analysis, sharing,
|
|
6
|
-
and archival purposes, including spectrum files, feature tables, and custom formats.
|
|
7
|
-
|
|
8
|
-
Key Features:
|
|
9
|
-
- **Multi-Format Export**: Save data as MGF, mzML, CSV, FeatureXML, and custom formats.
|
|
10
|
-
- **Spectrum Export**: Export MS/MS spectra for database searching and identification.
|
|
11
|
-
- **Feature Export**: Save detected features with quantitative information.
|
|
12
|
-
- **Custom Formats**: Support for compressed pickle formats (mzpkl) for fast storage.
|
|
13
|
-
- **Metadata Preservation**: Maintain acquisition parameters and processing history.
|
|
14
|
-
- **Batch Export**: Export multiple samples or studies simultaneously.
|
|
15
|
-
|
|
16
|
-
Dependencies:
|
|
17
|
-
- `pyopenms`: For standard mass spectrometry file format export.
|
|
18
|
-
- `polars` and `pandas`: For tabular data export and manipulation.
|
|
19
|
-
- `numpy`: For numerical array operations.
|
|
20
|
-
- `pickle` and `bz2`: For custom format compression and serialization.
|
|
21
|
-
- `loguru`: For logging export operations and error handling.
|
|
22
|
-
|
|
23
|
-
Functions:
|
|
24
|
-
- `save()`: Main export function with format detection.
|
|
25
|
-
- `save_mzpkl()`: Export to compressed pickle format for fast loading.
|
|
26
|
-
- `save_featureXML()`: Export features in OpenMS FeatureXML format.
|
|
27
|
-
- `export_mgf()`: Export MS/MS spectra in MGF format for database searching.
|
|
28
|
-
- `export_csv()`: Export features and metadata in CSV format.
|
|
29
|
-
|
|
30
|
-
Supported Export Formats:
|
|
31
|
-
- MGF (Mascot Generic Format) for MS/MS spectra
|
|
32
|
-
- mzML (open standard format) for spectral data
|
|
33
|
-
- CSV for tabular feature data
|
|
34
|
-
- FeatureXML (OpenMS format) for feature data
|
|
35
|
-
- mzpkl (custom compressed format) for complete analysis results
|
|
36
|
-
|
|
37
|
-
Example Usage:
|
|
38
|
-
```python
|
|
39
|
-
from _export import save, export_mgf
|
|
40
|
-
|
|
41
|
-
# Save complete analysis in custom format
|
|
42
|
-
save(self, filename="analysis_results.mzpkl")
|
|
43
|
-
|
|
44
|
-
# Export MS/MS spectra for database searching
|
|
45
|
-
export_mgf(self, filename="ms2_spectra.mgf", export_type="all")
|
|
46
|
-
|
|
47
|
-
# Export feature table
|
|
48
|
-
export_csv(self, filename="features.csv", data_type="features")
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
See Also:
|
|
52
|
-
- `parameters._export_parameters`: For export-specific parameter configuration.
|
|
53
|
-
- `_import.py`: For data import functionality.
|
|
54
|
-
- `single.py`: For using export methods with ddafile class.
|
|
55
|
-
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
import os
|
|
59
|
-
|
|
60
|
-
from datetime import datetime
|
|
61
|
-
|
|
62
|
-
import numpy as np
|
|
63
|
-
import pandas as pd
|
|
64
|
-
import polars as pl
|
|
65
|
-
import pyopenms as oms
|
|
66
|
-
|
|
67
|
-
from tqdm import tqdm
|
|
68
|
-
|
|
69
|
-
# Parameters removed - using hardcoded defaults
|
|
70
|
-
from masster.spectrum import combine_peaks
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def save(self, filename=None):
|
|
74
|
-
"""
|
|
75
|
-
Save the current object to a file in the '.sample5' format.
|
|
76
|
-
|
|
77
|
-
If `filename` is not provided, the method attempts to use `self.file_path` as the base name,
|
|
78
|
-
replacing its extension with '.sample5'. If neither `filename` nor `self.file_path` is available,
|
|
79
|
-
a ValueError is raised.
|
|
80
|
-
|
|
81
|
-
If `filename` is provided and `self.file_path` is an absolute path, the extension of `filename`
|
|
82
|
-
is replaced with '.sample5'. Otherwise, if `self.file_path` is available, its extension is replaced
|
|
83
|
-
with '.sample5'. If neither is available, a ValueError is raised.
|
|
84
|
-
|
|
85
|
-
Parameters:
|
|
86
|
-
filename (str, optional): The name of the file to save to. If not provided, uses `self.file_path`.
|
|
87
|
-
|
|
88
|
-
Returns:
|
|
89
|
-
None
|
|
90
|
-
"""
|
|
91
|
-
if filename is None:
|
|
92
|
-
# save to default file name
|
|
93
|
-
if self.file_path is not None:
|
|
94
|
-
filename = os.path.splitext(self.file_path)[0] + ".sample5"
|
|
95
|
-
else:
|
|
96
|
-
raise ValueError("either filename or file_path must be provided")
|
|
97
|
-
else:
|
|
98
|
-
# check if filename includes an absolute path
|
|
99
|
-
if os.path.isabs(self.file_path):
|
|
100
|
-
filename = os.path.splitext(filename)[0] + ".sample5"
|
|
101
|
-
elif self.file_path is not None:
|
|
102
|
-
filename = os.path.splitext(self.file_path)[0] + ".sample5"
|
|
103
|
-
else:
|
|
104
|
-
raise ValueError("either filename or file_path must be provided")
|
|
105
|
-
self._save_sample5(filename=filename)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
fh.
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
if
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
-
|
|
208
|
-
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
if
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
if
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
if
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
if
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
f.write(f"
|
|
268
|
-
f.write(f"
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
f.
|
|
279
|
-
f.
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
if
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
f
|
|
345
|
-
feature_uid,
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
if
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
s
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
spect
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
spect
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
"
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
else
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
#
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
)
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
if
|
|
667
|
-
lines.append(
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
lines.append(
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
else:
|
|
692
|
-
lines.append("
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
lines.append("
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
1
|
+
"""
|
|
2
|
+
_export.py
|
|
3
|
+
|
|
4
|
+
This module provides data export functionality for mass spectrometry analysis results.
|
|
5
|
+
It handles saving processed data in various formats for downstream analysis, sharing,
|
|
6
|
+
and archival purposes, including spectrum files, feature tables, and custom formats.
|
|
7
|
+
|
|
8
|
+
Key Features:
|
|
9
|
+
- **Multi-Format Export**: Save data as MGF, mzML, CSV, FeatureXML, and custom formats.
|
|
10
|
+
- **Spectrum Export**: Export MS/MS spectra for database searching and identification.
|
|
11
|
+
- **Feature Export**: Save detected features with quantitative information.
|
|
12
|
+
- **Custom Formats**: Support for compressed pickle formats (mzpkl) for fast storage.
|
|
13
|
+
- **Metadata Preservation**: Maintain acquisition parameters and processing history.
|
|
14
|
+
- **Batch Export**: Export multiple samples or studies simultaneously.
|
|
15
|
+
|
|
16
|
+
Dependencies:
|
|
17
|
+
- `pyopenms`: For standard mass spectrometry file format export.
|
|
18
|
+
- `polars` and `pandas`: For tabular data export and manipulation.
|
|
19
|
+
- `numpy`: For numerical array operations.
|
|
20
|
+
- `pickle` and `bz2`: For custom format compression and serialization.
|
|
21
|
+
- `loguru`: For logging export operations and error handling.
|
|
22
|
+
|
|
23
|
+
Functions:
|
|
24
|
+
- `save()`: Main export function with format detection.
|
|
25
|
+
- `save_mzpkl()`: Export to compressed pickle format for fast loading.
|
|
26
|
+
- `save_featureXML()`: Export features in OpenMS FeatureXML format.
|
|
27
|
+
- `export_mgf()`: Export MS/MS spectra in MGF format for database searching.
|
|
28
|
+
- `export_csv()`: Export features and metadata in CSV format.
|
|
29
|
+
|
|
30
|
+
Supported Export Formats:
|
|
31
|
+
- MGF (Mascot Generic Format) for MS/MS spectra
|
|
32
|
+
- mzML (open standard format) for spectral data
|
|
33
|
+
- CSV for tabular feature data
|
|
34
|
+
- FeatureXML (OpenMS format) for feature data
|
|
35
|
+
- mzpkl (custom compressed format) for complete analysis results
|
|
36
|
+
|
|
37
|
+
Example Usage:
|
|
38
|
+
```python
|
|
39
|
+
from _export import save, export_mgf
|
|
40
|
+
|
|
41
|
+
# Save complete analysis in custom format
|
|
42
|
+
save(self, filename="analysis_results.mzpkl")
|
|
43
|
+
|
|
44
|
+
# Export MS/MS spectra for database searching
|
|
45
|
+
export_mgf(self, filename="ms2_spectra.mgf", export_type="all")
|
|
46
|
+
|
|
47
|
+
# Export feature table
|
|
48
|
+
export_csv(self, filename="features.csv", data_type="features")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
See Also:
|
|
52
|
+
- `parameters._export_parameters`: For export-specific parameter configuration.
|
|
53
|
+
- `_import.py`: For data import functionality.
|
|
54
|
+
- `single.py`: For using export methods with ddafile class.
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
import os
|
|
59
|
+
|
|
60
|
+
from datetime import datetime
|
|
61
|
+
|
|
62
|
+
import numpy as np
|
|
63
|
+
import pandas as pd
|
|
64
|
+
import polars as pl
|
|
65
|
+
import pyopenms as oms
|
|
66
|
+
|
|
67
|
+
from tqdm import tqdm
|
|
68
|
+
|
|
69
|
+
# Parameters removed - using hardcoded defaults
|
|
70
|
+
from masster.spectrum import combine_peaks
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def save(self, filename=None):
|
|
74
|
+
"""
|
|
75
|
+
Save the current object to a file in the '.sample5' format.
|
|
76
|
+
|
|
77
|
+
If `filename` is not provided, the method attempts to use `self.file_path` as the base name,
|
|
78
|
+
replacing its extension with '.sample5'. If neither `filename` nor `self.file_path` is available,
|
|
79
|
+
a ValueError is raised.
|
|
80
|
+
|
|
81
|
+
If `filename` is provided and `self.file_path` is an absolute path, the extension of `filename`
|
|
82
|
+
is replaced with '.sample5'. Otherwise, if `self.file_path` is available, its extension is replaced
|
|
83
|
+
with '.sample5'. If neither is available, a ValueError is raised.
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
filename (str, optional): The name of the file to save to. If not provided, uses `self.file_path`.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
None
|
|
90
|
+
"""
|
|
91
|
+
if filename is None:
|
|
92
|
+
# save to default file name
|
|
93
|
+
if self.file_path is not None:
|
|
94
|
+
filename = os.path.splitext(self.file_path)[0] + ".sample5"
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError("either filename or file_path must be provided")
|
|
97
|
+
else:
|
|
98
|
+
# check if filename includes an absolute path
|
|
99
|
+
if os.path.isabs(self.file_path):
|
|
100
|
+
filename = os.path.splitext(filename)[0] + ".sample5"
|
|
101
|
+
elif self.file_path is not None:
|
|
102
|
+
filename = os.path.splitext(self.file_path)[0] + ".sample5"
|
|
103
|
+
else:
|
|
104
|
+
raise ValueError("either filename or file_path must be provided")
|
|
105
|
+
self._save_sample5(filename=filename)
|
|
106
|
+
self.file_path = filename
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _save_featureXML(self, filename="features.featureXML"):
|
|
110
|
+
if self.features is None:
|
|
111
|
+
self.logger.warning("No features found.")
|
|
112
|
+
return
|
|
113
|
+
fh = oms.FeatureXMLFile()
|
|
114
|
+
fh.store(filename, self.features)
|
|
115
|
+
self.logger.debug(f"Features Map saved to {filename}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def export_features(self, filename="features.csv"):
|
|
119
|
+
"""
|
|
120
|
+
Export the features DataFrame to a CSV or Excel file.
|
|
121
|
+
|
|
122
|
+
This method clones the internal features DataFrame, adds a boolean column 'has_ms2' indicating
|
|
123
|
+
whether the 'ms2_scans' column is not null, and exports the resulting DataFrame to the specified file.
|
|
124
|
+
Columns with data types 'List' or 'Object' are excluded from the export.
|
|
125
|
+
|
|
126
|
+
Parameters:
|
|
127
|
+
filename (str): The path to the output file. If the filename ends with '.xls' or '.xlsx',
|
|
128
|
+
the data is exported in Excel format; otherwise, it is exported as CSV.
|
|
129
|
+
Defaults to 'features.csv'.
|
|
130
|
+
|
|
131
|
+
Side Effects:
|
|
132
|
+
Writes the exported data to the specified file and logs the export operation.
|
|
133
|
+
"""
|
|
134
|
+
# clone df
|
|
135
|
+
clean_df = self.features_df.clone()
|
|
136
|
+
filename = os.path.abspath(filename)
|
|
137
|
+
# add a column has_ms2=True if colum ms2_scans is not None
|
|
138
|
+
if "ms2_scans" in clean_df.columns:
|
|
139
|
+
clean_df = clean_df.with_columns(
|
|
140
|
+
(pl.col("ms2_scans").is_not_null()).alias("has_ms2")
|
|
141
|
+
)
|
|
142
|
+
clean_df = self.features_df.select([
|
|
143
|
+
col for col in self.features_df.columns if self.features_df[col].dtype not in (pl.List, pl.Object)
|
|
144
|
+
])
|
|
145
|
+
if filename.lower().endswith((".xls", ".xlsx")):
|
|
146
|
+
clean_df.to_pandas().to_excel(filename, index=False)
|
|
147
|
+
self.logger.info(f"Features exported to {filename} (Excel format)")
|
|
148
|
+
else:
|
|
149
|
+
clean_df.write_csv(filename)
|
|
150
|
+
self.logger.info(f"Features exported to {filename}")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def export_mgf(
|
|
154
|
+
self,
|
|
155
|
+
filename: str = "features.mgf",
|
|
156
|
+
use_cache=True,
|
|
157
|
+
selection="best",
|
|
158
|
+
split_energy=True,
|
|
159
|
+
merge=False,
|
|
160
|
+
mz_start=None,
|
|
161
|
+
mz_end=None,
|
|
162
|
+
rt_start=None,
|
|
163
|
+
rt_end=None,
|
|
164
|
+
include_all_ms1=False,
|
|
165
|
+
full_ms1=False,
|
|
166
|
+
centroid=True,
|
|
167
|
+
inty_min=float("-inf"),
|
|
168
|
+
q1_ratio_min=None,
|
|
169
|
+
q1_ratio_max=None,
|
|
170
|
+
eic_corr_min=None,
|
|
171
|
+
deisotope=True,
|
|
172
|
+
precursor_trim=10.0,
|
|
173
|
+
centroid_algo=None,
|
|
174
|
+
):
|
|
175
|
+
"""
|
|
176
|
+
Export features as an MGF file with MS1 and MS2 spectra.
|
|
177
|
+
|
|
178
|
+
Iterates over all features in `self.features_df` (or `self.features` if the former is None),
|
|
179
|
+
retrieves the corresponding MS1 and MS2 spectra, applies peak filtering, and writes them in MGF format.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
filename (str, optional): Output MGF file name. Defaults to "features.mgf".
|
|
183
|
+
use_cache (bool, optional): Use cached MS2 spectra from the features DataFrame. Defaults to False.
|
|
184
|
+
selection (str, optional): "best" for first scan, "all" for every scan. Defaults to "best".
|
|
185
|
+
split_energy (bool, optional): Process MS2 scans by unique energy. Defaults to False.
|
|
186
|
+
merge (bool, optional): If selection="all", merge MS2 scans into one spectrum. Defaults to False.
|
|
187
|
+
mz_start (float, optional): Minimum m/z for feature selection.
|
|
188
|
+
mz_end (float, optional): Maximum m/z for feature selection.
|
|
189
|
+
rt_start (float, optional): Minimum RT for feature selection.
|
|
190
|
+
rt_end (float, optional): Maximum RT for feature selection.
|
|
191
|
+
include_all_ms1 (bool, optional): Include MS1 spectra even if no MS2 scan. Defaults to False.
|
|
192
|
+
full_ms1 (bool, optional): Export full MS1 spectrum or trim around precursor. Defaults to False.
|
|
193
|
+
centroid (bool, optional): Centroid the spectrum. Defaults to True.
|
|
194
|
+
inty_min (float, optional): Minimum intensity threshold for peaks.
|
|
195
|
+
q1_ratio_min (float, optional): Minimum q1_ratio for peaks.
|
|
196
|
+
q1_ratio_max (float, optional): Maximum q1_ratio for peaks.
|
|
197
|
+
eic_corr_min (float, optional): Minimum EIC correlation for peaks.
|
|
198
|
+
deisotope (bool, optional): Perform deisotoping. Defaults to True.
|
|
199
|
+
verbose (bool, optional): Print summary after export. Defaults to False.
|
|
200
|
+
precursor_trim (int, optional): Trimming parameter for precursor peaks. Defaults to -10.
|
|
201
|
+
centroid_algo (str, optional): Centroiding algorithm to use.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
None
|
|
205
|
+
|
|
206
|
+
Notes:
|
|
207
|
+
- If neither `self.features_df` nor `self.features` are available, the method logs a warning and returns.
|
|
208
|
+
- Uses internal helpers for peak filtering and MGF formatting.
|
|
209
|
+
- For each feature, writes MS1 spectrum first, then MS2 spectra if available.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
if self.features_df is None:
|
|
213
|
+
if self.features is None:
|
|
214
|
+
self.logger.warning("Please find features first.")
|
|
215
|
+
return
|
|
216
|
+
else:
|
|
217
|
+
self.features_df = self.features.get_df()
|
|
218
|
+
|
|
219
|
+
# Apply filtering at DataFrame level for better performance
|
|
220
|
+
features = self.features_df
|
|
221
|
+
if mz_start is not None:
|
|
222
|
+
features = features.filter(pl.col("mz") >= mz_start)
|
|
223
|
+
if mz_end is not None:
|
|
224
|
+
features = features.filter(pl.col("mz") <= mz_end)
|
|
225
|
+
if rt_start is not None:
|
|
226
|
+
features = features.filter(pl.col("rt") >= rt_start)
|
|
227
|
+
if rt_end is not None:
|
|
228
|
+
features = features.filter(pl.col("rt") <= rt_end)
|
|
229
|
+
if not include_all_ms1:
|
|
230
|
+
features = features.filter(pl.col("ms2_scans").is_not_null())
|
|
231
|
+
|
|
232
|
+
# Convert to list of dictionaries for faster iteration
|
|
233
|
+
features_list = features.to_dicts()
|
|
234
|
+
|
|
235
|
+
def filter_peaks(spec, inty_min=None, q1_min=None, eic_min=None, q1_max=None):
|
|
236
|
+
# create a copy of the spectrum
|
|
237
|
+
spec = spec.copy()
|
|
238
|
+
spec_len = len(spec.mz)
|
|
239
|
+
mask = [True] * spec_len
|
|
240
|
+
if inty_min is not None and inty_min > 0:
|
|
241
|
+
mask = np.array(mask) & (spec.inty >= inty_min)
|
|
242
|
+
# check if q1_ratio is an attribute of spec
|
|
243
|
+
if q1_min is not None and hasattr(spec, "q1_ratio"):
|
|
244
|
+
mask = mask & (spec.q1_ratio >= q1_min)
|
|
245
|
+
# check if eic_corr is an attribute of spec
|
|
246
|
+
if q1_max is not None and hasattr(spec, "q1_ratio"):
|
|
247
|
+
mask = mask & (spec.q1_ratio <= q1_max)
|
|
248
|
+
# check if eic_corr is an attribute of spec
|
|
249
|
+
if eic_min is not None and hasattr(spec, "eic_corr"):
|
|
250
|
+
mask = mask & (spec.eic_corr >= eic_min)
|
|
251
|
+
# apply mask to all attributes of spec with the same length as mz
|
|
252
|
+
for attr in spec.__dict__:
|
|
253
|
+
# check it attr is a list or an array:
|
|
254
|
+
if isinstance(getattr(spec, attr), list) or isinstance(
|
|
255
|
+
getattr(spec, attr),
|
|
256
|
+
np.ndarray,
|
|
257
|
+
):
|
|
258
|
+
# check if attr has attribute 0 and its length is equal to spec_len:
|
|
259
|
+
if hasattr(getattr(spec, attr), "__len__"):
|
|
260
|
+
if len(getattr(spec, attr)) == spec_len:
|
|
261
|
+
setattr(spec, attr, getattr(spec, attr)[mask])
|
|
262
|
+
return spec
|
|
263
|
+
|
|
264
|
+
def write_ion(f, title, fid, mz, rt, charge, spect):
|
|
265
|
+
if spect is None:
|
|
266
|
+
return
|
|
267
|
+
f.write(f"BEGIN IONS\nTITLE={title}\n")
|
|
268
|
+
f.write(f"FEATURE_ID={fid}\n")
|
|
269
|
+
f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
|
|
270
|
+
if spect.ms_level is None:
|
|
271
|
+
f.write("MSLEVEL=1\n")
|
|
272
|
+
else:
|
|
273
|
+
f.write(f"MSLEVEL={spect.ms_level}\n")
|
|
274
|
+
if spect.ms_level is not None:
|
|
275
|
+
if spect.ms_level > 1 and hasattr(spect, "energy"):
|
|
276
|
+
f.write(f"ENERGY={spect.energy}\n")
|
|
277
|
+
# Use list comprehension for better performance
|
|
278
|
+
peak_lines = [f"{mz_val:.5f} {inty_val:.0f}\n" for mz_val, inty_val in zip(spect.mz, spect.inty, strict=False)]
|
|
279
|
+
f.writelines(peak_lines)
|
|
280
|
+
f.write("END IONS\n\n")
|
|
281
|
+
|
|
282
|
+
if centroid_algo is None:
|
|
283
|
+
if hasattr(self.parameters, "centroid_algo"):
|
|
284
|
+
centroid_algo = self.parameters.centroid_algo
|
|
285
|
+
else:
|
|
286
|
+
centroid_algo = "cr"
|
|
287
|
+
|
|
288
|
+
# count how many features have charge < 0
|
|
289
|
+
if self.features_df.filter(pl.col("charge") < 0).shape[0]- self.features_df.filter(pl.col("charge") > 0).shape[0] > 0:
|
|
290
|
+
preferred_charge = -1
|
|
291
|
+
else:
|
|
292
|
+
preferred_charge = 1
|
|
293
|
+
|
|
294
|
+
c = 0
|
|
295
|
+
skip = 0
|
|
296
|
+
# check if features is empty
|
|
297
|
+
if len(features_list) == 0:
|
|
298
|
+
self.logger.warning("No features found.")
|
|
299
|
+
return
|
|
300
|
+
filename = os.path.abspath(filename)
|
|
301
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
302
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
303
|
+
for row in tqdm(
|
|
304
|
+
features_list,
|
|
305
|
+
total=len(features_list),
|
|
306
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MGF",
|
|
307
|
+
disable=tdqm_disable,
|
|
308
|
+
):
|
|
309
|
+
# Pre-calculate common values
|
|
310
|
+
feature_uid = row["feature_uid"]
|
|
311
|
+
mz = row["mz"]
|
|
312
|
+
rt = row["rt"]
|
|
313
|
+
rt_str = f"{rt:.2f}"
|
|
314
|
+
mz_str = f"{mz:.4f}"
|
|
315
|
+
|
|
316
|
+
# Filtering is now done at DataFrame level, so we can skip these checks
|
|
317
|
+
if row["ms2_scans"] is None and not include_all_ms1:
|
|
318
|
+
skip = skip + 1
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
# write MS1 spectrum
|
|
322
|
+
ms1_scan_uid = self.select_closest_scan(rt=rt)["scan_uid"][0]
|
|
323
|
+
spect = self.get_spectrum(
|
|
324
|
+
ms1_scan_uid,
|
|
325
|
+
centroid=centroid,
|
|
326
|
+
deisotope=deisotope,
|
|
327
|
+
centroid_algo=centroid_algo,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
spect = filter_peaks(spect, inty_min=inty_min)
|
|
331
|
+
|
|
332
|
+
if not full_ms1:
|
|
333
|
+
# trim spectrum to region around the precursor, it's wide to potentially identify adducts
|
|
334
|
+
spect = spect.trim(
|
|
335
|
+
mz_min=mz - 50,
|
|
336
|
+
mz_max=mz + 50,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
charge = preferred_charge
|
|
340
|
+
if row["charge"] is not None and row["charge"] != 0:
|
|
341
|
+
charge = row["charge"]
|
|
342
|
+
|
|
343
|
+
write_ion(
|
|
344
|
+
f,
|
|
345
|
+
f"feature_uid:{feature_uid}, rt:{rt_str}, mz:{mz_str}",
|
|
346
|
+
feature_uid,
|
|
347
|
+
mz,
|
|
348
|
+
rt,
|
|
349
|
+
charge,
|
|
350
|
+
spect,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
if row["ms2_scans"] is None:
|
|
354
|
+
continue
|
|
355
|
+
elif use_cache:
|
|
356
|
+
spect = row["ms2_specs"]
|
|
357
|
+
if spect is None:
|
|
358
|
+
# No cached spectra, fall through to fetch from scan_uid
|
|
359
|
+
use_cache = False
|
|
360
|
+
else:
|
|
361
|
+
# check if spec is a list of spectra
|
|
362
|
+
if isinstance(spect, list):
|
|
363
|
+
if selection == "best":
|
|
364
|
+
s = spect[0]
|
|
365
|
+
scan_uid = row["ms2_scans"][0]
|
|
366
|
+
s.energy = self.get_spectrum(scan_uid).energy
|
|
367
|
+
spect = [s]
|
|
368
|
+
scan_uids = [scan_uid]
|
|
369
|
+
else:
|
|
370
|
+
scan_uids = row["ms2_scans"]
|
|
371
|
+
|
|
372
|
+
for i, s in enumerate(spect):
|
|
373
|
+
if s is None:
|
|
374
|
+
print(
|
|
375
|
+
f"No MS2 spectrum for feature {feature_uid} is cached.",
|
|
376
|
+
)
|
|
377
|
+
continue
|
|
378
|
+
# check if s is a spectrum
|
|
379
|
+
if type(s).__name__ == "Spectrum":
|
|
380
|
+
s = filter_peaks(
|
|
381
|
+
s,
|
|
382
|
+
inty_min=inty_min,
|
|
383
|
+
q1_min=q1_ratio_min,
|
|
384
|
+
eic_min=eic_corr_min,
|
|
385
|
+
q1_max=q1_ratio_max,
|
|
386
|
+
)
|
|
387
|
+
# Get the corresponding scan_uid from the list
|
|
388
|
+
current_scan_uid = scan_uids[i] if i < len(scan_uids) else "unknown"
|
|
389
|
+
write_ion(
|
|
390
|
+
f,
|
|
391
|
+
f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{current_scan_uid}",
|
|
392
|
+
feature_uid,
|
|
393
|
+
mz,
|
|
394
|
+
rt,
|
|
395
|
+
charge,
|
|
396
|
+
s,
|
|
397
|
+
)
|
|
398
|
+
c += 1
|
|
399
|
+
continue # Skip the rest of the processing for this feature
|
|
400
|
+
|
|
401
|
+
# If we reach here, either use_cache=False or no cached spectra were available
|
|
402
|
+
if split_energy:
|
|
403
|
+
# get energy of all scans with scan_uid in ms2_scans by fetching them
|
|
404
|
+
ms2_scan_uids = row["ms2_scans"]
|
|
405
|
+
if isinstance(ms2_scan_uids, list) and len(ms2_scan_uids) > 0:
|
|
406
|
+
# Fetch spectra to get energy information
|
|
407
|
+
spectra_with_energy = []
|
|
408
|
+
for scan_uid in ms2_scan_uids:
|
|
409
|
+
spec = self.get_spectrum(scan_uid)
|
|
410
|
+
if spec is not None:
|
|
411
|
+
spectra_with_energy.append((scan_uid, spec.energy if hasattr(spec, 'energy') else 0))
|
|
412
|
+
|
|
413
|
+
# Group by energy
|
|
414
|
+
energy_groups: dict[float, list[int]] = {}
|
|
415
|
+
for scan_uid, energy in spectra_with_energy:
|
|
416
|
+
if energy not in energy_groups:
|
|
417
|
+
energy_groups[energy] = []
|
|
418
|
+
energy_groups[energy].append(scan_uid)
|
|
419
|
+
|
|
420
|
+
for energy, scan_uids_for_energy in energy_groups.items():
|
|
421
|
+
if selection == "best":
|
|
422
|
+
# Keep only the first scan for this energy
|
|
423
|
+
scan_uids_for_energy = [scan_uids_for_energy[0]]
|
|
424
|
+
|
|
425
|
+
for scan_uid in scan_uids_for_energy:
|
|
426
|
+
spect = self.get_spectrum(
|
|
427
|
+
scan_uid,
|
|
428
|
+
centroid=centroid,
|
|
429
|
+
deisotope=deisotope,
|
|
430
|
+
precursor_trim=precursor_trim,
|
|
431
|
+
centroid_algo=centroid_algo,
|
|
432
|
+
)
|
|
433
|
+
spect = filter_peaks(
|
|
434
|
+
spect,
|
|
435
|
+
inty_min=inty_min,
|
|
436
|
+
q1_min=q1_ratio_min,
|
|
437
|
+
eic_min=eic_corr_min,
|
|
438
|
+
q1_max=q1_ratio_max,
|
|
439
|
+
)
|
|
440
|
+
write_ion(
|
|
441
|
+
f,
|
|
442
|
+
f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{scan_uid}, energy:{energy}",
|
|
443
|
+
feature_uid,
|
|
444
|
+
mz,
|
|
445
|
+
rt,
|
|
446
|
+
charge,
|
|
447
|
+
spect,
|
|
448
|
+
)
|
|
449
|
+
c += 1
|
|
450
|
+
else:
|
|
451
|
+
if selection == "best":
|
|
452
|
+
ms2_scans = row["ms2_scans"][0]
|
|
453
|
+
spect = self.get_spectrum(
|
|
454
|
+
ms2_scans,
|
|
455
|
+
centroid=centroid,
|
|
456
|
+
deisotope=deisotope,
|
|
457
|
+
precursor_trim=precursor_trim,
|
|
458
|
+
centroid_algo=centroid_algo,
|
|
459
|
+
)
|
|
460
|
+
spect = filter_peaks(
|
|
461
|
+
spect,
|
|
462
|
+
inty_min=inty_min,
|
|
463
|
+
q1_min=q1_ratio_min,
|
|
464
|
+
eic_min=eic_corr_min,
|
|
465
|
+
q1_max=q1_ratio_max,
|
|
466
|
+
)
|
|
467
|
+
write_ion(
|
|
468
|
+
f,
|
|
469
|
+
f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
|
|
470
|
+
feature_uid,
|
|
471
|
+
mz,
|
|
472
|
+
rt,
|
|
473
|
+
charge,
|
|
474
|
+
spect,
|
|
475
|
+
)
|
|
476
|
+
c += 1
|
|
477
|
+
elif selection == "all":
|
|
478
|
+
if merge:
|
|
479
|
+
specs = []
|
|
480
|
+
for ms2_scans in row["ms2_scans"]:
|
|
481
|
+
specs.append(
|
|
482
|
+
self.get_spectrum(
|
|
483
|
+
ms2_scans,
|
|
484
|
+
centroid=centroid,
|
|
485
|
+
deisotope=deisotope,
|
|
486
|
+
precursor_trim=precursor_trim,
|
|
487
|
+
),
|
|
488
|
+
)
|
|
489
|
+
spect = combine_peaks(specs)
|
|
490
|
+
if centroid:
|
|
491
|
+
spect = spect.denoise()
|
|
492
|
+
if spect.ms_level == 1:
|
|
493
|
+
spect = spect.centroid(
|
|
494
|
+
tolerance=self.parameters["mz_tol_ms1_da"],
|
|
495
|
+
ppm=self.parameters["mz_tol_ms1_ppm"],
|
|
496
|
+
min_points=self.parameters["centroid_min_points_ms1"],
|
|
497
|
+
algo=centroid_algo,
|
|
498
|
+
)
|
|
499
|
+
elif spect.ms_level == 2:
|
|
500
|
+
spect = spect.centroid(
|
|
501
|
+
tolerance=self.parameters["mz_tol_ms2_da"],
|
|
502
|
+
ppm=self.parameters["mz_tol_ms2_ppm"],
|
|
503
|
+
min_points=self.parameters["centroid_min_points_ms2"],
|
|
504
|
+
algo=centroid_algo,
|
|
505
|
+
)
|
|
506
|
+
if deisotope:
|
|
507
|
+
spect = spect.deisotope()
|
|
508
|
+
title = f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, merged"
|
|
509
|
+
spect = filter_peaks(
|
|
510
|
+
spect,
|
|
511
|
+
inty_min=inty_min,
|
|
512
|
+
q1_min=q1_ratio_min,
|
|
513
|
+
eic_min=eic_corr_min,
|
|
514
|
+
q1_max=q1_ratio_max,
|
|
515
|
+
)
|
|
516
|
+
write_ion(
|
|
517
|
+
f,
|
|
518
|
+
title,
|
|
519
|
+
feature_uid,
|
|
520
|
+
mz,
|
|
521
|
+
rt,
|
|
522
|
+
charge,
|
|
523
|
+
spect,
|
|
524
|
+
)
|
|
525
|
+
c += 1
|
|
526
|
+
else:
|
|
527
|
+
for ms2_scans in row["ms2_scans"]:
|
|
528
|
+
spect = self.get_spectrum(
|
|
529
|
+
ms2_scans,
|
|
530
|
+
centroid=centroid,
|
|
531
|
+
deisotope=deisotope,
|
|
532
|
+
precursor_trim=precursor_trim,
|
|
533
|
+
centroid_algo=centroid_algo,
|
|
534
|
+
)
|
|
535
|
+
spect = filter_peaks(
|
|
536
|
+
spect,
|
|
537
|
+
inty_min=inty_min,
|
|
538
|
+
q1_min=q1_ratio_min,
|
|
539
|
+
eic_min=eic_corr_min,
|
|
540
|
+
q1_max=q1_ratio_max,
|
|
541
|
+
)
|
|
542
|
+
write_ion(
|
|
543
|
+
f,
|
|
544
|
+
f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
|
|
545
|
+
feature_uid,
|
|
546
|
+
mz,
|
|
547
|
+
rt,
|
|
548
|
+
charge,
|
|
549
|
+
spect,
|
|
550
|
+
)
|
|
551
|
+
c += 1
|
|
552
|
+
|
|
553
|
+
self.logger.info(f"Exported {c} features to {filename}")
|
|
554
|
+
|
|
555
|
+
# Handle None values in logging
|
|
556
|
+
inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
|
|
557
|
+
q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
|
|
558
|
+
eic_corr_min_str = f"{eic_corr_min:.3f}" if eic_corr_min is not None else "None"
|
|
559
|
+
|
|
560
|
+
self.logger.debug(
|
|
561
|
+
f"MGF created with int>{inty_min_str}, q1_ratio>{q1_ratio_min_str}, eic_corr>{eic_corr_min_str}",
|
|
562
|
+
)
|
|
563
|
+
self.logger.debug(
|
|
564
|
+
f"- Exported {c} MS2 spectra for {len(features_list) - skip} precursors. Average spectra/feature is {c / (len(features_list) - skip + 0.000000001):.0f}",
|
|
565
|
+
)
|
|
566
|
+
self.logger.debug(
|
|
567
|
+
f"- Skipped {skip} features because no MS2 scans were available.",
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def export_dda_stats(self, filename="stats.csv"):
|
|
572
|
+
"""
|
|
573
|
+
Save DDA statistics into a CSV file.
|
|
574
|
+
|
|
575
|
+
This method computes basic statistics from the DDA analysis, such as:
|
|
576
|
+
- Total number of MS1 scans.
|
|
577
|
+
- Total number of MS2 scans.
|
|
578
|
+
- Total number of detected features.
|
|
579
|
+
- Number of features linked with MS2 data.
|
|
580
|
+
- Average cycle time (if available in the scans data).
|
|
581
|
+
|
|
582
|
+
The resulting statistics are saved in CSV format.
|
|
583
|
+
|
|
584
|
+
Parameters:
|
|
585
|
+
filename (str): The name/path of the CSV file to be saved. Defaults to "stats.csv".
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
None
|
|
589
|
+
"""
|
|
590
|
+
# Compute counts from scans_df and features_df
|
|
591
|
+
ms1_count = len(self.scans_df.filter(pl.col("ms_level") == 1))
|
|
592
|
+
ms2_count = len(self.scans_df.filter(pl.col("ms_level") == 2))
|
|
593
|
+
features_count = len(self.features_df) if self.features_df is not None else 0
|
|
594
|
+
features_with_ms2 = (
|
|
595
|
+
self.features_df.filter(pl.col("ms2_scans").is_not_null()).height if self.features_df is not None else 0
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
# Initialize a dictionary to hold statistics
|
|
599
|
+
stats = {
|
|
600
|
+
"MS1_scans": ms1_count,
|
|
601
|
+
"MS2_scans": ms2_count,
|
|
602
|
+
"Total_features": features_count,
|
|
603
|
+
"Features_with_MS2": features_with_ms2,
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
# Calculate the average cycle time if available.
|
|
607
|
+
if "time_cycle" in self.scans_df.columns:
|
|
608
|
+
ms1_df = self.scans_df.filter(pl.col("ms_level") == 1)
|
|
609
|
+
avg_cycle_time = ms1_df["time_cycle"].mean()
|
|
610
|
+
stats["Average_cycle_time"] = avg_cycle_time if avg_cycle_time is not None else ""
|
|
611
|
+
else:
|
|
612
|
+
stats["Average_cycle_time"] = 0
|
|
613
|
+
|
|
614
|
+
# Convert stats dict to a Pandas DataFrame and save as CSV.
|
|
615
|
+
df_stats = pd.DataFrame(list(stats.items()), columns=["Metric", "Value"])
|
|
616
|
+
df_stats.to_csv(filename, index=False)
|
|
617
|
+
lines = []
|
|
618
|
+
lines.append(f"Filename,{self.file_path}")
|
|
619
|
+
lines.append(
|
|
620
|
+
f"Number of cycles,{len(self.scans_df.filter(pl.col('ms_level') == 1))}",
|
|
621
|
+
)
|
|
622
|
+
lines.append(
|
|
623
|
+
f"Number of MS2 scans,{len(self.scans_df.filter(pl.col('ms_level') == 2))}",
|
|
624
|
+
)
|
|
625
|
+
# retrieve scans with mslevel 1 from
|
|
626
|
+
ms1 = self.scans_df.filter(pl.col("ms_level") == 1)
|
|
627
|
+
lines.append(f"Maximal number of MS2 scans per cycle (N),{ms1['ms2_n'].max()}")
|
|
628
|
+
# average number of MS2 scans per cycle, skip null values
|
|
629
|
+
ms2n_mean = ms1.filter(pl.col("ms2_n") >= 0)["ms2_n"].mean()
|
|
630
|
+
lines.append(f"Average number of MS2 scans per cycle,{ms2n_mean:.0f}")
|
|
631
|
+
lines.append(f"Maximal cycle time,{ms1['time_cycle'].max():.3f}")
|
|
632
|
+
# find spectra with ms2_n = 0
|
|
633
|
+
ms1_ms2_0 = ms1.filter(pl.col("ms2_n") == 0)
|
|
634
|
+
if len(ms1_ms2_0) > 0:
|
|
635
|
+
lines.append(
|
|
636
|
+
f"Average cycle time at MS1-only,{ms1_ms2_0['time_cycle'].mean():.3f}",
|
|
637
|
+
)
|
|
638
|
+
else:
|
|
639
|
+
lines.append("Average cycle time at MS1-only,")
|
|
640
|
+
# find spectra with ms2_n = 1
|
|
641
|
+
ms1_ms2_1 = ms1.filter(pl.col("ms2_n") == 1)
|
|
642
|
+
if len(ms1_ms2_1) > 0:
|
|
643
|
+
lines.append(
|
|
644
|
+
f"Average cycle time with 1 MS2,{ms1_ms2_1['time_cycle'].mean():.3f}",
|
|
645
|
+
)
|
|
646
|
+
else:
|
|
647
|
+
lines.append("Average cycle time with 1 MS2,")
|
|
648
|
+
# find spectra with ms2_n = 2
|
|
649
|
+
ms1_ms2_2 = ms1.filter(pl.col("ms2_n") == 2)
|
|
650
|
+
if len(ms1_ms2_2) > 0:
|
|
651
|
+
lines.append(
|
|
652
|
+
f"Average cycle time with 2 MS2,{ms1_ms2_2['time_cycle'].mean():.3f}",
|
|
653
|
+
)
|
|
654
|
+
else:
|
|
655
|
+
lines.append("Average cycle time with 2 MS2,")
|
|
656
|
+
# find spectra with ms2_n = 2
|
|
657
|
+
ms1_ms2_3 = ms1.filter(pl.col("ms2_n") == 3)
|
|
658
|
+
if len(ms1_ms2_3) > 0:
|
|
659
|
+
lines.append(
|
|
660
|
+
f"Average cycle time with 3 MS2,{ms1_ms2_3['time_cycle'].mean():.3f}",
|
|
661
|
+
)
|
|
662
|
+
else:
|
|
663
|
+
lines.append("Average cycle time with 3 MS2,")
|
|
664
|
+
max_ms2_n = ms1["ms2_n"].max()
|
|
665
|
+
ms1_ms2_n1 = ms1.filter(pl.col("ms2_n") == max_ms2_n - 1)
|
|
666
|
+
if len(ms1_ms2_n1) > 0:
|
|
667
|
+
lines.append(
|
|
668
|
+
f"Average cycle time with N-1 MS2,{ms1_ms2_n1['time_cycle'].mean():.3f}",
|
|
669
|
+
)
|
|
670
|
+
else:
|
|
671
|
+
lines.append("Average cycle time with N-1 MS2,")
|
|
672
|
+
# find specgtra with maximal ms2_n
|
|
673
|
+
ms1_max_ms2_n = ms1.filter(pl.col("ms2_n") == max_ms2_n)
|
|
674
|
+
lines.append(
|
|
675
|
+
f"Average cycle time with N MS2,{ms1_max_ms2_n['time_cycle'].mean():.3f}",
|
|
676
|
+
)
|
|
677
|
+
# average time_MS1, skip null values
|
|
678
|
+
a = ms1.filter(pl.col("time_ms1_to_ms1") >= 0)["time_ms1_to_ms1"].mean()
|
|
679
|
+
if a is not None:
|
|
680
|
+
lines.append(f"Average MS1-to-MS1 scan time,{a:.3f}")
|
|
681
|
+
else:
|
|
682
|
+
lines.append("Average MS1-to-MS1 scan time,")
|
|
683
|
+
a = ms1.filter(pl.col("time_ms1_to_ms2") >= 0)["time_ms1_to_ms2"].mean()
|
|
684
|
+
if a is not None:
|
|
685
|
+
lines.append(f"Average MS1-to-MS2 scan time,{a:.3f}")
|
|
686
|
+
else:
|
|
687
|
+
lines.append("Average MS1-to-MS2 scan time,")
|
|
688
|
+
ms2_mean = ms1.filter(pl.col("time_ms2_to_ms2") >= 0)["time_ms2_to_ms2"].mean()
|
|
689
|
+
if ms2_mean is not None:
|
|
690
|
+
lines.append(f"Average MS2-to-MS2 scan time,{ms2_mean:.3f}")
|
|
691
|
+
else:
|
|
692
|
+
lines.append("Average MS2-to-MS2 scan time,")
|
|
693
|
+
a = ms1.filter(pl.col("time_ms2_to_ms1") >= 0)["time_ms2_to_ms1"].mean()
|
|
694
|
+
if a is not None:
|
|
695
|
+
lines.append(f"Average MS2-to-MS1 scan time,{a:.3f}")
|
|
696
|
+
else:
|
|
697
|
+
lines.append("Average MS2-to-MS1 scan time,")
|
|
698
|
+
# number of features
|
|
699
|
+
if self.features_df is not None:
|
|
700
|
+
lines.append(f"Number of features,{self.features_df.height}")
|
|
701
|
+
a = self.features_df.filter(pl.col("ms2_scans").is_not_null()).height
|
|
702
|
+
lines.append(f"Number of features with MS2 data,{a}")
|
|
703
|
+
b = self.scans_df.filter(pl.col("feature_uid") >= 0).height
|
|
704
|
+
lines.append(f"Number of MS2 scans with features,{b}")
|
|
705
|
+
if a > 0:
|
|
706
|
+
lines.append(f"Redundancy of MS2 scans with features,{b / a:.3f}")
|
|
707
|
+
else:
|
|
708
|
+
lines.append("Redundancy of MS2 scans with features,")
|
|
709
|
+
else:
|
|
710
|
+
lines.append("Number of features,")
|
|
711
|
+
lines.append("Number of features with MS2 data,")
|
|
712
|
+
lines.append("Number of MS2 scans with features,")
|
|
713
|
+
lines.append("Redundancy of MS2 scans with features,")
|
|
714
|
+
|
|
715
|
+
# write to file
|
|
716
|
+
with open(filename, "w") as f:
|
|
717
|
+
for line in lines:
|
|
718
|
+
f.write(line + "\n")
|
|
719
|
+
|
|
720
|
+
self.logger.info(f"DDA statistics exported to {filename}")
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def export_chrom(self, filename="chrom.csv"):
|
|
724
|
+
# saves self.chrom_df to a csv file. Remove the scan_uid and chrom columns if the file already exists
|
|
725
|
+
if self.chrom_df is None:
|
|
726
|
+
self.logger.warning("No chromatogram definitions found.")
|
|
727
|
+
return
|
|
728
|
+
data = self.chrom_df.clone()
|
|
729
|
+
# Convert to pandas for CSV export
|
|
730
|
+
if hasattr(data, "to_pandas"):
|
|
731
|
+
data = data.to_pandas()
|
|
732
|
+
# remove scan_uid and chrom columns if they exist
|
|
733
|
+
if "scan_uid" in data.columns:
|
|
734
|
+
data = data.drop("scan_uid")
|
|
735
|
+
if "chrom" in data.columns:
|
|
736
|
+
data = data.drop("chrom")
|
|
737
|
+
data.to_csv(filename, index=False)
|