masster 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -736
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.5.dist-info/RECORD +0 -50
  54. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/save.py CHANGED
@@ -1,736 +1,737 @@
1
- """
2
- _export.py
3
-
4
- This module provides data export functionality for mass spectrometry analysis results.
5
- It handles saving processed data in various formats for downstream analysis, sharing,
6
- and archival purposes, including spectrum files, feature tables, and custom formats.
7
-
8
- Key Features:
9
- - **Multi-Format Export**: Save data as MGF, mzML, CSV, FeatureXML, and custom formats.
10
- - **Spectrum Export**: Export MS/MS spectra for database searching and identification.
11
- - **Feature Export**: Save detected features with quantitative information.
12
- - **Custom Formats**: Support for compressed pickle formats (mzpkl) for fast storage.
13
- - **Metadata Preservation**: Maintain acquisition parameters and processing history.
14
- - **Batch Export**: Export multiple samples or studies simultaneously.
15
-
16
- Dependencies:
17
- - `pyopenms`: For standard mass spectrometry file format export.
18
- - `polars` and `pandas`: For tabular data export and manipulation.
19
- - `numpy`: For numerical array operations.
20
- - `pickle` and `bz2`: For custom format compression and serialization.
21
- - `loguru`: For logging export operations and error handling.
22
-
23
- Functions:
24
- - `save()`: Main export function with format detection.
25
- - `save_mzpkl()`: Export to compressed pickle format for fast loading.
26
- - `save_featureXML()`: Export features in OpenMS FeatureXML format.
27
- - `export_mgf()`: Export MS/MS spectra in MGF format for database searching.
28
- - `export_csv()`: Export features and metadata in CSV format.
29
-
30
- Supported Export Formats:
31
- - MGF (Mascot Generic Format) for MS/MS spectra
32
- - mzML (open standard format) for spectral data
33
- - CSV for tabular feature data
34
- - FeatureXML (OpenMS format) for feature data
35
- - mzpkl (custom compressed format) for complete analysis results
36
-
37
- Example Usage:
38
- ```python
39
- from _export import save, export_mgf
40
-
41
- # Save complete analysis in custom format
42
- save(self, filename="analysis_results.mzpkl")
43
-
44
- # Export MS/MS spectra for database searching
45
- export_mgf(self, filename="ms2_spectra.mgf", export_type="all")
46
-
47
- # Export feature table
48
- export_csv(self, filename="features.csv", data_type="features")
49
- ```
50
-
51
- See Also:
52
- - `parameters._export_parameters`: For export-specific parameter configuration.
53
- - `_import.py`: For data import functionality.
54
- - `single.py`: For using export methods with ddafile class.
55
-
56
- """
57
-
58
- import os
59
-
60
- from datetime import datetime
61
-
62
- import numpy as np
63
- import pandas as pd
64
- import polars as pl
65
- import pyopenms as oms
66
-
67
- from tqdm import tqdm
68
-
69
- # Parameters removed - using hardcoded defaults
70
- from masster.spectrum import combine_peaks
71
-
72
-
73
- def save(self, filename=None):
74
- """
75
- Save the current object to a file in the '.sample5' format.
76
-
77
- If `filename` is not provided, the method attempts to use `self.file_path` as the base name,
78
- replacing its extension with '.sample5'. If neither `filename` nor `self.file_path` is available,
79
- a ValueError is raised.
80
-
81
- If `filename` is provided and `self.file_path` is an absolute path, the extension of `filename`
82
- is replaced with '.sample5'. Otherwise, if `self.file_path` is available, its extension is replaced
83
- with '.sample5'. If neither is available, a ValueError is raised.
84
-
85
- Parameters:
86
- filename (str, optional): The name of the file to save to. If not provided, uses `self.file_path`.
87
-
88
- Returns:
89
- None
90
- """
91
- if filename is None:
92
- # save to default file name
93
- if self.file_path is not None:
94
- filename = os.path.splitext(self.file_path)[0] + ".sample5"
95
- else:
96
- raise ValueError("either filename or file_path must be provided")
97
- else:
98
- # check if filename includes an absolute path
99
- if os.path.isabs(self.file_path):
100
- filename = os.path.splitext(filename)[0] + ".sample5"
101
- elif self.file_path is not None:
102
- filename = os.path.splitext(self.file_path)[0] + ".sample5"
103
- else:
104
- raise ValueError("either filename or file_path must be provided")
105
- self._save_sample5(filename=filename)
106
-
107
-
108
- def _save_featureXML(self, filename="features.featureXML"):
109
- if self.features is None:
110
- self.logger.warning("No features found.")
111
- return
112
- fh = oms.FeatureXMLFile()
113
- fh.store(filename, self.features)
114
- self.logger.debug(f"Features Map saved to {filename}")
115
-
116
-
117
- def export_features(self, filename="features.csv"):
118
- """
119
- Export the features DataFrame to a CSV or Excel file.
120
-
121
- This method clones the internal features DataFrame, adds a boolean column 'has_ms2' indicating
122
- whether the 'ms2_scans' column is not null, and exports the resulting DataFrame to the specified file.
123
- Columns with data types 'List' or 'Object' are excluded from the export.
124
-
125
- Parameters:
126
- filename (str): The path to the output file. If the filename ends with '.xls' or '.xlsx',
127
- the data is exported in Excel format; otherwise, it is exported as CSV.
128
- Defaults to 'features.csv'.
129
-
130
- Side Effects:
131
- Writes the exported data to the specified file and logs the export operation.
132
- """
133
- # clone df
134
- clean_df = self.features_df.clone()
135
- filename = os.path.abspath(filename)
136
- # add a column has_ms2=True if colum ms2_scans is not None
137
- if "ms2_scans" in clean_df.columns:
138
- clean_df = clean_df.with_columns(
139
- (pl.col("ms2_scans").is_not_null()).alias("has_ms2")
140
- )
141
- clean_df = self.features_df.select([
142
- col for col in self.features_df.columns if self.features_df[col].dtype not in (pl.List, pl.Object)
143
- ])
144
- if filename.lower().endswith((".xls", ".xlsx")):
145
- clean_df.to_pandas().to_excel(filename, index=False)
146
- self.logger.info(f"Features exported to {filename} (Excel format)")
147
- else:
148
- clean_df.write_csv(filename)
149
- self.logger.info(f"Features exported to {filename}")
150
-
151
-
152
- def export_mgf(
153
- self,
154
- filename: str = "features.mgf",
155
- use_cache=True,
156
- selection="best",
157
- split_energy=True,
158
- merge=False,
159
- mz_start=None,
160
- mz_end=None,
161
- rt_start=None,
162
- rt_end=None,
163
- include_all_ms1=False,
164
- full_ms1=False,
165
- centroid=True,
166
- inty_min=float("-inf"),
167
- q1_ratio_min=None,
168
- q1_ratio_max=None,
169
- eic_corr_min=None,
170
- deisotope=True,
171
- precursor_trim=10.0,
172
- centroid_algo=None,
173
- ):
174
- """
175
- Export features as an MGF file with MS1 and MS2 spectra.
176
-
177
- Iterates over all features in `self.features_df` (or `self.features` if the former is None),
178
- retrieves the corresponding MS1 and MS2 spectra, applies peak filtering, and writes them in MGF format.
179
-
180
- Args:
181
- filename (str, optional): Output MGF file name. Defaults to "features.mgf".
182
- use_cache (bool, optional): Use cached MS2 spectra from the features DataFrame. Defaults to False.
183
- selection (str, optional): "best" for first scan, "all" for every scan. Defaults to "best".
184
- split_energy (bool, optional): Process MS2 scans by unique energy. Defaults to False.
185
- merge (bool, optional): If selection="all", merge MS2 scans into one spectrum. Defaults to False.
186
- mz_start (float, optional): Minimum m/z for feature selection.
187
- mz_end (float, optional): Maximum m/z for feature selection.
188
- rt_start (float, optional): Minimum RT for feature selection.
189
- rt_end (float, optional): Maximum RT for feature selection.
190
- include_all_ms1 (bool, optional): Include MS1 spectra even if no MS2 scan. Defaults to False.
191
- full_ms1 (bool, optional): Export full MS1 spectrum or trim around precursor. Defaults to False.
192
- centroid (bool, optional): Centroid the spectrum. Defaults to True.
193
- inty_min (float, optional): Minimum intensity threshold for peaks.
194
- q1_ratio_min (float, optional): Minimum q1_ratio for peaks.
195
- q1_ratio_max (float, optional): Maximum q1_ratio for peaks.
196
- eic_corr_min (float, optional): Minimum EIC correlation for peaks.
197
- deisotope (bool, optional): Perform deisotoping. Defaults to True.
198
- verbose (bool, optional): Print summary after export. Defaults to False.
199
- precursor_trim (int, optional): Trimming parameter for precursor peaks. Defaults to -10.
200
- centroid_algo (str, optional): Centroiding algorithm to use.
201
-
202
- Returns:
203
- None
204
-
205
- Notes:
206
- - If neither `self.features_df` nor `self.features` are available, the method logs a warning and returns.
207
- - Uses internal helpers for peak filtering and MGF formatting.
208
- - For each feature, writes MS1 spectrum first, then MS2 spectra if available.
209
- """
210
-
211
- if self.features_df is None:
212
- if self.features is None:
213
- self.logger.warning("Please find features first.")
214
- return
215
- else:
216
- self.features_df = self.features.get_df()
217
-
218
- # Apply filtering at DataFrame level for better performance
219
- features = self.features_df
220
- if mz_start is not None:
221
- features = features.filter(pl.col("mz") >= mz_start)
222
- if mz_end is not None:
223
- features = features.filter(pl.col("mz") <= mz_end)
224
- if rt_start is not None:
225
- features = features.filter(pl.col("rt") >= rt_start)
226
- if rt_end is not None:
227
- features = features.filter(pl.col("rt") <= rt_end)
228
- if not include_all_ms1:
229
- features = features.filter(pl.col("ms2_scans").is_not_null())
230
-
231
- # Convert to list of dictionaries for faster iteration
232
- features_list = features.to_dicts()
233
-
234
- def filter_peaks(spec, inty_min=None, q1_min=None, eic_min=None, q1_max=None):
235
- # create a copy of the spectrum
236
- spec = spec.copy()
237
- spec_len = len(spec.mz)
238
- mask = [True] * spec_len
239
- if inty_min is not None and inty_min > 0:
240
- mask = np.array(mask) & (spec.inty >= inty_min)
241
- # check if q1_ratio is an attribute of spec
242
- if q1_min is not None and hasattr(spec, "q1_ratio"):
243
- mask = mask & (spec.q1_ratio >= q1_min)
244
- # check if eic_corr is an attribute of spec
245
- if q1_max is not None and hasattr(spec, "q1_ratio"):
246
- mask = mask & (spec.q1_ratio <= q1_max)
247
- # check if eic_corr is an attribute of spec
248
- if eic_min is not None and hasattr(spec, "eic_corr"):
249
- mask = mask & (spec.eic_corr >= eic_min)
250
- # apply mask to all attributes of spec with the same length as mz
251
- for attr in spec.__dict__:
252
- # check it attr is a list or an array:
253
- if isinstance(getattr(spec, attr), list) or isinstance(
254
- getattr(spec, attr),
255
- np.ndarray,
256
- ):
257
- # check if attr has attribute 0 and its length is equal to spec_len:
258
- if hasattr(getattr(spec, attr), "__len__"):
259
- if len(getattr(spec, attr)) == spec_len:
260
- setattr(spec, attr, getattr(spec, attr)[mask])
261
- return spec
262
-
263
- def write_ion(f, title, fid, mz, rt, charge, spect):
264
- if spect is None:
265
- return
266
- f.write(f"BEGIN IONS\nTITLE={title}\n")
267
- f.write(f"FEATURE_ID={fid}\n")
268
- f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
269
- if spect.ms_level is None:
270
- f.write("MSLEVEL=1\n")
271
- else:
272
- f.write(f"MSLEVEL={spect.ms_level}\n")
273
- if spect.ms_level is not None:
274
- if spect.ms_level > 1 and hasattr(spect, "energy"):
275
- f.write(f"ENERGY={spect.energy}\n")
276
- # Use list comprehension for better performance
277
- peak_lines = [f"{mz_val:.5f} {inty_val:.0f}\n" for mz_val, inty_val in zip(spect.mz, spect.inty, strict=False)]
278
- f.writelines(peak_lines)
279
- f.write("END IONS\n\n")
280
-
281
- if centroid_algo is None:
282
- if hasattr(self.parameters, "centroid_algo"):
283
- centroid_algo = self.parameters.centroid_algo
284
- else:
285
- centroid_algo = "cr"
286
-
287
- # count how many features have charge < 0
288
- if self.features_df.filter(pl.col("charge") < 0).shape[0]- self.features_df.filter(pl.col("charge") > 0).shape[0] > 0:
289
- preferred_charge = -1
290
- else:
291
- preferred_charge = 1
292
-
293
- c = 0
294
- skip = 0
295
- # check if features is empty
296
- if len(features_list) == 0:
297
- self.logger.warning("No features found.")
298
- return
299
- filename = os.path.abspath(filename)
300
- with open(filename, "w", encoding="utf-8") as f:
301
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
302
- for row in tqdm(
303
- features_list,
304
- total=len(features_list),
305
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MGF",
306
- disable=tdqm_disable,
307
- ):
308
- # Pre-calculate common values
309
- feature_uid = row["feature_uid"]
310
- mz = row["mz"]
311
- rt = row["rt"]
312
- rt_str = f"{rt:.2f}"
313
- mz_str = f"{mz:.4f}"
314
-
315
- # Filtering is now done at DataFrame level, so we can skip these checks
316
- if row["ms2_scans"] is None and not include_all_ms1:
317
- skip = skip + 1
318
- continue
319
-
320
- # write MS1 spectrum
321
- ms1_scan_uid = self.find_closest_scan(rt=rt)["scan_uid"]
322
- spect = self.get_spectrum(
323
- ms1_scan_uid,
324
- centroid=centroid,
325
- deisotope=deisotope,
326
- centroid_algo=centroid_algo,
327
- )
328
-
329
- spect = filter_peaks(spect, inty_min=inty_min)
330
-
331
- if not full_ms1:
332
- # trim spectrum to region around the precursor, it's wide to potentially identify adducts
333
- spect = spect.trim(
334
- mz_min=mz - 50,
335
- mz_max=mz + 50,
336
- )
337
-
338
- charge = preferred_charge
339
- if row["charge"] is not None and row["charge"] != 0:
340
- charge = row["charge"]
341
-
342
- write_ion(
343
- f,
344
- f"feature_uid:{feature_uid}, rt:{rt_str}, mz:{mz_str}",
345
- feature_uid,
346
- mz,
347
- rt,
348
- charge,
349
- spect,
350
- )
351
-
352
- if row["ms2_scans"] is None:
353
- continue
354
- elif use_cache:
355
- spect = row["ms2_specs"]
356
- if spect is None:
357
- # No cached spectra, fall through to fetch from scan_uid
358
- use_cache = False
359
- else:
360
- # check if spec is a list of spectra
361
- if isinstance(spect, list):
362
- if selection == "best":
363
- s = spect[0]
364
- scan_uid = row["ms2_scans"][0]
365
- s.energy = self.get_spectrum(scan_uid).energy
366
- spect = [s]
367
- scan_uids = [scan_uid]
368
- else:
369
- scan_uids = row["ms2_scans"]
370
-
371
- for i, s in enumerate(spect):
372
- if s is None:
373
- print(
374
- f"No MS2 spectrum for feature {feature_uid} is cached.",
375
- )
376
- continue
377
- # check if s is a spectrum
378
- if type(s).__name__ == "Spectrum":
379
- s = filter_peaks(
380
- s,
381
- inty_min=inty_min,
382
- q1_min=q1_ratio_min,
383
- eic_min=eic_corr_min,
384
- q1_max=q1_ratio_max,
385
- )
386
- # Get the corresponding scan_uid from the list
387
- current_scan_uid = scan_uids[i] if i < len(scan_uids) else "unknown"
388
- write_ion(
389
- f,
390
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{current_scan_uid}",
391
- feature_uid,
392
- mz,
393
- rt,
394
- charge,
395
- s,
396
- )
397
- c += 1
398
- continue # Skip the rest of the processing for this feature
399
-
400
- # If we reach here, either use_cache=False or no cached spectra were available
401
- if split_energy:
402
- # get energy of all scans with scan_uid in ms2_scans by fetching them
403
- ms2_scan_uids = row["ms2_scans"]
404
- if isinstance(ms2_scan_uids, list) and len(ms2_scan_uids) > 0:
405
- # Fetch spectra to get energy information
406
- spectra_with_energy = []
407
- for scan_uid in ms2_scan_uids:
408
- spec = self.get_spectrum(scan_uid)
409
- if spec is not None:
410
- spectra_with_energy.append((scan_uid, spec.energy if hasattr(spec, 'energy') else 0))
411
-
412
- # Group by energy
413
- energy_groups: dict[float, list[int]] = {}
414
- for scan_uid, energy in spectra_with_energy:
415
- if energy not in energy_groups:
416
- energy_groups[energy] = []
417
- energy_groups[energy].append(scan_uid)
418
-
419
- for energy, scan_uids_for_energy in energy_groups.items():
420
- if selection == "best":
421
- # Keep only the first scan for this energy
422
- scan_uids_for_energy = [scan_uids_for_energy[0]]
423
-
424
- for scan_uid in scan_uids_for_energy:
425
- spect = self.get_spectrum(
426
- scan_uid,
427
- centroid=centroid,
428
- deisotope=deisotope,
429
- precursor_trim=precursor_trim,
430
- centroid_algo=centroid_algo,
431
- )
432
- spect = filter_peaks(
433
- spect,
434
- inty_min=inty_min,
435
- q1_min=q1_ratio_min,
436
- eic_min=eic_corr_min,
437
- q1_max=q1_ratio_max,
438
- )
439
- write_ion(
440
- f,
441
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{scan_uid}, energy:{energy}",
442
- feature_uid,
443
- mz,
444
- rt,
445
- charge,
446
- spect,
447
- )
448
- c += 1
449
- else:
450
- if selection == "best":
451
- ms2_scans = row["ms2_scans"][0]
452
- spect = self.get_spectrum(
453
- ms2_scans,
454
- centroid=centroid,
455
- deisotope=deisotope,
456
- precursor_trim=precursor_trim,
457
- centroid_algo=centroid_algo,
458
- )
459
- spect = filter_peaks(
460
- spect,
461
- inty_min=inty_min,
462
- q1_min=q1_ratio_min,
463
- eic_min=eic_corr_min,
464
- q1_max=q1_ratio_max,
465
- )
466
- write_ion(
467
- f,
468
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
469
- feature_uid,
470
- mz,
471
- rt,
472
- charge,
473
- spect,
474
- )
475
- c += 1
476
- elif selection == "all":
477
- if merge:
478
- specs = []
479
- for ms2_scans in row["ms2_scans"]:
480
- specs.append(
481
- self.get_spectrum(
482
- ms2_scans,
483
- centroid=centroid,
484
- deisotope=deisotope,
485
- precursor_trim=precursor_trim,
486
- ),
487
- )
488
- spect = combine_peaks(specs)
489
- if centroid:
490
- spect = spect.denoise()
491
- if spect.ms_level == 1:
492
- spect = spect.centroid(
493
- tolerance=self.parameters["mz_tol_ms1_da"],
494
- ppm=self.parameters["mz_tol_ms1_ppm"],
495
- min_points=self.parameters["centroid_min_points_ms1"],
496
- algo=centroid_algo,
497
- )
498
- elif spect.ms_level == 2:
499
- spect = spect.centroid(
500
- tolerance=self.parameters["mz_tol_ms2_da"],
501
- ppm=self.parameters["mz_tol_ms2_ppm"],
502
- min_points=self.parameters["centroid_min_points_ms2"],
503
- algo=centroid_algo,
504
- )
505
- if deisotope:
506
- spect = spect.deisotope()
507
- title = f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, merged"
508
- spect = filter_peaks(
509
- spect,
510
- inty_min=inty_min,
511
- q1_min=q1_ratio_min,
512
- eic_min=eic_corr_min,
513
- q1_max=q1_ratio_max,
514
- )
515
- write_ion(
516
- f,
517
- title,
518
- feature_uid,
519
- mz,
520
- rt,
521
- charge,
522
- spect,
523
- )
524
- c += 1
525
- else:
526
- for ms2_scans in row["ms2_scans"]:
527
- spect = self.get_spectrum(
528
- ms2_scans,
529
- centroid=centroid,
530
- deisotope=deisotope,
531
- precursor_trim=precursor_trim,
532
- centroid_algo=centroid_algo,
533
- )
534
- spect = filter_peaks(
535
- spect,
536
- inty_min=inty_min,
537
- q1_min=q1_ratio_min,
538
- eic_min=eic_corr_min,
539
- q1_max=q1_ratio_max,
540
- )
541
- write_ion(
542
- f,
543
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
544
- feature_uid,
545
- mz,
546
- rt,
547
- charge,
548
- spect,
549
- )
550
- c += 1
551
-
552
- self.logger.info(f"Exported {c} features to {filename}")
553
-
554
- # Handle None values in logging
555
- inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
556
- q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
557
- eic_corr_min_str = f"{eic_corr_min:.3f}" if eic_corr_min is not None else "None"
558
-
559
- self.logger.debug(
560
- f"MGF created with int>{inty_min_str}, q1_ratio>{q1_ratio_min_str}, eic_corr>{eic_corr_min_str}",
561
- )
562
- self.logger.debug(
563
- f"- Exported {c} MS2 spectra for {len(features_list) - skip} precursors. Average spectra/feature is {c / (len(features_list) - skip + 0.000000001):.0f}",
564
- )
565
- self.logger.debug(
566
- f"- Skipped {skip} features because no MS2 scans were available.",
567
- )
568
-
569
-
570
- def export_dda_stats(self, filename="stats.csv"):
571
- """
572
- Save DDA statistics into a CSV file.
573
-
574
- This method computes basic statistics from the DDA analysis, such as:
575
- - Total number of MS1 scans.
576
- - Total number of MS2 scans.
577
- - Total number of detected features.
578
- - Number of features linked with MS2 data.
579
- - Average cycle time (if available in the scans data).
580
-
581
- The resulting statistics are saved in CSV format.
582
-
583
- Parameters:
584
- filename (str): The name/path of the CSV file to be saved. Defaults to "stats.csv".
585
-
586
- Returns:
587
- None
588
- """
589
- # Compute counts from scans_df and features_df
590
- ms1_count = len(self.scans_df.filter(pl.col("ms_level") == 1))
591
- ms2_count = len(self.scans_df.filter(pl.col("ms_level") == 2))
592
- features_count = len(self.features_df) if self.features_df is not None else 0
593
- features_with_ms2 = (
594
- self.features_df.filter(pl.col("ms2_scans").is_not_null()).height if self.features_df is not None else 0
595
- )
596
-
597
- # Initialize a dictionary to hold statistics
598
- stats = {
599
- "MS1_scans": ms1_count,
600
- "MS2_scans": ms2_count,
601
- "Total_features": features_count,
602
- "Features_with_MS2": features_with_ms2,
603
- }
604
-
605
- # Calculate the average cycle time if available.
606
- if "time_cycle" in self.scans_df.columns:
607
- ms1_df = self.scans_df.filter(pl.col("ms_level") == 1)
608
- avg_cycle_time = ms1_df["time_cycle"].mean()
609
- stats["Average_cycle_time"] = avg_cycle_time if avg_cycle_time is not None else ""
610
- else:
611
- stats["Average_cycle_time"] = 0
612
-
613
- # Convert stats dict to a Pandas DataFrame and save as CSV.
614
- df_stats = pd.DataFrame(list(stats.items()), columns=["Metric", "Value"])
615
- df_stats.to_csv(filename, index=False)
616
- lines = []
617
- lines.append(f"Filename,{self.file_path}")
618
- lines.append(
619
- f"Number of cycles,{len(self.scans_df.filter(pl.col('ms_level') == 1))}",
620
- )
621
- lines.append(
622
- f"Number of MS2 scans,{len(self.scans_df.filter(pl.col('ms_level') == 2))}",
623
- )
624
- # retrieve scans with mslevel 1 from
625
- ms1 = self.scans_df.filter(pl.col("ms_level") == 1)
626
- lines.append(f"Maximal number of MS2 scans per cycle (N),{ms1['ms2_n'].max()}")
627
- # average number of MS2 scans per cycle, skip null values
628
- ms2n_mean = ms1.filter(pl.col("ms2_n") >= 0)["ms2_n"].mean()
629
- lines.append(f"Average number of MS2 scans per cycle,{ms2n_mean:.0f}")
630
- lines.append(f"Maximal cycle time,{ms1['time_cycle'].max():.3f}")
631
- # find spectra with ms2_n = 0
632
- ms1_ms2_0 = ms1.filter(pl.col("ms2_n") == 0)
633
- if len(ms1_ms2_0) > 0:
634
- lines.append(
635
- f"Average cycle time at MS1-only,{ms1_ms2_0['time_cycle'].mean():.3f}",
636
- )
637
- else:
638
- lines.append("Average cycle time at MS1-only,")
639
- # find spectra with ms2_n = 1
640
- ms1_ms2_1 = ms1.filter(pl.col("ms2_n") == 1)
641
- if len(ms1_ms2_1) > 0:
642
- lines.append(
643
- f"Average cycle time with 1 MS2,{ms1_ms2_1['time_cycle'].mean():.3f}",
644
- )
645
- else:
646
- lines.append("Average cycle time with 1 MS2,")
647
- # find spectra with ms2_n = 2
648
- ms1_ms2_2 = ms1.filter(pl.col("ms2_n") == 2)
649
- if len(ms1_ms2_2) > 0:
650
- lines.append(
651
- f"Average cycle time with 2 MS2,{ms1_ms2_2['time_cycle'].mean():.3f}",
652
- )
653
- else:
654
- lines.append("Average cycle time with 2 MS2,")
655
- # find spectra with ms2_n = 2
656
- ms1_ms2_3 = ms1.filter(pl.col("ms2_n") == 3)
657
- if len(ms1_ms2_3) > 0:
658
- lines.append(
659
- f"Average cycle time with 3 MS2,{ms1_ms2_3['time_cycle'].mean():.3f}",
660
- )
661
- else:
662
- lines.append("Average cycle time with 3 MS2,")
663
- max_ms2_n = ms1["ms2_n"].max()
664
- ms1_ms2_n1 = ms1.filter(pl.col("ms2_n") == max_ms2_n - 1)
665
- if len(ms1_ms2_n1) > 0:
666
- lines.append(
667
- f"Average cycle time with N-1 MS2,{ms1_ms2_n1['time_cycle'].mean():.3f}",
668
- )
669
- else:
670
- lines.append("Average cycle time with N-1 MS2,")
671
- # find specgtra with maximal ms2_n
672
- ms1_max_ms2_n = ms1.filter(pl.col("ms2_n") == max_ms2_n)
673
- lines.append(
674
- f"Average cycle time with N MS2,{ms1_max_ms2_n['time_cycle'].mean():.3f}",
675
- )
676
- # average time_MS1, skip null values
677
- a = ms1.filter(pl.col("time_ms1_to_ms1") >= 0)["time_ms1_to_ms1"].mean()
678
- if a is not None:
679
- lines.append(f"Average MS1-to-MS1 scan time,{a:.3f}")
680
- else:
681
- lines.append("Average MS1-to-MS1 scan time,")
682
- a = ms1.filter(pl.col("time_ms1_to_ms2") >= 0)["time_ms1_to_ms2"].mean()
683
- if a is not None:
684
- lines.append(f"Average MS1-to-MS2 scan time,{a:.3f}")
685
- else:
686
- lines.append("Average MS1-to-MS2 scan time,")
687
- ms2_mean = ms1.filter(pl.col("time_ms2_to_ms2") >= 0)["time_ms2_to_ms2"].mean()
688
- if ms2_mean is not None:
689
- lines.append(f"Average MS2-to-MS2 scan time,{ms2_mean:.3f}")
690
- else:
691
- lines.append("Average MS2-to-MS2 scan time,")
692
- a = ms1.filter(pl.col("time_ms2_to_ms1") >= 0)["time_ms2_to_ms1"].mean()
693
- if a is not None:
694
- lines.append(f"Average MS2-to-MS1 scan time,{a:.3f}")
695
- else:
696
- lines.append("Average MS2-to-MS1 scan time,")
697
- # number of features
698
- if self.features_df is not None:
699
- lines.append(f"Number of features,{self.features_df.height}")
700
- a = self.features_df.filter(pl.col("ms2_scans").is_not_null()).height
701
- lines.append(f"Number of features with MS2 data,{a}")
702
- b = self.scans_df.filter(pl.col("feature_uid") >= 0).height
703
- lines.append(f"Number of MS2 scans with features,{b}")
704
- if a > 0:
705
- lines.append(f"Redundancy of MS2 scans with features,{b / a:.3f}")
706
- else:
707
- lines.append("Redundancy of MS2 scans with features,")
708
- else:
709
- lines.append("Number of features,")
710
- lines.append("Number of features with MS2 data,")
711
- lines.append("Number of MS2 scans with features,")
712
- lines.append("Redundancy of MS2 scans with features,")
713
-
714
- # write to file
715
- with open(filename, "w") as f:
716
- for line in lines:
717
- f.write(line + "\n")
718
-
719
- self.logger.info(f"DDA statistics exported to {filename}")
720
-
721
-
722
- def export_chrom(self, filename="chrom.csv"):
723
- # saves self.chrom_df to a csv file. Remove the scan_uid and chrom columns if the file already exists
724
- if self.chrom_df is None:
725
- self.logger.warning("No chromatogram definitions found.")
726
- return
727
- data = self.chrom_df.clone()
728
- # Convert to pandas for CSV export
729
- if hasattr(data, "to_pandas"):
730
- data = data.to_pandas()
731
- # remove scan_uid and chrom columns if they exist
732
- if "scan_uid" in data.columns:
733
- data = data.drop("scan_uid")
734
- if "chrom" in data.columns:
735
- data = data.drop("chrom")
736
- data.to_csv(filename, index=False)
1
+ """
2
+ _export.py
3
+
4
+ This module provides data export functionality for mass spectrometry analysis results.
5
+ It handles saving processed data in various formats for downstream analysis, sharing,
6
+ and archival purposes, including spectrum files, feature tables, and custom formats.
7
+
8
+ Key Features:
9
+ - **Multi-Format Export**: Save data as MGF, mzML, CSV, FeatureXML, and custom formats.
10
+ - **Spectrum Export**: Export MS/MS spectra for database searching and identification.
11
+ - **Feature Export**: Save detected features with quantitative information.
12
+ - **Custom Formats**: Support for compressed pickle formats (mzpkl) for fast storage.
13
+ - **Metadata Preservation**: Maintain acquisition parameters and processing history.
14
+ - **Batch Export**: Export multiple samples or studies simultaneously.
15
+
16
+ Dependencies:
17
+ - `pyopenms`: For standard mass spectrometry file format export.
18
+ - `polars` and `pandas`: For tabular data export and manipulation.
19
+ - `numpy`: For numerical array operations.
20
+ - `pickle` and `bz2`: For custom format compression and serialization.
21
+ - `loguru`: For logging export operations and error handling.
22
+
23
+ Functions:
24
+ - `save()`: Main export function with format detection.
25
+ - `save_mzpkl()`: Export to compressed pickle format for fast loading.
26
+ - `save_featureXML()`: Export features in OpenMS FeatureXML format.
27
+ - `export_mgf()`: Export MS/MS spectra in MGF format for database searching.
28
+ - `export_csv()`: Export features and metadata in CSV format.
29
+
30
+ Supported Export Formats:
31
+ - MGF (Mascot Generic Format) for MS/MS spectra
32
+ - mzML (open standard format) for spectral data
33
+ - CSV for tabular feature data
34
+ - FeatureXML (OpenMS format) for feature data
35
+ - mzpkl (custom compressed format) for complete analysis results
36
+
37
+ Example Usage:
38
+ ```python
39
+ from _export import save, export_mgf
40
+
41
+ # Save complete analysis in custom format
42
+ save(self, filename="analysis_results.mzpkl")
43
+
44
+ # Export MS/MS spectra for database searching
45
+ export_mgf(self, filename="ms2_spectra.mgf", export_type="all")
46
+
47
+ # Export feature table
48
+ export_csv(self, filename="features.csv", data_type="features")
49
+ ```
50
+
51
+ See Also:
52
+ - `parameters._export_parameters`: For export-specific parameter configuration.
53
+ - `_import.py`: For data import functionality.
54
+ - `single.py`: For using export methods with ddafile class.
55
+
56
+ """
57
+
58
+ import os
59
+
60
+ from datetime import datetime
61
+
62
+ import numpy as np
63
+ import pandas as pd
64
+ import polars as pl
65
+ import pyopenms as oms
66
+
67
+ from tqdm import tqdm
68
+
69
+ # Parameters removed - using hardcoded defaults
70
+ from masster.spectrum import combine_peaks
71
+
72
+
73
+ def save(self, filename=None):
74
+ """
75
+ Save the current object to a file in the '.sample5' format.
76
+
77
+ If `filename` is not provided, the method attempts to use `self.file_path` as the base name,
78
+ replacing its extension with '.sample5'. If neither `filename` nor `self.file_path` is available,
79
+ a ValueError is raised.
80
+
81
+ If `filename` is provided and `self.file_path` is an absolute path, the extension of `filename`
82
+ is replaced with '.sample5'. Otherwise, if `self.file_path` is available, its extension is replaced
83
+ with '.sample5'. If neither is available, a ValueError is raised.
84
+
85
+ Parameters:
86
+ filename (str, optional): The name of the file to save to. If not provided, uses `self.file_path`.
87
+
88
+ Returns:
89
+ None
90
+ """
91
+ if filename is None:
92
+ # save to default file name
93
+ if self.file_path is not None:
94
+ filename = os.path.splitext(self.file_path)[0] + ".sample5"
95
+ else:
96
+ raise ValueError("either filename or file_path must be provided")
97
+ else:
98
+ # check if filename includes an absolute path
99
+ if os.path.isabs(self.file_path):
100
+ filename = os.path.splitext(filename)[0] + ".sample5"
101
+ elif self.file_path is not None:
102
+ filename = os.path.splitext(self.file_path)[0] + ".sample5"
103
+ else:
104
+ raise ValueError("either filename or file_path must be provided")
105
+ self._save_sample5(filename=filename)
106
+ self.file_path = filename
107
+
108
+
109
+ def _save_featureXML(self, filename="features.featureXML"):
110
+ if self.features is None:
111
+ self.logger.warning("No features found.")
112
+ return
113
+ fh = oms.FeatureXMLFile()
114
+ fh.store(filename, self.features)
115
+ self.logger.debug(f"Features Map saved to {filename}")
116
+
117
+
118
+ def export_features(self, filename="features.csv"):
119
+ """
120
+ Export the features DataFrame to a CSV or Excel file.
121
+
122
+ This method clones the internal features DataFrame, adds a boolean column 'has_ms2' indicating
123
+ whether the 'ms2_scans' column is not null, and exports the resulting DataFrame to the specified file.
124
+ Columns with data types 'List' or 'Object' are excluded from the export.
125
+
126
+ Parameters:
127
+ filename (str): The path to the output file. If the filename ends with '.xls' or '.xlsx',
128
+ the data is exported in Excel format; otherwise, it is exported as CSV.
129
+ Defaults to 'features.csv'.
130
+
131
+ Side Effects:
132
+ Writes the exported data to the specified file and logs the export operation.
133
+ """
134
+ # clone df
135
+ clean_df = self.features_df.clone()
136
+ filename = os.path.abspath(filename)
137
+ # add a column has_ms2=True if colum ms2_scans is not None
138
+ if "ms2_scans" in clean_df.columns:
139
+ clean_df = clean_df.with_columns(
140
+ (pl.col("ms2_scans").is_not_null()).alias("has_ms2")
141
+ )
142
+ clean_df = self.features_df.select([
143
+ col for col in self.features_df.columns if self.features_df[col].dtype not in (pl.List, pl.Object)
144
+ ])
145
+ if filename.lower().endswith((".xls", ".xlsx")):
146
+ clean_df.to_pandas().to_excel(filename, index=False)
147
+ self.logger.info(f"Features exported to {filename} (Excel format)")
148
+ else:
149
+ clean_df.write_csv(filename)
150
+ self.logger.info(f"Features exported to {filename}")
151
+
152
+
153
+ def export_mgf(
154
+ self,
155
+ filename: str = "features.mgf",
156
+ use_cache=True,
157
+ selection="best",
158
+ split_energy=True,
159
+ merge=False,
160
+ mz_start=None,
161
+ mz_end=None,
162
+ rt_start=None,
163
+ rt_end=None,
164
+ include_all_ms1=False,
165
+ full_ms1=False,
166
+ centroid=True,
167
+ inty_min=float("-inf"),
168
+ q1_ratio_min=None,
169
+ q1_ratio_max=None,
170
+ eic_corr_min=None,
171
+ deisotope=True,
172
+ precursor_trim=10.0,
173
+ centroid_algo=None,
174
+ ):
175
+ """
176
+ Export features as an MGF file with MS1 and MS2 spectra.
177
+
178
+ Iterates over all features in `self.features_df` (or `self.features` if the former is None),
179
+ retrieves the corresponding MS1 and MS2 spectra, applies peak filtering, and writes them in MGF format.
180
+
181
+ Args:
182
+ filename (str, optional): Output MGF file name. Defaults to "features.mgf".
183
+ use_cache (bool, optional): Use cached MS2 spectra from the features DataFrame. Defaults to False.
184
+ selection (str, optional): "best" for first scan, "all" for every scan. Defaults to "best".
185
+ split_energy (bool, optional): Process MS2 scans by unique energy. Defaults to False.
186
+ merge (bool, optional): If selection="all", merge MS2 scans into one spectrum. Defaults to False.
187
+ mz_start (float, optional): Minimum m/z for feature selection.
188
+ mz_end (float, optional): Maximum m/z for feature selection.
189
+ rt_start (float, optional): Minimum RT for feature selection.
190
+ rt_end (float, optional): Maximum RT for feature selection.
191
+ include_all_ms1 (bool, optional): Include MS1 spectra even if no MS2 scan. Defaults to False.
192
+ full_ms1 (bool, optional): Export full MS1 spectrum or trim around precursor. Defaults to False.
193
+ centroid (bool, optional): Centroid the spectrum. Defaults to True.
194
+ inty_min (float, optional): Minimum intensity threshold for peaks.
195
+ q1_ratio_min (float, optional): Minimum q1_ratio for peaks.
196
+ q1_ratio_max (float, optional): Maximum q1_ratio for peaks.
197
+ eic_corr_min (float, optional): Minimum EIC correlation for peaks.
198
+ deisotope (bool, optional): Perform deisotoping. Defaults to True.
199
+ verbose (bool, optional): Print summary after export. Defaults to False.
200
+ precursor_trim (int, optional): Trimming parameter for precursor peaks. Defaults to -10.
201
+ centroid_algo (str, optional): Centroiding algorithm to use.
202
+
203
+ Returns:
204
+ None
205
+
206
+ Notes:
207
+ - If neither `self.features_df` nor `self.features` are available, the method logs a warning and returns.
208
+ - Uses internal helpers for peak filtering and MGF formatting.
209
+ - For each feature, writes MS1 spectrum first, then MS2 spectra if available.
210
+ """
211
+
212
+ if self.features_df is None:
213
+ if self.features is None:
214
+ self.logger.warning("Please find features first.")
215
+ return
216
+ else:
217
+ self.features_df = self.features.get_df()
218
+
219
+ # Apply filtering at DataFrame level for better performance
220
+ features = self.features_df
221
+ if mz_start is not None:
222
+ features = features.filter(pl.col("mz") >= mz_start)
223
+ if mz_end is not None:
224
+ features = features.filter(pl.col("mz") <= mz_end)
225
+ if rt_start is not None:
226
+ features = features.filter(pl.col("rt") >= rt_start)
227
+ if rt_end is not None:
228
+ features = features.filter(pl.col("rt") <= rt_end)
229
+ if not include_all_ms1:
230
+ features = features.filter(pl.col("ms2_scans").is_not_null())
231
+
232
+ # Convert to list of dictionaries for faster iteration
233
+ features_list = features.to_dicts()
234
+
235
+ def filter_peaks(spec, inty_min=None, q1_min=None, eic_min=None, q1_max=None):
236
+ # create a copy of the spectrum
237
+ spec = spec.copy()
238
+ spec_len = len(spec.mz)
239
+ mask = [True] * spec_len
240
+ if inty_min is not None and inty_min > 0:
241
+ mask = np.array(mask) & (spec.inty >= inty_min)
242
+ # check if q1_ratio is an attribute of spec
243
+ if q1_min is not None and hasattr(spec, "q1_ratio"):
244
+ mask = mask & (spec.q1_ratio >= q1_min)
245
+ # check if eic_corr is an attribute of spec
246
+ if q1_max is not None and hasattr(spec, "q1_ratio"):
247
+ mask = mask & (spec.q1_ratio <= q1_max)
248
+ # check if eic_corr is an attribute of spec
249
+ if eic_min is not None and hasattr(spec, "eic_corr"):
250
+ mask = mask & (spec.eic_corr >= eic_min)
251
+ # apply mask to all attributes of spec with the same length as mz
252
+ for attr in spec.__dict__:
253
+ # check it attr is a list or an array:
254
+ if isinstance(getattr(spec, attr), list) or isinstance(
255
+ getattr(spec, attr),
256
+ np.ndarray,
257
+ ):
258
+ # check if attr has attribute 0 and its length is equal to spec_len:
259
+ if hasattr(getattr(spec, attr), "__len__"):
260
+ if len(getattr(spec, attr)) == spec_len:
261
+ setattr(spec, attr, getattr(spec, attr)[mask])
262
+ return spec
263
+
264
+ def write_ion(f, title, fid, mz, rt, charge, spect):
265
+ if spect is None:
266
+ return
267
+ f.write(f"BEGIN IONS\nTITLE={title}\n")
268
+ f.write(f"FEATURE_ID={fid}\n")
269
+ f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
270
+ if spect.ms_level is None:
271
+ f.write("MSLEVEL=1\n")
272
+ else:
273
+ f.write(f"MSLEVEL={spect.ms_level}\n")
274
+ if spect.ms_level is not None:
275
+ if spect.ms_level > 1 and hasattr(spect, "energy"):
276
+ f.write(f"ENERGY={spect.energy}\n")
277
+ # Use list comprehension for better performance
278
+ peak_lines = [f"{mz_val:.5f} {inty_val:.0f}\n" for mz_val, inty_val in zip(spect.mz, spect.inty, strict=False)]
279
+ f.writelines(peak_lines)
280
+ f.write("END IONS\n\n")
281
+
282
+ if centroid_algo is None:
283
+ if hasattr(self.parameters, "centroid_algo"):
284
+ centroid_algo = self.parameters.centroid_algo
285
+ else:
286
+ centroid_algo = "cr"
287
+
288
+ # count how many features have charge < 0
289
+ if self.features_df.filter(pl.col("charge") < 0).shape[0]- self.features_df.filter(pl.col("charge") > 0).shape[0] > 0:
290
+ preferred_charge = -1
291
+ else:
292
+ preferred_charge = 1
293
+
294
+ c = 0
295
+ skip = 0
296
+ # check if features is empty
297
+ if len(features_list) == 0:
298
+ self.logger.warning("No features found.")
299
+ return
300
+ filename = os.path.abspath(filename)
301
+ with open(filename, "w", encoding="utf-8") as f:
302
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
303
+ for row in tqdm(
304
+ features_list,
305
+ total=len(features_list),
306
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MGF",
307
+ disable=tdqm_disable,
308
+ ):
309
+ # Pre-calculate common values
310
+ feature_uid = row["feature_uid"]
311
+ mz = row["mz"]
312
+ rt = row["rt"]
313
+ rt_str = f"{rt:.2f}"
314
+ mz_str = f"{mz:.4f}"
315
+
316
+ # Filtering is now done at DataFrame level, so we can skip these checks
317
+ if row["ms2_scans"] is None and not include_all_ms1:
318
+ skip = skip + 1
319
+ continue
320
+
321
+ # write MS1 spectrum
322
+ ms1_scan_uid = self.select_closest_scan(rt=rt)["scan_uid"][0]
323
+ spect = self.get_spectrum(
324
+ ms1_scan_uid,
325
+ centroid=centroid,
326
+ deisotope=deisotope,
327
+ centroid_algo=centroid_algo,
328
+ )
329
+
330
+ spect = filter_peaks(spect, inty_min=inty_min)
331
+
332
+ if not full_ms1:
333
+ # trim spectrum to region around the precursor, it's wide to potentially identify adducts
334
+ spect = spect.trim(
335
+ mz_min=mz - 50,
336
+ mz_max=mz + 50,
337
+ )
338
+
339
+ charge = preferred_charge
340
+ if row["charge"] is not None and row["charge"] != 0:
341
+ charge = row["charge"]
342
+
343
+ write_ion(
344
+ f,
345
+ f"feature_uid:{feature_uid}, rt:{rt_str}, mz:{mz_str}",
346
+ feature_uid,
347
+ mz,
348
+ rt,
349
+ charge,
350
+ spect,
351
+ )
352
+
353
+ if row["ms2_scans"] is None:
354
+ continue
355
+ elif use_cache:
356
+ spect = row["ms2_specs"]
357
+ if spect is None:
358
+ # No cached spectra, fall through to fetch from scan_uid
359
+ use_cache = False
360
+ else:
361
+ # check if spec is a list of spectra
362
+ if isinstance(spect, list):
363
+ if selection == "best":
364
+ s = spect[0]
365
+ scan_uid = row["ms2_scans"][0]
366
+ s.energy = self.get_spectrum(scan_uid).energy
367
+ spect = [s]
368
+ scan_uids = [scan_uid]
369
+ else:
370
+ scan_uids = row["ms2_scans"]
371
+
372
+ for i, s in enumerate(spect):
373
+ if s is None:
374
+ print(
375
+ f"No MS2 spectrum for feature {feature_uid} is cached.",
376
+ )
377
+ continue
378
+ # check if s is a spectrum
379
+ if type(s).__name__ == "Spectrum":
380
+ s = filter_peaks(
381
+ s,
382
+ inty_min=inty_min,
383
+ q1_min=q1_ratio_min,
384
+ eic_min=eic_corr_min,
385
+ q1_max=q1_ratio_max,
386
+ )
387
+ # Get the corresponding scan_uid from the list
388
+ current_scan_uid = scan_uids[i] if i < len(scan_uids) else "unknown"
389
+ write_ion(
390
+ f,
391
+ f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{current_scan_uid}",
392
+ feature_uid,
393
+ mz,
394
+ rt,
395
+ charge,
396
+ s,
397
+ )
398
+ c += 1
399
+ continue # Skip the rest of the processing for this feature
400
+
401
+ # If we reach here, either use_cache=False or no cached spectra were available
402
+ if split_energy:
403
+ # get energy of all scans with scan_uid in ms2_scans by fetching them
404
+ ms2_scan_uids = row["ms2_scans"]
405
+ if isinstance(ms2_scan_uids, list) and len(ms2_scan_uids) > 0:
406
+ # Fetch spectra to get energy information
407
+ spectra_with_energy = []
408
+ for scan_uid in ms2_scan_uids:
409
+ spec = self.get_spectrum(scan_uid)
410
+ if spec is not None:
411
+ spectra_with_energy.append((scan_uid, spec.energy if hasattr(spec, 'energy') else 0))
412
+
413
+ # Group by energy
414
+ energy_groups: dict[float, list[int]] = {}
415
+ for scan_uid, energy in spectra_with_energy:
416
+ if energy not in energy_groups:
417
+ energy_groups[energy] = []
418
+ energy_groups[energy].append(scan_uid)
419
+
420
+ for energy, scan_uids_for_energy in energy_groups.items():
421
+ if selection == "best":
422
+ # Keep only the first scan for this energy
423
+ scan_uids_for_energy = [scan_uids_for_energy[0]]
424
+
425
+ for scan_uid in scan_uids_for_energy:
426
+ spect = self.get_spectrum(
427
+ scan_uid,
428
+ centroid=centroid,
429
+ deisotope=deisotope,
430
+ precursor_trim=precursor_trim,
431
+ centroid_algo=centroid_algo,
432
+ )
433
+ spect = filter_peaks(
434
+ spect,
435
+ inty_min=inty_min,
436
+ q1_min=q1_ratio_min,
437
+ eic_min=eic_corr_min,
438
+ q1_max=q1_ratio_max,
439
+ )
440
+ write_ion(
441
+ f,
442
+ f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{scan_uid}, energy:{energy}",
443
+ feature_uid,
444
+ mz,
445
+ rt,
446
+ charge,
447
+ spect,
448
+ )
449
+ c += 1
450
+ else:
451
+ if selection == "best":
452
+ ms2_scans = row["ms2_scans"][0]
453
+ spect = self.get_spectrum(
454
+ ms2_scans,
455
+ centroid=centroid,
456
+ deisotope=deisotope,
457
+ precursor_trim=precursor_trim,
458
+ centroid_algo=centroid_algo,
459
+ )
460
+ spect = filter_peaks(
461
+ spect,
462
+ inty_min=inty_min,
463
+ q1_min=q1_ratio_min,
464
+ eic_min=eic_corr_min,
465
+ q1_max=q1_ratio_max,
466
+ )
467
+ write_ion(
468
+ f,
469
+ f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
470
+ feature_uid,
471
+ mz,
472
+ rt,
473
+ charge,
474
+ spect,
475
+ )
476
+ c += 1
477
+ elif selection == "all":
478
+ if merge:
479
+ specs = []
480
+ for ms2_scans in row["ms2_scans"]:
481
+ specs.append(
482
+ self.get_spectrum(
483
+ ms2_scans,
484
+ centroid=centroid,
485
+ deisotope=deisotope,
486
+ precursor_trim=precursor_trim,
487
+ ),
488
+ )
489
+ spect = combine_peaks(specs)
490
+ if centroid:
491
+ spect = spect.denoise()
492
+ if spect.ms_level == 1:
493
+ spect = spect.centroid(
494
+ tolerance=self.parameters["mz_tol_ms1_da"],
495
+ ppm=self.parameters["mz_tol_ms1_ppm"],
496
+ min_points=self.parameters["centroid_min_points_ms1"],
497
+ algo=centroid_algo,
498
+ )
499
+ elif spect.ms_level == 2:
500
+ spect = spect.centroid(
501
+ tolerance=self.parameters["mz_tol_ms2_da"],
502
+ ppm=self.parameters["mz_tol_ms2_ppm"],
503
+ min_points=self.parameters["centroid_min_points_ms2"],
504
+ algo=centroid_algo,
505
+ )
506
+ if deisotope:
507
+ spect = spect.deisotope()
508
+ title = f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, merged"
509
+ spect = filter_peaks(
510
+ spect,
511
+ inty_min=inty_min,
512
+ q1_min=q1_ratio_min,
513
+ eic_min=eic_corr_min,
514
+ q1_max=q1_ratio_max,
515
+ )
516
+ write_ion(
517
+ f,
518
+ title,
519
+ feature_uid,
520
+ mz,
521
+ rt,
522
+ charge,
523
+ spect,
524
+ )
525
+ c += 1
526
+ else:
527
+ for ms2_scans in row["ms2_scans"]:
528
+ spect = self.get_spectrum(
529
+ ms2_scans,
530
+ centroid=centroid,
531
+ deisotope=deisotope,
532
+ precursor_trim=precursor_trim,
533
+ centroid_algo=centroid_algo,
534
+ )
535
+ spect = filter_peaks(
536
+ spect,
537
+ inty_min=inty_min,
538
+ q1_min=q1_ratio_min,
539
+ eic_min=eic_corr_min,
540
+ q1_max=q1_ratio_max,
541
+ )
542
+ write_ion(
543
+ f,
544
+ f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
545
+ feature_uid,
546
+ mz,
547
+ rt,
548
+ charge,
549
+ spect,
550
+ )
551
+ c += 1
552
+
553
+ self.logger.info(f"Exported {c} features to {filename}")
554
+
555
+ # Handle None values in logging
556
+ inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
557
+ q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
558
+ eic_corr_min_str = f"{eic_corr_min:.3f}" if eic_corr_min is not None else "None"
559
+
560
+ self.logger.debug(
561
+ f"MGF created with int>{inty_min_str}, q1_ratio>{q1_ratio_min_str}, eic_corr>{eic_corr_min_str}",
562
+ )
563
+ self.logger.debug(
564
+ f"- Exported {c} MS2 spectra for {len(features_list) - skip} precursors. Average spectra/feature is {c / (len(features_list) - skip + 0.000000001):.0f}",
565
+ )
566
+ self.logger.debug(
567
+ f"- Skipped {skip} features because no MS2 scans were available.",
568
+ )
569
+
570
+
571
+ def export_dda_stats(self, filename="stats.csv"):
572
+ """
573
+ Save DDA statistics into a CSV file.
574
+
575
+ This method computes basic statistics from the DDA analysis, such as:
576
+ - Total number of MS1 scans.
577
+ - Total number of MS2 scans.
578
+ - Total number of detected features.
579
+ - Number of features linked with MS2 data.
580
+ - Average cycle time (if available in the scans data).
581
+
582
+ The resulting statistics are saved in CSV format.
583
+
584
+ Parameters:
585
+ filename (str): The name/path of the CSV file to be saved. Defaults to "stats.csv".
586
+
587
+ Returns:
588
+ None
589
+ """
590
+ # Compute counts from scans_df and features_df
591
+ ms1_count = len(self.scans_df.filter(pl.col("ms_level") == 1))
592
+ ms2_count = len(self.scans_df.filter(pl.col("ms_level") == 2))
593
+ features_count = len(self.features_df) if self.features_df is not None else 0
594
+ features_with_ms2 = (
595
+ self.features_df.filter(pl.col("ms2_scans").is_not_null()).height if self.features_df is not None else 0
596
+ )
597
+
598
+ # Initialize a dictionary to hold statistics
599
+ stats = {
600
+ "MS1_scans": ms1_count,
601
+ "MS2_scans": ms2_count,
602
+ "Total_features": features_count,
603
+ "Features_with_MS2": features_with_ms2,
604
+ }
605
+
606
+ # Calculate the average cycle time if available.
607
+ if "time_cycle" in self.scans_df.columns:
608
+ ms1_df = self.scans_df.filter(pl.col("ms_level") == 1)
609
+ avg_cycle_time = ms1_df["time_cycle"].mean()
610
+ stats["Average_cycle_time"] = avg_cycle_time if avg_cycle_time is not None else ""
611
+ else:
612
+ stats["Average_cycle_time"] = 0
613
+
614
+ # Convert stats dict to a Pandas DataFrame and save as CSV.
615
+ df_stats = pd.DataFrame(list(stats.items()), columns=["Metric", "Value"])
616
+ df_stats.to_csv(filename, index=False)
617
+ lines = []
618
+ lines.append(f"Filename,{self.file_path}")
619
+ lines.append(
620
+ f"Number of cycles,{len(self.scans_df.filter(pl.col('ms_level') == 1))}",
621
+ )
622
+ lines.append(
623
+ f"Number of MS2 scans,{len(self.scans_df.filter(pl.col('ms_level') == 2))}",
624
+ )
625
+ # retrieve scans with mslevel 1 from
626
+ ms1 = self.scans_df.filter(pl.col("ms_level") == 1)
627
+ lines.append(f"Maximal number of MS2 scans per cycle (N),{ms1['ms2_n'].max()}")
628
+ # average number of MS2 scans per cycle, skip null values
629
+ ms2n_mean = ms1.filter(pl.col("ms2_n") >= 0)["ms2_n"].mean()
630
+ lines.append(f"Average number of MS2 scans per cycle,{ms2n_mean:.0f}")
631
+ lines.append(f"Maximal cycle time,{ms1['time_cycle'].max():.3f}")
632
+ # find spectra with ms2_n = 0
633
+ ms1_ms2_0 = ms1.filter(pl.col("ms2_n") == 0)
634
+ if len(ms1_ms2_0) > 0:
635
+ lines.append(
636
+ f"Average cycle time at MS1-only,{ms1_ms2_0['time_cycle'].mean():.3f}",
637
+ )
638
+ else:
639
+ lines.append("Average cycle time at MS1-only,")
640
+ # find spectra with ms2_n = 1
641
+ ms1_ms2_1 = ms1.filter(pl.col("ms2_n") == 1)
642
+ if len(ms1_ms2_1) > 0:
643
+ lines.append(
644
+ f"Average cycle time with 1 MS2,{ms1_ms2_1['time_cycle'].mean():.3f}",
645
+ )
646
+ else:
647
+ lines.append("Average cycle time with 1 MS2,")
648
+ # find spectra with ms2_n = 2
649
+ ms1_ms2_2 = ms1.filter(pl.col("ms2_n") == 2)
650
+ if len(ms1_ms2_2) > 0:
651
+ lines.append(
652
+ f"Average cycle time with 2 MS2,{ms1_ms2_2['time_cycle'].mean():.3f}",
653
+ )
654
+ else:
655
+ lines.append("Average cycle time with 2 MS2,")
656
+ # find spectra with ms2_n = 2
657
+ ms1_ms2_3 = ms1.filter(pl.col("ms2_n") == 3)
658
+ if len(ms1_ms2_3) > 0:
659
+ lines.append(
660
+ f"Average cycle time with 3 MS2,{ms1_ms2_3['time_cycle'].mean():.3f}",
661
+ )
662
+ else:
663
+ lines.append("Average cycle time with 3 MS2,")
664
+ max_ms2_n = ms1["ms2_n"].max()
665
+ ms1_ms2_n1 = ms1.filter(pl.col("ms2_n") == max_ms2_n - 1)
666
+ if len(ms1_ms2_n1) > 0:
667
+ lines.append(
668
+ f"Average cycle time with N-1 MS2,{ms1_ms2_n1['time_cycle'].mean():.3f}",
669
+ )
670
+ else:
671
+ lines.append("Average cycle time with N-1 MS2,")
672
+ # find specgtra with maximal ms2_n
673
+ ms1_max_ms2_n = ms1.filter(pl.col("ms2_n") == max_ms2_n)
674
+ lines.append(
675
+ f"Average cycle time with N MS2,{ms1_max_ms2_n['time_cycle'].mean():.3f}",
676
+ )
677
+ # average time_MS1, skip null values
678
+ a = ms1.filter(pl.col("time_ms1_to_ms1") >= 0)["time_ms1_to_ms1"].mean()
679
+ if a is not None:
680
+ lines.append(f"Average MS1-to-MS1 scan time,{a:.3f}")
681
+ else:
682
+ lines.append("Average MS1-to-MS1 scan time,")
683
+ a = ms1.filter(pl.col("time_ms1_to_ms2") >= 0)["time_ms1_to_ms2"].mean()
684
+ if a is not None:
685
+ lines.append(f"Average MS1-to-MS2 scan time,{a:.3f}")
686
+ else:
687
+ lines.append("Average MS1-to-MS2 scan time,")
688
+ ms2_mean = ms1.filter(pl.col("time_ms2_to_ms2") >= 0)["time_ms2_to_ms2"].mean()
689
+ if ms2_mean is not None:
690
+ lines.append(f"Average MS2-to-MS2 scan time,{ms2_mean:.3f}")
691
+ else:
692
+ lines.append("Average MS2-to-MS2 scan time,")
693
+ a = ms1.filter(pl.col("time_ms2_to_ms1") >= 0)["time_ms2_to_ms1"].mean()
694
+ if a is not None:
695
+ lines.append(f"Average MS2-to-MS1 scan time,{a:.3f}")
696
+ else:
697
+ lines.append("Average MS2-to-MS1 scan time,")
698
+ # number of features
699
+ if self.features_df is not None:
700
+ lines.append(f"Number of features,{self.features_df.height}")
701
+ a = self.features_df.filter(pl.col("ms2_scans").is_not_null()).height
702
+ lines.append(f"Number of features with MS2 data,{a}")
703
+ b = self.scans_df.filter(pl.col("feature_uid") >= 0).height
704
+ lines.append(f"Number of MS2 scans with features,{b}")
705
+ if a > 0:
706
+ lines.append(f"Redundancy of MS2 scans with features,{b / a:.3f}")
707
+ else:
708
+ lines.append("Redundancy of MS2 scans with features,")
709
+ else:
710
+ lines.append("Number of features,")
711
+ lines.append("Number of features with MS2 data,")
712
+ lines.append("Number of MS2 scans with features,")
713
+ lines.append("Redundancy of MS2 scans with features,")
714
+
715
+ # write to file
716
+ with open(filename, "w") as f:
717
+ for line in lines:
718
+ f.write(line + "\n")
719
+
720
+ self.logger.info(f"DDA statistics exported to {filename}")
721
+
722
+
723
+ def export_chrom(self, filename="chrom.csv"):
724
+ # saves self.chrom_df to a csv file. Remove the scan_uid and chrom columns if the file already exists
725
+ if self.chrom_df is None:
726
+ self.logger.warning("No chromatogram definitions found.")
727
+ return
728
+ data = self.chrom_df.clone()
729
+ # Convert to pandas for CSV export
730
+ if hasattr(data, "to_pandas"):
731
+ data = data.to_pandas()
732
+ # remove scan_uid and chrom columns if they exist
733
+ if "scan_uid" in data.columns:
734
+ data = data.drop("scan_uid")
735
+ if "chrom" in data.columns:
736
+ data = data.drop("chrom")
737
+ data.to_csv(filename, index=False)