masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -719
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.4.dist-info/RECORD +0 -50
  54. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/load.py CHANGED
@@ -1,1187 +1,1220 @@
1
- """
2
- _import.py
3
-
4
- This module provides data import functionality for mass spectrometry files.
5
- It handles loading and processing of various mass spectrometry file formats
6
- including mzML, vendor formats (WIFF, RAW).
7
-
8
- Key Features:
9
- - **Multi-Format Support**: Load mzML, WIFF (SCIEX), and RAW (Thermo) files.
10
- - **File Validation**: Check file existence and format compatibility.
11
- - **Memory Management**: Support for on-disk and in-memory data handling.
12
- - **Metadata Extraction**: Extract acquisition parameters and instrument information.
13
- - **Error Handling**: Comprehensive error reporting for file loading issues.
14
- - **Raw Data Processing**: Handle centroided and profile data with signal smoothing.
15
-
16
- Dependencies:
17
- - `pyopenms`: For standard mass spectrometry file format support.
18
- - `polars` and `pandas`: For efficient data handling and manipulation.
19
- - `numpy`: For numerical array operations.
20
- - `pickle` and `bz2`: For custom format compression and serialization.
21
-
22
- Functions:
23
- - `load()`: Main file loading function with format detection.
24
- - `_load_mzML()`: Specialized mzML file loader.
25
- - `_load_wiff()`: SCIEX WIFF file loader.
26
- - `_load_raw()`: Thermo RAW file loader.
27
- - `_load_raw()`: Thermo RAW file loader.
28
-
29
- Supported File Formats:
30
- - mzML (open standard format)
31
- - WIFF (SCIEX vendor format)
32
- - RAW (Thermo proprietary format)
33
-
34
- See Also:
35
- - `parameters._import_parameters`: For import-specific parameter configuration.
36
- - `_export.py`: For data export functionality.
37
- - `single.py`: For using imported data with ddafile class.
38
-
39
- """
40
-
41
- import bz2
42
- import os
43
- import pickle
44
-
45
- from datetime import datetime
46
-
47
- import numpy as np
48
- import pandas as pd
49
- import polars as pl
50
- import pyopenms as oms
51
-
52
- from tqdm import tqdm
53
-
54
- from masster.chromatogram import Chromatogram
55
- # Parameters removed - using hardcoded defaults
56
- from masster.spectrum import Spectrum
57
-
58
-
59
- def load(
60
- self,
61
- file=None,
62
- ondisk=False,
63
- type=None,
64
- label=None,
65
- ):
66
- '''
67
- Load file content from a specified filename.
68
- Parameters:
69
- filename (str): The path to the file to load. The file must exist and have one of the following extensions:
70
- .mzML, .wiff, or .raw.
71
- ondisk (bool, optional): Indicates whether the file should be treated as on disk. Defaults to False.
72
- type (str, optional): Specifies the type of file. If provided and set to 'ztscan' (case-insensitive), the file_type
73
- attribute will be adjusted accordingly. Defaults to None.
74
- label (Any, optional): An optional label to associate with the loaded file. Defaults to None.
75
- Raises:
76
- FileNotFoundError: If the file specified by filename does not exist.
77
- ValueError: If the file extension is not one of the supported types (.mzML, .wiff, or .raw).
78
- Notes:
79
- The function determines the appropriate internal loading mechanism based on the file extension:
80
- - ".mzml": Calls _load_mzML(filename)
81
- - ".wiff": Calls _load_wiff(filename)
82
- - ".raw": Calls _load_raw(filename)
83
- After loading, the file_type attribute is set to 'dda', unless the optional 'type' parameter is provided as 'ztscan',
84
- in which case it is updated to 'ztscan'. The label attribute is updated if a label is provided.
85
- '''
86
-
87
- self.ondisk = ondisk
88
- file = file.strip()
89
- # check if file exists
90
- if not os.path.exists(file):
91
- raise FileNotFoundError(f"File {file} not found.")
92
- # check if file is mzML
93
- if file.lower().endswith(".mzml"):
94
- self._load_mzML(file)
95
- elif file.lower().endswith(".wiff") or file.lower().endswith(".wiff2"):
96
- self._load_wiff(file)
97
- elif file.lower().endswith(".raw"):
98
- self._load_raw(file)
99
- elif file.lower().endswith(".sample5"):
100
- self._load_sample5(file)
101
- elif file.lower().endswith(".h5"):
102
- self._load_h5(file)
103
- else:
104
- raise ValueError("File must be .mzML, .wiff, .sample5")
105
-
106
- self.file_type = "dda"
107
- if type is not None and type.lower() in ["ztscan"]:
108
- self.file_type = "ztscan"
109
-
110
- if label is not None:
111
- self.label = label
112
-
113
-
114
- def _load_mzML(
115
- self,
116
- filename=None,
117
- ):
118
- """
119
- Load an mzML file and process its spectra.
120
- This method loads an mzML file (if a filename is provided, it will update the internal file path) using either an on-disk or in-memory MS experiment depending on the object's "ondisk" flag. It then iterates over all the spectra in the experiment:
121
- - For MS level 1 spectra, it increments a cycle counter and creates a polars DataFrame containing the retention time, m/z values, and intensity values.
122
- - For higher MS level spectra, it processes precursor-related information such as precursor m/z, isolation window offsets, intensity, and activation energy.
123
- Each spectrum is further processed by computing its baseline, denoising based on the baseline, and extracting various scan properties (such as TIC, minimum/maximum intensity, m/z bounds, etc.). This scan information is appended to a list.
124
- After processing all spectra, the method consolidates the collected scan data into a polars DataFrame with an explicit schema. It also assigns the on-disk/in-memory experiment object and corresponding file interface to instance attributes. The method sets a label based on the file basename, and, unless the scan type is 'ztscan', calls an additional analysis routine (analyze_dda).
125
- Parameters:
126
- filename (str, optional): The path to the mzML file to load. If None, the existing file path attribute is used.
127
- Returns:
128
- None
129
- Side Effects:
130
- - Updates self.file_path if a new filename is provided.
131
- - Loads and stores the MS experiment in self.file_obj.
132
- - Sets self.file_interface to the string 'oms'.
133
- - Stores the processed scan data in self.scans_df.
134
- - Maintains MS1-specific data in self.ms1_df.
135
- - Updates the instance label based on the loaded file's basename.
136
- - Invokes the analyze_dda method if the scan type is not 'ztscan'.
137
- """
138
- if filename is not None:
139
- self.file_path = filename
140
-
141
- self.logger.info(f"Loading {filename}")
142
-
143
- omsexp: oms.OnDiscMSExperiment | oms.MSExperiment
144
- if self.ondisk:
145
- omsexp = oms.OnDiscMSExperiment()
146
- self.file_obj = omsexp
147
- else:
148
- omsexp = oms.MSExperiment()
149
- oms.MzMLFile().load(self.file_path, omsexp)
150
- self.file_obj = omsexp
151
-
152
- scans = []
153
- cycle = 0
154
- schema = {
155
- "cycle": pl.Int32,
156
- "scan_uid": pl.Int64,
157
- "rt": pl.Float64,
158
- "mz": pl.Float64,
159
- "inty": pl.Float64,
160
- }
161
- # create a polars DataFrame with explicit schema: cycle: int, rt: float, mz: float, intensity: float
162
- ms1_df = pl.DataFrame(
163
- {"cycle": [], "scan_uid": [], "rt": [], "mz": [], "inty": []},
164
- schema=schema,
165
- )
166
-
167
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
168
- # iterate over all spectra
169
- for i, s in tqdm(
170
- enumerate(omsexp.getSpectra()), # type: ignore[union-attr]
171
- total=omsexp.getNrSpectra(),
172
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Scans",
173
- disable=tdqm_disable,
174
- ):
175
- # create a dict
176
- if s.getMSLevel() == 1:
177
- cycle += 1
178
- prec_mz = None
179
- precursorIsolationWindowLowerMZ = None
180
- precursorIsolationWindowUpperMZ = None
181
- prec_intyensity = None
182
- energy = None
183
- else:
184
- prec_mz = s.getPrecursors()[0].getMZ()
185
- precursorIsolationWindowLowerMZ = s.getPrecursors()[
186
- 0
187
- ].getIsolationWindowLowerOffset()
188
- precursorIsolationWindowUpperMZ = s.getPrecursors()[
189
- 0
190
- ].getIsolationWindowUpperOffset()
191
- prec_intyensity = s.getPrecursors()[0].getIntensity()
192
- energy = s.getPrecursors()[0].getActivationEnergy()
193
-
194
- peaks = s.get_peaks()
195
- spect = Spectrum(mz=peaks[0], inty=peaks[1], ms_level=s.getMSLevel())
196
-
197
- bl = spect.baseline()
198
- spect = spect.denoise(threshold=bl)
199
-
200
- if spect.ms_level == 1:
201
- mz = np.array(spect.mz)
202
- median_diff = np.median(np.diff(np.sort(mz))) if mz.size > 1 else None
203
-
204
- if median_diff is not None and median_diff < 0.01:
205
- spect = spect.centroid(
206
- tolerance=self.parameters.mz_tol_ms1_da,
207
- ppm=self.parameters.mz_tol_ms1_ppm,
208
- min_points=self.parameters.centroid_min_points_ms1,
209
- )
210
-
211
- newscan = {
212
- "scan_uid": i,
213
- "cycle": cycle,
214
- "ms_level": int(s.getMSLevel()),
215
- "rt": s.getRT(),
216
- "inty_tot": spect.tic(),
217
- "inty_min": spect.inty_min(),
218
- "inty_max": spect.inty_max(),
219
- "bl": bl,
220
- "mz_min": spect.mz_min(),
221
- "mz_max": spect.mz_max(),
222
- "comment": s.getComment(),
223
- "name": s.getName(),
224
- "id": s.getNativeID(),
225
- "prec_mz": prec_mz,
226
- "prec_mz_min": precursorIsolationWindowLowerMZ,
227
- "prec_mz_max": precursorIsolationWindowUpperMZ,
228
- "prec_inty": prec_intyensity,
229
- "energy": energy,
230
- "feature_uid": -1,
231
- }
232
-
233
- scans.append(newscan)
234
-
235
- if s.getMSLevel() == 1 and len(peaks) > 0:
236
- newms1_df = pl.DataFrame(
237
- {
238
- "cycle": cycle,
239
- "scan_uid": i,
240
- "rt": s.getRT(),
241
- "mz": spect.mz,
242
- "inty": spect.inty,
243
- },
244
- schema=schema,
245
- )
246
- ms1_df = pl.concat([ms1_df, newms1_df])
247
-
248
- # convert to polars DataFrame with explicit schema and store in self.scans_df
249
- self.scans_df = pl.DataFrame(
250
- scans,
251
- schema={
252
- "scan_uid": pl.Int64,
253
- "cycle": pl.Int64,
254
- "ms_level": pl.Int64,
255
- "rt": pl.Float64,
256
- "inty_tot": pl.Float64,
257
- "inty_min": pl.Float64,
258
- "inty_max": pl.Float64,
259
- "bl": pl.Float64,
260
- "mz_min": pl.Float64,
261
- "mz_max": pl.Float64,
262
- "comment": pl.Utf8,
263
- "name": pl.Utf8,
264
- "id": pl.Utf8,
265
- "prec_mz": pl.Float64,
266
- "prec_mz_min": pl.Float64,
267
- "prec_mz_max": pl.Float64,
268
- "prec_inty": pl.Float64,
269
- "energy": pl.Float64,
270
- "feature_uid": pl.Int64,
271
- },
272
- infer_schema_length=None,
273
- )
274
- self.file_interface = "oms"
275
- self.ms1_df = ms1_df
276
- self.label = os.path.basename(filename)
277
- if self.file_type != "ztscan":
278
- self.analyze_dda()
279
-
280
-
281
- def _load_raw(
282
- self,
283
- filename=None,
284
- ):
285
- """
286
- Load and process raw spectral data from the given file.
287
- This method reads a Thermo raw file (with '.raw' extension) by utilizing the ThermoRawData class from
288
- the alpharaw.thermo module. It validates the filename, checks for file existence, and then imports and processes
289
- the raw data. The method performs the following tasks:
290
- - Converts retention times (rt) from minutes to seconds and rounds them to 4 decimal places.
291
- - Iterates over each spectrum in the raw data and constructs a list of scan dictionaries.
292
- - For MS level 1 scans, performs centroiding if peaks with intensities > 0 after denoising.
293
- - Creates a Polars DataFrame for all scans (self.scans_df) with detailed spectrum information.
294
- - Aggregates MS1 spectrum peak data into a separate Polars DataFrame (self.ms1_df).
295
- - Sets additional attributes such as file path, raw data object, interface label, and file label.
296
- - Calls the analyze_dda method for further processed data analysis.
297
- Parameters:
298
- filename (str): The path to the raw data file. Must end with ".raw".
299
- Raises:
300
- ValueError: If the provided filename does not end with ".raw".
301
- FileNotFoundError: If the file specified by filename does not exist.
302
- Side Effects:
303
- - Populates self.scans_df with scan data in a Polars DataFrame.
304
- - Populates self.ms1_df with MS1 scan data.
305
- - Updates instance attributes including self.file_path, self.file_obj, self.file_interface, and self.label.
306
- - Initiates further analysis by invoking analyze_dda().
307
- """
308
- from alpharaw.thermo import ThermoRawData
309
-
310
- raw_data = ThermoRawData(centroided=False)
311
- raw_data.keep_k_peaks_per_spec = self.parameters.max_points_per_spectrum
312
- # check thatupdat filename ends with .raw
313
- if not filename.endswith(".raw"):
314
- raise ValueError("filename must end with .raw")
315
- # check that the file exists
316
- if not os.path.exists(filename):
317
- raise FileNotFoundError(f"File {filename} not found.")
318
- self.logger.info(f"Loading {filename}")
319
- raw_data.import_raw(filename)
320
- specs = raw_data.spectrum_df
321
- # convert rt from minutes to seconds, round to 4 decimal places
322
- specs.rt = specs.rt * 60
323
- # TODO this should be an external param
324
- specs.rt = specs.rt.round(4)
325
-
326
- scans = []
327
- cycle = 0
328
- schema = {
329
- "cycle": pl.Int32,
330
- "scan_uid": pl.Int64,
331
- "rt": pl.Float64,
332
- "mz": pl.Float64,
333
- "inty": pl.Float64,
334
- }
335
- # create a polars DataFrame with explicit schema: cycle: int, rt: float, mz: float, intensity: float
336
- ms1_df = pl.DataFrame(
337
- {"cycle": [], "scan_uid": [], "rt": [], "mz": [], "inty": []},
338
- schema=schema,
339
- )
340
- # iterate over rows of specs
341
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
342
- for i, s in tqdm(
343
- specs.iterrows(),
344
- total=len(specs),
345
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Scans",
346
- disable=tdqm_disable,
347
- ):
348
- # create a dict
349
- if s["ms_level"] == 1:
350
- cycle += 1
351
- prec_mz = None
352
- precursorIsolationWindowLowerMZ = None
353
- precursorIsolationWindowUpperMZ = None
354
- prec_intyensity = None
355
- energy = None
356
- else:
357
- prec_mz = s["precursor_mz"]
358
- precursorIsolationWindowLowerMZ = s["isolation_lower_mz"]
359
- precursorIsolationWindowUpperMZ = s["isolation_upper_mz"]
360
- prec_intyensity = None
361
- energy = s["nce"]
362
-
363
- peak_start_idx = s["peak_start_idx"]
364
- peak_stop_idx = s["peak_stop_idx"]
365
- peaks = raw_data.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
366
- spect = Spectrum(
367
- mz=peaks.mz.values,
368
- inty=peaks.intensity.values,
369
- ms_level=s["ms_level"],
370
- centroided=False,
371
- )
372
- # remove peaks with intensity <= 0
373
-
374
- bl = spect.baseline()
375
- spect = spect.denoise(threshold=bl)
376
- if spect.ms_level == 1:
377
- spect = spect.centroid(
378
- tolerance=self.parameters.mz_tol_ms1_da,
379
- ppm=self.parameters.mz_tol_ms1_ppm,
380
- min_points=self.parameters.centroid_min_points_ms1,
381
- )
382
- newscan = {
383
- "scan_uid": i,
384
- "cycle": cycle,
385
- "ms_level": int(s["ms_level"]),
386
- "rt": s["rt"],
387
- "inty_tot": spect.tic(),
388
- "inty_min": spect.inty_min(),
389
- "inty_max": spect.inty_max(),
390
- "bl": bl,
391
- "mz_min": spect.mz_min(),
392
- "mz_max": spect.mz_max(),
393
- "comment": "",
394
- "name": "",
395
- "id": "",
396
- "prec_mz": prec_mz,
397
- "prec_mz_min": precursorIsolationWindowLowerMZ,
398
- "prec_mz_max": precursorIsolationWindowUpperMZ,
399
- "prec_inty": prec_intyensity,
400
- "energy": energy,
401
- "feature_uid": -1,
402
- }
403
-
404
- scans.append(newscan)
405
-
406
- if s["ms_level"] == 1 and len(peaks) > 0:
407
- newms1_df = pl.DataFrame(
408
- {
409
- "cycle": cycle,
410
- "scan_uid": i,
411
- "rt": s["rt"],
412
- "mz": spect.mz,
413
- "inty": spect.inty,
414
- },
415
- schema=schema,
416
- )
417
- ms1_df = pl.concat([ms1_df, newms1_df])
418
-
419
- # convert to polars DataFrame with explicit schema and store in self.scans_df
420
- self.scans_df = pl.DataFrame(
421
- scans,
422
- schema={
423
- "scan_uid": pl.Int64,
424
- "cycle": pl.Int64,
425
- "ms_level": pl.Int64,
426
- "rt": pl.Float64,
427
- "inty_tot": pl.Float64,
428
- "inty_min": pl.Float64,
429
- "inty_max": pl.Float64,
430
- "bl": pl.Float64,
431
- "mz_min": pl.Float64,
432
- "mz_max": pl.Float64,
433
- "comment": pl.Utf8,
434
- "name": pl.Utf8,
435
- "id": pl.Utf8,
436
- "prec_mz": pl.Float64,
437
- "prec_mz_min": pl.Float64,
438
- "prec_mz_max": pl.Float64,
439
- "prec_inty": pl.Float64,
440
- "energy": pl.Float64,
441
- "feature_uid": pl.Int64,
442
- },
443
- infer_schema_length=None,
444
- )
445
- self.file_path = filename
446
- self.file_obj = raw_data
447
- self.file_interface = "alpharaw"
448
- self.label = os.path.basename(filename)
449
- self.ms1_df = ms1_df
450
- self.analyze_dda()
451
-
452
-
453
- def _load_wiff(
454
- self,
455
- filename=None,
456
- ):
457
- try:
458
- from alpharaw.sciex import SciexWiffData
459
- except ImportError:
460
- # Fallback to masster's own implementation
461
- from masster.sample.sciex import SciexWiffData
462
-
463
- raw_data = SciexWiffData(centroided=False)
464
- raw_data.keep_k_peaks_per_spec = self.parameters.max_points_per_spectrum
465
-
466
- if not filename.endswith(".wiff"):
467
- raise ValueError("filename must end with .wiff")
468
- if not os.path.exists(filename):
469
- raise FileNotFoundError(f"File {filename} not found.")
470
-
471
- self.logger.info(f"Loading {filename}")
472
- raw_data.import_raw(filename)
473
- # metadata = _wiff_to_dict(filename)
474
-
475
- specs = raw_data.spectrum_df
476
- specs.rt = specs.rt * 60
477
- # TODO this should be an external param
478
- specs.rt = specs.rt.round(4)
479
-
480
- algo = self.parameters.centroid_algo
481
-
482
- scans = []
483
- ms1_df_records = []
484
- cycle = 0
485
- schema = {
486
- "cycle": pl.Int32,
487
- "scan_uid": pl.Int64,
488
- "rt": pl.Float64,
489
- "mz": pl.Float64,
490
- "inty": pl.Float64,
491
- }
492
-
493
- # iterate over rows of specs
494
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
495
- for i, s in tqdm(
496
- specs.iterrows(),
497
- total=len(specs),
498
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Scans",
499
- disable=tdqm_disable,
500
- ):
501
- ms_level = s["ms_level"]
502
- if ms_level == 1:
503
- cycle += 1
504
- prec_mz = None
505
- precursorIsolationWindowLowerMZ = None
506
- precursorIsolationWindowUpperMZ = None
507
- prec_intyensity = None
508
- energy = None
509
- else:
510
- prec_mz = s["precursor_mz"]
511
- precursorIsolationWindowLowerMZ = s["isolation_lower_mz"]
512
- precursorIsolationWindowUpperMZ = s["isolation_upper_mz"]
513
- prec_intyensity = None
514
- energy = s["nce"]
515
-
516
- peak_start_idx = s["peak_start_idx"]
517
- peak_stop_idx = s["peak_stop_idx"]
518
- peaks = raw_data.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
519
- spect = Spectrum(
520
- mz=peaks.mz.values,
521
- inty=peaks.intensity.values,
522
- ms_level=ms_level,
523
- centroided=False,
524
- )
525
- bl = spect.baseline()
526
- spect = spect.denoise(threshold=bl)
527
- if ms_level == 1:
528
- spect = spect.centroid(
529
- algo=algo,
530
- tolerance=self.parameters.mz_tol_ms1_da,
531
- ppm=self.parameters.mz_tol_ms1_ppm,
532
- min_points=self.parameters.centroid_min_points_ms1,
533
- )
534
- scans.append(
535
- {
536
- "scan_uid": i,
537
- "cycle": cycle,
538
- "ms_level": int(ms_level),
539
- "rt": s["rt"],
540
- "inty_tot": spect.tic(),
541
- "inty_min": spect.inty_min(),
542
- "inty_max": spect.inty_max(),
543
- "bl": bl,
544
- "mz_min": spect.mz_min(),
545
- "mz_max": spect.mz_max(),
546
- "comment": "",
547
- "name": "",
548
- "id": "",
549
- "prec_mz": prec_mz,
550
- "prec_mz_min": precursorIsolationWindowLowerMZ,
551
- "prec_mz_max": precursorIsolationWindowUpperMZ,
552
- "prec_inty": prec_intyensity,
553
- "energy": energy,
554
- "feature_uid": -1,
555
- },
556
- )
557
-
558
- if ms_level == 1 and len(peaks) > 0:
559
- # Use extend for all mz/int pairs at once
560
- ms1_df_records.extend(
561
- [
562
- {
563
- "cycle": cycle,
564
- "scan_uid": i,
565
- "rt": s["rt"],
566
- "mz": mz,
567
- "inty": inty,
568
- }
569
- for mz, inty in zip(spect.mz, spect.inty, strict=False)
570
- ],
571
- )
572
-
573
- # Create DataFrames in one go
574
- self.scans_df = pl.DataFrame(
575
- scans,
576
- schema={
577
- "scan_uid": pl.Int64,
578
- "cycle": pl.Int64,
579
- "ms_level": pl.Int64,
580
- "rt": pl.Float64,
581
- "inty_tot": pl.Float64,
582
- "inty_min": pl.Float64,
583
- "inty_max": pl.Float64,
584
- "bl": pl.Float64,
585
- "mz_min": pl.Float64,
586
- "mz_max": pl.Float64,
587
- "comment": pl.Utf8,
588
- "name": pl.Utf8,
589
- "id": pl.Utf8,
590
- "prec_mz": pl.Float64,
591
- "prec_mz_min": pl.Float64,
592
- "prec_mz_max": pl.Float64,
593
- "prec_inty": pl.Float64,
594
- "energy": pl.Float64,
595
- "feature_uid": pl.Int64,
596
- },
597
- infer_schema_length=None,
598
- )
599
- self.file_path = filename
600
- self.file_obj = raw_data
601
- self.file_interface = "alpharaw"
602
- self.label = os.path.basename(filename)
603
- self.ms1_df = pl.DataFrame(ms1_df_records, schema=schema)
604
- if self.file_type != "ztscan":
605
- self.analyze_dda()
606
-
607
-
608
- def _load_featureXML(
609
- self,
610
- filename="features.featureXML",
611
- ):
612
- """
613
- Load feature data from a FeatureXML file.
614
-
615
- This method reads a FeatureXML file (defaulting to "features.featureXML") using the
616
- OMS library's FeatureXMLFile and FeatureMap objects. The loaded feature data is stored
617
- in the instance variable 'features'. The method then converts the feature data into a
618
- DataFrame, optionally excluding peptide identification data, and cleans it using the
619
- '__oms_clean_df' method, saving the cleaned DataFrame into 'features_df'.
620
-
621
- Parameters:
622
- filename (str): The path to the FeatureXML file to load. Defaults to "features.featureXML".
623
-
624
- Returns:
625
- None
626
- """
627
- fh = oms.FeatureXMLFile()
628
- fm = oms.FeatureMap()
629
- fh.load(filename, fm)
630
- self.features = fm
631
- """if self.features_df is None:
632
- df = self.features.get_df(export_peptide_identifications=False)
633
- df = self._clean_features_df(df)
634
-
635
- # desotope features
636
- df = self._features_deisotope(df, mz_tol=0.02, rt_tol=0.5)
637
-
638
- # update eic
639
- df["chrom"] = None
640
- mz_tol = 0.01
641
- rt_tol = 10
642
- # iterate over all rows in df
643
- for i, row in df.iterrows():
644
- # select data in ms1_df with mz in range [mz_start - mz_tol, mz_end + mz_tol] and rt in range [rt_start - rt_tol, rt_end + rt_tol]
645
- d = self.ms1_df.filter(
646
- (pl.col("rt") >= row["rt_start"] - rt_tol)
647
- & (pl.col("rt") <= row["rt_end"] + rt_tol)
648
- & (pl.col("mz") >= row["mz"] - mz_tol)
649
- & (pl.col("mz") <= row["mz"] + mz_tol)
650
- )
651
- # for all unique rt values, find the maximum inty
652
- eic_rt = d.group_by("rt").agg(pl.col("inty").max())
653
- if len(eic_rt) < 4:
654
- continue
655
- eic = Chromatogram(
656
- eic_rt["rt"].to_numpy(),
657
- eic_rt["inty"].to_numpy(),
658
- label=f"EIC mz={row['mz']:.4f}",
659
- feature_start=row["rt_start"],
660
- feature_end=row["rt_end"],
661
- feature_apex=row["rt"],
662
- ).find_peaks()
663
- # set eic in df
664
- df.at[i, "chrom"] = eic
665
- if len(eic.peak_widths) > 0:
666
- df.at[i, "chrom_coherence"] = round(eic.feature_coherence, 3)
667
- df.at[i, "chrom_prominence"] = round(
668
- eic.peak_prominences[0], 3
669
- ) # eic.peak_prominences[0]
670
- df.at[i, "chrom_prominence_scaled"] = round(
671
- eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3
672
- )
673
- df.at[i, "chrom_height_scaled"] = round(
674
- eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3
675
- )
676
-
677
- self.features_df = df"""
678
-
679
- '''
680
- def _load_mzpkl(
681
- self,
682
- filename="sample.mzpkl",
683
- ondisk=False,
684
- ):
685
- """
686
- Load the mzpkl data file, initialize the experiment attributes, and set up the file object.
687
- Parameters:
688
- filename (str, optional): The path to the .mzpkl file to be loaded. Defaults to "data.mzpkl".
689
- ondisk (bool, optional): A flag indicating whether the data should be loaded for on-disk usage.
690
- If True, self.ondisk is set to True and an OnDiscMSExperiment is used.
691
- Otherwise, an MSExperiment is used.
692
- Side Effects:
693
- - Decompresses and unpickles the specified file.
694
- - Sets attributes on self for each key in the loaded data dictionary, except for keys named 'format'.
695
- - Renames the attribute 'spectra_df' to 'scans_df' if present.
696
- - Initializes self.file_obj as either an OnDiscMSExperiment or MSExperiment based on the ondisk flag.
697
- - Checks for an associated featureXML file (with the same base name as the input file) and loads it if found.
698
- """
699
-
700
- if ondisk is True:
701
- self.ondisk = True
702
-
703
- with bz2.BZ2File(filename, "rb") as f:
704
- data = pickle.load(f)
705
-
706
- for k, v in data.items():
707
- if k in ["format"]:
708
- continue
709
- if k == "spectra_df":
710
- k = "scans_df"
711
- setattr(self, k, v)
712
-
713
- self.sanitize()
714
-
715
- if self.ondisk:
716
- self.file_obj = oms.OnDiscMSExperiment()
717
- else:
718
- self.file_obj = oms.MSExperiment()
719
-
720
- # check if *.featureXML exists
721
- featureXML = filename.replace(".mzpkl", ".featureXML")
722
- if os.path.exists(featureXML):
723
- self._load_featureXML(featureXML)
724
-
725
- '''
726
- def _wiff_to_dict(
727
- filename=None,
728
- ):
729
- from alpharaw.raw_access.pysciexwifffilereader import WillFileReader
730
-
731
- file_reader = WillFileReader(filename)
732
- number_of_samples = len(file_reader.sample_names)
733
- metadata = []
734
- for si in range(number_of_samples):
735
- sample_reader = file_reader._wiff_file.GetSample(si)
736
- number_of_exps = sample_reader.MassSpectrometerSample.ExperimentCount
737
- for ei in range(number_of_exps):
738
- exp_reader = sample_reader.MassSpectrometerSample.GetMSExperiment(ei)
739
-
740
- exp_info = exp_reader.GetMassSpectrumInfo(ei)
741
-
742
- # get the details of the experiment
743
- exp_name = exp_reader.Details.get_ExperimentName()
744
- exp_type = exp_reader.Details.get_ExperimentType()
745
-
746
- IDA_type = exp_reader.Details.get_IDAType()
747
- has_MRM_Pro_Data = exp_reader.Details.get_HasMRMProData()
748
- has_SMRM_Data = exp_reader.Details.get_HasSMRMData()
749
- is_swath = exp_reader.Details.get_IsSwath()
750
- has_dyn_fill_time = exp_reader.Details.get_HasDynamicFillTime()
751
- method_fill_time = exp_reader.Details.get_MethodFillTime()
752
- default_resolution = exp_reader.Details.get_DefaultResolution()
753
- parameters = exp_reader.Details.get_Parameters()
754
- targeted_compound_info = exp_reader.Details.get_TargetedCompoundInfo()
755
- source_type = exp_reader.Details.get_SourceType()
756
- raw_data_type = exp_reader.Details.get_RawDataType()
757
-
758
- number_of_scans = exp_reader.Details.get_NumberOfScans()
759
- scan_group = exp_reader.Details.get_ScanGroup()
760
- spectrum_type = exp_reader.Details.get_SpectrumType()
761
- saturatrion_threshold = exp_reader.Details.get_SaturationThreshold()
762
- polarity = exp_reader.Details.get_Polarity()
763
- mass_range_info = exp_reader.Details.get_MassRangeInfo()
764
- start_mass = exp_reader.Details.get_StartMass()
765
- end_mass = exp_reader.Details.get_EndMass()
766
-
767
- mslevel = exp_info.MSLevel
768
- if mslevel > 1:
769
- # get the precursor information
770
- parent_mz = exp_info.ParentMZ
771
- collision_energy = exp_info.CollisionEnergy
772
- parent_charge_state = exp_info.ParentChargeState
773
- else:
774
- parent_mz = None
775
- collision_energy = None
776
- parent_charge_state = None
777
-
778
- # create a dict with the details
779
- exp_dict = {
780
- "instrument_name": sample_reader.MassSpectrometerSample.get_InstrumentName(),
781
- "sample_id": si,
782
- "experiment_id": ei,
783
- "experiment_name": exp_name,
784
- "experiment_type": exp_type,
785
- "IDA_type": IDA_type,
786
- "has_MRM_Pro_Data": has_MRM_Pro_Data,
787
- "has_SMRM_Data": has_SMRM_Data,
788
- "is_swath": is_swath,
789
- "has_dyn_fill_time": has_dyn_fill_time,
790
- "method_fill_time": method_fill_time,
791
- "default_resolution": default_resolution,
792
- "parameters": parameters,
793
- "targeted_compound_info": targeted_compound_info,
794
- "source_type": source_type,
795
- "raw_data_type": raw_data_type,
796
- "number_of_scans": number_of_scans,
797
- "scan_group": scan_group,
798
- "spectrum_type": spectrum_type,
799
- "saturatrion_threshold": saturatrion_threshold,
800
- "polarity": polarity,
801
- "mass_range_info": mass_range_info,
802
- "start_mass": start_mass,
803
- "end_mass": end_mass,
804
- "mslevel": mslevel,
805
- "parent_mz": parent_mz,
806
- "collision_energy": collision_energy,
807
- "parent_charge_state": parent_charge_state,
808
- }
809
- metadata.append(exp_dict)
810
- # convert to pandas DataFrame
811
- metadata = pd.DataFrame(metadata)
812
-
813
- return metadata
814
-
815
-
816
- def sanitize(self):
817
- # iterate over all rows in self.features_df
818
- if self.features_df is None:
819
- return
820
- for _i, row in self.features_df.iterrows():
821
- # check if chrom is not None
822
- if row["chrom"] is not None and not isinstance(row["chrom"], Chromatogram):
823
- # update chrom to a Chromatogram
824
- new_chrom = Chromatogram(rt=np.array([]), inty=np.array([]))
825
- new_chrom.from_dict(row["chrom"].__dict__)
826
- self.features_df.at[_i, "chrom"] = new_chrom
827
- if row["ms2_specs"] is not None:
828
- if isinstance(row["ms2_specs"], list):
829
- for _j, ms2_specs in enumerate(row["ms2_specs"]):
830
- if not isinstance(ms2_specs, Spectrum):
831
- new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
832
- new_ms2_specs.from_dict(ms2_specs.__dict__)
833
- self.features_df.at[_i, "ms2_specs"][_j] = new_ms2_specs
834
-
835
-
836
- def index_file(self):
837
- """
838
- Reload raw data from a file based on its extension.
839
-
840
- This method checks whether the file at self.file_path exists and determines
841
- the appropriate way to load it depending on its extension:
842
- - If the file ends with ".wiff", it uses the SciexWiffData class for import.
843
- - If the file ends with ".raw", it uses the ThermoRawData class for import.
844
- - If the file ends with ".mzml", it uses the MzMLFile loader with either
845
- an on-disk or in-memory MSExperiment based on the self.ondisk flag.
846
-
847
- It also sets the file interface and file object on the instance after successful
848
- import. Additionally, the number of peaks per spectrum is configured using the
849
- 'max_points_per_spectrum' parameter from self.parameters.
850
-
851
- Raises:
852
- FileNotFoundError: If the file does not exist or has an unsupported extension.
853
- """
854
- # check if file_path exists and ends with .wiff
855
- if os.path.exists(self.file_path) and self.file_path.lower().endswith(".wiff"):
856
- self.file_interface = "alpharaw"
857
- try:
858
- from alpharaw.sciex import SciexWiffData
859
- except ImportError:
860
- # Fallback to masster's own implementation
861
- from masster.sample.sciex import SciexWiffData
862
-
863
- raw_data = SciexWiffData(centroided=False)
864
- raw_data.keep_k_peaks_per_spec = self.parameters.max_points_per_spectrum
865
- self.logger.info("Index raw data...")
866
- raw_data.import_raw(self.file_path)
867
- self.file_obj = raw_data
868
- elif os.path.exists(self.file_path) and self.file_path.lower().endswith(".raw"):
869
- self.file_interface = "alpharaw"
870
- from alpharaw.thermo import ThermoRawData
871
-
872
- raw_data = ThermoRawData(centroided=False)
873
- raw_data.keep_k_peaks_per_spec = self.parameters.get("max_points_per_spectrum")
874
- self.logger.info("Index raw data...")
875
- raw_data.import_raw(self.file_path)
876
- self.file_obj = raw_data
877
- elif os.path.exists(self.file_path) and self.file_path.lower().endswith(".mzml"):
878
- self.file_interface = "oms"
879
- omsexp: oms.OnDiscMSExperiment | oms.MSExperiment
880
- if self.ondisk:
881
- omsexp = oms.OnDiscMSExperiment()
882
- self.file_obj = omsexp
883
- else:
884
- omsexp = oms.MSExperiment()
885
- oms.MzMLFile().load(self.file_path, omsexp)
886
- self.file_obj = omsexp
887
- else:
888
- raise FileNotFoundError(f"File {self.file_path} not found.")
889
-
890
-
891
- def _load_ms2data(
892
- self,
893
- scans=None,
894
- ):
895
- # reads all ms2 data from the file object and returns a polars DataFrame
896
-
897
- # check if file_obj is set
898
- if self.file_obj is None:
899
- return
900
- # check if scan_uid is set
901
- if scans is None:
902
- scans = self.scans_df["scan_uid"].to_list()
903
- if len(scans) == 0:
904
- scans = self.scans_df["scan_uid"].to_list()
905
-
906
- # check the file interface
907
- if self.file_interface == "oms":
908
- _load_ms2data(self, scans=scans)
909
- elif self.file_interface == "alpharaw":
910
- _load_ms2data_alpharaw(self, scan_uid=scans)
911
-
912
- return
913
-
914
-
915
- def _load_ms2data_alpharaw(
916
- self,
917
- scan_uid=None,
918
- ):
919
- # reads all ms data from the file object and returns a polars DataFrame
920
-
921
- # TODO not used
922
- ms2data = None
923
- scan_uid = self.scans_df["scan_uid"].to_list() if scan_uid is None else scan_uid
924
- self.logger.info(f"Loading MS2 data for {len(scan_uid)} scans...")
925
- # keep only scans with ms_level == 2
926
- if self.file_obj is None:
927
- return
928
-
929
- raw_data = self.file_obj
930
- scans = raw_data.spectrum_df
931
- # scans.rt = scans.rt * 60
932
- scans.rt = scans.rt.round(4)
933
-
934
- schema = {
935
- "scan_uid": pl.Int64,
936
- "rt": pl.Float64,
937
- "prec_mz": pl.Float64,
938
- "mz": pl.Float64,
939
- "inty": pl.Float64,
940
- }
941
- # create a polars DataFrame with explicit schema: cycle: int, rt: float, mz: float, intensity: float
942
- ms2data = pl.DataFrame(
943
- {"scan_uid": [], "rt": [], "prec_mz": [], "mz": [], "inty": []},
944
- schema=schema,
945
- )
946
- # iterate over rows of specs
947
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
948
- for i, s in tqdm(
949
- scans.iterrows(),
950
- total=len(scans),
951
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Centroid",
952
- disable=tdqm_disable,
953
- ):
954
- # create a dict
955
- if s["ms_level"] == 2:
956
- prec_mz = s["precursor_mz"]
957
- peak_start_idx = s["peak_start_idx"]
958
- peak_stop_idx = s["peak_stop_idx"]
959
- peaks = raw_data.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
960
- spect = Spectrum(
961
- mz=peaks.mz.values,
962
- inty=peaks.intensity.values,
963
- ms_level=s["ms_level"],
964
- centroided=False,
965
- )
966
- # remove peaks with intensity <= 0
967
- bl = spect.baseline()
968
- spect = spect.denoise(threshold=bl)
969
-
970
- if len(peaks) > 0:
971
- newms2data = pl.DataFrame(
972
- {
973
- "scan_uid": i,
974
- "rt": s["rt"],
975
- "prec_mz": prec_mz,
976
- "mz": spect.mz,
977
- "inty": spect.inty,
978
- },
979
- schema=schema,
980
- )
981
- ms2data = pl.concat([ms2data, newms2data])
982
- self.ms2data = ms2data
983
-
984
-
985
- # TODO this should go to chrom?
986
- def chrom_extract(
987
- self,
988
- rt_tol=6.0,
989
- mz_tol=0.005,
990
- ):
991
- """
992
- Extracts MRM (Multiple Reaction Monitoring) and EIC (Extracted Ion Chromatogram) data from the file object.
993
-
994
- This method processes the `chrom_df` DataFrame, identifying relevant scans in `scans_df` and extracting chromatograms
995
- for MS1, MRM, and MS2 traces. It updates `chrom_df` with scan IDs and extracted chromatogram objects.
996
-
997
- Parameters:
998
- rt_tol (float, optional): Retention time tolerance for scan selection. Defaults to RtParameters().rt_tol.
999
- mz_tol (float, optional): m/z tolerance for scan selection. Defaults to MzParameters().mz_tol_ms1_da.
1000
-
1001
- Returns:
1002
- None: Updates self.chrom_df in place with extracted chromatogram data.
1003
- """
1004
- if self.file_obj is None:
1005
- return
1006
-
1007
- if self.chrom_df is None:
1008
- return
1009
-
1010
- # check if mrm_df is dict, if so convert to DataFrame
1011
- chrom_df = self.chrom_df
1012
-
1013
- chrom_df["scan_uid"] = None
1014
- chrom_df["chrom"] = None
1015
- scan_uid = []
1016
-
1017
- # iterate over all mrms and identidy the scans
1018
- for i, trace in chrom_df.iterrows():
1019
- if trace["type"] in ["ms1"]:
1020
- rt = trace["rt"]
1021
- rt_start = trace["rt_start"]
1022
- if rt_start is None:
1023
- rt_start = rt - 3
1024
- rt_end = trace["rt_end"]
1025
- if rt_end is None:
1026
- rt_end = rt + 3
1027
- # TODO not used
1028
- q1 = trace["prec_mz"]
1029
- # find all rows in self.scans_df that have rt between rt_start-rt_tol and rt_end+rt_tol and mz between q1-mz_tol and q1+mz_tol
1030
- mask = (
1031
- (self.scans_df["rt"] >= rt_start - rt_tol)
1032
- & (self.scans_df["rt"] <= rt_end + rt_tol)
1033
- & (self.scans_df["ms_level"] == 1)
1034
- )
1035
- scans_df = self.scans_df.filter(mask)
1036
- scan_ids = scans_df["scan_uid"].to_list()
1037
- scan_uid.extend(scan_ids)
1038
- chrom_df.at[i, "scan_uid"] = scan_ids
1039
-
1040
- elif trace["type"] in ["mrm", "ms2"]:
1041
- rt = trace["rt"]
1042
- rt_start = trace["rt_start"]
1043
- if rt_start is None:
1044
- rt_start = rt - 3
1045
- rt_end = trace["rt_end"]
1046
- if rt_end is None:
1047
- rt_end = rt + 3
1048
- q1 = trace["prec_mz"]
1049
- # find all rows in self.scans_df that have rt between rt_start-rt_tol and rt_end+rt_tol and mz between q1-mz_tol and q1+mz_tol
1050
- mask = (
1051
- (self.scans_df["rt"] >= rt_start - rt_tol)
1052
- & (self.scans_df["rt"] <= rt_end + rt_tol)
1053
- & (self.scans_df["ms_level"] == 2)
1054
- & (self.scans_df["prec_mz"] >= q1 - 5)
1055
- & (self.scans_df["prec_mz"] <= q1 + 5)
1056
- )
1057
- scans_df = self.scans_df.filter(mask)
1058
- # find the closes prec_mz to q1
1059
- if scans_df.is_empty():
1060
- continue
1061
- # find the closest prec_mz to q1
1062
- # sort by abs(prec_mz - q1) and take the first row
1063
- # this is the closest precursor m/z to q1
1064
- closest_prec_mz = scans_df.sort(abs(pl.col("prec_mz") - q1)).select(
1065
- pl.col("prec_mz").first(),
1066
- )
1067
- # keep only the scans with prec_mz within mz_tol of closest_prec_mz
1068
- scans_df = scans_df.filter(
1069
- (pl.col("prec_mz") >= closest_prec_mz["prec_mz"][0] - 0.2)
1070
- & (pl.col("prec_mz") <= closest_prec_mz["prec_mz"][0] + 0.2),
1071
- )
1072
-
1073
- scan_ids = scans_df["scan_uid"].to_list()
1074
- scan_uid.extend(scan_ids)
1075
- chrom_df.at[i, "scan_uid"] = scan_ids
1076
-
1077
- # get the ms2data
1078
- _load_ms2data(self, scans=list(set(scan_uid)) if scan_uid else None)
1079
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1080
-
1081
- for i, trace in tqdm(
1082
- chrom_df.iterrows(),
1083
- total=len(chrom_df),
1084
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract EICs",
1085
- disable=tdqm_disable,
1086
- ):
1087
- if trace["type"] in ["ms1"]:
1088
- q1 = trace["prec_mz"]
1089
- name = trace["name"]
1090
- scan_uid = trace["scan_uid"]
1091
- # find all ms1 data with scan_uid and mz between q1-mz_tol and q1+mz_tol
1092
- d = self.ms1_df.filter(
1093
- (pl.col("scan_uid").is_in(scan_uid))
1094
- & (pl.col("mz") >= q1 - mz_tol)
1095
- & (pl.col("mz") <= q1 + mz_tol),
1096
- )
1097
- # for all unique rt values, find the maximum inty
1098
- eic_rt = d.group_by("rt").agg(pl.col("inty").max())
1099
- eic = Chromatogram(
1100
- eic_rt["rt"].to_numpy(),
1101
- inty=eic_rt["inty"].to_numpy(),
1102
- label=f"MS1 {name} ({q1:0.3f})",
1103
- lib_rt=trace["rt"],
1104
- )
1105
- chrom_df.at[i, "chrom"] = eic
1106
-
1107
- elif trace["type"] in ["mrm", "ms2"]:
1108
- q1 = trace["prec_mz"]
1109
- q3 = trace["prod_mz"]
1110
- name = trace["name"]
1111
- scan_uid = trace["scan_uid"]
1112
- # find all ms2 data with scan_uid and mz between q3-mz_tol and q3+mz_tol
1113
- d = self.ms2data.filter(
1114
- (pl.col("scan_uid").is_in(scan_uid))
1115
- & (pl.col("mz") >= q3 - mz_tol)
1116
- & (pl.col("mz") <= q3 + mz_tol),
1117
- )
1118
- # for all unique rt values, find the maximum inty
1119
- eic_rt = d.group_by("rt").agg(pl.col("inty").max())
1120
- eic = Chromatogram(
1121
- eic_rt["rt"].to_numpy(),
1122
- inty=eic_rt["inty"].to_numpy(),
1123
- label=f"MRM {name} ({q1:0.3f}>{q3:0.3f})",
1124
- lib_rt=trace["rt"],
1125
- )
1126
- chrom_df.at[i, "chrom"] = eic
1127
-
1128
- self.chrom_df = chrom_df
1129
-
1130
-
1131
- # TODO no self?
1132
- def _oms_clean_df(self, df):
1133
- df2 = df[df["quality"] != 0]
1134
- # change columns and order
1135
- df = pd.DataFrame(
1136
- columns=[
1137
- "feature_uid",
1138
- "uid",
1139
- "mz",
1140
- "rt",
1141
- "rt_start",
1142
- "rt_end",
1143
- "rt_delta",
1144
- "mz_start",
1145
- "mz_end",
1146
- "inty",
1147
- "quality",
1148
- "charge",
1149
- "iso",
1150
- "iso_of",
1151
- "chrom",
1152
- "chrom_coherence",
1153
- "chrom_prominence",
1154
- "chrom_prominence_scaled",
1155
- "chrom_height_scaled",
1156
- "ms2_scans",
1157
- "ms2_specs",
1158
- ],
1159
- )
1160
-
1161
- # set values of fid to 0:len(df)
1162
- df["uid"] = df2.index.to_list()
1163
- df["mz"] = (df2["mz"]).round(5)
1164
- df["rt"] = (df2["RT"]).round(3)
1165
- df["rt_start"] = (df2["RTstart"]).round(3)
1166
- df["rt_end"] = (df2["RTend"]).round(3)
1167
- df["rt_delta"] = (df2["RTend"] - df2["RTstart"]).round(3)
1168
- df["mz_start"] = (df2["MZstart"]).round(5)
1169
- df["mz_end"] = (df2["MZend"]).round(5) # df2["MZend"]
1170
- df["inty"] = df2["intensity"]
1171
- df["quality"] = df2["quality"]
1172
- df["charge"] = df2["charge"]
1173
- df["iso"] = 0
1174
- df["iso_of"] = None
1175
- df["chrom"] = None
1176
- df["chrom_coherence"] = None
1177
- df["chrom_prominence"] = None
1178
- df["chrom_prominence_scaled"] = None
1179
- df["chrom_height_scaled"] = None
1180
- df["ms2_scans"] = None
1181
- df["ms2_specs"] = None
1182
- df["feature_uid"] = range(1, len(df) + 1)
1183
- # df.set_index('fid', inplace=True)
1184
- # rests index
1185
- # df.reset_index(drop=True, inplace=True)
1186
-
1187
- return df
1
+ """
2
+ _import.py
3
+
4
+ This module provides data import functionality for mass spectrometry files.
5
+ It handles loading and processing of various mass spectrometry file formats
6
+ including mzML, vendor formats (WIFF, RAW).
7
+
8
+ Key Features:
9
+ - **Multi-Format Support**: Load mzML, WIFF (SCIEX), and RAW (Thermo) files.
10
+ - **File Validation**: Check file existence and format compatibility.
11
+ - **Memory Management**: Support for on-disk and in-memory data handling.
12
+ - **Metadata Extraction**: Extract acquisition parameters and instrument information.
13
+ - **Error Handling**: Comprehensive error reporting for file loading issues.
14
+ - **Raw Data Processing**: Handle centroided and profile data with signal smoothing.
15
+
16
+ Dependencies:
17
+ - `pyopenms`: For standard mass spectrometry file format support.
18
+ - `polars` and `pandas`: For efficient data handling and manipulation.
19
+ - `numpy`: For numerical array operations.
20
+
21
+ Functions:
22
+ - `load()`: Main file loading function with format detection.
23
+ - `_load_mzML()`: Specialized mzML file loader.
24
+ - `_load_wiff()`: SCIEX WIFF file loader.
25
+ - `_load_raw()`: Thermo RAW file loader.
26
+
27
+ Supported File Formats:
28
+ - mzML (open standard format)
29
+ - WIFF (SCIEX vendor format)
30
+ - RAW (Thermo proprietary format)
31
+
32
+ See Also:
33
+ - `parameters._import_parameters`: For import-specific parameter configuration.
34
+ - `_export.py`: For data export functionality.
35
+ - `single.py`: For using imported data with ddafile class.
36
+
37
+ """
38
+
39
+ import os
40
+
41
+ from datetime import datetime
42
+
43
+ import numpy as np
44
+ import pandas as pd
45
+ import polars as pl
46
+ import pyopenms as oms
47
+
48
+ from tqdm import tqdm
49
+
50
+ from masster.chromatogram import Chromatogram
51
+
52
+ # Parameters removed - using hardcoded defaults
53
+ from masster.spectrum import Spectrum
54
+
55
+
56
+ def load(
57
+ self,
58
+ filename=None,
59
+ ondisk=False,
60
+ type=None,
61
+ label=None,
62
+ ):
63
+ """
64
+ Load file content from a specified filename.
65
+ Parameters:
66
+ filename (str): The path to the file to load. The file must exist and have one of the following extensions:
67
+ .mzML, .wiff, or .raw.
68
+ ondisk (bool, optional): Indicates whether the file should be treated as on disk. Defaults to False.
69
+ type (str, optional): Specifies the type of file. If provided and set to 'ztscan' (case-insensitive), the file_type
70
+ attribute will be adjusted accordingly. Defaults to None.
71
+ label (Any, optional): An optional label to associate with the loaded file. Defaults to None.
72
+ Raises:
73
+ FileNotFoundError: If the file specified by filename does not exist.
74
+ ValueError: If the file extension is not one of the supported types (.mzML, .wiff, or .raw).
75
+ Notes:
76
+ The function determines the appropriate internal loading mechanism based on the file extension:
77
+ - ".mzml": Calls _load_mzML(filename)
78
+ - ".wiff": Calls _load_wiff(filename)
79
+ - ".raw": Calls _load_raw(filename)
80
+ After loading, the file_type attribute is set to 'dda', unless the optional 'type' parameter is provided as 'ztscan',
81
+ in which case it is updated to 'ztscan'. The label attribute is updated if a label is provided.
82
+ """
83
+
84
+ if filename is None:
85
+ filename = self.file_path
86
+ filename = os.path.abspath(filename)
87
+ if not os.path.exists(filename):
88
+ raise FileNotFoundError(f"Filename not valid. Provide a valid file path.")
89
+ self.ondisk = ondisk
90
+
91
+ # check if file is mzML
92
+ if filename.lower().endswith(".mzml"):
93
+ self._load_mzML(filename)
94
+ elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
95
+ self._load_wiff(filename)
96
+ elif filename.lower().endswith(".raw"):
97
+ self._load_raw(filename)
98
+ elif filename.lower().endswith(".sample5"):
99
+ self._load_sample5(filename)
100
+ #elif filename.lower().endswith(".h5"):
101
+ # self._load_h5(filename)
102
+ else:
103
+ raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")
104
+
105
+ self.file_type = "dda"
106
+ if type is not None and type.lower() in ["ztscan"]:
107
+ self.file_type = "ztscan"
108
+
109
+ if label is not None:
110
+ self.label = label
111
+
112
+
113
+ def _load_mzML(
114
+ self,
115
+ filename=None,
116
+ ):
117
+ """
118
+ Load an mzML file and process its spectra.
119
+ This method loads an mzML file (if a filename is provided, it will update the internal file path) using either an on-disk or in-memory MS experiment depending on the object's "ondisk" flag. It then iterates over all the spectra in the experiment:
120
+ - For MS level 1 spectra, it increments a cycle counter and creates a polars DataFrame containing the retention time, m/z values, and intensity values.
121
+ - For higher MS level spectra, it processes precursor-related information such as precursor m/z, isolation window offsets, intensity, and activation energy.
122
+ Each spectrum is further processed by computing its baseline, denoising based on the baseline, and extracting various scan properties (such as TIC, minimum/maximum intensity, m/z bounds, etc.). This scan information is appended to a list.
123
+ After processing all spectra, the method consolidates the collected scan data into a polars DataFrame with an explicit schema. It also assigns the on-disk/in-memory experiment object and corresponding file interface to instance attributes. The method sets a label based on the file basename, and, unless the scan type is 'ztscan', calls an additional analysis routine (analyze_dda).
124
+ Parameters:
125
+ filename (str, optional): The path to the mzML file to load. If None, the existing file path attribute is used.
126
+ Returns:
127
+ None
128
+ Side Effects:
129
+ - Updates self.file_path if a new filename is provided.
130
+ - Loads and stores the MS experiment in self.file_obj.
131
+ - Sets self.file_interface to the string 'oms'.
132
+ - Stores the processed scan data in self.scans_df.
133
+ - Maintains MS1-specific data in self.ms1_df.
134
+ - Updates the instance label based on the loaded file's basename.
135
+ - Invokes the analyze_dda method if the scan type is not 'ztscan'.
136
+ """
137
+ # check if filename exists
138
+ if filename is None:
139
+ raise ValueError("Filename must be provided.")
140
+
141
+ filename = os.path.abspath(filename)
142
+ # check if it exists
143
+ if not os.path.exists(filename):
144
+ raise FileNotFoundError(f"File {filename} not found.")
145
+ if filename is not None:
146
+ self.file_path = filename
147
+ self.file_source = filename
148
+
149
+ self.logger.info(f"Loading {filename}")
150
+
151
+ omsexp: oms.OnDiscMSExperiment | oms.MSExperiment
152
+ if self.ondisk:
153
+ omsexp = oms.OnDiscMSExperiment()
154
+ self.file_obj = omsexp
155
+ else:
156
+ omsexp = oms.MSExperiment()
157
+ oms.MzMLFile().load(self.file_path, omsexp)
158
+ self.file_obj = omsexp
159
+
160
+ scans = []
161
+ cycle = 0
162
+ schema = {
163
+ "cycle": pl.Int32,
164
+ "scan_uid": pl.Int64,
165
+ "rt": pl.Float64,
166
+ "mz": pl.Float64,
167
+ "inty": pl.Float64,
168
+ }
169
+ # create a polars DataFrame with explicit schema: cycle: int, rt: float, mz: float, intensity: float
170
+ ms1_df = pl.DataFrame(
171
+ {"cycle": [], "scan_uid": [], "rt": [], "mz": [], "inty": []},
172
+ schema=schema,
173
+ )
174
+
175
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
176
+ # iterate over all spectra
177
+ for i, s in tqdm(
178
+ enumerate(omsexp.getSpectra()), # type: ignore[union-attr]
179
+ total=omsexp.getNrSpectra(),
180
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Scans",
181
+ disable=tdqm_disable,
182
+ ):
183
+ # create a dict
184
+ if s.getMSLevel() == 1:
185
+ cycle += 1
186
+ prec_mz = None
187
+ precursorIsolationWindowLowerMZ = None
188
+ precursorIsolationWindowUpperMZ = None
189
+ prec_intyensity = None
190
+ energy = None
191
+ else:
192
+ prec_mz = s.getPrecursors()[0].getMZ()
193
+ precursorIsolationWindowLowerMZ = s.getPrecursors()[0].getIsolationWindowLowerOffset()
194
+ precursorIsolationWindowUpperMZ = s.getPrecursors()[0].getIsolationWindowUpperOffset()
195
+ prec_intyensity = s.getPrecursors()[0].getIntensity()
196
+ energy = s.getPrecursors()[0].getActivationEnergy()
197
+
198
+ peaks = s.get_peaks()
199
+ spect = Spectrum(mz=peaks[0], inty=peaks[1], ms_level=s.getMSLevel())
200
+
201
+ bl = spect.baseline()
202
+ spect = spect.denoise(threshold=bl)
203
+
204
+ if spect.ms_level == 1:
205
+ mz = np.array(spect.mz)
206
+ median_diff = np.median(np.diff(np.sort(mz))) if mz.size > 1 else None
207
+
208
+ if median_diff is not None and median_diff < 0.01:
209
+ spect = spect.centroid(
210
+ tolerance=self.parameters.mz_tol_ms1_da,
211
+ ppm=self.parameters.mz_tol_ms1_ppm,
212
+ min_points=self.parameters.centroid_min_points_ms1,
213
+ )
214
+
215
+ newscan = {
216
+ "scan_uid": i,
217
+ "cycle": cycle,
218
+ "ms_level": int(s.getMSLevel()),
219
+ "rt": s.getRT(),
220
+ "inty_tot": spect.tic(),
221
+ "inty_min": spect.inty_min(),
222
+ "inty_max": spect.inty_max(),
223
+ "bl": bl,
224
+ "mz_min": spect.mz_min(),
225
+ "mz_max": spect.mz_max(),
226
+ "comment": s.getComment(),
227
+ "name": s.getName(),
228
+ "id": s.getNativeID(),
229
+ "prec_mz": prec_mz,
230
+ "prec_mz_min": precursorIsolationWindowLowerMZ,
231
+ "prec_mz_max": precursorIsolationWindowUpperMZ,
232
+ "prec_inty": prec_intyensity,
233
+ "energy": energy,
234
+ "feature_uid": -1,
235
+ }
236
+
237
+ scans.append(newscan)
238
+
239
+ if s.getMSLevel() == 1 and len(peaks) > 0:
240
+ newms1_df = pl.DataFrame(
241
+ {
242
+ "cycle": cycle,
243
+ "scan_uid": i,
244
+ "rt": s.getRT(),
245
+ "mz": spect.mz,
246
+ "inty": spect.inty,
247
+ },
248
+ schema=schema,
249
+ )
250
+ ms1_df = pl.concat([ms1_df, newms1_df])
251
+
252
+ # convert to polars DataFrame with explicit schema and store in self.scans_df
253
+ self.scans_df = pl.DataFrame(
254
+ scans,
255
+ schema={
256
+ "scan_uid": pl.Int64,
257
+ "cycle": pl.Int64,
258
+ "ms_level": pl.Int64,
259
+ "rt": pl.Float64,
260
+ "inty_tot": pl.Float64,
261
+ "inty_min": pl.Float64,
262
+ "inty_max": pl.Float64,
263
+ "bl": pl.Float64,
264
+ "mz_min": pl.Float64,
265
+ "mz_max": pl.Float64,
266
+ "comment": pl.Utf8,
267
+ "name": pl.Utf8,
268
+ "id": pl.Utf8,
269
+ "prec_mz": pl.Float64,
270
+ "prec_mz_min": pl.Float64,
271
+ "prec_mz_max": pl.Float64,
272
+ "prec_inty": pl.Float64,
273
+ "energy": pl.Float64,
274
+ "feature_uid": pl.Int64,
275
+ },
276
+ infer_schema_length=None,
277
+ )
278
+ self.file_interface = "oms"
279
+ self.ms1_df = ms1_df
280
+ self.label = os.path.basename(filename)
281
+ if self.file_type != "ztscan":
282
+ self.analyze_dda()
283
+
284
+
285
+ def _load_raw(
286
+ self,
287
+ filename=None,
288
+ ):
289
+ """
290
+ Load and process raw spectral data from the given file.
291
+ This method reads a Thermo raw file (with '.raw' extension) by utilizing the ThermoRawData class from
292
+ the alpharaw.thermo module. It validates the filename, checks for file existence, and then imports and processes
293
+ the raw data. The method performs the following tasks:
294
+ - Converts retention times (rt) from minutes to seconds and rounds them to 4 decimal places.
295
+ - Iterates over each spectrum in the raw data and constructs a list of scan dictionaries.
296
+ - For MS level 1 scans, performs centroiding if peaks with intensities > 0 after denoising.
297
+ - Creates a Polars DataFrame for all scans (self.scans_df) with detailed spectrum information.
298
+ - Aggregates MS1 spectrum peak data into a separate Polars DataFrame (self.ms1_df).
299
+ - Sets additional attributes such as file path, raw data object, interface label, and file label.
300
+ - Calls the analyze_dda method for further processed data analysis.
301
+ Parameters:
302
+ filename (str): The path to the raw data file. Must end with ".raw".
303
+ Raises:
304
+ ValueError: If the provided filename does not end with ".raw".
305
+ FileNotFoundError: If the file specified by filename does not exist.
306
+ Side Effects:
307
+ - Populates self.scans_df with scan data in a Polars DataFrame.
308
+ - Populates self.ms1_df with MS1 scan data.
309
+ - Updates instance attributes including self.file_path, self.file_obj, self.file_interface, and self.label.
310
+ - Initiates further analysis by invoking analyze_dda().
311
+ """
312
+ from alpharaw.thermo import ThermoRawData
313
+
314
+ if not filename:
315
+ raise ValueError("Filename must be provided.")
316
+
317
+ filename = os.path.abspath(filename)
318
+ # check if it exists
319
+ if not os.path.exists(filename):
320
+ raise FileNotFoundError(f"File {filename} not found.")
321
+
322
+ raw_data = ThermoRawData(centroided=False)
323
+ raw_data.keep_k_peaks_per_spec = self.parameters.max_points_per_spectrum
324
+ # check thatupdat filename ends with .raw
325
+ if not filename.endswith(".raw"):
326
+ raise ValueError("filename must end with .raw")
327
+ # check that the file exists
328
+ if not os.path.exists(filename):
329
+ raise FileNotFoundError(f"File {filename} not found.")
330
+ self.logger.info(f"Loading {filename}")
331
+ raw_data.import_raw(filename)
332
+ specs = raw_data.spectrum_df
333
+ # convert rt from minutes to seconds, round to 4 decimal places
334
+ specs.rt = specs.rt * 60
335
+ # TODO this should be an external param
336
+ specs.rt = specs.rt.round(4)
337
+
338
+ scans = []
339
+ cycle = 0
340
+ schema = {
341
+ "cycle": pl.Int32,
342
+ "scan_uid": pl.Int64,
343
+ "rt": pl.Float64,
344
+ "mz": pl.Float64,
345
+ "inty": pl.Float64,
346
+ }
347
+ # create a polars DataFrame with explicit schema: cycle: int, rt: float, mz: float, intensity: float
348
+ ms1_df = pl.DataFrame(
349
+ {"cycle": [], "scan_uid": [], "rt": [], "mz": [], "inty": []},
350
+ schema=schema,
351
+ )
352
+ # iterate over rows of specs
353
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
354
+ for i, s in tqdm(
355
+ specs.iterrows(),
356
+ total=len(specs),
357
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Scans",
358
+ disable=tdqm_disable,
359
+ ):
360
+ # create a dict
361
+ if s["ms_level"] == 1:
362
+ cycle += 1
363
+ prec_mz = None
364
+ precursorIsolationWindowLowerMZ = None
365
+ precursorIsolationWindowUpperMZ = None
366
+ prec_intyensity = None
367
+ energy = None
368
+ else:
369
+ prec_mz = s["precursor_mz"]
370
+ precursorIsolationWindowLowerMZ = s["isolation_lower_mz"]
371
+ precursorIsolationWindowUpperMZ = s["isolation_upper_mz"]
372
+ prec_intyensity = None
373
+ energy = s["nce"]
374
+
375
+ peak_start_idx = s["peak_start_idx"]
376
+ peak_stop_idx = s["peak_stop_idx"]
377
+ peaks = raw_data.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
378
+ spect = Spectrum(
379
+ mz=peaks.mz.values,
380
+ inty=peaks.intensity.values,
381
+ ms_level=s["ms_level"],
382
+ centroided=False,
383
+ )
384
+ # remove peaks with intensity <= 0
385
+
386
+ bl = spect.baseline()
387
+ spect = spect.denoise(threshold=bl)
388
+ if spect.ms_level == 1:
389
+ spect = spect.centroid(
390
+ tolerance=self.parameters.mz_tol_ms1_da,
391
+ ppm=self.parameters.mz_tol_ms1_ppm,
392
+ min_points=self.parameters.centroid_min_points_ms1,
393
+ )
394
+ newscan = {
395
+ "scan_uid": i,
396
+ "cycle": cycle,
397
+ "ms_level": int(s["ms_level"]),
398
+ "rt": s["rt"],
399
+ "inty_tot": spect.tic(),
400
+ "inty_min": spect.inty_min(),
401
+ "inty_max": spect.inty_max(),
402
+ "bl": bl,
403
+ "mz_min": spect.mz_min(),
404
+ "mz_max": spect.mz_max(),
405
+ "comment": "",
406
+ "name": "",
407
+ "id": "",
408
+ "prec_mz": prec_mz,
409
+ "prec_mz_min": precursorIsolationWindowLowerMZ,
410
+ "prec_mz_max": precursorIsolationWindowUpperMZ,
411
+ "prec_inty": prec_intyensity,
412
+ "energy": energy,
413
+ "feature_uid": -1,
414
+ }
415
+
416
+ scans.append(newscan)
417
+
418
+ if s["ms_level"] == 1 and len(peaks) > 0:
419
+ newms1_df = pl.DataFrame(
420
+ {
421
+ "cycle": cycle,
422
+ "scan_uid": i,
423
+ "rt": s["rt"],
424
+ "mz": spect.mz,
425
+ "inty": spect.inty,
426
+ },
427
+ schema=schema,
428
+ )
429
+ ms1_df = pl.concat([ms1_df, newms1_df])
430
+
431
+ # convert to polars DataFrame with explicit schema and store in self.scans_df
432
+ self.scans_df = pl.DataFrame(
433
+ scans,
434
+ schema={
435
+ "scan_uid": pl.Int64,
436
+ "cycle": pl.Int64,
437
+ "ms_level": pl.Int64,
438
+ "rt": pl.Float64,
439
+ "inty_tot": pl.Float64,
440
+ "inty_min": pl.Float64,
441
+ "inty_max": pl.Float64,
442
+ "bl": pl.Float64,
443
+ "mz_min": pl.Float64,
444
+ "mz_max": pl.Float64,
445
+ "comment": pl.Utf8,
446
+ "name": pl.Utf8,
447
+ "id": pl.Utf8,
448
+ "prec_mz": pl.Float64,
449
+ "prec_mz_min": pl.Float64,
450
+ "prec_mz_max": pl.Float64,
451
+ "prec_inty": pl.Float64,
452
+ "energy": pl.Float64,
453
+ "feature_uid": pl.Int64,
454
+ },
455
+ infer_schema_length=None,
456
+ )
457
+ self.file_path = filename
458
+ self.file_source = filename
459
+ self.file_obj = raw_data
460
+ self.file_interface = "alpharaw"
461
+ self.label = os.path.basename(filename)
462
+ self.ms1_df = ms1_df
463
+ self.analyze_dda()
464
+
465
+
466
+ def _load_wiff(
467
+ self,
468
+ filename=None,
469
+ ):
470
+ try:
471
+ # Use masster's own implementation first
472
+ from masster.sample.sciex import SciexWiffData as MassterSciexWiffData
473
+ SciexWiffDataClass = MassterSciexWiffData
474
+ except ImportError:
475
+ # Fallback to alpharaw if masster implementation fails
476
+ from alpharaw.sciex import SciexWiffData as AlpharawSciexWiffData
477
+ SciexWiffDataClass = AlpharawSciexWiffData
478
+
479
+ if not filename:
480
+ raise ValueError("Filename must be provided.")
481
+
482
+ filename = os.path.abspath(filename)
483
+ # check if it exists
484
+ if not os.path.exists(filename):
485
+ raise FileNotFoundError(f"File {filename} not found.")
486
+
487
+ raw_data = SciexWiffDataClass(centroided=False)
488
+ raw_data.keep_k_peaks_per_spec = self.parameters.max_points_per_spectrum
489
+
490
+ if not filename.endswith(".wiff"):
491
+ raise ValueError("filename must end with .wiff")
492
+ if not os.path.exists(filename):
493
+ raise FileNotFoundError(f"File {filename} not found.")
494
+
495
+ self.logger.info(f"Loading {filename}")
496
+ raw_data.import_raw(filename)
497
+
498
+ specs = raw_data.spectrum_df
499
+ specs.rt = specs.rt * 60
500
+ specs.rt = specs.rt.round(4)
501
+
502
+ algo = self.parameters.centroid_algo
503
+
504
+ scans = []
505
+ ms1_df_records = []
506
+ cycle = 0
507
+ schema = {
508
+ "cycle": pl.Int32,
509
+ "scan_uid": pl.Int64,
510
+ "rt": pl.Float64,
511
+ "mz": pl.Float64,
512
+ "inty": pl.Float64,
513
+ }
514
+
515
+ # iterate over rows of specs
516
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
517
+ for i, s in tqdm(
518
+ specs.iterrows(),
519
+ total=len(specs),
520
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Scans",
521
+ disable=tdqm_disable,
522
+ ):
523
+ ms_level = s["ms_level"]
524
+ if ms_level == 1:
525
+ cycle += 1
526
+ prec_mz = None
527
+ precursorIsolationWindowLowerMZ = None
528
+ precursorIsolationWindowUpperMZ = None
529
+ prec_intyensity = None
530
+ energy = None
531
+ else:
532
+ prec_mz = s["precursor_mz"]
533
+ precursorIsolationWindowLowerMZ = s["isolation_lower_mz"]
534
+ precursorIsolationWindowUpperMZ = s["isolation_upper_mz"]
535
+ prec_intyensity = None
536
+ energy = s["nce"]
537
+
538
+ peak_start_idx = s["peak_start_idx"]
539
+ peak_stop_idx = s["peak_stop_idx"]
540
+ peaks = raw_data.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
541
+ spect = Spectrum(
542
+ mz=peaks.mz.values,
543
+ inty=peaks.intensity.values,
544
+ ms_level=ms_level,
545
+ centroided=False,
546
+ )
547
+ bl = spect.baseline()
548
+ spect = spect.denoise(threshold=bl)
549
+ if ms_level == 1:
550
+ spect = spect.centroid(
551
+ algo=algo,
552
+ tolerance=self.parameters.mz_tol_ms1_da,
553
+ ppm=self.parameters.mz_tol_ms1_ppm,
554
+ min_points=self.parameters.centroid_min_points_ms1,
555
+ )
556
+ scans.append(
557
+ {
558
+ "scan_uid": i,
559
+ "cycle": cycle,
560
+ "ms_level": int(ms_level),
561
+ "rt": s["rt"],
562
+ "inty_tot": spect.tic(),
563
+ "inty_min": spect.inty_min(),
564
+ "inty_max": spect.inty_max(),
565
+ "bl": bl,
566
+ "mz_min": spect.mz_min(),
567
+ "mz_max": spect.mz_max(),
568
+ "comment": "",
569
+ "name": "",
570
+ "id": "",
571
+ "prec_mz": prec_mz,
572
+ "prec_mz_min": precursorIsolationWindowLowerMZ,
573
+ "prec_mz_max": precursorIsolationWindowUpperMZ,
574
+ "prec_inty": prec_intyensity,
575
+ "energy": energy,
576
+ "feature_uid": -1,
577
+ },
578
+ )
579
+
580
+ if ms_level == 1 and len(peaks) > 0:
581
+ # Use extend for all mz/int pairs at once
582
+ ms1_df_records.extend(
583
+ [
584
+ {
585
+ "cycle": cycle,
586
+ "scan_uid": i,
587
+ "rt": s["rt"],
588
+ "mz": mz,
589
+ "inty": inty,
590
+ }
591
+ for mz, inty in zip(spect.mz, spect.inty, strict=False)
592
+ ],
593
+ )
594
+
595
+ # Create DataFrames in one go
596
+ self.scans_df = pl.DataFrame(
597
+ scans,
598
+ schema={
599
+ "scan_uid": pl.Int64,
600
+ "cycle": pl.Int64,
601
+ "ms_level": pl.Int64,
602
+ "rt": pl.Float64,
603
+ "inty_tot": pl.Float64,
604
+ "inty_min": pl.Float64,
605
+ "inty_max": pl.Float64,
606
+ "bl": pl.Float64,
607
+ "mz_min": pl.Float64,
608
+ "mz_max": pl.Float64,
609
+ "comment": pl.Utf8,
610
+ "name": pl.Utf8,
611
+ "id": pl.Utf8,
612
+ "prec_mz": pl.Float64,
613
+ "prec_mz_min": pl.Float64,
614
+ "prec_mz_max": pl.Float64,
615
+ "prec_inty": pl.Float64,
616
+ "energy": pl.Float64,
617
+ "feature_uid": pl.Int64,
618
+ },
619
+ infer_schema_length=None,
620
+ )
621
+ self.file_path = filename
622
+ self.file_source = filename
623
+ self.file_obj = raw_data
624
+ self.file_interface = "alpharaw"
625
+ self.label = os.path.basename(filename)
626
+ self.ms1_df = pl.DataFrame(ms1_df_records, schema=schema)
627
+ if self.file_type != "ztscan":
628
+ self.analyze_dda()
629
+
630
+
631
+ def _load_featureXML(
632
+ self,
633
+ filename="features.featureXML",
634
+ ):
635
+ """
636
+ Load feature data from a FeatureXML file.
637
+
638
+ This method reads a FeatureXML file (defaulting to "features.featureXML") using the
639
+ OMS library's FeatureXMLFile and FeatureMap objects. The loaded feature data is stored
640
+ in the instance variable 'features'. The method then converts the feature data into a
641
+ DataFrame, optionally excluding peptide identification data, and cleans it using the
642
+ '__oms_clean_df' method, saving the cleaned DataFrame into 'features_df'.
643
+
644
+ Parameters:
645
+ filename (str): The path to the FeatureXML file to load. Defaults to "features.featureXML".
646
+
647
+ Returns:
648
+ None
649
+ """
650
+ fh = oms.FeatureXMLFile()
651
+ fm = oms.FeatureMap()
652
+ fh.load(filename, fm)
653
+ self.features = fm
654
+ """if self.features_df is None:
655
+ df = self.features.get_df(export_peptide_identifications=False)
656
+ df = self._clean_features_df(df)
657
+
658
+ # desotope features
659
+ df = self._features_deisotope(df, mz_tol=0.02, rt_tol=0.5)
660
+
661
+ # update eic
662
+ df["chrom"] = None
663
+ mz_tol = 0.01
664
+ rt_tol = 10
665
+ # iterate over all rows in df
666
+ for i, row in df.iterrows():
667
+ # select data in ms1_df with mz in range [mz_start - mz_tol, mz_end + mz_tol] and rt in range [rt_start - rt_tol, rt_end + rt_tol]
668
+ d = self.ms1_df.filter(
669
+ (pl.col("rt") >= row["rt_start"] - rt_tol)
670
+ & (pl.col("rt") <= row["rt_end"] + rt_tol)
671
+ & (pl.col("mz") >= row["mz"] - mz_tol)
672
+ & (pl.col("mz") <= row["mz"] + mz_tol)
673
+ )
674
+ # for all unique rt values, find the maximum inty
675
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max())
676
+ if len(eic_rt) < 4:
677
+ continue
678
+ eic = Chromatogram(
679
+ eic_rt["rt"].to_numpy(),
680
+ eic_rt["inty"].to_numpy(),
681
+ label=f"EIC mz={row['mz']:.4f}",
682
+ feature_start=row["rt_start"],
683
+ feature_end=row["rt_end"],
684
+ feature_apex=row["rt"],
685
+ ).find_peaks()
686
+ # set eic in df
687
+ df.at[i, "chrom"] = eic
688
+ if len(eic.peak_widths) > 0:
689
+ df.at[i, "chrom_coherence"] = round(eic.feature_coherence, 3)
690
+ df.at[i, "chrom_prominence"] = round(
691
+ eic.peak_prominences[0], 3
692
+ ) # eic.peak_prominences[0]
693
+ df.at[i, "chrom_prominence_scaled"] = round(
694
+ eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3
695
+ )
696
+ df.at[i, "chrom_height_scaled"] = round(
697
+ eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3
698
+ )
699
+
700
+ self.features_df = df"""
701
+
702
+
703
+ '''
704
+ def _load_mzpkl(
705
+ self,
706
+ filename="sample.mzpkl",
707
+ ondisk=False,
708
+ ):
709
+ """
710
+ Load the mzpkl data file, initialize the experiment attributes, and set up the file object.
711
+ Parameters:
712
+ filename (str, optional): The path to the .mzpkl file to be loaded. Defaults to "data.mzpkl".
713
+ ondisk (bool, optional): A flag indicating whether the data should be loaded for on-disk usage.
714
+ If True, self.ondisk is set to True and an OnDiscMSExperiment is used.
715
+ Otherwise, an MSExperiment is used.
716
+ Side Effects:
717
+ - Decompresses and unpickles the specified file.
718
+ - Sets attributes on self for each key in the loaded data dictionary, except for keys named 'format'.
719
+ - Renames the attribute 'spectra_df' to 'scans_df' if present.
720
+ - Initializes self.file_obj as either an OnDiscMSExperiment or MSExperiment based on the ondisk flag.
721
+ - Checks for an associated featureXML file (with the same base name as the input file) and loads it if found.
722
+ """
723
+
724
+ if ondisk is True:
725
+ self.ondisk = True
726
+
727
+ with bz2.BZ2File(filename, "rb") as f:
728
+ data = pickle.load(f)
729
+
730
+ for k, v in data.items():
731
+ if k in ["format"]:
732
+ continue
733
+ if k == "spectra_df":
734
+ k = "scans_df"
735
+ setattr(self, k, v)
736
+
737
+ self.sanitize()
738
+
739
+ if self.ondisk:
740
+ self.file_obj = oms.OnDiscMSExperiment()
741
+ else:
742
+ self.file_obj = oms.MSExperiment()
743
+
744
+ # check if *.featureXML exists
745
+ featureXML = filename.replace(".mzpkl", ".featureXML")
746
+ if os.path.exists(featureXML):
747
+ self._load_featureXML(featureXML)
748
+
749
+ '''
750
+
751
+
752
+ def _wiff_to_dict(
753
+ filename=None,
754
+ ):
755
+ from alpharaw.raw_access.pysciexwifffilereader import WillFileReader
756
+
757
+ file_reader = WillFileReader(filename)
758
+ number_of_samples = len(file_reader.sample_names)
759
+ metadata = []
760
+ for si in range(number_of_samples):
761
+ sample_reader = file_reader._wiff_file.GetSample(si)
762
+ number_of_exps = sample_reader.MassSpectrometerSample.ExperimentCount
763
+ for ei in range(number_of_exps):
764
+ exp_reader = sample_reader.MassSpectrometerSample.GetMSExperiment(ei)
765
+
766
+ exp_info = exp_reader.GetMassSpectrumInfo(ei)
767
+
768
+ # get the details of the experiment
769
+ exp_name = exp_reader.Details.get_ExperimentName()
770
+ exp_type = exp_reader.Details.get_ExperimentType()
771
+
772
+ IDA_type = exp_reader.Details.get_IDAType()
773
+ has_MRM_Pro_Data = exp_reader.Details.get_HasMRMProData()
774
+ has_SMRM_Data = exp_reader.Details.get_HasSMRMData()
775
+ is_swath = exp_reader.Details.get_IsSwath()
776
+ has_dyn_fill_time = exp_reader.Details.get_HasDynamicFillTime()
777
+ method_fill_time = exp_reader.Details.get_MethodFillTime()
778
+ default_resolution = exp_reader.Details.get_DefaultResolution()
779
+ parameters = exp_reader.Details.get_Parameters()
780
+ targeted_compound_info = exp_reader.Details.get_TargetedCompoundInfo()
781
+ source_type = exp_reader.Details.get_SourceType()
782
+ raw_data_type = exp_reader.Details.get_RawDataType()
783
+
784
+ number_of_scans = exp_reader.Details.get_NumberOfScans()
785
+ scan_group = exp_reader.Details.get_ScanGroup()
786
+ spectrum_type = exp_reader.Details.get_SpectrumType()
787
+ saturatrion_threshold = exp_reader.Details.get_SaturationThreshold()
788
+ polarity = exp_reader.Details.get_Polarity()
789
+ mass_range_info = exp_reader.Details.get_MassRangeInfo()
790
+ start_mass = exp_reader.Details.get_StartMass()
791
+ end_mass = exp_reader.Details.get_EndMass()
792
+
793
+ mslevel = exp_info.MSLevel
794
+ if mslevel > 1:
795
+ # get the precursor information
796
+ parent_mz = exp_info.ParentMZ
797
+ collision_energy = exp_info.CollisionEnergy
798
+ parent_charge_state = exp_info.ParentChargeState
799
+ else:
800
+ parent_mz = None
801
+ collision_energy = None
802
+ parent_charge_state = None
803
+
804
+ # create a dict with the details
805
+ exp_dict = {
806
+ "instrument_name": sample_reader.MassSpectrometerSample.get_InstrumentName(),
807
+ "sample_id": si,
808
+ "experiment_id": ei,
809
+ "experiment_name": exp_name,
810
+ "experiment_type": exp_type,
811
+ "IDA_type": IDA_type,
812
+ "has_MRM_Pro_Data": has_MRM_Pro_Data,
813
+ "has_SMRM_Data": has_SMRM_Data,
814
+ "is_swath": is_swath,
815
+ "has_dyn_fill_time": has_dyn_fill_time,
816
+ "method_fill_time": method_fill_time,
817
+ "default_resolution": default_resolution,
818
+ "parameters": parameters,
819
+ "targeted_compound_info": targeted_compound_info,
820
+ "source_type": source_type,
821
+ "raw_data_type": raw_data_type,
822
+ "number_of_scans": number_of_scans,
823
+ "scan_group": scan_group,
824
+ "spectrum_type": spectrum_type,
825
+ "saturatrion_threshold": saturatrion_threshold,
826
+ "polarity": polarity,
827
+ "mass_range_info": mass_range_info,
828
+ "start_mass": start_mass,
829
+ "end_mass": end_mass,
830
+ "mslevel": mslevel,
831
+ "parent_mz": parent_mz,
832
+ "collision_energy": collision_energy,
833
+ "parent_charge_state": parent_charge_state,
834
+ }
835
+ metadata.append(exp_dict)
836
+ # convert to pandas DataFrame
837
+ metadata = pd.DataFrame(metadata)
838
+
839
+ return metadata
840
+
841
+
842
+ def sanitize(self):
843
+ # iterate over all rows in self.features_df
844
+ if self.features_df is None:
845
+ return
846
+ for _i, row in self.features_df.iterrows():
847
+ # check if chrom is not None
848
+ if row["chrom"] is not None and not isinstance(row["chrom"], Chromatogram):
849
+ # update chrom to a Chromatogram
850
+ new_chrom = Chromatogram(rt=np.array([]), inty=np.array([]))
851
+ new_chrom.from_dict(row["chrom"].__dict__)
852
+ self.features_df.at[_i, "chrom"] = new_chrom
853
+ if row["ms2_specs"] is not None:
854
+ if isinstance(row["ms2_specs"], list):
855
+ for _j, ms2_specs in enumerate(row["ms2_specs"]):
856
+ if not isinstance(ms2_specs, Spectrum):
857
+ new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
858
+ new_ms2_specs.from_dict(ms2_specs.__dict__)
859
+ self.features_df.at[_i, "ms2_specs"][_j] = new_ms2_specs
860
+
861
+
862
+ def index_file(self):
863
+ """
864
+ Reload raw data from a file based on its extension.
865
+
866
+ This method checks whether the file at self.file_path exists and determines
867
+ the appropriate way to load it depending on its extension:
868
+ - If the file ends with ".wiff", it uses the SciexWiffData class for import.
869
+ - If the file ends with ".raw", it uses the ThermoRawData class for import.
870
+ - If the file ends with ".mzml", it uses the MzMLFile loader with either
871
+ an on-disk or in-memory MSExperiment based on the self.ondisk flag.
872
+
873
+ It also sets the file interface and file object on the instance after successful
874
+ import. Additionally, the number of peaks per spectrum is configured using the
875
+ 'max_points_per_spectrum' parameter from self.parameters.
876
+
877
+ Raises:
878
+ FileNotFoundError: If the file does not exist or has an unsupported extension.
879
+ """
880
+ # check if file_path exists and ends with .wiff
881
+ if os.path.exists(self.file_source) and self.file_source.lower().endswith(".wiff"):
882
+ self.file_interface = "alpharaw"
883
+ try:
884
+ from alpharaw.sciex import SciexWiffData
885
+ except ImportError:
886
+ # Fallback to masster's own implementation
887
+ from masster.sample.sciex import SciexWiffData
888
+
889
+ raw_data = SciexWiffData(centroided=False)
890
+ raw_data.keep_k_peaks_per_spec = self.parameters.max_points_per_spectrum
891
+ self.logger.info("Index raw data...")
892
+ raw_data.import_raw(self.file_source)
893
+ self.file_obj = raw_data
894
+ elif os.path.exists(self.file_source) and self.file_source.lower().endswith(".raw"):
895
+ self.file_interface = "alpharaw"
896
+ from alpharaw.thermo import ThermoRawData
897
+
898
+ raw_data = ThermoRawData(centroided=False)
899
+ raw_data.keep_k_peaks_per_spec = self.parameters.get("max_points_per_spectrum")
900
+ self.logger.info("Index raw data...")
901
+ raw_data.import_raw(self.file_source)
902
+ self.file_obj = raw_data
903
+ elif os.path.exists(self.file_source) and self.file_source.lower().endswith(".mzml"):
904
+ self.file_interface = "oms"
905
+ omsexp: oms.OnDiscMSExperiment | oms.MSExperiment
906
+ if self.ondisk:
907
+ omsexp = oms.OnDiscMSExperiment()
908
+ self.file_obj = omsexp
909
+ else:
910
+ omsexp = oms.MSExperiment()
911
+ oms.MzMLFile().load(self.file_source, omsexp)
912
+ self.file_obj = omsexp
913
+ elif os.path.exists(self.file_source) and self.file_source.lower().endswith(".sample5"):
914
+ # this is an old save, try to see if
915
+ if os.path.exists(self.file_source.replace(".sample5", ".wiff")):
916
+ self.set_source(self.file_source.replace(".sample5", ".wiff"))
917
+ elif os.path.exists(self.file_source.replace(".sample5", ".raw")):
918
+ self.set_source(self.file_source.replace(".sample5", ".raw"))
919
+ elif os.path.exists(self.file_source.replace(".sample5", ".mzml")):
920
+ self.set_source(self.file_source.replace(".sample5", ".mzml"))
921
+ else:
922
+ raise FileNotFoundError(f"File {self.file_source} not found. Did the path change? Consider running source().")
923
+ self.index_file()
924
+ else:
925
+ raise FileNotFoundError(f"File {self.file_source} not found. Did the path change? Consider running source().")
926
+
927
+
928
+ def _load_ms2data(
929
+ self,
930
+ scans=None,
931
+ ):
932
+ # reads all ms2 data from the file object and returns a polars DataFrame
933
+
934
+ # check if file_obj is set
935
+ if self.file_obj is None:
936
+ return
937
+ # check if scan_uid is set
938
+ if scans is None:
939
+ scans = self.scans_df["scan_uid"].to_list()
940
+ if len(scans) == 0:
941
+ scans = self.scans_df["scan_uid"].to_list()
942
+
943
+ # check the file interface
944
+ if self.file_interface == "oms":
945
+ _load_ms2data(self, scans=scans)
946
+ elif self.file_interface == "alpharaw":
947
+ _load_ms2data_alpharaw(self, scan_uid=scans)
948
+
949
+ return
950
+
951
+
952
+ def _load_ms2data_alpharaw(
953
+ self,
954
+ scan_uid=None,
955
+ ):
956
+ # reads all ms data from the file object and returns a polars DataFrame
957
+
958
+ # TODO not used
959
+ ms2data = None
960
+ scan_uid = self.scans_df["scan_uid"].to_list() if scan_uid is None else scan_uid
961
+ self.logger.info(f"Loading MS2 data for {len(scan_uid)} scans...")
962
+ # keep only scans with ms_level == 2
963
+ if self.file_obj is None:
964
+ return
965
+
966
+ raw_data = self.file_obj
967
+ scans = raw_data.spectrum_df
968
+ # scans.rt = scans.rt * 60
969
+ scans.rt = scans.rt.round(4)
970
+
971
+ schema = {
972
+ "scan_uid": pl.Int64,
973
+ "rt": pl.Float64,
974
+ "prec_mz": pl.Float64,
975
+ "mz": pl.Float64,
976
+ "inty": pl.Float64,
977
+ }
978
+ # create a polars DataFrame with explicit schema: cycle: int, rt: float, mz: float, intensity: float
979
+ ms2data = pl.DataFrame(
980
+ {"scan_uid": [], "rt": [], "prec_mz": [], "mz": [], "inty": []},
981
+ schema=schema,
982
+ )
983
+ # iterate over rows of specs
984
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
985
+ for i, s in tqdm(
986
+ scans.iterrows(),
987
+ total=len(scans),
988
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Centroid",
989
+ disable=tdqm_disable,
990
+ ):
991
+ # create a dict
992
+ if s["ms_level"] == 2:
993
+ prec_mz = s["precursor_mz"]
994
+ peak_start_idx = s["peak_start_idx"]
995
+ peak_stop_idx = s["peak_stop_idx"]
996
+ peaks = raw_data.peak_df.loc[peak_start_idx : peak_stop_idx - 1]
997
+ spect = Spectrum(
998
+ mz=peaks.mz.values,
999
+ inty=peaks.intensity.values,
1000
+ ms_level=s["ms_level"],
1001
+ centroided=False,
1002
+ )
1003
+ # remove peaks with intensity <= 0
1004
+ bl = spect.baseline()
1005
+ spect = spect.denoise(threshold=bl)
1006
+
1007
+ if len(peaks) > 0:
1008
+ newms2data = pl.DataFrame(
1009
+ {
1010
+ "scan_uid": i,
1011
+ "rt": s["rt"],
1012
+ "prec_mz": prec_mz,
1013
+ "mz": spect.mz,
1014
+ "inty": spect.inty,
1015
+ },
1016
+ schema=schema,
1017
+ )
1018
+ ms2data = pl.concat([ms2data, newms2data])
1019
+ self.ms2data = ms2data
1020
+
1021
+
1022
+ # TODO this should go to chrom?
1023
+ def chrom_extract(
1024
+ self,
1025
+ rt_tol=6.0,
1026
+ mz_tol=0.005,
1027
+ ):
1028
+ """
1029
+ Extracts MRM (Multiple Reaction Monitoring) and EIC (Extracted Ion Chromatogram) data from the file object.
1030
+
1031
+ This method processes the `chrom_df` DataFrame, identifying relevant scans in `scans_df` and extracting chromatograms
1032
+ for MS1, MRM, and MS2 traces. It updates `chrom_df` with scan IDs and extracted chromatogram objects.
1033
+
1034
+ Parameters:
1035
+ rt_tol (float, optional): Retention time tolerance for scan selection. Defaults to RtParameters().rt_tol.
1036
+ mz_tol (float, optional): m/z tolerance for scan selection. Defaults to MzParameters().mz_tol_ms1_da.
1037
+
1038
+ Returns:
1039
+ None: Updates self.chrom_df in place with extracted chromatogram data.
1040
+ """
1041
+ if self.file_obj is None:
1042
+ return
1043
+
1044
+ if self.chrom_df is None:
1045
+ return
1046
+
1047
+ # check if mrm_df is dict, if so convert to DataFrame
1048
+ chrom_df = self.chrom_df
1049
+
1050
+ chrom_df["scan_uid"] = None
1051
+ chrom_df["chrom"] = None
1052
+ scan_uid = []
1053
+
1054
+ # iterate over all mrms and identidy the scans
1055
+ for i, trace in chrom_df.iterrows():
1056
+ if trace["type"] in ["ms1"]:
1057
+ rt = trace["rt"]
1058
+ rt_start = trace["rt_start"]
1059
+ if rt_start is None:
1060
+ rt_start = rt - 3
1061
+ rt_end = trace["rt_end"]
1062
+ if rt_end is None:
1063
+ rt_end = rt + 3
1064
+ # TODO not used
1065
+ q1 = trace["prec_mz"]
1066
+ # find all rows in self.scans_df that have rt between rt_start-rt_tol and rt_end+rt_tol and mz between q1-mz_tol and q1+mz_tol
1067
+ mask = (
1068
+ (self.scans_df["rt"] >= rt_start - rt_tol)
1069
+ & (self.scans_df["rt"] <= rt_end + rt_tol)
1070
+ & (self.scans_df["ms_level"] == 1)
1071
+ )
1072
+ scans_df = self.scans_df.filter(mask)
1073
+ scan_ids = scans_df["scan_uid"].to_list()
1074
+ scan_uid.extend(scan_ids)
1075
+ chrom_df.at[i, "scan_uid"] = scan_ids
1076
+
1077
+ elif trace["type"] in ["mrm", "ms2"]:
1078
+ rt = trace["rt"]
1079
+ rt_start = trace["rt_start"]
1080
+ if rt_start is None:
1081
+ rt_start = rt - 3
1082
+ rt_end = trace["rt_end"]
1083
+ if rt_end is None:
1084
+ rt_end = rt + 3
1085
+ q1 = trace["prec_mz"]
1086
+ # find all rows in self.scans_df that have rt between rt_start-rt_tol and rt_end+rt_tol and mz between q1-mz_tol and q1+mz_tol
1087
+ mask = (
1088
+ (self.scans_df["rt"] >= rt_start - rt_tol)
1089
+ & (self.scans_df["rt"] <= rt_end + rt_tol)
1090
+ & (self.scans_df["ms_level"] == 2)
1091
+ & (self.scans_df["prec_mz"] >= q1 - 5)
1092
+ & (self.scans_df["prec_mz"] <= q1 + 5)
1093
+ )
1094
+ scans_df = self.scans_df.filter(mask)
1095
+ # find the closes prec_mz to q1
1096
+ if scans_df.is_empty():
1097
+ continue
1098
+ # find the closest prec_mz to q1
1099
+ # sort by abs(prec_mz - q1) and take the first row
1100
+ # this is the closest precursor m/z to q1
1101
+ closest_prec_mz = scans_df.sort(abs(pl.col("prec_mz") - q1)).select(
1102
+ pl.col("prec_mz").first(),
1103
+ )
1104
+ # keep only the scans with prec_mz within mz_tol of closest_prec_mz
1105
+ scans_df = scans_df.filter(
1106
+ (pl.col("prec_mz") >= closest_prec_mz["prec_mz"][0] - 0.2)
1107
+ & (pl.col("prec_mz") <= closest_prec_mz["prec_mz"][0] + 0.2),
1108
+ )
1109
+
1110
+ scan_ids = scans_df["scan_uid"].to_list()
1111
+ scan_uid.extend(scan_ids)
1112
+ chrom_df.at[i, "scan_uid"] = scan_ids
1113
+
1114
+ # get the ms2data
1115
+ _load_ms2data(self, scans=list(set(scan_uid)) if scan_uid else None)
1116
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1117
+
1118
+ for i, trace in tqdm(
1119
+ chrom_df.iterrows(),
1120
+ total=len(chrom_df),
1121
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract EICs",
1122
+ disable=tdqm_disable,
1123
+ ):
1124
+ if trace["type"] in ["ms1"]:
1125
+ q1 = trace["prec_mz"]
1126
+ name = trace["name"]
1127
+ scan_uid = trace["scan_uid"]
1128
+ # find all ms1 data with scan_uid and mz between q1-mz_tol and q1+mz_tol
1129
+ d = self.ms1_df.filter(
1130
+ (pl.col("scan_uid").is_in(scan_uid)) & (pl.col("mz") >= q1 - mz_tol) & (pl.col("mz") <= q1 + mz_tol),
1131
+ )
1132
+ # for all unique rt values, find the maximum inty
1133
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max())
1134
+ eic = Chromatogram(
1135
+ eic_rt["rt"].to_numpy(),
1136
+ inty=eic_rt["inty"].to_numpy(),
1137
+ label=f"MS1 {name} ({q1:0.3f})",
1138
+ lib_rt=trace["rt"],
1139
+ )
1140
+ chrom_df.at[i, "chrom"] = eic
1141
+
1142
+ elif trace["type"] in ["mrm", "ms2"]:
1143
+ q1 = trace["prec_mz"]
1144
+ q3 = trace["prod_mz"]
1145
+ name = trace["name"]
1146
+ scan_uid = trace["scan_uid"]
1147
+ # find all ms2 data with scan_uid and mz between q3-mz_tol and q3+mz_tol
1148
+ d = self.ms2data.filter(
1149
+ (pl.col("scan_uid").is_in(scan_uid)) & (pl.col("mz") >= q3 - mz_tol) & (pl.col("mz") <= q3 + mz_tol),
1150
+ )
1151
+ # for all unique rt values, find the maximum inty
1152
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max())
1153
+ eic = Chromatogram(
1154
+ eic_rt["rt"].to_numpy(),
1155
+ inty=eic_rt["inty"].to_numpy(),
1156
+ label=f"MRM {name} ({q1:0.3f}>{q3:0.3f})",
1157
+ lib_rt=trace["rt"],
1158
+ )
1159
+ chrom_df.at[i, "chrom"] = eic
1160
+
1161
+ self.chrom_df = chrom_df
1162
+
1163
+
1164
+ # TODO no self?
1165
+ def _oms_clean_df(self, df):
1166
+ df2 = df[df["quality"] != 0]
1167
+ # change columns and order
1168
+ df = pd.DataFrame(
1169
+ columns=[
1170
+ "feature_uid",
1171
+ "uid",
1172
+ "mz",
1173
+ "rt",
1174
+ "rt_start",
1175
+ "rt_end",
1176
+ "rt_delta",
1177
+ "mz_start",
1178
+ "mz_end",
1179
+ "inty",
1180
+ "quality",
1181
+ "charge",
1182
+ "iso",
1183
+ "iso_of",
1184
+ "chrom",
1185
+ "chrom_coherence",
1186
+ "chrom_prominence",
1187
+ "chrom_prominence_scaled",
1188
+ "chrom_height_scaled",
1189
+ "ms2_scans",
1190
+ "ms2_specs",
1191
+ ],
1192
+ )
1193
+
1194
+ # set values of fid to 0:len(df)
1195
+ df["uid"] = df2.index.to_list()
1196
+ df["mz"] = (df2["mz"]).round(5)
1197
+ df["rt"] = (df2["RT"]).round(3)
1198
+ df["rt_start"] = (df2["RTstart"]).round(3)
1199
+ df["rt_end"] = (df2["RTend"]).round(3)
1200
+ df["rt_delta"] = (df2["RTend"] - df2["RTstart"]).round(3)
1201
+ df["mz_start"] = (df2["MZstart"]).round(5)
1202
+ df["mz_end"] = (df2["MZend"]).round(5) # df2["MZend"]
1203
+ df["inty"] = df2["intensity"]
1204
+ df["quality"] = df2["quality"]
1205
+ df["charge"] = df2["charge"]
1206
+ df["iso"] = 0
1207
+ df["iso_of"] = None
1208
+ df["chrom"] = None
1209
+ df["chrom_coherence"] = None
1210
+ df["chrom_prominence"] = None
1211
+ df["chrom_prominence_scaled"] = None
1212
+ df["chrom_height_scaled"] = None
1213
+ df["ms2_scans"] = None
1214
+ df["ms2_specs"] = None
1215
+ df["feature_uid"] = range(1, len(df) + 1)
1216
+ # df.set_index('fid', inplace=True)
1217
+ # rests index
1218
+ # df.reset_index(drop=True, inplace=True)
1219
+
1220
+ return df