masster 0.5.17__py3-none-any.whl → 0.5.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +12 -0
- masster/sample/defaults/find_ms2_def.py +5 -5
- masster/sample/defaults/sample_def.py +30 -6
- masster/sample/h5.py +59 -13
- masster/sample/lib.py +9 -3
- masster/sample/load.py +47 -120
- masster/sample/processing.py +3 -3
- masster/sample/sample.py +5 -3
- masster/sample/sciex.py +62 -648
- masster/sample/thermo.py +801 -0
- masster/study/id.py +3 -1
- masster/study/load.py +15 -792
- masster/study/study.py +1 -0
- masster/wizard/README.md +15 -15
- masster/wizard/wizard.py +82 -28
- {masster-0.5.17.dist-info → masster-0.5.19.dist-info}/METADATA +3 -2
- {masster-0.5.17.dist-info → masster-0.5.19.dist-info}/RECORD +21 -20
- {masster-0.5.17.dist-info → masster-0.5.19.dist-info}/WHEEL +0 -0
- {masster-0.5.17.dist-info → masster-0.5.19.dist-info}/entry_points.txt +0 -0
- {masster-0.5.17.dist-info → masster-0.5.19.dist-info}/licenses/LICENSE +0 -0
masster/sample/thermo.py
ADDED
|
@@ -0,0 +1,801 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Standalone Thermo RAW file reader module.
|
|
3
|
+
|
|
4
|
+
This module provides a standalone implementation for reading Thermo Fisher RAW files
|
|
5
|
+
using the Thermo Fisher .NET libraries directly. It offers functionality to extract
|
|
6
|
+
spectral data, retention times, MS levels, polarity information, and precursor details
|
|
7
|
+
from RAW files.
|
|
8
|
+
|
|
9
|
+
Key Features:
|
|
10
|
+
- Direct RAW file reading using Thermo Fisher DLLs
|
|
11
|
+
- Support for MS1 and MSn data extraction
|
|
12
|
+
- Optional naive peak centroiding
|
|
13
|
+
- Polarity detection from scan events
|
|
14
|
+
- Precursor information extraction for MS/MS spectra
|
|
15
|
+
- Context manager support for proper resource cleanup
|
|
16
|
+
|
|
17
|
+
Requirements:
|
|
18
|
+
- pythonnet (pip install pythonnet)
|
|
19
|
+
- Thermo Fisher DLLs available in alpharaw's ext/thermo_fisher directory
|
|
20
|
+
- On Linux/macOS: mono runtime must be installed
|
|
21
|
+
|
|
22
|
+
Classes:
|
|
23
|
+
ThermoRawFileReader: Low-level RAW file reader using .NET libraries
|
|
24
|
+
ThermoRawData: High-level interface providing pandas DataFrames
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> from thermo import load_raw_file
|
|
28
|
+
>>> raw_data = load_raw_file("sample.raw")
|
|
29
|
+
>>> print(f"Found {len(raw_data.spectrum_df)} spectra")
|
|
30
|
+
>>> mz, intensity = raw_data.get_peaks(0) # Get first spectrum peaks
|
|
31
|
+
|
|
32
|
+
Note:
|
|
33
|
+
The .NET imports (System, ThermoFisher) will only work when pythonnet
|
|
34
|
+
is properly installed and configured. Without these dependencies, the
|
|
35
|
+
module will still import but Thermo RAW file reading will be disabled.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# Standard library imports
|
|
39
|
+
import ctypes
|
|
40
|
+
import os
|
|
41
|
+
import site
|
|
42
|
+
import warnings
|
|
43
|
+
from typing import Any, ClassVar
|
|
44
|
+
|
|
45
|
+
# Third-party imports
|
|
46
|
+
import numpy as np
|
|
47
|
+
import pandas as pd
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def naive_centroid(
|
|
51
|
+
peak_mzs: np.ndarray,
|
|
52
|
+
peak_intensities: np.ndarray,
|
|
53
|
+
centroiding_ppm: float = 20.0,
|
|
54
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
55
|
+
"""
|
|
56
|
+
Simplified naive centroiding implementation.
|
|
57
|
+
|
|
58
|
+
Combines nearby peaks within a PPM tolerance using intensity-weighted averaging.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
peak_mzs : np.ndarray
|
|
63
|
+
Array of m/z values (must be sorted)
|
|
64
|
+
peak_intensities : np.ndarray
|
|
65
|
+
Array of intensity values corresponding to peak_mzs
|
|
66
|
+
centroiding_ppm : float, default 20.0
|
|
67
|
+
PPM tolerance for combining peaks
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
tuple[np.ndarray, np.ndarray]
|
|
72
|
+
Centroided m/z and intensity arrays
|
|
73
|
+
|
|
74
|
+
Notes
|
|
75
|
+
-----
|
|
76
|
+
This is a simple implementation that assumes input peaks are sorted by m/z.
|
|
77
|
+
For production use, consider more sophisticated centroiding algorithms.
|
|
78
|
+
"""
|
|
79
|
+
if len(peak_mzs) == 0:
|
|
80
|
+
return np.array([]), np.array([])
|
|
81
|
+
|
|
82
|
+
if len(peak_mzs) != len(peak_intensities):
|
|
83
|
+
raise ValueError("peak_mzs and peak_intensities must have the same length")
|
|
84
|
+
|
|
85
|
+
centroided_mzs = []
|
|
86
|
+
centroided_intensities = []
|
|
87
|
+
|
|
88
|
+
i = 0
|
|
89
|
+
while i < len(peak_mzs):
|
|
90
|
+
current_mz = peak_mzs[i]
|
|
91
|
+
current_intensity = peak_intensities[i]
|
|
92
|
+
|
|
93
|
+
# Calculate tolerance for current m/z
|
|
94
|
+
tolerance = current_mz * centroiding_ppm * 1e-6
|
|
95
|
+
|
|
96
|
+
# Find all peaks within tolerance
|
|
97
|
+
total_intensity = current_intensity
|
|
98
|
+
weighted_mz_sum = current_mz * current_intensity
|
|
99
|
+
j = i + 1
|
|
100
|
+
|
|
101
|
+
while j < len(peak_mzs) and abs(peak_mzs[j] - current_mz) <= tolerance:
|
|
102
|
+
total_intensity += peak_intensities[j]
|
|
103
|
+
weighted_mz_sum += peak_mzs[j] * peak_intensities[j]
|
|
104
|
+
j += 1
|
|
105
|
+
|
|
106
|
+
# Calculate intensity-weighted centroided m/z
|
|
107
|
+
if total_intensity > 0:
|
|
108
|
+
centroided_mz = weighted_mz_sum / total_intensity
|
|
109
|
+
centroided_mzs.append(centroided_mz)
|
|
110
|
+
centroided_intensities.append(total_intensity)
|
|
111
|
+
|
|
112
|
+
i = j
|
|
113
|
+
|
|
114
|
+
return np.array(centroided_mzs), np.array(centroided_intensities)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# CLR utilities implementation
|
|
118
|
+
try:
|
|
119
|
+
# require pythonnet, pip install pythonnet on Windows
|
|
120
|
+
import clr
|
|
121
|
+
|
|
122
|
+
clr.AddReference("System")
|
|
123
|
+
|
|
124
|
+
import System # noqa: F401
|
|
125
|
+
from System.Globalization import CultureInfo
|
|
126
|
+
from System.Runtime.InteropServices import GCHandle, GCHandleType
|
|
127
|
+
from System.Threading import Thread
|
|
128
|
+
|
|
129
|
+
de_fr = CultureInfo("fr-FR")
|
|
130
|
+
other = CultureInfo("en-US")
|
|
131
|
+
|
|
132
|
+
Thread.CurrentThread.CurrentCulture = other
|
|
133
|
+
Thread.CurrentThread.CurrentUICulture = other
|
|
134
|
+
|
|
135
|
+
# Find the alpharaw ext/thermo_fisher directory in site-packages
|
|
136
|
+
ext_dir = None
|
|
137
|
+
for site_dir in site.getsitepackages():
|
|
138
|
+
potential_ext_dir = os.path.join(site_dir, "alpharaw", "ext")
|
|
139
|
+
if os.path.exists(potential_ext_dir):
|
|
140
|
+
ext_dir = potential_ext_dir
|
|
141
|
+
break
|
|
142
|
+
|
|
143
|
+
if ext_dir is None:
|
|
144
|
+
# Try alternative locations
|
|
145
|
+
try:
|
|
146
|
+
import alpharaw
|
|
147
|
+
alpharaw_dir = os.path.dirname(alpharaw.__file__)
|
|
148
|
+
ext_dir = os.path.join(alpharaw_dir, "ext")
|
|
149
|
+
except ImportError:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
if not ext_dir or not os.path.exists(os.path.join(ext_dir, "thermo_fisher")):
|
|
153
|
+
raise ImportError("Could not find alpharaw ext/thermo_fisher directory with DLLs")
|
|
154
|
+
|
|
155
|
+
# Add Thermo Fisher DLL references
|
|
156
|
+
clr.AddReference(
|
|
157
|
+
os.path.join(ext_dir, "thermo_fisher", "ThermoFisher.CommonCore.Data.dll"),
|
|
158
|
+
)
|
|
159
|
+
clr.AddReference(
|
|
160
|
+
os.path.join(ext_dir, "thermo_fisher", "ThermoFisher.CommonCore.RawFileReader.dll")
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
import ThermoFisher # noqa: F401
|
|
164
|
+
|
|
165
|
+
from ThermoFisher.CommonCore.Data.Business import Device
|
|
166
|
+
from ThermoFisher.CommonCore.Data.Interfaces import IScanEvent, IScanEventBase # noqa: F401
|
|
167
|
+
from ThermoFisher.CommonCore.RawFileReader import RawFileReaderAdapter
|
|
168
|
+
|
|
169
|
+
HAS_DOTNET = True
|
|
170
|
+
except ImportError as e:
|
|
171
|
+
# Allow the rest of the code to work without .NET support
|
|
172
|
+
warnings.warn(
|
|
173
|
+
f"Thermo RAW file support is disabled. Install pythonnet and ensure Thermo Fisher DLLs "
|
|
174
|
+
f"are available to enable Thermo RAW file reading. Error: {e}",
|
|
175
|
+
UserWarning,
|
|
176
|
+
stacklevel=2,
|
|
177
|
+
)
|
|
178
|
+
HAS_DOTNET = False
|
|
179
|
+
except Exception as e:
|
|
180
|
+
# Catch any other .NET related errors
|
|
181
|
+
warnings.warn(
|
|
182
|
+
f"Failed to initialize .NET components for Thermo support. Error: {e}",
|
|
183
|
+
UserWarning,
|
|
184
|
+
stacklevel=2,
|
|
185
|
+
)
|
|
186
|
+
HAS_DOTNET = False
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def dot_net_array_to_np_array(src) -> np.ndarray:
|
|
190
|
+
"""
|
|
191
|
+
Convert .NET array to NumPy array with efficient memory handling.
|
|
192
|
+
|
|
193
|
+
This function performs a zero-copy conversion from .NET arrays to NumPy arrays
|
|
194
|
+
by directly accessing the underlying memory buffer. This is much faster than
|
|
195
|
+
iterating through elements.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
src : .NET array or None
|
|
200
|
+
Source .NET array to convert (typically double[])
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
np.ndarray
|
|
205
|
+
Converted NumPy array with dtype float64. Returns empty array if src is None.
|
|
206
|
+
|
|
207
|
+
Notes
|
|
208
|
+
-----
|
|
209
|
+
Based on the approach from:
|
|
210
|
+
https://mail.python.org/pipermail/pythondotnet/2014-May/001527.html
|
|
211
|
+
|
|
212
|
+
The function uses GCHandle.Alloc to pin the .NET array in memory, allowing
|
|
213
|
+
direct access to its underlying buffer via ctypes. The buffer is then
|
|
214
|
+
wrapped as a NumPy array and copied to ensure memory safety.
|
|
215
|
+
"""
|
|
216
|
+
if src is None:
|
|
217
|
+
return np.array([], dtype=np.float64)
|
|
218
|
+
|
|
219
|
+
# Pin the .NET array in memory to prevent garbage collection
|
|
220
|
+
src_hndl = GCHandle.Alloc(src, GCHandleType.Pinned)
|
|
221
|
+
try:
|
|
222
|
+
# Get pointer to the pinned memory
|
|
223
|
+
src_ptr = src_hndl.AddrOfPinnedObject().ToInt64()
|
|
224
|
+
|
|
225
|
+
# Create ctypes buffer pointing to the same memory
|
|
226
|
+
buf_type = ctypes.c_double * len(src)
|
|
227
|
+
cbuf = buf_type.from_address(src_ptr)
|
|
228
|
+
|
|
229
|
+
# Convert to NumPy array and make a copy for safety
|
|
230
|
+
dest = np.frombuffer(cbuf, dtype="float64").copy() # type: ignore[call-overload]
|
|
231
|
+
finally:
|
|
232
|
+
# Always free the GC handle to prevent memory leaks
|
|
233
|
+
if src_hndl.IsAllocated:
|
|
234
|
+
src_hndl.Free()
|
|
235
|
+
return dest # noqa: B012
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class ThermoRawFileReader:
|
|
239
|
+
"""
|
|
240
|
+
Direct implementation of Thermo RAW file reader using the Thermo Fisher DLLs.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
def __init__(self, filename: str):
|
|
244
|
+
if not HAS_DOTNET:
|
|
245
|
+
raise ImportError(
|
|
246
|
+
"Thermo RAW file support requires .NET components. "
|
|
247
|
+
"Install pythonnet (pip install pythonnet) and ensure Thermo Fisher DLLs "
|
|
248
|
+
"are available in alpharaw's ext/thermo_fisher directory."
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if not os.path.exists(filename):
|
|
252
|
+
raise FileNotFoundError(f"RAW file not found: {filename}")
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
self._raw_file = RawFileReaderAdapter.FileFactory(filename)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
raise ValueError(f"Failed to create RAW file reader for '{filename}': {e}") from e
|
|
258
|
+
|
|
259
|
+
if not self._raw_file.IsOpen:
|
|
260
|
+
raise ValueError(f"Could not open RAW file: {filename}")
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
# Get basic file information
|
|
264
|
+
self._raw_file.SelectInstrument(Device.MS, 1) # MS instrument
|
|
265
|
+
self.first_scan = self._raw_file.RunHeaderEx.FirstSpectrum
|
|
266
|
+
self.last_scan = self._raw_file.RunHeaderEx.LastSpectrum
|
|
267
|
+
self.num_scans = self.last_scan - self.first_scan + 1
|
|
268
|
+
except Exception as e:
|
|
269
|
+
self.close()
|
|
270
|
+
raise ValueError(f"Failed to read RAW file header information: {e}") from e
|
|
271
|
+
|
|
272
|
+
def close(self) -> None:
|
|
273
|
+
"""Close the file and clean up resources."""
|
|
274
|
+
if hasattr(self, '_raw_file') and self._raw_file is not None:
|
|
275
|
+
self._raw_file.Dispose()
|
|
276
|
+
|
|
277
|
+
def __enter__(self) -> 'ThermoRawFileReader':
|
|
278
|
+
"""Context manager entry."""
|
|
279
|
+
return self
|
|
280
|
+
|
|
281
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
282
|
+
"""Context manager exit."""
|
|
283
|
+
self.close()
|
|
284
|
+
|
|
285
|
+
def get_polarity_from_scan_event(self, scan_number: int) -> str:
|
|
286
|
+
"""
|
|
287
|
+
Extract polarity information from scan event.
|
|
288
|
+
|
|
289
|
+
Parameters
|
|
290
|
+
----------
|
|
291
|
+
scan_number : int
|
|
292
|
+
Scan number to extract polarity from
|
|
293
|
+
|
|
294
|
+
Returns
|
|
295
|
+
-------
|
|
296
|
+
str
|
|
297
|
+
'positive', 'negative', or '' if unknown
|
|
298
|
+
"""
|
|
299
|
+
try:
|
|
300
|
+
scan_event = self._raw_file.GetScanEventForScanNumber(scan_number)
|
|
301
|
+
if scan_event is None:
|
|
302
|
+
return ''
|
|
303
|
+
|
|
304
|
+
# Try the direct Polarity property first (most reliable)
|
|
305
|
+
if hasattr(scan_event, 'Polarity'):
|
|
306
|
+
polarity_str = str(scan_event.Polarity).lower()
|
|
307
|
+
if 'positive' in polarity_str:
|
|
308
|
+
return 'positive'
|
|
309
|
+
elif 'negative' in polarity_str:
|
|
310
|
+
return 'negative'
|
|
311
|
+
|
|
312
|
+
# Fallback: parse the scan filter string
|
|
313
|
+
filter_string = str(scan_event.ToString()).lower()
|
|
314
|
+
if '+' in filter_string or 'positive' in filter_string:
|
|
315
|
+
return 'positive'
|
|
316
|
+
elif '-' in filter_string or 'negative' in filter_string:
|
|
317
|
+
return 'negative'
|
|
318
|
+
|
|
319
|
+
except Exception:
|
|
320
|
+
# Log the exception if needed, but don't raise
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
return '' # Unknown polarity
|
|
324
|
+
|
|
325
|
+
def _extract_precursor_info(self, scan_event, ms_level: int) -> tuple[float, int, float, float, float]:
|
|
326
|
+
"""Extract precursor information from scan event for MS2+ scans."""
|
|
327
|
+
if ms_level <= 1 or scan_event is None:
|
|
328
|
+
return -1.0, 0, 0.0, -1.0, -1.0
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
precursor_mz = float(scan_event.GetMass(0))
|
|
332
|
+
except Exception:
|
|
333
|
+
precursor_mz = -1.0
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
precursor_charge = int(scan_event.GetChargeState(0)) if hasattr(scan_event, 'GetChargeState') else 0
|
|
337
|
+
except Exception:
|
|
338
|
+
precursor_charge = 0
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
collision_energy = float(scan_event.GetEnergy(0)) if hasattr(scan_event, 'GetEnergy') else 0.0
|
|
342
|
+
except Exception:
|
|
343
|
+
collision_energy = 0.0
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
isolation_window = float(scan_event.GetIsolationWidth(0)) if hasattr(scan_event, 'GetIsolationWidth') else 3.0
|
|
347
|
+
except Exception:
|
|
348
|
+
isolation_window = 3.0
|
|
349
|
+
|
|
350
|
+
isolation_lower = precursor_mz - isolation_window / 2
|
|
351
|
+
isolation_upper = precursor_mz + isolation_window / 2
|
|
352
|
+
|
|
353
|
+
return precursor_mz, precursor_charge, collision_energy, isolation_lower, isolation_upper
|
|
354
|
+
|
|
355
|
+
def _process_scan_data(
|
|
356
|
+
self,
|
|
357
|
+
scan_data,
|
|
358
|
+
centroid: bool,
|
|
359
|
+
centroid_ppm: float,
|
|
360
|
+
keep_k_peaks: int
|
|
361
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
362
|
+
"""Process scan data to extract and optionally centroid peaks."""
|
|
363
|
+
if scan_data.Positions is not None and scan_data.Intensities is not None:
|
|
364
|
+
mz_array = dot_net_array_to_np_array(scan_data.Positions)
|
|
365
|
+
int_array = dot_net_array_to_np_array(scan_data.Intensities).astype(np.float32)
|
|
366
|
+
else:
|
|
367
|
+
return np.array([]), np.array([])
|
|
368
|
+
|
|
369
|
+
if centroid and len(mz_array) > 0:
|
|
370
|
+
mz_array, int_array = naive_centroid(
|
|
371
|
+
mz_array,
|
|
372
|
+
int_array,
|
|
373
|
+
centroiding_ppm=centroid_ppm,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Keep only top K peaks by intensity
|
|
377
|
+
if len(mz_array) > keep_k_peaks:
|
|
378
|
+
top_indices = np.argsort(int_array)[-keep_k_peaks:]
|
|
379
|
+
top_indices = np.sort(top_indices)
|
|
380
|
+
mz_array = mz_array[top_indices]
|
|
381
|
+
int_array = int_array[top_indices]
|
|
382
|
+
|
|
383
|
+
return mz_array, int_array
|
|
384
|
+
|
|
385
|
+
def load_all_scans(
|
|
386
|
+
self,
|
|
387
|
+
centroid: bool = True,
|
|
388
|
+
centroid_ppm: float = 20.0,
|
|
389
|
+
ignore_empty_scans: bool = True,
|
|
390
|
+
keep_k_peaks: int = 2000,
|
|
391
|
+
) -> dict[str, Any]:
|
|
392
|
+
"""
|
|
393
|
+
Load all scans from the RAW file and extract spectral data.
|
|
394
|
+
|
|
395
|
+
Parameters
|
|
396
|
+
----------
|
|
397
|
+
centroid : bool
|
|
398
|
+
Whether to centroid the data
|
|
399
|
+
centroid_ppm : float
|
|
400
|
+
PPM tolerance for centroiding
|
|
401
|
+
ignore_empty_scans : bool
|
|
402
|
+
Whether to skip empty scans
|
|
403
|
+
keep_k_peaks : int
|
|
404
|
+
Maximum number of peaks to keep per spectrum
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
dict
|
|
409
|
+
Dictionary containing spectral data with keys:
|
|
410
|
+
peak_indices, peak_mz, peak_intensity, rt, ms_level, polarity,
|
|
411
|
+
precursor_mz, precursor_charge, isolation_lower_mz, isolation_upper_mz, nce
|
|
412
|
+
"""
|
|
413
|
+
# Initialize data collection lists
|
|
414
|
+
peak_indices_list: list[int] = []
|
|
415
|
+
peak_mz_arrays: list[np.ndarray] = []
|
|
416
|
+
peak_intensity_arrays: list[np.ndarray] = []
|
|
417
|
+
rt_list: list[float] = []
|
|
418
|
+
ms_level_list: list[int] = []
|
|
419
|
+
polarity_list: list[str] = []
|
|
420
|
+
precursor_mz_list: list[float] = []
|
|
421
|
+
precursor_charge_list: list[int] = []
|
|
422
|
+
ce_list: list[float] = []
|
|
423
|
+
isolation_lower_mz_list: list[float] = []
|
|
424
|
+
isolation_upper_mz_list: list[float] = []
|
|
425
|
+
|
|
426
|
+
for scan_num in range(self.first_scan, self.last_scan + 1):
|
|
427
|
+
# Get scan statistics and data
|
|
428
|
+
scan_stats = self._raw_file.GetScanStatsForScanNumber(scan_num)
|
|
429
|
+
if scan_stats is None:
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
scan_data = self._raw_file.GetSegmentedScanFromScanNumber(scan_num, scan_stats)
|
|
433
|
+
if scan_data is None or (ignore_empty_scans and scan_data.Positions is None):
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
scan_event = self._raw_file.GetScanEventForScanNumber(scan_num)
|
|
437
|
+
|
|
438
|
+
# Extract basic scan information
|
|
439
|
+
rt = scan_stats.StartTime # in minutes
|
|
440
|
+
ms_level = int(scan_event.MSOrder) if scan_event else 1
|
|
441
|
+
polarity = self.get_polarity_from_scan_event(scan_num)
|
|
442
|
+
|
|
443
|
+
# Process peak data
|
|
444
|
+
mz_array, int_array = self._process_scan_data(
|
|
445
|
+
scan_data, centroid, centroid_ppm, keep_k_peaks
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# Store scan data
|
|
449
|
+
peak_mz_arrays.append(mz_array)
|
|
450
|
+
peak_intensity_arrays.append(int_array)
|
|
451
|
+
peak_indices_list.append(len(mz_array))
|
|
452
|
+
|
|
453
|
+
rt_list.append(rt)
|
|
454
|
+
ms_level_list.append(ms_level)
|
|
455
|
+
polarity_list.append(polarity)
|
|
456
|
+
|
|
457
|
+
# Extract precursor information
|
|
458
|
+
precursor_mz, precursor_charge, collision_energy, isolation_lower, isolation_upper = \
|
|
459
|
+
self._extract_precursor_info(scan_event, ms_level)
|
|
460
|
+
|
|
461
|
+
precursor_mz_list.append(precursor_mz)
|
|
462
|
+
precursor_charge_list.append(precursor_charge)
|
|
463
|
+
ce_list.append(collision_energy)
|
|
464
|
+
isolation_lower_mz_list.append(isolation_lower)
|
|
465
|
+
isolation_upper_mz_list.append(isolation_upper)
|
|
466
|
+
|
|
467
|
+
if not rt_list:
|
|
468
|
+
raise ValueError("No valid scans found in the RAW file")
|
|
469
|
+
|
|
470
|
+
# Create cumulative peak indices array
|
|
471
|
+
peak_indices = np.empty(len(rt_list) + 1, dtype=np.int64)
|
|
472
|
+
peak_indices[0] = 0
|
|
473
|
+
peak_indices[1:] = np.cumsum(peak_indices_list)
|
|
474
|
+
|
|
475
|
+
return {
|
|
476
|
+
"peak_indices": peak_indices,
|
|
477
|
+
"peak_mz": np.concatenate(peak_mz_arrays) if peak_mz_arrays else np.array([]),
|
|
478
|
+
"peak_intensity": np.concatenate(peak_intensity_arrays) if peak_intensity_arrays else np.array([]),
|
|
479
|
+
"rt": np.array(rt_list, dtype=np.float64),
|
|
480
|
+
"ms_level": np.array(ms_level_list, dtype=np.int8),
|
|
481
|
+
"polarity": np.array(polarity_list, dtype="U8"),
|
|
482
|
+
"precursor_mz": np.array(precursor_mz_list, dtype=np.float64),
|
|
483
|
+
"precursor_charge": np.array(precursor_charge_list, dtype=np.int8),
|
|
484
|
+
"isolation_lower_mz": np.array(isolation_lower_mz_list, dtype=np.float64),
|
|
485
|
+
"isolation_upper_mz": np.array(isolation_upper_mz_list, dtype=np.float64),
|
|
486
|
+
"nce": np.array(ce_list, dtype=np.float32),
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
class ThermoRawData:
|
|
491
|
+
"""
|
|
492
|
+
Standalone Thermo RAW data reader class that provides RAW data reading
|
|
493
|
+
functionality using Thermo Fisher DLLs directly.
|
|
494
|
+
"""
|
|
495
|
+
|
|
496
|
+
# Column data types mapping
|
|
497
|
+
column_dtypes: ClassVar[dict[str, Any]] = {
|
|
498
|
+
"rt": np.float64,
|
|
499
|
+
"ms_level": np.int8,
|
|
500
|
+
"polarity": "U8",
|
|
501
|
+
"precursor_mz": np.float64,
|
|
502
|
+
"isolation_lower_mz": np.float64,
|
|
503
|
+
"isolation_upper_mz": np.float64,
|
|
504
|
+
"precursor_charge": np.int8,
|
|
505
|
+
"nce": np.float32,
|
|
506
|
+
"injection_time": np.float32,
|
|
507
|
+
"activation": "U",
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
def __init__(self, centroided: bool = True) -> None:
|
|
511
|
+
"""
|
|
512
|
+
Initialize ThermoRawData reader.
|
|
513
|
+
|
|
514
|
+
Parameters
|
|
515
|
+
----------
|
|
516
|
+
centroided : bool, optional
|
|
517
|
+
If peaks will be centroided after loading, by default True.
|
|
518
|
+
Note: Centroiding is currently disabled due to implementation limitations.
|
|
519
|
+
"""
|
|
520
|
+
# Initialize dataframes
|
|
521
|
+
self.spectrum_df: pd.DataFrame = pd.DataFrame()
|
|
522
|
+
self.peak_df: pd.DataFrame = pd.DataFrame()
|
|
523
|
+
|
|
524
|
+
# File and instrument information
|
|
525
|
+
self._raw_file_path = ""
|
|
526
|
+
self.creation_time = ""
|
|
527
|
+
self.type = "thermo"
|
|
528
|
+
self.instrument = "thermo"
|
|
529
|
+
|
|
530
|
+
# Processing parameters
|
|
531
|
+
self.centroided = centroided
|
|
532
|
+
self.centroid_ppm = 20.0
|
|
533
|
+
self.ignore_empty_scans = True
|
|
534
|
+
self.keep_k_peaks_per_spec = 2000
|
|
535
|
+
|
|
536
|
+
# Disable centroiding for now
|
|
537
|
+
if self.centroided:
|
|
538
|
+
self.centroided = False
|
|
539
|
+
warnings.warn(
|
|
540
|
+
"Centroiding for Thermo data is not well implemented yet. "
|
|
541
|
+
"Data will be processed in profile mode.",
|
|
542
|
+
UserWarning,
|
|
543
|
+
stacklevel=2,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
@property
|
|
547
|
+
def raw_file_path(self) -> str:
|
|
548
|
+
"""Get the raw file path."""
|
|
549
|
+
return self._raw_file_path
|
|
550
|
+
|
|
551
|
+
@raw_file_path.setter
|
|
552
|
+
def raw_file_path(self, value: str):
|
|
553
|
+
"""Set the raw file path."""
|
|
554
|
+
self._raw_file_path = value
|
|
555
|
+
|
|
556
|
+
def import_raw(self, raw_file_path: str) -> None:
|
|
557
|
+
"""
|
|
558
|
+
Import raw data from a RAW file.
|
|
559
|
+
|
|
560
|
+
Parameters
|
|
561
|
+
----------
|
|
562
|
+
raw_file_path : str
|
|
563
|
+
Path to the RAW file
|
|
564
|
+
"""
|
|
565
|
+
self.raw_file_path = raw_file_path
|
|
566
|
+
data_dict = self._import(raw_file_path)
|
|
567
|
+
self._set_dataframes(data_dict)
|
|
568
|
+
|
|
569
|
+
def _import(self, raw_file_path: str) -> dict[str, Any]:
|
|
570
|
+
"""
|
|
571
|
+
Import data from a Thermo RAW file.
|
|
572
|
+
|
|
573
|
+
Parameters
|
|
574
|
+
----------
|
|
575
|
+
raw_file_path : str
|
|
576
|
+
Absolute or relative path of the Thermo RAW file.
|
|
577
|
+
|
|
578
|
+
Returns
|
|
579
|
+
-------
|
|
580
|
+
dict
|
|
581
|
+
Dictionary containing spectrum information and peak data.
|
|
582
|
+
"""
|
|
583
|
+
with ThermoRawFileReader(raw_file_path) as raw_reader:
|
|
584
|
+
data_dict = raw_reader.load_all_scans(
|
|
585
|
+
centroid=self.centroided,
|
|
586
|
+
centroid_ppm=self.centroid_ppm,
|
|
587
|
+
ignore_empty_scans=self.ignore_empty_scans,
|
|
588
|
+
keep_k_peaks=self.keep_k_peaks_per_spec,
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
# Try to get file creation time
|
|
592
|
+
try:
|
|
593
|
+
creation_info = raw_reader._raw_file.GetCreationDate()
|
|
594
|
+
self.creation_time = creation_info.ToString("O") if creation_info else ""
|
|
595
|
+
except Exception:
|
|
596
|
+
self.creation_time = ""
|
|
597
|
+
|
|
598
|
+
return data_dict
|
|
599
|
+
|
|
600
|
+
def _set_dataframes(self, raw_data: dict[str, Any]) -> None:
|
|
601
|
+
"""
|
|
602
|
+
Set the spectrum and peak dataframes from raw data dictionary.
|
|
603
|
+
|
|
604
|
+
Parameters
|
|
605
|
+
----------
|
|
606
|
+
raw_data : dict
|
|
607
|
+
Dictionary containing the raw spectral data with keys like 'rt', 'peak_mz', etc.
|
|
608
|
+
"""
|
|
609
|
+
num_spectra = len(raw_data["rt"])
|
|
610
|
+
|
|
611
|
+
# Create spectrum dataframe
|
|
612
|
+
self.create_spectrum_df(num_spectra)
|
|
613
|
+
|
|
614
|
+
# Create peak dataframe with indexed arrays
|
|
615
|
+
self.set_peak_df_by_indexed_array(
|
|
616
|
+
raw_data["peak_mz"],
|
|
617
|
+
raw_data["peak_intensity"],
|
|
618
|
+
raw_data["peak_indices"][:-1], # start indices
|
|
619
|
+
raw_data["peak_indices"][1:], # end indices
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
# Add spectrum-level data to spectrum dataframe
|
|
623
|
+
for column_name, values in raw_data.items():
|
|
624
|
+
if column_name in self.column_dtypes and column_name != "peak_mz" and column_name != "peak_intensity":
|
|
625
|
+
dtype = self.column_dtypes[column_name]
|
|
626
|
+
if dtype == "O":
|
|
627
|
+
self.spectrum_df[column_name] = list(values)
|
|
628
|
+
else:
|
|
629
|
+
self.spectrum_df[column_name] = np.array(values, dtype=dtype)
|
|
630
|
+
|
|
631
|
+
def create_spectrum_df(self, spectrum_num: int) -> None:
|
|
632
|
+
"""
|
|
633
|
+
Create an empty spectrum dataframe from the number of spectra.
|
|
634
|
+
|
|
635
|
+
Parameters
|
|
636
|
+
----------
|
|
637
|
+
spectrum_num : int
|
|
638
|
+
The number of spectra.
|
|
639
|
+
"""
|
|
640
|
+
self.spectrum_df = pd.DataFrame(index=np.arange(spectrum_num, dtype=np.int64))
|
|
641
|
+
self.spectrum_df["spec_idx"] = self.spectrum_df.index.values
|
|
642
|
+
|
|
643
|
+
def set_peak_df_by_indexed_array(
|
|
644
|
+
self,
|
|
645
|
+
mz_array: np.ndarray,
|
|
646
|
+
intensity_array: np.ndarray,
|
|
647
|
+
peak_start_indices: np.ndarray,
|
|
648
|
+
peak_stop_indices: np.ndarray,
|
|
649
|
+
) -> None:
|
|
650
|
+
"""
|
|
651
|
+
Set peak dataframe using indexed arrays.
|
|
652
|
+
|
|
653
|
+
Parameters
|
|
654
|
+
----------
|
|
655
|
+
mz_array : np.ndarray
|
|
656
|
+
Array of m/z values
|
|
657
|
+
intensity_array : np.ndarray
|
|
658
|
+
Array of intensity values
|
|
659
|
+
peak_start_indices : np.ndarray
|
|
660
|
+
Array of start indices for each spectrum
|
|
661
|
+
peak_stop_indices : np.ndarray
|
|
662
|
+
Array of stop indices for each spectrum
|
|
663
|
+
"""
|
|
664
|
+
self.peak_df = pd.DataFrame()
|
|
665
|
+
self.peak_df["mz"] = mz_array.astype(np.float64)
|
|
666
|
+
self.peak_df["intensity"] = intensity_array.astype(np.float32)
|
|
667
|
+
|
|
668
|
+
# Set peak start and stop indices in spectrum df
|
|
669
|
+
self.spectrum_df["peak_start_idx"] = peak_start_indices
|
|
670
|
+
self.spectrum_df["peak_stop_idx"] = peak_stop_indices
|
|
671
|
+
|
|
672
|
+
def get_peaks(self, spec_idx: int) -> tuple[np.ndarray, np.ndarray]:
|
|
673
|
+
"""
|
|
674
|
+
Get peaks for a specific spectrum.
|
|
675
|
+
|
|
676
|
+
Parameters
|
|
677
|
+
----------
|
|
678
|
+
spec_idx : int
|
|
679
|
+
Spectrum index
|
|
680
|
+
|
|
681
|
+
Returns
|
|
682
|
+
-------
|
|
683
|
+
tuple
|
|
684
|
+
(mz_array, intensity_array)
|
|
685
|
+
"""
|
|
686
|
+
start, end = self.spectrum_df.iloc[spec_idx][["peak_start_idx", "peak_stop_idx"]].values
|
|
687
|
+
return (
|
|
688
|
+
self.peak_df.mz.values[start:end],
|
|
689
|
+
self.peak_df.intensity.values[start:end],
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
def __repr__(self) -> str:
|
|
693
|
+
return f"ThermoRawData(file_path='{self.raw_file_path}', spectra={len(self.spectrum_df)})"
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
# Convenience functions to maintain compatibility with existing code
|
|
697
|
+
def load_raw_file(filename: str, **kwargs) -> ThermoRawData:
|
|
698
|
+
"""
|
|
699
|
+
Load a RAW file and return a ThermoRawData object.
|
|
700
|
+
|
|
701
|
+
Parameters
|
|
702
|
+
----------
|
|
703
|
+
filename : str
|
|
704
|
+
Path to the RAW file
|
|
705
|
+
**kwargs
|
|
706
|
+
Additional arguments to pass to ThermoRawData constructor
|
|
707
|
+
|
|
708
|
+
Returns
|
|
709
|
+
-------
|
|
710
|
+
ThermoRawData
|
|
711
|
+
Loaded RAW data object
|
|
712
|
+
"""
|
|
713
|
+
raw_data = ThermoRawData(**kwargs)
|
|
714
|
+
raw_data.import_raw(filename)
|
|
715
|
+
return raw_data
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def get_file_info(filename: str) -> dict[str, Any]:
|
|
719
|
+
"""
|
|
720
|
+
Get basic information about a RAW file.
|
|
721
|
+
|
|
722
|
+
Parameters
|
|
723
|
+
----------
|
|
724
|
+
filename : str
|
|
725
|
+
Path to the RAW file
|
|
726
|
+
|
|
727
|
+
Returns
|
|
728
|
+
-------
|
|
729
|
+
dict
|
|
730
|
+
Dictionary with file information including scan count, scan range, etc.
|
|
731
|
+
"""
|
|
732
|
+
with ThermoRawFileReader(filename) as reader:
|
|
733
|
+
return {
|
|
734
|
+
"first_scan": reader.first_scan,
|
|
735
|
+
"last_scan": reader.last_scan,
|
|
736
|
+
"num_scans": reader.num_scans,
|
|
737
|
+
"scan_range": f"{reader.first_scan}-{reader.last_scan}",
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def main() -> None:
|
|
742
|
+
"""
|
|
743
|
+
Main function for testing and demonstrating the module functionality.
|
|
744
|
+
|
|
745
|
+
This function provides usage examples and tests basic module functionality
|
|
746
|
+
when the script is run directly.
|
|
747
|
+
"""
|
|
748
|
+
print("Standalone Thermo RAW Reader")
|
|
749
|
+
print("=" * 40)
|
|
750
|
+
|
|
751
|
+
# Display usage example
|
|
752
|
+
print("\nUsage Example:")
|
|
753
|
+
print("-" * 20)
|
|
754
|
+
example_code = '''
|
|
755
|
+
from thermo import ThermoRawData, load_raw_file
|
|
756
|
+
|
|
757
|
+
# Method 1: Create reader instance
|
|
758
|
+
raw_data = ThermoRawData(centroided=False)
|
|
759
|
+
raw_data.import_raw("path/to/file.raw")
|
|
760
|
+
|
|
761
|
+
# Method 2: Use convenience function
|
|
762
|
+
raw_data = load_raw_file("path/to/file.raw")
|
|
763
|
+
|
|
764
|
+
# Access data
|
|
765
|
+
print(f"Spectra: {len(raw_data.spectrum_df)}")
|
|
766
|
+
print(f"Peaks: {len(raw_data.peak_df)}")
|
|
767
|
+
|
|
768
|
+
# Get peaks for first spectrum
|
|
769
|
+
mz, intensity = raw_data.get_peaks(0)
|
|
770
|
+
|
|
771
|
+
# Check available polarities
|
|
772
|
+
polarities = raw_data.spectrum_df['polarity'].unique()
|
|
773
|
+
print(f"Polarities: {polarities}")
|
|
774
|
+
'''
|
|
775
|
+
print(example_code)
|
|
776
|
+
|
|
777
|
+
# Test module functionality
|
|
778
|
+
print("\nModule Status:")
|
|
779
|
+
print("-" * 20)
|
|
780
|
+
|
|
781
|
+
try:
|
|
782
|
+
# Test class instantiation
|
|
783
|
+
test_data = ThermoRawData()
|
|
784
|
+
print("✓ ThermoRawData instantiated successfully")
|
|
785
|
+
|
|
786
|
+
# Check .NET support
|
|
787
|
+
if HAS_DOTNET:
|
|
788
|
+
print("✓ .NET support available")
|
|
789
|
+
print(" • Thermo Fisher DLLs loaded")
|
|
790
|
+
print(" • RAW file reading enabled")
|
|
791
|
+
else:
|
|
792
|
+
print("⚠ .NET support not available")
|
|
793
|
+
print(" • Install pythonnet to enable RAW file reading")
|
|
794
|
+
print(" • Ensure Thermo Fisher DLLs are in alpharaw ext directory")
|
|
795
|
+
|
|
796
|
+
except Exception as e:
|
|
797
|
+
print(f"✗ Error during module testing: {e}")
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
if __name__ == "__main__":
|
|
801
|
+
main()
|