masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -719
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.4.dist-info/RECORD +0 -50
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/sciex.py
ADDED
|
@@ -0,0 +1,1213 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Standalone Sciex WIFF file reader module.
|
|
3
|
+
|
|
4
|
+
This module provides a standalone implementation of Sciex WIFF file reading
|
|
5
|
+
functionality that uses the DLLs from alpharaw's ext/sciex directory directly
|
|
6
|
+
without importing from the alpharaw package.
|
|
7
|
+
|
|
8
|
+
Requirements:
|
|
9
|
+
- pythonnet (pip install pythonnet)
|
|
10
|
+
- alpharaw package must be installed to access the DLLs in site-packages/alpharaw/ext/sciex/
|
|
11
|
+
- On Linux/macOS: mono runtime must be installed
|
|
12
|
+
|
|
13
|
+
The .NET imports (System, Clearcore2, WiffOps4Python) will only work when
|
|
14
|
+
pythonnet is properly installed and configured.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import site
|
|
19
|
+
import warnings
|
|
20
|
+
|
|
21
|
+
from typing import Any, ClassVar
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Import centroiding functionality (simplified naive centroid implementation)
|
|
28
|
+
def naive_centroid(
|
|
29
|
+
peak_mzs: np.ndarray,
|
|
30
|
+
peak_intensities: np.ndarray,
|
|
31
|
+
centroiding_ppm: float = 20.0,
|
|
32
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
33
|
+
"""
|
|
34
|
+
Simplified naive centroiding implementation.
|
|
35
|
+
"""
|
|
36
|
+
if len(peak_mzs) == 0:
|
|
37
|
+
return np.array([]), np.array([])
|
|
38
|
+
|
|
39
|
+
# Simple centroiding: combine peaks within tolerance
|
|
40
|
+
centroided_mzs = []
|
|
41
|
+
centroided_intensities = []
|
|
42
|
+
|
|
43
|
+
i = 0
|
|
44
|
+
while i < len(peak_mzs):
|
|
45
|
+
current_mz = peak_mzs[i]
|
|
46
|
+
current_intensity = peak_intensities[i]
|
|
47
|
+
|
|
48
|
+
# Look for nearby peaks within tolerance
|
|
49
|
+
j = i + 1
|
|
50
|
+
total_intensity = current_intensity
|
|
51
|
+
weighted_mz_sum = current_mz * current_intensity
|
|
52
|
+
|
|
53
|
+
while j < len(peak_mzs):
|
|
54
|
+
tolerance = current_mz * centroiding_ppm * 1e-6
|
|
55
|
+
if abs(peak_mzs[j] - current_mz) <= tolerance:
|
|
56
|
+
total_intensity += peak_intensities[j]
|
|
57
|
+
weighted_mz_sum += peak_mzs[j] * peak_intensities[j]
|
|
58
|
+
j += 1
|
|
59
|
+
else:
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
# Calculate centroided m/z and intensity
|
|
63
|
+
if total_intensity > 0:
|
|
64
|
+
centroided_mz = weighted_mz_sum / total_intensity
|
|
65
|
+
centroided_mzs.append(centroided_mz)
|
|
66
|
+
centroided_intensities.append(total_intensity)
|
|
67
|
+
|
|
68
|
+
i = j
|
|
69
|
+
|
|
70
|
+
return np.array(centroided_mzs), np.array(centroided_intensities)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# CLR utilities implementation
|
|
74
|
+
try:
|
|
75
|
+
# require pythonnet, pip install pythonnet on Windows
|
|
76
|
+
import clr
|
|
77
|
+
|
|
78
|
+
clr.AddReference("System")
|
|
79
|
+
|
|
80
|
+
import ctypes
|
|
81
|
+
|
|
82
|
+
import System # noqa: F401
|
|
83
|
+
|
|
84
|
+
from System.Globalization import CultureInfo
|
|
85
|
+
from System.Runtime.InteropServices import GCHandle
|
|
86
|
+
from System.Runtime.InteropServices import GCHandleType
|
|
87
|
+
from System.Threading import Thread
|
|
88
|
+
|
|
89
|
+
de_fr = CultureInfo("fr-FR")
|
|
90
|
+
other = CultureInfo("en-US")
|
|
91
|
+
|
|
92
|
+
Thread.CurrentThread.CurrentCulture = other
|
|
93
|
+
Thread.CurrentThread.CurrentUICulture = other
|
|
94
|
+
|
|
95
|
+
# Find the alpharaw ext/sciex directory in site-packages
|
|
96
|
+
ext_dir = None
|
|
97
|
+
for site_dir in site.getsitepackages():
|
|
98
|
+
potential_ext_dir = os.path.join(site_dir, "alpharaw", "ext", "sciex")
|
|
99
|
+
if os.path.exists(potential_ext_dir):
|
|
100
|
+
ext_dir = potential_ext_dir
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
if ext_dir is None:
|
|
104
|
+
# Try alternative locations
|
|
105
|
+
import alpharaw
|
|
106
|
+
|
|
107
|
+
alpharaw_dir = os.path.dirname(alpharaw.__file__)
|
|
108
|
+
ext_dir = os.path.join(alpharaw_dir, "ext", "sciex")
|
|
109
|
+
|
|
110
|
+
if not os.path.exists(ext_dir):
|
|
111
|
+
raise ImportError("Could not find alpharaw ext/sciex directory with DLLs")
|
|
112
|
+
|
|
113
|
+
# Add Sciex DLL references
|
|
114
|
+
clr.AddReference(
|
|
115
|
+
os.path.join(ext_dir, "Clearcore2.Data.AnalystDataProvider.dll"),
|
|
116
|
+
)
|
|
117
|
+
clr.AddReference(os.path.join(ext_dir, "Clearcore2.Data.dll"))
|
|
118
|
+
clr.AddReference(os.path.join(ext_dir, "WiffOps4Python.dll"))
|
|
119
|
+
|
|
120
|
+
import Clearcore2 # noqa: F401
|
|
121
|
+
import WiffOps4Python # noqa: F401
|
|
122
|
+
|
|
123
|
+
from Clearcore2.Data.AnalystDataProvider import AnalystDataProviderFactory
|
|
124
|
+
from Clearcore2.Data.AnalystDataProvider import AnalystWiffDataProvider
|
|
125
|
+
from WiffOps4Python import WiffOps as DotNetWiffOps
|
|
126
|
+
|
|
127
|
+
HAS_DOTNET = True
|
|
128
|
+
except Exception as e:
|
|
129
|
+
# allows to use the rest of the code without clr
|
|
130
|
+
warnings.warn(
|
|
131
|
+
f"Dotnet-based dependencies could not be loaded. Sciex support is disabled. Error: {e}",
|
|
132
|
+
stacklevel=2,
|
|
133
|
+
)
|
|
134
|
+
HAS_DOTNET = False
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def dot_net_array_to_np_array(src):
|
|
138
|
+
"""
|
|
139
|
+
Convert .NET array to NumPy array.
|
|
140
|
+
See https://mail.python.org/pipermail/pythondotnet/2014-May/001527.html
|
|
141
|
+
"""
|
|
142
|
+
if src is None:
|
|
143
|
+
return np.array([], dtype=np.float64)
|
|
144
|
+
src_hndl = GCHandle.Alloc(src, GCHandleType.Pinned)
|
|
145
|
+
try:
|
|
146
|
+
src_ptr = src_hndl.AddrOfPinnedObject().ToInt64()
|
|
147
|
+
buf_type = ctypes.c_double * len(src)
|
|
148
|
+
cbuf = buf_type.from_address(src_ptr)
|
|
149
|
+
dest = np.frombuffer(cbuf, dtype='float64').copy() # type: ignore[call-overload]
|
|
150
|
+
finally:
|
|
151
|
+
if src_hndl.IsAllocated:
|
|
152
|
+
src_hndl.Free()
|
|
153
|
+
return dest # noqa: B012
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class SciexWiff2FileReader:
|
|
157
|
+
"""
|
|
158
|
+
Specialized reader for Sciex WIFF2 files using optimal DLL combination.
|
|
159
|
+
|
|
160
|
+
WIFF2 is a newer format from Sciex that may have enhanced capabilities
|
|
161
|
+
compared to the original WIFF format. This reader is optimized specifically
|
|
162
|
+
for WIFF2 files and uses the most appropriate DLLs for maximum information extraction.
|
|
163
|
+
|
|
164
|
+
Based on comprehensive DLL analysis, WIFF2 files require specific handling and
|
|
165
|
+
may use different underlying storage mechanisms than regular WIFF files.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
def __init__(self, filename: str):
|
|
169
|
+
"""
|
|
170
|
+
Initialize WIFF2 reader with file path.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
filename : str
|
|
175
|
+
Path to the WIFF2 file
|
|
176
|
+
"""
|
|
177
|
+
if not HAS_DOTNET:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
"Dotnet-based dependencies are required for reading Sciex WIFF2 files. "
|
|
180
|
+
"Install pythonnet and ensure alpharaw DLLs are available."
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
self.filename = filename
|
|
184
|
+
self.ext_dir = self._find_dll_directory()
|
|
185
|
+
self._ensure_wiff2_dlls_loaded()
|
|
186
|
+
|
|
187
|
+
# Try different initialization strategies for WIFF2
|
|
188
|
+
self._initialize_wiff2_reader()
|
|
189
|
+
|
|
190
|
+
def _find_dll_directory(self):
|
|
191
|
+
"""Find the alpharaw DLL directory using the same discovery pattern."""
|
|
192
|
+
for site_dir in site.getsitepackages():
|
|
193
|
+
potential_ext_dir = os.path.join(site_dir, "alpharaw", "ext", "sciex")
|
|
194
|
+
if os.path.exists(potential_ext_dir):
|
|
195
|
+
return potential_ext_dir
|
|
196
|
+
|
|
197
|
+
# Fallback to alpharaw module location
|
|
198
|
+
try:
|
|
199
|
+
import alpharaw
|
|
200
|
+
alpharaw_dir = os.path.dirname(alpharaw.__file__)
|
|
201
|
+
return os.path.join(alpharaw_dir, "ext", "sciex")
|
|
202
|
+
except ImportError:
|
|
203
|
+
raise ImportError("Could not find alpharaw DLL directory")
|
|
204
|
+
|
|
205
|
+
def _ensure_wiff2_dlls_loaded(self):
|
|
206
|
+
"""Ensure all necessary WIFF2 DLLs are loaded."""
|
|
207
|
+
# Key DLLs identified through comprehensive analysis
|
|
208
|
+
required_dlls = [
|
|
209
|
+
"Clearcore2.Data.Wiff2.dll", # Primary WIFF2 support
|
|
210
|
+
"Clearcore2.Data.AnalystDataProvider.dll",
|
|
211
|
+
"Clearcore2.Data.dll",
|
|
212
|
+
"Clearcore2.Data.Common.dll",
|
|
213
|
+
"Clearcore2.Data.Core.dll",
|
|
214
|
+
"Clearcore2.StructuredStorage.dll", # For WIFF2 storage format
|
|
215
|
+
"WiffOps4Python.dll"
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
for dll in required_dlls:
|
|
219
|
+
dll_path = os.path.join(self.ext_dir, dll)
|
|
220
|
+
if os.path.exists(dll_path):
|
|
221
|
+
try:
|
|
222
|
+
clr.AddReference(dll_path)
|
|
223
|
+
except:
|
|
224
|
+
pass # May already be loaded
|
|
225
|
+
else:
|
|
226
|
+
warnings.warn(f"WIFF2 DLL not found: {dll}", stacklevel=2)
|
|
227
|
+
|
|
228
|
+
def _initialize_wiff2_reader(self):
|
|
229
|
+
"""
|
|
230
|
+
Initialize WIFF2 reader with fallback strategies.
|
|
231
|
+
|
|
232
|
+
WIFF2 files may require different initialization approaches than WIFF files.
|
|
233
|
+
We try multiple strategies based on the comprehensive DLL analysis.
|
|
234
|
+
"""
|
|
235
|
+
initialization_errors = []
|
|
236
|
+
|
|
237
|
+
# Strategy 1: Try standard AnalystDataProvider (may work for some WIFF2)
|
|
238
|
+
try:
|
|
239
|
+
from Clearcore2.Data.AnalystDataProvider import AnalystDataProviderFactory
|
|
240
|
+
from Clearcore2.Data.AnalystDataProvider import AnalystWiffDataProvider
|
|
241
|
+
|
|
242
|
+
self._wiffDataProvider = AnalystWiffDataProvider()
|
|
243
|
+
self._wiff_file = AnalystDataProviderFactory.CreateBatch(
|
|
244
|
+
self.filename,
|
|
245
|
+
self._wiffDataProvider,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
self.sample_names = self._wiff_file.GetSampleNames()
|
|
249
|
+
self.sample_count = len(self.sample_names)
|
|
250
|
+
self.initialization_method = "AnalystDataProvider"
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
initialization_errors.append(f"AnalystDataProvider: {e}")
|
|
255
|
+
|
|
256
|
+
# Strategy 2: Try alpharaw's SciexWiffData (correct API)
|
|
257
|
+
try:
|
|
258
|
+
from alpharaw.sciex import SciexWiffData
|
|
259
|
+
|
|
260
|
+
self._alpharaw_reader = SciexWiffData()
|
|
261
|
+
self._alpharaw_reader.import_raw(self.filename)
|
|
262
|
+
|
|
263
|
+
# Extract basic information (SciexWiffData doesn't have sample_names property)
|
|
264
|
+
self.sample_names = ['Sample_0'] # Default since WIFF2 format needs investigation
|
|
265
|
+
self.sample_count = 1
|
|
266
|
+
self.initialization_method = "alpharaw_SciexWiffData"
|
|
267
|
+
|
|
268
|
+
# Store the reader for later use
|
|
269
|
+
self._wiff_data = self._alpharaw_reader
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
except Exception as e:
|
|
273
|
+
initialization_errors.append(f"alpharaw_SciexWiffData: {e}")
|
|
274
|
+
|
|
275
|
+
# Strategy 3: Try direct WIFF2 DLL approach
|
|
276
|
+
try:
|
|
277
|
+
# Check if file is recognized as WIFF2
|
|
278
|
+
from Clearcore2.Data.AnalystDataProvider import DataProviderHelper
|
|
279
|
+
|
|
280
|
+
is_wiff2 = DataProviderHelper.IsMdWiffFile(self.filename)
|
|
281
|
+
if is_wiff2:
|
|
282
|
+
# Try specialized WIFF2 handling
|
|
283
|
+
warnings.warn(
|
|
284
|
+
f"File detected as WIFF2 format but specialized reader not fully implemented. "
|
|
285
|
+
f"Consider using alpharaw.ms_data_from_file() directly.",
|
|
286
|
+
stacklevel=2
|
|
287
|
+
)
|
|
288
|
+
# For now, fall back to treating as regular WIFF with enhanced parameters
|
|
289
|
+
self._initialize_as_enhanced_wiff()
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
initialization_errors.append(f"WIFF2 detection: {e}")
|
|
294
|
+
|
|
295
|
+
# If all strategies fail, provide comprehensive error information with helpful suggestions
|
|
296
|
+
error_summary = "; ".join(initialization_errors)
|
|
297
|
+
|
|
298
|
+
# Check if this is a WIFF2 format issue specifically
|
|
299
|
+
if "could not be opened (result = -2147286960)" in error_summary:
|
|
300
|
+
raise RuntimeError(
|
|
301
|
+
f"WIFF2 file format is not supported by the current DLL combination. "
|
|
302
|
+
f"Error code -2147286960 (0x80030050) indicates format incompatibility. "
|
|
303
|
+
f"The file '{self.filename}' appears to be a valid WIFF2 file but requires "
|
|
304
|
+
f"newer or different DLLs than currently available. "
|
|
305
|
+
f"Try converting the WIFF2 file to WIFF format or use alternative tools. "
|
|
306
|
+
f"Full error details: {error_summary}"
|
|
307
|
+
)
|
|
308
|
+
else:
|
|
309
|
+
raise RuntimeError(
|
|
310
|
+
f"Failed to initialize WIFF2 reader with any strategy. "
|
|
311
|
+
f"Errors: {error_summary}. "
|
|
312
|
+
f"The file may be corrupted, locked, or require different dependencies."
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
def _initialize_as_enhanced_wiff(self):
|
|
316
|
+
"""Fallback: Initialize as enhanced WIFF with WIFF2-optimized parameters."""
|
|
317
|
+
# Use the same initialization as regular WIFF but with warnings
|
|
318
|
+
try:
|
|
319
|
+
from Clearcore2.Data.AnalystDataProvider import AnalystDataProviderFactory
|
|
320
|
+
from Clearcore2.Data.AnalystDataProvider import AnalystWiffDataProvider
|
|
321
|
+
|
|
322
|
+
self._wiffDataProvider = AnalystWiffDataProvider()
|
|
323
|
+
self._wiff_file = AnalystDataProviderFactory.CreateBatch(
|
|
324
|
+
self.filename,
|
|
325
|
+
self._wiffDataProvider,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
self.sample_names = self._wiff_file.GetSampleNames()
|
|
329
|
+
self.sample_count = len(self.sample_names)
|
|
330
|
+
self.initialization_method = "enhanced_wiff_fallback"
|
|
331
|
+
|
|
332
|
+
warnings.warn(
|
|
333
|
+
"WIFF2 file opened using WIFF reader fallback. "
|
|
334
|
+
"Some WIFF2-specific features may not be available.",
|
|
335
|
+
stacklevel=2
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
except Exception as e:
|
|
339
|
+
raise RuntimeError(f"Enhanced WIFF fallback also failed: {e}")
|
|
340
|
+
|
|
341
|
+
def get_file_metadata(self) -> dict[str, Any]:
|
|
342
|
+
"""Get comprehensive file metadata for WIFF2 format."""
|
|
343
|
+
metadata: dict[str, Any] = {
|
|
344
|
+
'format': 'WIFF2',
|
|
345
|
+
'sample_count': self.sample_count,
|
|
346
|
+
'sample_names': list(self.sample_names),
|
|
347
|
+
'file_size': os.path.getsize(self.filename),
|
|
348
|
+
'file_path': self.filename,
|
|
349
|
+
'initialization_method': self.initialization_method,
|
|
350
|
+
'samples': [] # Initialize samples list
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if self.initialization_method == "alpharaw":
|
|
354
|
+
# Get metadata from alpharaw reader
|
|
355
|
+
try:
|
|
356
|
+
if hasattr(self._alpharaw_reader, 'get_spectrum_count'):
|
|
357
|
+
metadata['total_spectra'] = self._alpharaw_reader.get_spectrum_count()
|
|
358
|
+
|
|
359
|
+
# Add alpharaw-specific metadata
|
|
360
|
+
for attr in ['creation_time', 'instrument_model', 'ms_levels']:
|
|
361
|
+
if hasattr(self._alpharaw_reader, attr):
|
|
362
|
+
try:
|
|
363
|
+
value = getattr(self._alpharaw_reader, attr)
|
|
364
|
+
if callable(value):
|
|
365
|
+
metadata[attr] = value()
|
|
366
|
+
else:
|
|
367
|
+
metadata[attr] = value
|
|
368
|
+
except:
|
|
369
|
+
pass
|
|
370
|
+
|
|
371
|
+
except Exception as e:
|
|
372
|
+
metadata['metadata_error'] = str(e)
|
|
373
|
+
|
|
374
|
+
elif hasattr(self, '_wiff_file'):
|
|
375
|
+
# Get metadata from standard WIFF reader
|
|
376
|
+
try:
|
|
377
|
+
for i in range(self.sample_count):
|
|
378
|
+
sample = self._wiff_file.GetSample(i)
|
|
379
|
+
sample_info = {
|
|
380
|
+
'index': i,
|
|
381
|
+
'name': str(self.sample_names[i]),
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
if hasattr(sample, 'Details'):
|
|
385
|
+
details = sample.Details
|
|
386
|
+
if hasattr(details, 'AcquisitionDateTime'):
|
|
387
|
+
sample_info['acquisition_time'] = str(details.AcquisitionDateTime.ToString("O"))
|
|
388
|
+
|
|
389
|
+
if hasattr(sample, 'MassSpectrometerSample'):
|
|
390
|
+
ms_sample = sample.MassSpectrometerSample
|
|
391
|
+
sample_info['experiment_count'] = ms_sample.ExperimentCount
|
|
392
|
+
|
|
393
|
+
metadata['samples'].append(sample_info)
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
metadata['metadata_error'] = str(e)
|
|
397
|
+
|
|
398
|
+
return metadata
|
|
399
|
+
|
|
400
|
+
def load_sample(self, sample_id: int = 0, **kwargs):
|
|
401
|
+
"""
|
|
402
|
+
Load sample data with WIFF2-optimized settings.
|
|
403
|
+
|
|
404
|
+
Parameters
|
|
405
|
+
----------
|
|
406
|
+
sample_id : int
|
|
407
|
+
Sample index to load
|
|
408
|
+
**kwargs
|
|
409
|
+
Additional parameters for data loading
|
|
410
|
+
|
|
411
|
+
Returns
|
|
412
|
+
-------
|
|
413
|
+
dict
|
|
414
|
+
Comprehensive spectral data dictionary
|
|
415
|
+
"""
|
|
416
|
+
if self.initialization_method == "alpharaw":
|
|
417
|
+
return self._load_sample_alpharaw(sample_id, **kwargs)
|
|
418
|
+
else:
|
|
419
|
+
return self._load_sample_standard(sample_id, **kwargs)
|
|
420
|
+
|
|
421
|
+
def _load_sample_alpharaw(self, sample_id: int, **kwargs):
|
|
422
|
+
"""Load sample using alpharaw reader."""
|
|
423
|
+
# Enhanced parameters for WIFF2
|
|
424
|
+
enhanced_params = {
|
|
425
|
+
'centroid': kwargs.get('centroid', True),
|
|
426
|
+
'centroid_ppm': kwargs.get('centroid_ppm', 15.0),
|
|
427
|
+
'keep_k_peaks': kwargs.get('keep_k_peaks', 3000),
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
# Use alpharaw's data extraction
|
|
432
|
+
spectrum_df = self._alpharaw_reader.spectrum_df
|
|
433
|
+
peak_df = self._alpharaw_reader.peak_df
|
|
434
|
+
|
|
435
|
+
# Convert to the expected format
|
|
436
|
+
spectral_data = {
|
|
437
|
+
'peak_indices': spectrum_df[['peak_start_idx', 'peak_stop_idx']].values.flatten(),
|
|
438
|
+
'peak_mz': peak_df['mz'].values,
|
|
439
|
+
'peak_intensity': peak_df['intensity'].values,
|
|
440
|
+
'rt': spectrum_df['rt'].values,
|
|
441
|
+
'ms_level': spectrum_df['ms_level'].values,
|
|
442
|
+
'precursor_mz': spectrum_df.get('precursor_mz', np.full(len(spectrum_df), -1.0)).values,
|
|
443
|
+
'precursor_charge': spectrum_df.get('precursor_charge', np.full(len(spectrum_df), 0)).values,
|
|
444
|
+
'isolation_lower_mz': spectrum_df.get('isolation_lower_mz', np.full(len(spectrum_df), -1.0)).values,
|
|
445
|
+
'isolation_upper_mz': spectrum_df.get('isolation_upper_mz', np.full(len(spectrum_df), -1.0)).values,
|
|
446
|
+
'nce': spectrum_df.get('nce', np.full(len(spectrum_df), 0.0)).values,
|
|
447
|
+
'metadata': {
|
|
448
|
+
'format': 'WIFF2',
|
|
449
|
+
'sample_id': sample_id,
|
|
450
|
+
'sample_name': str(self.sample_names[sample_id]) if sample_id < len(self.sample_names) else f'Sample_{sample_id}',
|
|
451
|
+
'loading_params': enhanced_params,
|
|
452
|
+
'total_spectra': len(spectrum_df),
|
|
453
|
+
'total_peaks': len(peak_df),
|
|
454
|
+
'ms1_count': np.sum(spectrum_df['ms_level'] == 1),
|
|
455
|
+
'ms2_count': np.sum(spectrum_df['ms_level'] > 1),
|
|
456
|
+
'rt_range': [float(spectrum_df['rt'].min()), float(spectrum_df['rt'].max())] if len(spectrum_df) > 0 else [0, 0],
|
|
457
|
+
'reader_method': 'alpharaw'
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
return spectral_data
|
|
462
|
+
|
|
463
|
+
except Exception as e:
|
|
464
|
+
raise RuntimeError(f"Failed to load WIFF2 sample via alpharaw: {e}")
|
|
465
|
+
|
|
466
|
+
def _load_sample_standard(self, sample_id: int, **kwargs):
|
|
467
|
+
"""Load sample using standard WIFF reader with WIFF2 enhancements."""
|
|
468
|
+
# Use enhanced parameters optimized for WIFF2
|
|
469
|
+
enhanced_params = {
|
|
470
|
+
'centroid': kwargs.get('centroid', True),
|
|
471
|
+
'centroid_ppm': kwargs.get('centroid_ppm', 15.0), # Tighter for WIFF2
|
|
472
|
+
'ignore_empty_scans': kwargs.get('ignore_empty_scans', True),
|
|
473
|
+
'keep_k_peaks': kwargs.get('keep_k_peaks', 3000), # More peaks for WIFF2
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
if sample_id < 0 or sample_id >= self.sample_count:
|
|
477
|
+
raise ValueError(f"Sample ID {sample_id} out of range (0-{self.sample_count-1})")
|
|
478
|
+
|
|
479
|
+
# Use the same loading approach as SciexWiffFileReader but with enhancements
|
|
480
|
+
sample = self._wiff_file.GetSample(sample_id)
|
|
481
|
+
ms_sample = sample.MassSpectrometerSample
|
|
482
|
+
|
|
483
|
+
# Process data (same as SciexWiffFileReader.load_sample but with enhanced params)
|
|
484
|
+
_peak_indices: list[int] = []
|
|
485
|
+
peak_mz_list: list[np.ndarray] = []
|
|
486
|
+
peak_intensity_list: list[np.ndarray] = []
|
|
487
|
+
rt_list: list[float] = []
|
|
488
|
+
ms_level_list: list[int] = []
|
|
489
|
+
precursor_mz_list: list[float] = []
|
|
490
|
+
precursor_charge_list: list[int] = []
|
|
491
|
+
nce_list: list[float] = []
|
|
492
|
+
isolation_lower_list: list[float] = []
|
|
493
|
+
isolation_upper_list: list[float] = []
|
|
494
|
+
|
|
495
|
+
exp_list = [ms_sample.GetMSExperiment(i) for i in range(ms_sample.ExperimentCount)]
|
|
496
|
+
|
|
497
|
+
for j in range(exp_list[0].Details.NumberOfScans):
|
|
498
|
+
for i in range(ms_sample.ExperimentCount):
|
|
499
|
+
exp = exp_list[i]
|
|
500
|
+
mass_spectrum = exp.GetMassSpectrum(j)
|
|
501
|
+
mass_spectrum_info = exp.GetMassSpectrumInfo(j)
|
|
502
|
+
details = exp.Details
|
|
503
|
+
ms_level = mass_spectrum_info.MSLevel
|
|
504
|
+
|
|
505
|
+
if (ms_level > 1 and not details.IsSwath and
|
|
506
|
+
mass_spectrum.NumDataPoints <= 0 and enhanced_params['ignore_empty_scans']):
|
|
507
|
+
continue
|
|
508
|
+
|
|
509
|
+
mz_array = dot_net_array_to_np_array(mass_spectrum.GetActualXValues())
|
|
510
|
+
int_array = dot_net_array_to_np_array(mass_spectrum.GetActualYValues()).astype(np.float32)
|
|
511
|
+
|
|
512
|
+
if enhanced_params['centroid']:
|
|
513
|
+
mz_array, int_array = naive_centroid(
|
|
514
|
+
mz_array, int_array,
|
|
515
|
+
centroiding_ppm=enhanced_params['centroid_ppm']
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
if len(mz_array) > enhanced_params['keep_k_peaks']:
|
|
519
|
+
top_indices = np.argsort(int_array)[-enhanced_params['keep_k_peaks']:]
|
|
520
|
+
top_indices = np.sort(top_indices)
|
|
521
|
+
mz_array = mz_array[top_indices]
|
|
522
|
+
int_array = int_array[top_indices]
|
|
523
|
+
|
|
524
|
+
peak_mz_list.append(mz_array)
|
|
525
|
+
peak_intensity_list.append(int_array)
|
|
526
|
+
_peak_indices.append(len(peak_mz_list[-1]))
|
|
527
|
+
|
|
528
|
+
rt_list.append(exp.GetRTFromExperimentCycle(j))
|
|
529
|
+
ms_level_list.append(ms_level)
|
|
530
|
+
|
|
531
|
+
# Enhanced precursor handling for WIFF2
|
|
532
|
+
center_mz = -1.0
|
|
533
|
+
isolation_window = 0.0
|
|
534
|
+
|
|
535
|
+
if ms_level > 1:
|
|
536
|
+
if details.IsSwath and details.MassRangeInfo.Length > 0:
|
|
537
|
+
try:
|
|
538
|
+
from WiffOps4Python import WiffOps as DotNetWiffOps
|
|
539
|
+
center_mz = DotNetWiffOps.get_center_mz(details)
|
|
540
|
+
isolation_window = DotNetWiffOps.get_isolation_window(details)
|
|
541
|
+
except:
|
|
542
|
+
center_mz = mass_spectrum_info.ParentMZ
|
|
543
|
+
isolation_window = 3.0
|
|
544
|
+
|
|
545
|
+
if isolation_window <= 0:
|
|
546
|
+
isolation_window = 3.0
|
|
547
|
+
if center_mz <= 0:
|
|
548
|
+
center_mz = mass_spectrum_info.ParentMZ
|
|
549
|
+
|
|
550
|
+
precursor_mz_list.append(center_mz)
|
|
551
|
+
precursor_charge_list.append(mass_spectrum_info.ParentChargeState)
|
|
552
|
+
nce_list.append(float(mass_spectrum_info.CollisionEnergy))
|
|
553
|
+
isolation_lower_list.append(center_mz - isolation_window / 2)
|
|
554
|
+
isolation_upper_list.append(center_mz + isolation_window / 2)
|
|
555
|
+
else:
|
|
556
|
+
precursor_mz_list.append(-1.0)
|
|
557
|
+
precursor_charge_list.append(0)
|
|
558
|
+
nce_list.append(0.0)
|
|
559
|
+
isolation_lower_list.append(-1.0)
|
|
560
|
+
isolation_upper_list.append(-1.0)
|
|
561
|
+
|
|
562
|
+
# Finalize arrays
|
|
563
|
+
peak_indices = np.empty(len(rt_list) + 1, np.int64)
|
|
564
|
+
peak_indices[0] = 0
|
|
565
|
+
peak_indices[1:] = np.cumsum(_peak_indices)
|
|
566
|
+
|
|
567
|
+
return {
|
|
568
|
+
'peak_indices': peak_indices,
|
|
569
|
+
'peak_mz': np.concatenate(peak_mz_list) if peak_mz_list else np.array([]),
|
|
570
|
+
'peak_intensity': np.concatenate(peak_intensity_list) if peak_intensity_list else np.array([]),
|
|
571
|
+
'rt': np.array(rt_list, dtype=np.float64),
|
|
572
|
+
'ms_level': np.array(ms_level_list, dtype=np.int8),
|
|
573
|
+
'precursor_mz': np.array(precursor_mz_list, dtype=np.float64),
|
|
574
|
+
'precursor_charge': np.array(precursor_charge_list, dtype=np.int8),
|
|
575
|
+
'isolation_lower_mz': np.array(isolation_lower_list, dtype=np.float64),
|
|
576
|
+
'isolation_upper_mz': np.array(isolation_upper_list, dtype=np.float64),
|
|
577
|
+
'nce': np.array(nce_list, dtype=np.float32),
|
|
578
|
+
'metadata': {
|
|
579
|
+
'format': 'WIFF2',
|
|
580
|
+
'sample_id': sample_id,
|
|
581
|
+
'sample_name': str(self.sample_names[sample_id]),
|
|
582
|
+
'loading_params': enhanced_params,
|
|
583
|
+
'total_spectra': len(rt_list),
|
|
584
|
+
'total_peaks': sum(_peak_indices),
|
|
585
|
+
'ms1_count': np.sum(np.array(ms_level_list) == 1),
|
|
586
|
+
'ms2_count': np.sum(np.array(ms_level_list) > 1),
|
|
587
|
+
'rt_range': [float(np.min(rt_list)), float(np.max(rt_list))] if rt_list else [0, 0],
|
|
588
|
+
'creation_time': str(sample.Details.AcquisitionDateTime.ToString("O")) if hasattr(sample, 'Details') else '',
|
|
589
|
+
'reader_method': 'standard_enhanced'
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
def close(self):
|
|
594
|
+
"""Close the WIFF2 file and clean up resources."""
|
|
595
|
+
if hasattr(self, '_wiffDataProvider'):
|
|
596
|
+
try:
|
|
597
|
+
self._wiffDataProvider.Close()
|
|
598
|
+
except:
|
|
599
|
+
pass
|
|
600
|
+
|
|
601
|
+
if hasattr(self, '_alpharaw_reader'):
|
|
602
|
+
try:
|
|
603
|
+
self._alpharaw_reader.close()
|
|
604
|
+
except:
|
|
605
|
+
pass
|
|
606
|
+
|
|
607
|
+
def __enter__(self):
|
|
608
|
+
return self
|
|
609
|
+
|
|
610
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
611
|
+
self.close()
|
|
612
|
+
|
|
613
|
+
def __repr__(self):
|
|
614
|
+
return f"SciexWiff2FileReader(file='{self.filename}', samples={self.sample_count}, method={self.initialization_method})"
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
class SciexWiffFileReader:
|
|
618
|
+
"""
|
|
619
|
+
Direct implementation of Sciex WIFF file reader using the DLLs without alpharaw dependency.
|
|
620
|
+
"""
|
|
621
|
+
|
|
622
|
+
def __init__(self, filename: str):
|
|
623
|
+
if not HAS_DOTNET:
|
|
624
|
+
raise ValueError(
|
|
625
|
+
"Dotnet-based dependencies are required for reading Sciex files. "
|
|
626
|
+
"Do you have pythonnet and/or mono installed? "
|
|
627
|
+
"See the alpharaw documentation for details.",
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
self._wiffDataProvider = AnalystWiffDataProvider()
|
|
631
|
+
self._wiff_file = AnalystDataProviderFactory.CreateBatch(
|
|
632
|
+
filename,
|
|
633
|
+
self._wiffDataProvider,
|
|
634
|
+
)
|
|
635
|
+
self.sample_names = self._wiff_file.GetSampleNames()
|
|
636
|
+
|
|
637
|
+
def close(self):
|
|
638
|
+
"""Close the file and clean up resources."""
|
|
639
|
+
self._wiffDataProvider.Close()
|
|
640
|
+
|
|
641
|
+
def load_sample(
|
|
642
|
+
self,
|
|
643
|
+
sample_id: int,
|
|
644
|
+
centroid: bool = True,
|
|
645
|
+
centroid_ppm: float = 20.0,
|
|
646
|
+
ignore_empty_scans: bool = True,
|
|
647
|
+
keep_k_peaks: int = 2000,
|
|
648
|
+
) -> dict[str, Any]:
|
|
649
|
+
"""
|
|
650
|
+
Load a sample from the WIFF file and extract spectral data.
|
|
651
|
+
|
|
652
|
+
Parameters
|
|
653
|
+
----------
|
|
654
|
+
sample_id : int
|
|
655
|
+
ID of the sample to load
|
|
656
|
+
centroid : bool
|
|
657
|
+
Whether to centroid the data
|
|
658
|
+
centroid_ppm : float
|
|
659
|
+
PPM tolerance for centroiding
|
|
660
|
+
ignore_empty_scans : bool
|
|
661
|
+
Whether to skip empty scans
|
|
662
|
+
keep_k_peaks : int
|
|
663
|
+
Maximum number of peaks to keep per spectrum
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
dict
|
|
668
|
+
Dictionary containing spectral data
|
|
669
|
+
"""
|
|
670
|
+
if sample_id < 0 or sample_id >= len(self.sample_names):
|
|
671
|
+
raise ValueError("Incorrect sample number.")
|
|
672
|
+
|
|
673
|
+
self.wiffSample = self._wiff_file.GetSample(sample_id)
|
|
674
|
+
self.msSample = self.wiffSample.MassSpectrometerSample
|
|
675
|
+
|
|
676
|
+
_peak_indices: list[int] = []
|
|
677
|
+
peak_mz_array_list: list[np.ndarray] = []
|
|
678
|
+
peak_intensity_array_list: list[np.ndarray] = []
|
|
679
|
+
rt_list: list[float] = []
|
|
680
|
+
ms_level_list: list[int] = []
|
|
681
|
+
precursor_mz_list: list[float] = []
|
|
682
|
+
precursor_charge_list: list[int] = []
|
|
683
|
+
ce_list: list[float] = []
|
|
684
|
+
isolation_lower_mz_list: list[float] = []
|
|
685
|
+
isolation_upper_mz_list: list[float] = []
|
|
686
|
+
|
|
687
|
+
exp_list = [self.msSample.GetMSExperiment(i) for i in range(self.msSample.ExperimentCount)]
|
|
688
|
+
|
|
689
|
+
for j in range(exp_list[0].Details.NumberOfScans):
|
|
690
|
+
for i in range(self.msSample.ExperimentCount):
|
|
691
|
+
exp = exp_list[i]
|
|
692
|
+
mass_spectrum = exp.GetMassSpectrum(j)
|
|
693
|
+
mass_spectrum_info = exp.GetMassSpectrumInfo(j)
|
|
694
|
+
details = exp.Details
|
|
695
|
+
ms_level = mass_spectrum_info.MSLevel
|
|
696
|
+
|
|
697
|
+
if ms_level > 1 and not details.IsSwath and mass_spectrum.NumDataPoints <= 0 and ignore_empty_scans:
|
|
698
|
+
continue
|
|
699
|
+
|
|
700
|
+
mz_array = dot_net_array_to_np_array(mass_spectrum.GetActualXValues())
|
|
701
|
+
int_array = dot_net_array_to_np_array(
|
|
702
|
+
mass_spectrum.GetActualYValues(),
|
|
703
|
+
).astype(np.float32)
|
|
704
|
+
|
|
705
|
+
if centroid:
|
|
706
|
+
(mz_array, int_array) = naive_centroid(
|
|
707
|
+
mz_array,
|
|
708
|
+
int_array,
|
|
709
|
+
centroiding_ppm=centroid_ppm,
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
if len(mz_array) > keep_k_peaks:
|
|
713
|
+
idxes = np.argsort(int_array)[-keep_k_peaks:]
|
|
714
|
+
idxes = np.sort(idxes)
|
|
715
|
+
mz_array = mz_array[idxes]
|
|
716
|
+
int_array = int_array[idxes]
|
|
717
|
+
|
|
718
|
+
peak_mz_array_list.append(mz_array)
|
|
719
|
+
peak_intensity_array_list.append(int_array)
|
|
720
|
+
|
|
721
|
+
_peak_indices.append(len(peak_mz_array_list[-1]))
|
|
722
|
+
rt_list.append(exp.GetRTFromExperimentCycle(j))
|
|
723
|
+
|
|
724
|
+
ms_level_list.append(ms_level)
|
|
725
|
+
|
|
726
|
+
center_mz = -1.0
|
|
727
|
+
isolation_window = 0.0
|
|
728
|
+
|
|
729
|
+
if ms_level > 1:
|
|
730
|
+
if details.IsSwath and details.MassRangeInfo.Length > 0:
|
|
731
|
+
center_mz = DotNetWiffOps.get_center_mz(details)
|
|
732
|
+
isolation_window = DotNetWiffOps.get_isolation_window(details)
|
|
733
|
+
if isolation_window <= 0:
|
|
734
|
+
isolation_window = 3.0
|
|
735
|
+
if center_mz <= 0:
|
|
736
|
+
center_mz = mass_spectrum_info.ParentMZ
|
|
737
|
+
precursor_mz_list.append(center_mz)
|
|
738
|
+
precursor_charge_list.append(mass_spectrum_info.ParentChargeState)
|
|
739
|
+
ce_list.append(float(mass_spectrum_info.CollisionEnergy))
|
|
740
|
+
isolation_lower_mz_list.append(center_mz - isolation_window / 2)
|
|
741
|
+
isolation_upper_mz_list.append(center_mz + isolation_window / 2)
|
|
742
|
+
else:
|
|
743
|
+
precursor_mz_list.append(-1.0)
|
|
744
|
+
precursor_charge_list.append(0)
|
|
745
|
+
ce_list.append(0.0)
|
|
746
|
+
isolation_lower_mz_list.append(-1.0)
|
|
747
|
+
isolation_upper_mz_list.append(-1.0)
|
|
748
|
+
|
|
749
|
+
peak_indices = np.empty(len(rt_list) + 1, np.int64)
|
|
750
|
+
peak_indices[0] = 0
|
|
751
|
+
peak_indices[1:] = np.cumsum(_peak_indices)
|
|
752
|
+
|
|
753
|
+
return {
|
|
754
|
+
"peak_indices": peak_indices,
|
|
755
|
+
"peak_mz": np.concatenate(peak_mz_array_list),
|
|
756
|
+
"peak_intensity": np.concatenate(peak_intensity_array_list),
|
|
757
|
+
"rt": np.array(rt_list, dtype=np.float64),
|
|
758
|
+
"ms_level": np.array(ms_level_list, dtype=np.int8),
|
|
759
|
+
"precursor_mz": np.array(precursor_mz_list, dtype=np.float64),
|
|
760
|
+
"precursor_charge": np.array(precursor_charge_list, dtype=np.int8),
|
|
761
|
+
"isolation_lower_mz": np.array(isolation_lower_mz_list),
|
|
762
|
+
"isolation_upper_mz": np.array(isolation_upper_mz_list),
|
|
763
|
+
"nce": np.array(ce_list, dtype=np.float32),
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
class SciexWiffData:
|
|
768
|
+
"""
|
|
769
|
+
Standalone Sciex WIFF data reader class that mimics alpharaw.sciex.SciexWiffData
|
|
770
|
+
functionality but uses DLLs directly without importing from alpharaw.
|
|
771
|
+
"""
|
|
772
|
+
|
|
773
|
+
# Column data types mapping
|
|
774
|
+
column_dtypes: ClassVar[dict[str, Any]] = {
|
|
775
|
+
"rt": np.float64,
|
|
776
|
+
"ms_level": np.int8,
|
|
777
|
+
"precursor_mz": np.float64,
|
|
778
|
+
"isolation_lower_mz": np.float64,
|
|
779
|
+
"isolation_upper_mz": np.float64,
|
|
780
|
+
"precursor_charge": np.int8,
|
|
781
|
+
"nce": np.float32,
|
|
782
|
+
"injection_time": np.float32,
|
|
783
|
+
"activation": "U",
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
def __init__(self, centroided: bool = True, save_as_hdf: bool = False, **kwargs):
|
|
787
|
+
"""
|
|
788
|
+
Parameters
|
|
789
|
+
----------
|
|
790
|
+
centroided : bool, optional
|
|
791
|
+
If peaks will be centroided after loading, by default True.
|
|
792
|
+
save_as_hdf : bool, optional
|
|
793
|
+
Automatically save hdf after load raw data, by default False.
|
|
794
|
+
"""
|
|
795
|
+
self.spectrum_df: pd.DataFrame = pd.DataFrame()
|
|
796
|
+
self.peak_df: pd.DataFrame = pd.DataFrame()
|
|
797
|
+
self._raw_file_path = ""
|
|
798
|
+
self.centroided = centroided
|
|
799
|
+
self._save_as_hdf = save_as_hdf
|
|
800
|
+
self.creation_time = ""
|
|
801
|
+
self.file_type = "sciex"
|
|
802
|
+
self.instrument = "sciex"
|
|
803
|
+
|
|
804
|
+
if self.centroided:
|
|
805
|
+
self.centroided = False
|
|
806
|
+
warnings.warn(
|
|
807
|
+
"Centroiding for Sciex data is not well implemented yet",
|
|
808
|
+
stacklevel=2,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
self.centroid_ppm = 20.0
|
|
812
|
+
self.ignore_empty_scans = True
|
|
813
|
+
self.keep_k_peaks_per_spec = 2000
|
|
814
|
+
self.sample_id = 0
|
|
815
|
+
|
|
816
|
+
@property
|
|
817
|
+
def raw_file_path(self) -> str:
|
|
818
|
+
"""Get the raw file path."""
|
|
819
|
+
return self._raw_file_path
|
|
820
|
+
|
|
821
|
+
@raw_file_path.setter
|
|
822
|
+
def raw_file_path(self, value: str):
|
|
823
|
+
"""Set the raw file path."""
|
|
824
|
+
self._raw_file_path = value
|
|
825
|
+
|
|
826
|
+
def import_raw(self, wiff_file_path: str) -> None:
|
|
827
|
+
"""
|
|
828
|
+
Import raw data from a WIFF file.
|
|
829
|
+
|
|
830
|
+
Parameters
|
|
831
|
+
----------
|
|
832
|
+
wiff_file_path : str
|
|
833
|
+
Path to the WIFF file
|
|
834
|
+
"""
|
|
835
|
+
self.raw_file_path = wiff_file_path
|
|
836
|
+
data_dict = self._import(wiff_file_path)
|
|
837
|
+
self._set_dataframes(data_dict)
|
|
838
|
+
|
|
839
|
+
def _import(self, _wiff_file_path: str) -> dict[str, Any]:
|
|
840
|
+
"""
|
|
841
|
+
Implementation of data import interface.
|
|
842
|
+
|
|
843
|
+
Parameters
|
|
844
|
+
----------
|
|
845
|
+
_wiff_file_path : str
|
|
846
|
+
Absolute or relative path of the sciex wiff file.
|
|
847
|
+
|
|
848
|
+
Returns
|
|
849
|
+
-------
|
|
850
|
+
dict
|
|
851
|
+
Spectrum information dict.
|
|
852
|
+
"""
|
|
853
|
+
wiff_reader = SciexWiffFileReader(_wiff_file_path)
|
|
854
|
+
data_dict = wiff_reader.load_sample(
|
|
855
|
+
self.sample_id,
|
|
856
|
+
centroid=self.centroided,
|
|
857
|
+
centroid_ppm=self.centroid_ppm,
|
|
858
|
+
ignore_empty_scans=self.ignore_empty_scans,
|
|
859
|
+
keep_k_peaks=self.keep_k_peaks_per_spec,
|
|
860
|
+
)
|
|
861
|
+
self.creation_time = wiff_reader.wiffSample.Details.AcquisitionDateTime.ToString("O")
|
|
862
|
+
wiff_reader.close()
|
|
863
|
+
return data_dict
|
|
864
|
+
|
|
865
|
+
def _set_dataframes(self, raw_data: dict[str, Any]) -> None:
|
|
866
|
+
"""
|
|
867
|
+
Set the spectrum and peak dataframes from raw data dictionary.
|
|
868
|
+
|
|
869
|
+
Parameters
|
|
870
|
+
----------
|
|
871
|
+
raw_data : dict
|
|
872
|
+
Dictionary containing the raw spectral data
|
|
873
|
+
"""
|
|
874
|
+
self.create_spectrum_df(len(raw_data["rt"]))
|
|
875
|
+
self.set_peak_df_by_indexed_array(
|
|
876
|
+
raw_data["peak_mz"],
|
|
877
|
+
raw_data["peak_intensity"],
|
|
878
|
+
raw_data["peak_indices"][:-1],
|
|
879
|
+
raw_data["peak_indices"][1:],
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
for col, val in raw_data.items():
|
|
883
|
+
if col in self.column_dtypes:
|
|
884
|
+
if self.column_dtypes[col] == "O":
|
|
885
|
+
self.spectrum_df[col] = list(val)
|
|
886
|
+
else:
|
|
887
|
+
self.spectrum_df[col] = np.array(val, dtype=self.column_dtypes[col])
|
|
888
|
+
|
|
889
|
+
def create_spectrum_df(self, spectrum_num: int) -> None:
|
|
890
|
+
"""
|
|
891
|
+
Create an empty spectrum dataframe from the number of spectra.
|
|
892
|
+
|
|
893
|
+
Parameters
|
|
894
|
+
----------
|
|
895
|
+
spectrum_num : int
|
|
896
|
+
The number of spectra.
|
|
897
|
+
"""
|
|
898
|
+
self.spectrum_df = pd.DataFrame(index=np.arange(spectrum_num, dtype=np.int64))
|
|
899
|
+
self.spectrum_df["spec_idx"] = self.spectrum_df.index.values
|
|
900
|
+
|
|
901
|
+
def set_peak_df_by_indexed_array(
|
|
902
|
+
self,
|
|
903
|
+
mz_array: np.ndarray,
|
|
904
|
+
intensity_array: np.ndarray,
|
|
905
|
+
peak_start_indices: np.ndarray,
|
|
906
|
+
peak_stop_indices: np.ndarray,
|
|
907
|
+
) -> None:
|
|
908
|
+
"""
|
|
909
|
+
Set peak dataframe using indexed arrays.
|
|
910
|
+
|
|
911
|
+
Parameters
|
|
912
|
+
----------
|
|
913
|
+
mz_array : np.ndarray
|
|
914
|
+
Array of m/z values
|
|
915
|
+
intensity_array : np.ndarray
|
|
916
|
+
Array of intensity values
|
|
917
|
+
peak_start_indices : np.ndarray
|
|
918
|
+
Array of start indices for each spectrum
|
|
919
|
+
peak_stop_indices : np.ndarray
|
|
920
|
+
Array of stop indices for each spectrum
|
|
921
|
+
"""
|
|
922
|
+
self.peak_df = pd.DataFrame()
|
|
923
|
+
self.peak_df["mz"] = mz_array.astype(np.float64)
|
|
924
|
+
self.peak_df["intensity"] = intensity_array.astype(np.float32)
|
|
925
|
+
|
|
926
|
+
# Set peak start and stop indices in spectrum df
|
|
927
|
+
self.spectrum_df["peak_start_idx"] = peak_start_indices
|
|
928
|
+
self.spectrum_df["peak_stop_idx"] = peak_stop_indices
|
|
929
|
+
|
|
930
|
+
def get_peaks(self, spec_idx: int) -> tuple[np.ndarray, np.ndarray]:
|
|
931
|
+
"""
|
|
932
|
+
Get peaks for a specific spectrum.
|
|
933
|
+
|
|
934
|
+
Parameters
|
|
935
|
+
----------
|
|
936
|
+
spec_idx : int
|
|
937
|
+
Spectrum index
|
|
938
|
+
|
|
939
|
+
Returns
|
|
940
|
+
-------
|
|
941
|
+
tuple
|
|
942
|
+
(mz_array, intensity_array)
|
|
943
|
+
"""
|
|
944
|
+
start, end = self.spectrum_df[["peak_start_idx", "peak_stop_idx"]].values[
|
|
945
|
+
spec_idx,
|
|
946
|
+
:,
|
|
947
|
+
]
|
|
948
|
+
return (
|
|
949
|
+
self.peak_df.mz.values[start:end],
|
|
950
|
+
self.peak_df.intensity.values[start:end],
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
def save_hdf(self, hdf_file_path: str) -> None:
|
|
954
|
+
"""
|
|
955
|
+
Save data to HDF5 file (placeholder implementation).
|
|
956
|
+
|
|
957
|
+
Parameters
|
|
958
|
+
----------
|
|
959
|
+
hdf_file_path : str
|
|
960
|
+
Path to save the HDF5 file
|
|
961
|
+
"""
|
|
962
|
+
# This would require implementing HDF5 saving functionality
|
|
963
|
+
# For now, just save as pickle or implement as needed
|
|
964
|
+
import pickle
|
|
965
|
+
|
|
966
|
+
with open(hdf_file_path.replace(".hdf", ".pkl"), "wb") as f:
|
|
967
|
+
pickle.dump(
|
|
968
|
+
{
|
|
969
|
+
"spectrum_df": self.spectrum_df,
|
|
970
|
+
"peak_df": self.peak_df,
|
|
971
|
+
"creation_time": self.creation_time,
|
|
972
|
+
"raw_file_path": self.raw_file_path,
|
|
973
|
+
"file_type": self.file_type,
|
|
974
|
+
"centroided": self.centroided,
|
|
975
|
+
"instrument": self.instrument,
|
|
976
|
+
},
|
|
977
|
+
f,
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
def __repr__(self) -> str:
|
|
981
|
+
return f"SciexWiffData(file_path='{self.raw_file_path}', spectra={len(self.spectrum_df)})"
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
# Convenience functions to maintain compatibility with existing code
|
|
985
|
+
def load_wiff_file(filename: str, **kwargs) -> SciexWiffData:
|
|
986
|
+
"""
|
|
987
|
+
Load a WIFF file and return a SciexWiffData object.
|
|
988
|
+
|
|
989
|
+
Parameters
|
|
990
|
+
----------
|
|
991
|
+
filename : str
|
|
992
|
+
Path to the WIFF file
|
|
993
|
+
**kwargs
|
|
994
|
+
Additional arguments to pass to SciexWiffData constructor
|
|
995
|
+
|
|
996
|
+
Returns
|
|
997
|
+
-------
|
|
998
|
+
SciexWiffData
|
|
999
|
+
Loaded WIFF data object
|
|
1000
|
+
"""
|
|
1001
|
+
wiff_data = SciexWiffData(**kwargs)
|
|
1002
|
+
wiff_data.import_raw(filename)
|
|
1003
|
+
return wiff_data
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def load_wiff2_file(filename: str, **kwargs) -> dict[str, Any]:
|
|
1007
|
+
"""
|
|
1008
|
+
Load a WIFF2 file and return spectral data.
|
|
1009
|
+
|
|
1010
|
+
Note: WIFF2 format support is limited with current DLL versions.
|
|
1011
|
+
If you encounter format incompatibility errors, try using the regular
|
|
1012
|
+
WIFF file instead or convert WIFF2 to WIFF format.
|
|
1013
|
+
|
|
1014
|
+
Parameters
|
|
1015
|
+
----------
|
|
1016
|
+
filename : str
|
|
1017
|
+
Path to the WIFF2 file
|
|
1018
|
+
**kwargs
|
|
1019
|
+
Additional arguments for WIFF2 loading (sample_id, centroid, etc.)
|
|
1020
|
+
|
|
1021
|
+
Returns
|
|
1022
|
+
-------
|
|
1023
|
+
dict
|
|
1024
|
+
Spectral data dictionary with enhanced WIFF2 information
|
|
1025
|
+
|
|
1026
|
+
Raises
|
|
1027
|
+
------
|
|
1028
|
+
RuntimeError
|
|
1029
|
+
If WIFF2 format is not supported by current DLL combination
|
|
1030
|
+
"""
|
|
1031
|
+
sample_id = kwargs.pop('sample_id', 0)
|
|
1032
|
+
|
|
1033
|
+
try:
|
|
1034
|
+
with SciexWiff2FileReader(filename) as reader:
|
|
1035
|
+
return reader.load_sample(sample_id, **kwargs) # type: ignore[no-any-return]
|
|
1036
|
+
except RuntimeError as e:
|
|
1037
|
+
if "format is not supported" in str(e):
|
|
1038
|
+
# Suggest using regular WIFF file if available
|
|
1039
|
+
wiff_file = filename.replace('.wiff2', '.wiff')
|
|
1040
|
+
if os.path.exists(wiff_file):
|
|
1041
|
+
raise RuntimeError(
|
|
1042
|
+
f"WIFF2 format not supported. However, a regular WIFF file was found: "
|
|
1043
|
+
f"'{wiff_file}'. Try using load_wiff_file('{wiff_file}') instead."
|
|
1044
|
+
) from e
|
|
1045
|
+
else:
|
|
1046
|
+
raise RuntimeError(
|
|
1047
|
+
f"WIFF2 format not supported and no corresponding WIFF file found. "
|
|
1048
|
+
f"Original error: {e}"
|
|
1049
|
+
) from e
|
|
1050
|
+
else:
|
|
1051
|
+
raise
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
def load_wiff_file_smart(filename: str, **kwargs) -> dict[str, Any] | SciexWiffData:
|
|
1055
|
+
"""
|
|
1056
|
+
Smart WIFF file loader that automatically handles WIFF and WIFF2 formats.
|
|
1057
|
+
|
|
1058
|
+
This function will first try to load the file as specified, and if it's a WIFF2
|
|
1059
|
+
file that fails due to format incompatibility, it will suggest alternatives.
|
|
1060
|
+
|
|
1061
|
+
Parameters
|
|
1062
|
+
----------
|
|
1063
|
+
filename : str
|
|
1064
|
+
Path to the WIFF or WIFF2 file
|
|
1065
|
+
**kwargs
|
|
1066
|
+
Additional arguments for loading (sample_id, centroid, etc.)
|
|
1067
|
+
|
|
1068
|
+
Returns
|
|
1069
|
+
-------
|
|
1070
|
+
dict
|
|
1071
|
+
Spectral data dictionary
|
|
1072
|
+
"""
|
|
1073
|
+
if filename.lower().endswith('.wiff2'):
|
|
1074
|
+
try:
|
|
1075
|
+
return load_wiff2_file(filename, **kwargs)
|
|
1076
|
+
except RuntimeError as e:
|
|
1077
|
+
if "format is not supported" in str(e):
|
|
1078
|
+
# Check if regular WIFF file exists
|
|
1079
|
+
wiff_file = filename.replace('.wiff2', '.wiff')
|
|
1080
|
+
if os.path.exists(wiff_file):
|
|
1081
|
+
warnings.warn(
|
|
1082
|
+
f"WIFF2 format not supported, falling back to WIFF file: {wiff_file}",
|
|
1083
|
+
stacklevel=2
|
|
1084
|
+
)
|
|
1085
|
+
return load_wiff_file(wiff_file, **kwargs)
|
|
1086
|
+
raise
|
|
1087
|
+
else:
|
|
1088
|
+
return load_wiff_file(filename, **kwargs)
|
|
1089
|
+
|
|
1090
|
+
|
|
1091
|
+
def get_sample_names(filename: str) -> list:
|
|
1092
|
+
"""
|
|
1093
|
+
Get the sample names from a WIFF file.
|
|
1094
|
+
|
|
1095
|
+
Parameters
|
|
1096
|
+
----------
|
|
1097
|
+
filename : str
|
|
1098
|
+
Path to the WIFF file
|
|
1099
|
+
|
|
1100
|
+
Returns
|
|
1101
|
+
-------
|
|
1102
|
+
list
|
|
1103
|
+
List of sample names
|
|
1104
|
+
"""
|
|
1105
|
+
reader = SciexWiffFileReader(filename)
|
|
1106
|
+
try:
|
|
1107
|
+
return list(reader.sample_names)
|
|
1108
|
+
finally:
|
|
1109
|
+
reader.close()
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
def get_wiff2_sample_names(filename: str) -> list:
|
|
1113
|
+
"""
|
|
1114
|
+
Get the sample names from a WIFF2 file.
|
|
1115
|
+
|
|
1116
|
+
Parameters
|
|
1117
|
+
----------
|
|
1118
|
+
filename : str
|
|
1119
|
+
Path to the WIFF2 file
|
|
1120
|
+
|
|
1121
|
+
Returns
|
|
1122
|
+
-------
|
|
1123
|
+
list
|
|
1124
|
+
List of sample names
|
|
1125
|
+
"""
|
|
1126
|
+
with SciexWiff2FileReader(filename) as reader:
|
|
1127
|
+
return list(reader.sample_names)
|
|
1128
|
+
|
|
1129
|
+
|
|
1130
|
+
def get_wiff2_metadata(filename: str) -> dict[str, Any]:
|
|
1131
|
+
"""
|
|
1132
|
+
Get comprehensive metadata from a WIFF2 file.
|
|
1133
|
+
|
|
1134
|
+
Parameters
|
|
1135
|
+
----------
|
|
1136
|
+
filename : str
|
|
1137
|
+
Path to the WIFF2 file
|
|
1138
|
+
|
|
1139
|
+
Returns
|
|
1140
|
+
-------
|
|
1141
|
+
dict
|
|
1142
|
+
Comprehensive WIFF2 file metadata
|
|
1143
|
+
"""
|
|
1144
|
+
with SciexWiff2FileReader(filename) as reader:
|
|
1145
|
+
return reader.get_file_metadata() # type: ignore[no-any-return]
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
# Example usage and testing
|
|
1149
|
+
if __name__ == "__main__":
|
|
1150
|
+
print("Standalone Sciex WIFF reader implementation")
|
|
1151
|
+
print("Usage example:")
|
|
1152
|
+
print("""
|
|
1153
|
+
from sciex import SciexWiffData, load_wiff_file
|
|
1154
|
+
|
|
1155
|
+
# Create reader instance
|
|
1156
|
+
wiff_data = SciexWiffData(centroided=False)
|
|
1157
|
+
wiff_data.import_raw("path/to/file.wiff")
|
|
1158
|
+
|
|
1159
|
+
# Or use convenience function
|
|
1160
|
+
wiff_data = load_wiff_file("path/to/file.wiff")
|
|
1161
|
+
|
|
1162
|
+
# Access spectrum and peak data
|
|
1163
|
+
print(f"Number of spectra: {len(wiff_data.spectrum_df)}")
|
|
1164
|
+
print(f"Number of peaks: {len(wiff_data.peak_df)}")
|
|
1165
|
+
|
|
1166
|
+
# Get peaks for first spectrum
|
|
1167
|
+
mz, intensity = wiff_data.get_peaks(0)
|
|
1168
|
+
""")
|
|
1169
|
+
|
|
1170
|
+
# Test that the module can be imported and classes instantiated
|
|
1171
|
+
try:
|
|
1172
|
+
test_data = SciexWiffData()
|
|
1173
|
+
print(f"✓ SciexWiffData class instantiated successfully: {test_data}")
|
|
1174
|
+
print(f"✓ Has dotnet support: {HAS_DOTNET}")
|
|
1175
|
+
|
|
1176
|
+
# Test with example WIFF file if available
|
|
1177
|
+
example_file = os.path.join(
|
|
1178
|
+
os.path.dirname(__file__),
|
|
1179
|
+
"data",
|
|
1180
|
+
"examples",
|
|
1181
|
+
"2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff",
|
|
1182
|
+
)
|
|
1183
|
+
|
|
1184
|
+
if os.path.exists(example_file):
|
|
1185
|
+
print(f"\n✓ Found example WIFF file: {example_file}")
|
|
1186
|
+
print("Testing WIFF file loading...")
|
|
1187
|
+
|
|
1188
|
+
# Test loading the example file
|
|
1189
|
+
wiff_data = load_wiff_file(example_file)
|
|
1190
|
+
print("✓ Successfully loaded WIFF file")
|
|
1191
|
+
print(f" - Number of spectra: {len(wiff_data.spectrum_df)}")
|
|
1192
|
+
print(f" - Number of peaks: {len(wiff_data.peak_df)}")
|
|
1193
|
+
print(f" - Creation time: {wiff_data.creation_time}")
|
|
1194
|
+
print(f" - File type: {wiff_data.file_type}")
|
|
1195
|
+
print(f" - Instrument: {wiff_data.instrument}")
|
|
1196
|
+
|
|
1197
|
+
# Test getting peaks from first spectrum
|
|
1198
|
+
if len(wiff_data.spectrum_df) > 0:
|
|
1199
|
+
mz, intensity = wiff_data.get_peaks(0)
|
|
1200
|
+
print(f" - First spectrum has {len(mz)} peaks")
|
|
1201
|
+
if len(mz) > 0:
|
|
1202
|
+
print(f" - m/z range: {mz.min():.2f} - {mz.max():.2f}")
|
|
1203
|
+
print(
|
|
1204
|
+
f" - Intensity range: {intensity.min():.0f} - {intensity.max():.0f}",
|
|
1205
|
+
)
|
|
1206
|
+
else:
|
|
1207
|
+
print(f"\n⚠ Example WIFF file not found at: {example_file}")
|
|
1208
|
+
|
|
1209
|
+
except Exception as e:
|
|
1210
|
+
print(f"✗ Error during testing: {e}")
|
|
1211
|
+
import traceback
|
|
1212
|
+
|
|
1213
|
+
traceback.print_exc()
|