masster 0.5.17__py3-none-any.whl → 0.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -0,0 +1,801 @@
1
+ """
2
+ Standalone Thermo RAW file reader module.
3
+
4
+ This module provides a standalone implementation for reading Thermo Fisher RAW files
5
+ using the Thermo Fisher .NET libraries directly. It offers functionality to extract
6
+ spectral data, retention times, MS levels, polarity information, and precursor details
7
+ from RAW files.
8
+
9
+ Key Features:
10
+ - Direct RAW file reading using Thermo Fisher DLLs
11
+ - Support for MS1 and MSn data extraction
12
+ - Optional naive peak centroiding
13
+ - Polarity detection from scan events
14
+ - Precursor information extraction for MS/MS spectra
15
+ - Context manager support for proper resource cleanup
16
+
17
+ Requirements:
18
+ - pythonnet (pip install pythonnet)
19
+ - Thermo Fisher DLLs available in alpharaw's ext/thermo_fisher directory
20
+ - On Linux/macOS: mono runtime must be installed
21
+
22
+ Classes:
23
+ ThermoRawFileReader: Low-level RAW file reader using .NET libraries
24
+ ThermoRawData: High-level interface providing pandas DataFrames
25
+
26
+ Example:
27
+ >>> from thermo import load_raw_file
28
+ >>> raw_data = load_raw_file("sample.raw")
29
+ >>> print(f"Found {len(raw_data.spectrum_df)} spectra")
30
+ >>> mz, intensity = raw_data.get_peaks(0) # Get first spectrum peaks
31
+
32
+ Note:
33
+ The .NET imports (System, ThermoFisher) will only work when pythonnet
34
+ is properly installed and configured. Without these dependencies, the
35
+ module will still import but Thermo RAW file reading will be disabled.
36
+ """
37
+
38
+ # Standard library imports
39
+ import ctypes
40
+ import os
41
+ import site
42
+ import warnings
43
+ from typing import Any, ClassVar
44
+
45
+ # Third-party imports
46
+ import numpy as np
47
+ import pandas as pd
48
+
49
+
50
+ def naive_centroid(
51
+ peak_mzs: np.ndarray,
52
+ peak_intensities: np.ndarray,
53
+ centroiding_ppm: float = 20.0,
54
+ ) -> tuple[np.ndarray, np.ndarray]:
55
+ """
56
+ Simplified naive centroiding implementation.
57
+
58
+ Combines nearby peaks within a PPM tolerance using intensity-weighted averaging.
59
+
60
+ Parameters
61
+ ----------
62
+ peak_mzs : np.ndarray
63
+ Array of m/z values (must be sorted)
64
+ peak_intensities : np.ndarray
65
+ Array of intensity values corresponding to peak_mzs
66
+ centroiding_ppm : float, default 20.0
67
+ PPM tolerance for combining peaks
68
+
69
+ Returns
70
+ -------
71
+ tuple[np.ndarray, np.ndarray]
72
+ Centroided m/z and intensity arrays
73
+
74
+ Notes
75
+ -----
76
+ This is a simple implementation that assumes input peaks are sorted by m/z.
77
+ For production use, consider more sophisticated centroiding algorithms.
78
+ """
79
+ if len(peak_mzs) == 0:
80
+ return np.array([]), np.array([])
81
+
82
+ if len(peak_mzs) != len(peak_intensities):
83
+ raise ValueError("peak_mzs and peak_intensities must have the same length")
84
+
85
+ centroided_mzs = []
86
+ centroided_intensities = []
87
+
88
+ i = 0
89
+ while i < len(peak_mzs):
90
+ current_mz = peak_mzs[i]
91
+ current_intensity = peak_intensities[i]
92
+
93
+ # Calculate tolerance for current m/z
94
+ tolerance = current_mz * centroiding_ppm * 1e-6
95
+
96
+ # Find all peaks within tolerance
97
+ total_intensity = current_intensity
98
+ weighted_mz_sum = current_mz * current_intensity
99
+ j = i + 1
100
+
101
+ while j < len(peak_mzs) and abs(peak_mzs[j] - current_mz) <= tolerance:
102
+ total_intensity += peak_intensities[j]
103
+ weighted_mz_sum += peak_mzs[j] * peak_intensities[j]
104
+ j += 1
105
+
106
+ # Calculate intensity-weighted centroided m/z
107
+ if total_intensity > 0:
108
+ centroided_mz = weighted_mz_sum / total_intensity
109
+ centroided_mzs.append(centroided_mz)
110
+ centroided_intensities.append(total_intensity)
111
+
112
+ i = j
113
+
114
+ return np.array(centroided_mzs), np.array(centroided_intensities)
115
+
116
+
117
+ # CLR utilities implementation
118
+ try:
119
+ # require pythonnet, pip install pythonnet on Windows
120
+ import clr
121
+
122
+ clr.AddReference("System")
123
+
124
+ import System # noqa: F401
125
+ from System.Globalization import CultureInfo
126
+ from System.Runtime.InteropServices import GCHandle, GCHandleType
127
+ from System.Threading import Thread
128
+
129
+ de_fr = CultureInfo("fr-FR")
130
+ other = CultureInfo("en-US")
131
+
132
+ Thread.CurrentThread.CurrentCulture = other
133
+ Thread.CurrentThread.CurrentUICulture = other
134
+
135
+ # Find the alpharaw ext/thermo_fisher directory in site-packages
136
+ ext_dir = None
137
+ for site_dir in site.getsitepackages():
138
+ potential_ext_dir = os.path.join(site_dir, "alpharaw", "ext")
139
+ if os.path.exists(potential_ext_dir):
140
+ ext_dir = potential_ext_dir
141
+ break
142
+
143
+ if ext_dir is None:
144
+ # Try alternative locations
145
+ try:
146
+ import alpharaw
147
+ alpharaw_dir = os.path.dirname(alpharaw.__file__)
148
+ ext_dir = os.path.join(alpharaw_dir, "ext")
149
+ except ImportError:
150
+ pass
151
+
152
+ if not ext_dir or not os.path.exists(os.path.join(ext_dir, "thermo_fisher")):
153
+ raise ImportError("Could not find alpharaw ext/thermo_fisher directory with DLLs")
154
+
155
+ # Add Thermo Fisher DLL references
156
+ clr.AddReference(
157
+ os.path.join(ext_dir, "thermo_fisher", "ThermoFisher.CommonCore.Data.dll"),
158
+ )
159
+ clr.AddReference(
160
+ os.path.join(ext_dir, "thermo_fisher", "ThermoFisher.CommonCore.RawFileReader.dll")
161
+ )
162
+
163
+ import ThermoFisher # noqa: F401
164
+
165
+ from ThermoFisher.CommonCore.Data.Business import Device
166
+ from ThermoFisher.CommonCore.Data.Interfaces import IScanEvent, IScanEventBase # noqa: F401
167
+ from ThermoFisher.CommonCore.RawFileReader import RawFileReaderAdapter
168
+
169
+ HAS_DOTNET = True
170
+ except ImportError as e:
171
+ # Allow the rest of the code to work without .NET support
172
+ warnings.warn(
173
+ f"Thermo RAW file support is disabled. Install pythonnet and ensure Thermo Fisher DLLs "
174
+ f"are available to enable Thermo RAW file reading. Error: {e}",
175
+ UserWarning,
176
+ stacklevel=2,
177
+ )
178
+ HAS_DOTNET = False
179
+ except Exception as e:
180
+ # Catch any other .NET related errors
181
+ warnings.warn(
182
+ f"Failed to initialize .NET components for Thermo support. Error: {e}",
183
+ UserWarning,
184
+ stacklevel=2,
185
+ )
186
+ HAS_DOTNET = False
187
+
188
+
189
+ def dot_net_array_to_np_array(src) -> np.ndarray:
190
+ """
191
+ Convert .NET array to NumPy array with efficient memory handling.
192
+
193
+ This function performs a zero-copy conversion from .NET arrays to NumPy arrays
194
+ by directly accessing the underlying memory buffer. This is much faster than
195
+ iterating through elements.
196
+
197
+ Parameters
198
+ ----------
199
+ src : .NET array or None
200
+ Source .NET array to convert (typically double[])
201
+
202
+ Returns
203
+ -------
204
+ np.ndarray
205
+ Converted NumPy array with dtype float64. Returns empty array if src is None.
206
+
207
+ Notes
208
+ -----
209
+ Based on the approach from:
210
+ https://mail.python.org/pipermail/pythondotnet/2014-May/001527.html
211
+
212
+ The function uses GCHandle.Alloc to pin the .NET array in memory, allowing
213
+ direct access to its underlying buffer via ctypes. The buffer is then
214
+ wrapped as a NumPy array and copied to ensure memory safety.
215
+ """
216
+ if src is None:
217
+ return np.array([], dtype=np.float64)
218
+
219
+ # Pin the .NET array in memory to prevent garbage collection
220
+ src_hndl = GCHandle.Alloc(src, GCHandleType.Pinned)
221
+ try:
222
+ # Get pointer to the pinned memory
223
+ src_ptr = src_hndl.AddrOfPinnedObject().ToInt64()
224
+
225
+ # Create ctypes buffer pointing to the same memory
226
+ buf_type = ctypes.c_double * len(src)
227
+ cbuf = buf_type.from_address(src_ptr)
228
+
229
+ # Convert to NumPy array and make a copy for safety
230
+ dest = np.frombuffer(cbuf, dtype="float64").copy() # type: ignore[call-overload]
231
+ finally:
232
+ # Always free the GC handle to prevent memory leaks
233
+ if src_hndl.IsAllocated:
234
+ src_hndl.Free()
235
+ return dest # noqa: B012
236
+
237
+
238
+ class ThermoRawFileReader:
239
+ """
240
+ Direct implementation of Thermo RAW file reader using the Thermo Fisher DLLs.
241
+ """
242
+
243
+ def __init__(self, filename: str):
244
+ if not HAS_DOTNET:
245
+ raise ImportError(
246
+ "Thermo RAW file support requires .NET components. "
247
+ "Install pythonnet (pip install pythonnet) and ensure Thermo Fisher DLLs "
248
+ "are available in alpharaw's ext/thermo_fisher directory."
249
+ )
250
+
251
+ if not os.path.exists(filename):
252
+ raise FileNotFoundError(f"RAW file not found: {filename}")
253
+
254
+ try:
255
+ self._raw_file = RawFileReaderAdapter.FileFactory(filename)
256
+ except Exception as e:
257
+ raise ValueError(f"Failed to create RAW file reader for '{filename}': {e}") from e
258
+
259
+ if not self._raw_file.IsOpen:
260
+ raise ValueError(f"Could not open RAW file: {filename}")
261
+
262
+ try:
263
+ # Get basic file information
264
+ self._raw_file.SelectInstrument(Device.MS, 1) # MS instrument
265
+ self.first_scan = self._raw_file.RunHeaderEx.FirstSpectrum
266
+ self.last_scan = self._raw_file.RunHeaderEx.LastSpectrum
267
+ self.num_scans = self.last_scan - self.first_scan + 1
268
+ except Exception as e:
269
+ self.close()
270
+ raise ValueError(f"Failed to read RAW file header information: {e}") from e
271
+
272
+ def close(self) -> None:
273
+ """Close the file and clean up resources."""
274
+ if hasattr(self, '_raw_file') and self._raw_file is not None:
275
+ self._raw_file.Dispose()
276
+
277
+ def __enter__(self) -> 'ThermoRawFileReader':
278
+ """Context manager entry."""
279
+ return self
280
+
281
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
282
+ """Context manager exit."""
283
+ self.close()
284
+
285
+ def get_polarity_from_scan_event(self, scan_number: int) -> str:
286
+ """
287
+ Extract polarity information from scan event.
288
+
289
+ Parameters
290
+ ----------
291
+ scan_number : int
292
+ Scan number to extract polarity from
293
+
294
+ Returns
295
+ -------
296
+ str
297
+ 'positive', 'negative', or '' if unknown
298
+ """
299
+ try:
300
+ scan_event = self._raw_file.GetScanEventForScanNumber(scan_number)
301
+ if scan_event is None:
302
+ return ''
303
+
304
+ # Try the direct Polarity property first (most reliable)
305
+ if hasattr(scan_event, 'Polarity'):
306
+ polarity_str = str(scan_event.Polarity).lower()
307
+ if 'positive' in polarity_str:
308
+ return 'positive'
309
+ elif 'negative' in polarity_str:
310
+ return 'negative'
311
+
312
+ # Fallback: parse the scan filter string
313
+ filter_string = str(scan_event.ToString()).lower()
314
+ if '+' in filter_string or 'positive' in filter_string:
315
+ return 'positive'
316
+ elif '-' in filter_string or 'negative' in filter_string:
317
+ return 'negative'
318
+
319
+ except Exception:
320
+ # Log the exception if needed, but don't raise
321
+ pass
322
+
323
+ return '' # Unknown polarity
324
+
325
+ def _extract_precursor_info(self, scan_event, ms_level: int) -> tuple[float, int, float, float, float]:
326
+ """Extract precursor information from scan event for MS2+ scans."""
327
+ if ms_level <= 1 or scan_event is None:
328
+ return -1.0, 0, 0.0, -1.0, -1.0
329
+
330
+ try:
331
+ precursor_mz = float(scan_event.GetMass(0))
332
+ except Exception:
333
+ precursor_mz = -1.0
334
+
335
+ try:
336
+ precursor_charge = int(scan_event.GetChargeState(0)) if hasattr(scan_event, 'GetChargeState') else 0
337
+ except Exception:
338
+ precursor_charge = 0
339
+
340
+ try:
341
+ collision_energy = float(scan_event.GetEnergy(0)) if hasattr(scan_event, 'GetEnergy') else 0.0
342
+ except Exception:
343
+ collision_energy = 0.0
344
+
345
+ try:
346
+ isolation_window = float(scan_event.GetIsolationWidth(0)) if hasattr(scan_event, 'GetIsolationWidth') else 3.0
347
+ except Exception:
348
+ isolation_window = 3.0
349
+
350
+ isolation_lower = precursor_mz - isolation_window / 2
351
+ isolation_upper = precursor_mz + isolation_window / 2
352
+
353
+ return precursor_mz, precursor_charge, collision_energy, isolation_lower, isolation_upper
354
+
355
+ def _process_scan_data(
356
+ self,
357
+ scan_data,
358
+ centroid: bool,
359
+ centroid_ppm: float,
360
+ keep_k_peaks: int
361
+ ) -> tuple[np.ndarray, np.ndarray]:
362
+ """Process scan data to extract and optionally centroid peaks."""
363
+ if scan_data.Positions is not None and scan_data.Intensities is not None:
364
+ mz_array = dot_net_array_to_np_array(scan_data.Positions)
365
+ int_array = dot_net_array_to_np_array(scan_data.Intensities).astype(np.float32)
366
+ else:
367
+ return np.array([]), np.array([])
368
+
369
+ if centroid and len(mz_array) > 0:
370
+ mz_array, int_array = naive_centroid(
371
+ mz_array,
372
+ int_array,
373
+ centroiding_ppm=centroid_ppm,
374
+ )
375
+
376
+ # Keep only top K peaks by intensity
377
+ if len(mz_array) > keep_k_peaks:
378
+ top_indices = np.argsort(int_array)[-keep_k_peaks:]
379
+ top_indices = np.sort(top_indices)
380
+ mz_array = mz_array[top_indices]
381
+ int_array = int_array[top_indices]
382
+
383
+ return mz_array, int_array
384
+
385
+ def load_all_scans(
386
+ self,
387
+ centroid: bool = True,
388
+ centroid_ppm: float = 20.0,
389
+ ignore_empty_scans: bool = True,
390
+ keep_k_peaks: int = 2000,
391
+ ) -> dict[str, Any]:
392
+ """
393
+ Load all scans from the RAW file and extract spectral data.
394
+
395
+ Parameters
396
+ ----------
397
+ centroid : bool
398
+ Whether to centroid the data
399
+ centroid_ppm : float
400
+ PPM tolerance for centroiding
401
+ ignore_empty_scans : bool
402
+ Whether to skip empty scans
403
+ keep_k_peaks : int
404
+ Maximum number of peaks to keep per spectrum
405
+
406
+ Returns
407
+ -------
408
+ dict
409
+ Dictionary containing spectral data with keys:
410
+ peak_indices, peak_mz, peak_intensity, rt, ms_level, polarity,
411
+ precursor_mz, precursor_charge, isolation_lower_mz, isolation_upper_mz, nce
412
+ """
413
+ # Initialize data collection lists
414
+ peak_indices_list: list[int] = []
415
+ peak_mz_arrays: list[np.ndarray] = []
416
+ peak_intensity_arrays: list[np.ndarray] = []
417
+ rt_list: list[float] = []
418
+ ms_level_list: list[int] = []
419
+ polarity_list: list[str] = []
420
+ precursor_mz_list: list[float] = []
421
+ precursor_charge_list: list[int] = []
422
+ ce_list: list[float] = []
423
+ isolation_lower_mz_list: list[float] = []
424
+ isolation_upper_mz_list: list[float] = []
425
+
426
+ for scan_num in range(self.first_scan, self.last_scan + 1):
427
+ # Get scan statistics and data
428
+ scan_stats = self._raw_file.GetScanStatsForScanNumber(scan_num)
429
+ if scan_stats is None:
430
+ continue
431
+
432
+ scan_data = self._raw_file.GetSegmentedScanFromScanNumber(scan_num, scan_stats)
433
+ if scan_data is None or (ignore_empty_scans and scan_data.Positions is None):
434
+ continue
435
+
436
+ scan_event = self._raw_file.GetScanEventForScanNumber(scan_num)
437
+
438
+ # Extract basic scan information
439
+ rt = scan_stats.StartTime # in minutes
440
+ ms_level = int(scan_event.MSOrder) if scan_event else 1
441
+ polarity = self.get_polarity_from_scan_event(scan_num)
442
+
443
+ # Process peak data
444
+ mz_array, int_array = self._process_scan_data(
445
+ scan_data, centroid, centroid_ppm, keep_k_peaks
446
+ )
447
+
448
+ # Store scan data
449
+ peak_mz_arrays.append(mz_array)
450
+ peak_intensity_arrays.append(int_array)
451
+ peak_indices_list.append(len(mz_array))
452
+
453
+ rt_list.append(rt)
454
+ ms_level_list.append(ms_level)
455
+ polarity_list.append(polarity)
456
+
457
+ # Extract precursor information
458
+ precursor_mz, precursor_charge, collision_energy, isolation_lower, isolation_upper = \
459
+ self._extract_precursor_info(scan_event, ms_level)
460
+
461
+ precursor_mz_list.append(precursor_mz)
462
+ precursor_charge_list.append(precursor_charge)
463
+ ce_list.append(collision_energy)
464
+ isolation_lower_mz_list.append(isolation_lower)
465
+ isolation_upper_mz_list.append(isolation_upper)
466
+
467
+ if not rt_list:
468
+ raise ValueError("No valid scans found in the RAW file")
469
+
470
+ # Create cumulative peak indices array
471
+ peak_indices = np.empty(len(rt_list) + 1, dtype=np.int64)
472
+ peak_indices[0] = 0
473
+ peak_indices[1:] = np.cumsum(peak_indices_list)
474
+
475
+ return {
476
+ "peak_indices": peak_indices,
477
+ "peak_mz": np.concatenate(peak_mz_arrays) if peak_mz_arrays else np.array([]),
478
+ "peak_intensity": np.concatenate(peak_intensity_arrays) if peak_intensity_arrays else np.array([]),
479
+ "rt": np.array(rt_list, dtype=np.float64),
480
+ "ms_level": np.array(ms_level_list, dtype=np.int8),
481
+ "polarity": np.array(polarity_list, dtype="U8"),
482
+ "precursor_mz": np.array(precursor_mz_list, dtype=np.float64),
483
+ "precursor_charge": np.array(precursor_charge_list, dtype=np.int8),
484
+ "isolation_lower_mz": np.array(isolation_lower_mz_list, dtype=np.float64),
485
+ "isolation_upper_mz": np.array(isolation_upper_mz_list, dtype=np.float64),
486
+ "nce": np.array(ce_list, dtype=np.float32),
487
+ }
488
+
489
+
490
+ class ThermoRawData:
491
+ """
492
+ Standalone Thermo RAW data reader class that provides RAW data reading
493
+ functionality using Thermo Fisher DLLs directly.
494
+ """
495
+
496
+ # Column data types mapping
497
+ column_dtypes: ClassVar[dict[str, Any]] = {
498
+ "rt": np.float64,
499
+ "ms_level": np.int8,
500
+ "polarity": "U8",
501
+ "precursor_mz": np.float64,
502
+ "isolation_lower_mz": np.float64,
503
+ "isolation_upper_mz": np.float64,
504
+ "precursor_charge": np.int8,
505
+ "nce": np.float32,
506
+ "injection_time": np.float32,
507
+ "activation": "U",
508
+ }
509
+
510
+ def __init__(self, centroided: bool = True) -> None:
511
+ """
512
+ Initialize ThermoRawData reader.
513
+
514
+ Parameters
515
+ ----------
516
+ centroided : bool, optional
517
+ If peaks will be centroided after loading, by default True.
518
+ Note: Centroiding is currently disabled due to implementation limitations.
519
+ """
520
+ # Initialize dataframes
521
+ self.spectrum_df: pd.DataFrame = pd.DataFrame()
522
+ self.peak_df: pd.DataFrame = pd.DataFrame()
523
+
524
+ # File and instrument information
525
+ self._raw_file_path = ""
526
+ self.creation_time = ""
527
+ self.file_type = "thermo"
528
+ self.instrument = "thermo"
529
+
530
+ # Processing parameters
531
+ self.centroided = centroided
532
+ self.centroid_ppm = 20.0
533
+ self.ignore_empty_scans = True
534
+ self.keep_k_peaks_per_spec = 2000
535
+
536
+ # Disable centroiding for now
537
+ if self.centroided:
538
+ self.centroided = False
539
+ warnings.warn(
540
+ "Centroiding for Thermo data is not well implemented yet. "
541
+ "Data will be processed in profile mode.",
542
+ UserWarning,
543
+ stacklevel=2,
544
+ )
545
+
546
+ @property
547
+ def raw_file_path(self) -> str:
548
+ """Get the raw file path."""
549
+ return self._raw_file_path
550
+
551
+ @raw_file_path.setter
552
+ def raw_file_path(self, value: str):
553
+ """Set the raw file path."""
554
+ self._raw_file_path = value
555
+
556
+ def import_raw(self, raw_file_path: str) -> None:
557
+ """
558
+ Import raw data from a RAW file.
559
+
560
+ Parameters
561
+ ----------
562
+ raw_file_path : str
563
+ Path to the RAW file
564
+ """
565
+ self.raw_file_path = raw_file_path
566
+ data_dict = self._import(raw_file_path)
567
+ self._set_dataframes(data_dict)
568
+
569
+ def _import(self, raw_file_path: str) -> dict[str, Any]:
570
+ """
571
+ Import data from a Thermo RAW file.
572
+
573
+ Parameters
574
+ ----------
575
+ raw_file_path : str
576
+ Absolute or relative path of the Thermo RAW file.
577
+
578
+ Returns
579
+ -------
580
+ dict
581
+ Dictionary containing spectrum information and peak data.
582
+ """
583
+ with ThermoRawFileReader(raw_file_path) as raw_reader:
584
+ data_dict = raw_reader.load_all_scans(
585
+ centroid=self.centroided,
586
+ centroid_ppm=self.centroid_ppm,
587
+ ignore_empty_scans=self.ignore_empty_scans,
588
+ keep_k_peaks=self.keep_k_peaks_per_spec,
589
+ )
590
+
591
+ # Try to get file creation time
592
+ try:
593
+ creation_info = raw_reader._raw_file.GetCreationDate()
594
+ self.creation_time = creation_info.ToString("O") if creation_info else ""
595
+ except Exception:
596
+ self.creation_time = ""
597
+
598
+ return data_dict
599
+
600
+ def _set_dataframes(self, raw_data: dict[str, Any]) -> None:
601
+ """
602
+ Set the spectrum and peak dataframes from raw data dictionary.
603
+
604
+ Parameters
605
+ ----------
606
+ raw_data : dict
607
+ Dictionary containing the raw spectral data with keys like 'rt', 'peak_mz', etc.
608
+ """
609
+ num_spectra = len(raw_data["rt"])
610
+
611
+ # Create spectrum dataframe
612
+ self.create_spectrum_df(num_spectra)
613
+
614
+ # Create peak dataframe with indexed arrays
615
+ self.set_peak_df_by_indexed_array(
616
+ raw_data["peak_mz"],
617
+ raw_data["peak_intensity"],
618
+ raw_data["peak_indices"][:-1], # start indices
619
+ raw_data["peak_indices"][1:], # end indices
620
+ )
621
+
622
+ # Add spectrum-level data to spectrum dataframe
623
+ for column_name, values in raw_data.items():
624
+ if column_name in self.column_dtypes and column_name != "peak_mz" and column_name != "peak_intensity":
625
+ dtype = self.column_dtypes[column_name]
626
+ if dtype == "O":
627
+ self.spectrum_df[column_name] = list(values)
628
+ else:
629
+ self.spectrum_df[column_name] = np.array(values, dtype=dtype)
630
+
631
+ def create_spectrum_df(self, spectrum_num: int) -> None:
632
+ """
633
+ Create an empty spectrum dataframe from the number of spectra.
634
+
635
+ Parameters
636
+ ----------
637
+ spectrum_num : int
638
+ The number of spectra.
639
+ """
640
+ self.spectrum_df = pd.DataFrame(index=np.arange(spectrum_num, dtype=np.int64))
641
+ self.spectrum_df["spec_idx"] = self.spectrum_df.index.values
642
+
643
+ def set_peak_df_by_indexed_array(
644
+ self,
645
+ mz_array: np.ndarray,
646
+ intensity_array: np.ndarray,
647
+ peak_start_indices: np.ndarray,
648
+ peak_stop_indices: np.ndarray,
649
+ ) -> None:
650
+ """
651
+ Set peak dataframe using indexed arrays.
652
+
653
+ Parameters
654
+ ----------
655
+ mz_array : np.ndarray
656
+ Array of m/z values
657
+ intensity_array : np.ndarray
658
+ Array of intensity values
659
+ peak_start_indices : np.ndarray
660
+ Array of start indices for each spectrum
661
+ peak_stop_indices : np.ndarray
662
+ Array of stop indices for each spectrum
663
+ """
664
+ self.peak_df = pd.DataFrame()
665
+ self.peak_df["mz"] = mz_array.astype(np.float64)
666
+ self.peak_df["intensity"] = intensity_array.astype(np.float32)
667
+
668
+ # Set peak start and stop indices in spectrum df
669
+ self.spectrum_df["peak_start_idx"] = peak_start_indices
670
+ self.spectrum_df["peak_stop_idx"] = peak_stop_indices
671
+
672
+ def get_peaks(self, spec_idx: int) -> tuple[np.ndarray, np.ndarray]:
673
+ """
674
+ Get peaks for a specific spectrum.
675
+
676
+ Parameters
677
+ ----------
678
+ spec_idx : int
679
+ Spectrum index
680
+
681
+ Returns
682
+ -------
683
+ tuple
684
+ (mz_array, intensity_array)
685
+ """
686
+ start, end = self.spectrum_df.iloc[spec_idx][["peak_start_idx", "peak_stop_idx"]].values
687
+ return (
688
+ self.peak_df.mz.values[start:end],
689
+ self.peak_df.intensity.values[start:end],
690
+ )
691
+
692
+ def __repr__(self) -> str:
693
+ return f"ThermoRawData(file_path='{self.raw_file_path}', spectra={len(self.spectrum_df)})"
694
+
695
+
696
+ # Convenience functions to maintain compatibility with existing code
697
+ def load_raw_file(filename: str, **kwargs) -> ThermoRawData:
698
+ """
699
+ Load a RAW file and return a ThermoRawData object.
700
+
701
+ Parameters
702
+ ----------
703
+ filename : str
704
+ Path to the RAW file
705
+ **kwargs
706
+ Additional arguments to pass to ThermoRawData constructor
707
+
708
+ Returns
709
+ -------
710
+ ThermoRawData
711
+ Loaded RAW data object
712
+ """
713
+ raw_data = ThermoRawData(**kwargs)
714
+ raw_data.import_raw(filename)
715
+ return raw_data
716
+
717
+
718
+ def get_file_info(filename: str) -> dict[str, Any]:
719
+ """
720
+ Get basic information about a RAW file.
721
+
722
+ Parameters
723
+ ----------
724
+ filename : str
725
+ Path to the RAW file
726
+
727
+ Returns
728
+ -------
729
+ dict
730
+ Dictionary with file information including scan count, scan range, etc.
731
+ """
732
+ with ThermoRawFileReader(filename) as reader:
733
+ return {
734
+ "first_scan": reader.first_scan,
735
+ "last_scan": reader.last_scan,
736
+ "num_scans": reader.num_scans,
737
+ "scan_range": f"{reader.first_scan}-{reader.last_scan}",
738
+ }
739
+
740
+
741
+ def main() -> None:
742
+ """
743
+ Main function for testing and demonstrating the module functionality.
744
+
745
+ This function provides usage examples and tests basic module functionality
746
+ when the script is run directly.
747
+ """
748
+ print("Standalone Thermo RAW Reader")
749
+ print("=" * 40)
750
+
751
+ # Display usage example
752
+ print("\nUsage Example:")
753
+ print("-" * 20)
754
+ example_code = '''
755
+ from thermo import ThermoRawData, load_raw_file
756
+
757
+ # Method 1: Create reader instance
758
+ raw_data = ThermoRawData(centroided=False)
759
+ raw_data.import_raw("path/to/file.raw")
760
+
761
+ # Method 2: Use convenience function
762
+ raw_data = load_raw_file("path/to/file.raw")
763
+
764
+ # Access data
765
+ print(f"Spectra: {len(raw_data.spectrum_df)}")
766
+ print(f"Peaks: {len(raw_data.peak_df)}")
767
+
768
+ # Get peaks for first spectrum
769
+ mz, intensity = raw_data.get_peaks(0)
770
+
771
+ # Check available polarities
772
+ polarities = raw_data.spectrum_df['polarity'].unique()
773
+ print(f"Polarities: {polarities}")
774
+ '''
775
+ print(example_code)
776
+
777
+ # Test module functionality
778
+ print("\nModule Status:")
779
+ print("-" * 20)
780
+
781
+ try:
782
+ # Test class instantiation
783
+ test_data = ThermoRawData()
784
+ print("✓ ThermoRawData instantiated successfully")
785
+
786
+ # Check .NET support
787
+ if HAS_DOTNET:
788
+ print("✓ .NET support available")
789
+ print(" • Thermo Fisher DLLs loaded")
790
+ print(" • RAW file reading enabled")
791
+ else:
792
+ print("⚠ .NET support not available")
793
+ print(" • Install pythonnet to enable RAW file reading")
794
+ print(" • Ensure Thermo Fisher DLLs are in alpharaw ext directory")
795
+
796
+ except Exception as e:
797
+ print(f"✗ Error during module testing: {e}")
798
+
799
+
800
+ if __name__ == "__main__":
801
+ main()