masster 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1685 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +393 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -736
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1406 -886
  41. masster/study/helpers.py +1713 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1231 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +161 -134
  48. masster/study/study.py +612 -522
  49. masster/study/study5_schema.json +253 -241
  50. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/METADATA +15 -10
  51. masster-0.3.1.dist-info/RECORD +59 -0
  52. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.5.dist-info/RECORD +0 -50
  54. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/WHEEL +0 -0
  55. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/entry_points.txt +0 -0
masster/chromatogram.py CHANGED
@@ -1,503 +1,497 @@
1
- """
2
- chrom.py
3
-
4
- This module provides tools for processing and analyzing chromatographic data.
5
- It defines the `chrom` class for handling retention time and intensity profiles,
6
- including peak detection, chromatographic feature extraction, and visualization.
7
-
8
- Key Features:
9
- - **Chromatogram Processing**: Handle retention time and intensity data arrays.
10
- - **Peak Detection**: Advanced chromatographic peak picking with customizable parameters.
11
- - **Feature Extraction**: Extract chromatographic features including peak areas, widths, and shapes.
12
- - **Baseline Correction**: Remove baseline contributions from chromatographic data.
13
- - **Visualization**: Plot chromatograms with peak annotations and feature highlighting.
14
- - **Quality Metrics**: Calculate peak quality metrics and chromatographic statistics.
15
-
16
- Dependencies:
17
- - `numpy`: For numerical array operations and mathematical computations.
18
- - `polars`: For structured data handling and tabulation.
19
- - `scipy.signal`: For signal processing, peak detection, and chromatographic algorithms.
20
-
21
- Classes:
22
- - `chrom`: Main class for chromatographic data processing, providing methods for
23
- peak detection, feature extraction, and analysis.
24
-
25
- Example Usage:
26
- ```python
27
- from chrom import chrom
28
- import numpy as np
29
-
30
- # Create chromatogram from retention time and intensity arrays
31
- rt = np.linspace(0, 300, 1000) # 5 minutes in seconds
32
- intensity = np.random.normal(1000, 100, 1000) # Baseline noise
33
- # Add a peak
34
- peak_center = 150
35
- peak_intensity = np.exp(-((rt - peak_center) ** 2) / (2 * 10**2)) * 10000
36
- intensity += peak_intensity
37
-
38
- chromatogram = chrom(rt=rt, inty=intensity, label="Sample 1")
39
- chromatogram.find_peaks()
40
- chromatogram.plot()
41
- ```
42
-
43
- See Also:
44
- - `single.py`: For complete mass spectrometry file processing including chromatograms.
45
- - `parameters.chrom_parameters`: For chromatography-specific parameter configuration.
46
-
47
- """
48
-
49
- import importlib
50
-
51
- from dataclasses import dataclass
52
-
53
- import numpy as np
54
- import polars
55
-
56
- from scipy.signal import find_peaks
57
- from scipy.signal import peak_prominences
58
-
59
-
60
- @dataclass
61
- class Chromatogram:
62
- """
63
- A class for processing and analyzing chromatographic data.
64
-
65
- The `chrom` class provides comprehensive tools for handling chromatographic profiles,
66
- including retention time and intensity data processing, peak detection, feature
67
- extraction, and quality assessment. It supports various chromatographic data types
68
- and provides methods for baseline correction and peak characterization.
69
-
70
- Attributes:
71
- rt (np.ndarray): Retention time values (typically in seconds).
72
- inty (np.ndarray): Intensity values corresponding to retention times.
73
- label (str, optional): Text label for the chromatogram.
74
- rt_unit (str, optional): Unit for retention time ("sec" or "min").
75
- history (str): Processing history log.
76
- bl (np.ndarray, optional): Baseline values for baseline correction.
77
- feature_start (float, optional): Start retention time of detected feature.
78
- feature_end (float, optional): End retention time of detected feature.
79
- feature_apex (float, optional): Apex retention time of detected feature.
80
- feature_area (float, optional): Integrated area of detected feature.
81
-
82
- Key Methods:
83
- - `find_peaks()`: Detect chromatographic peaks with customizable parameters.
84
- - `calculate_area()`: Integrate peak areas for quantification.
85
- - `baseline_correct()`: Remove baseline contributions.
86
- - `plot()`: Visualize chromatographic data with annotations.
87
- - `get_statistics()`: Calculate chromatographic quality metrics.
88
-
89
- Example Usage:
90
- >>> import numpy as np
91
- >>> from masster import chrom
92
- >>> rt = np.linspace(0, 300, 1000)
93
- >>> intensity = np.random.normal(1000, 100, 1000)
94
- >>> chromatogram = chrom(rt=rt, inty=intensity, label="EIC m/z 150")
95
- >>> chromatogram.find_peaks()
96
- >>> chromatogram.calculate_area()
97
-
98
- See Also:
99
- - `ddafile`: For complete mass spectrometry data including chromatograms.
100
- - `ChromParameters`: For chromatography-specific parameter configuration.
101
- """
102
-
103
- def __init__(
104
- self,
105
- rt: np.ndarray | None = None,
106
- inty: np.ndarray | None = None,
107
- label: str | None = None,
108
- rt_unit: str | None = None,
109
- **kwargs,
110
- ):
111
- # Handle case where rt and inty might be in kwargs (from from_dict/from_json)
112
- if rt is None and "rt" in kwargs:
113
- rt = kwargs.pop("rt")
114
- if inty is None and "inty" in kwargs:
115
- inty = kwargs.pop("inty")
116
-
117
- # Ensure rt and inty are provided
118
- if rt is None or inty is None:
119
- raise ValueError("rt and inty arrays are required")
120
-
121
- self.label = label
122
- self.rt = np.asarray(rt, dtype=np.float64)
123
- # if all rt are less than 60, assume minutes
124
- if rt_unit is None:
125
- if np.all(self.rt < 60):
126
- self.rt_unit = "sec"
127
- else:
128
- self.rt_unit = "sec"
129
- else:
130
- self.rt_unit = rt_unit
131
- self.inty = np.asarray(inty, dtype=np.float64)
132
- self.history = ""
133
- self.bl: float | None = None
134
- self.feature_start: float | None = None
135
- self.feature_end: float | None = None
136
- self.feature_apex: float | None = None
137
- self.feature_area: float | None = None
138
- self.lib_rt: float | None = None # Library retention time for reference
139
- self.__dict__.update(kwargs)
140
- # sort rt and inty by rt
141
- if len(self.rt) > 0:
142
- sorted_indices = np.argsort(self.rt)
143
- self.rt = self.rt[sorted_indices]
144
- self.inty = self.inty[sorted_indices]
145
- self.__post_init__()
146
-
147
- # a spectrum is defined by mz and intensity values. It can also have ms_level, centroided, and label. If additional arguments are provided, they are added to the dictionary.
148
-
149
- def __post_init__(self):
150
- """Validate and ensure arrays are numpy arrays."""
151
- self.rt = np.asarray(self.rt)
152
- self.inty = np.asarray(self.inty)
153
- if self.rt.shape != self.inty.shape:
154
- raise ValueError("rt and intensity arrays must have the same shape")
155
-
156
- def __len__(self):
157
- """Return the number of points in the chromatogram."""
158
- return len(self.rt)
159
-
160
- def reload(self):
161
- """
162
- Reloads the module and updates the class reference of the instance.
163
- """
164
- # Get the name of the module containing the class
165
- modname = self.__class__.__module__
166
- # Import the module
167
- mod = __import__(modname, fromlist=[modname.split(".")[0]])
168
- # Reload the module
169
- importlib.reload(mod)
170
- # Get the updated class reference from the reloaded module
171
- new = getattr(mod, self.__class__.__name__)
172
- # Update the class reference of the instance
173
- self.__class__ = new
174
-
175
- def to_dict(self):
176
- # return a dictionary representation of the chromatogram. include all the attributes
177
- # Create a copy to avoid modifying the original object
178
- result = {}
179
-
180
- # Handle numpy arrays by creating copies and converting to lists
181
- for key, value in self.__dict__.items():
182
- if isinstance(value, np.ndarray):
183
- result[key] = value.copy().tolist()
184
- elif isinstance(value, (list, dict)):
185
- # Create copies of mutable objects
186
- import copy
187
-
188
- result[key] = copy.deepcopy(value)
189
- else:
190
- # Immutable objects can be copied directly
191
- result[key] = value
192
-
193
- # Sort rt and inty in the result (not the original object)
194
- if "rt" in result and "inty" in result and len(result["rt"]) > 0:
195
- rt_array = np.array(result["rt"])
196
- inty_array = np.array(result["inty"])
197
- sorted_indices = np.argsort(rt_array)
198
- result["rt"] = np.round(rt_array[sorted_indices], 3).tolist()
199
- result["inty"] = np.round(inty_array[sorted_indices], 3).tolist()
200
-
201
- return result
202
-
203
- @classmethod
204
- def from_dict(cls, data):
205
- """
206
- Create a Chromatogram instance from a dictionary of attributes.
207
- Args:
208
- data (dict): Dictionary containing chromatogram attributes.
209
- Returns:
210
- Chromatogram: New instance with attributes set from the dictionary.
211
- """
212
- # Create instance directly from data dictionary
213
- return cls(**data)
214
-
215
- def to_json(self):
216
- """
217
- Serialize the chromatogram to a JSON string.
218
-
219
- Returns:
220
- str: JSON string representation of the chromatogram.
221
- """
222
- import json
223
-
224
- data = self.to_dict()
225
- return json.dumps(data, indent=2)
226
-
227
- @classmethod
228
- def from_json(cls, json_str):
229
- """
230
- Create a Chromatogram instance from a JSON string.
231
-
232
- Args:
233
- json_str (str): JSON string containing chromatogram data.
234
-
235
- Returns:
236
- Chromatogram: New instance with attributes set from the JSON data.
237
- """
238
- import json
239
-
240
- data = json.loads(json_str)
241
- return cls.from_dict(data)
242
-
243
- def copy(self):
244
- """
245
- Create a copy of the chromatogram instance.
246
- Returns:
247
- A new instance of the chromatogram with the same data.
248
- """
249
- return Chromatogram(
250
- rt=self.rt.copy(),
251
- inty=self.inty.copy(),
252
- label=self.label,
253
- rt_unit=self.rt_unit,
254
- **{
255
- k: v.copy()
256
- for k, v in self.__dict__.items()
257
- if isinstance(v, np.ndarray)
258
- },
259
- )
260
-
261
- def pandalize(self):
262
- """
263
- Convert the spectrum to a pandas DataFrame.
264
- This is an alias for to_df.
265
- """
266
- return self.to_df()
267
-
268
- def to_df(self):
269
- """
270
- Convert the spectrum to a pandas dataframe. include all the attributes that have the same length as mz
271
- find all attributes that are numpy arrays and have the same length as mz
272
- """
273
- data = {
274
- key: val
275
- for key, val in self.__dict__.items()
276
- if isinstance(val, np.ndarray) and val.size == self.rt.size
277
- }
278
- return polars.DataFrame(data)
279
-
280
- def plot(self, ax=None, width=800, height=300, **kwargs):
281
- """
282
- Plot the chromatogram using bokeh
283
- """
284
- import bokeh.plotting as bp
285
-
286
- from bokeh.models import ColumnDataSource
287
- from bokeh.models import HoverTool
288
-
289
- # Import Span with fallback - use type: ignore to avoid mypy issues with different Bokeh versions
290
- try:
291
- from bokeh.models import Span # type: ignore
292
- except ImportError:
293
- from bokeh.models import VSpan as Span
294
-
295
- if ax is None:
296
- p = bp.figure(
297
- title=self.label,
298
- width=width,
299
- height=height,
300
- )
301
- p.xaxis.axis_label = f"rt ({self.rt_unit})"
302
- p.yaxis.axis_label = "inty"
303
- else:
304
- p = ax
305
-
306
- # sort by rt
307
- sorted_indices = np.argsort(self.rt)
308
- self.rt = self.rt[sorted_indices]
309
- self.inty = self.inty[sorted_indices]
310
-
311
- source = ColumnDataSource(data={"rt": self.rt, "inty": self.inty})
312
-
313
- line = p.line("rt", "inty", source=source, **kwargs)
314
-
315
- # Add hover tool for the chromatogram line
316
- hover = HoverTool(
317
- tooltips=[
318
- ("rt", "@rt"),
319
- ("inty", "@inty"),
320
- ],
321
- renderers=[line],
322
- )
323
- p.add_tools(hover)
324
-
325
- # Add spans and hover tools for them
326
- span_renderers = []
327
- if "feature_start" in self.__dict__:
328
- feature_start = self.feature_start
329
- feature_end = self.feature_end
330
- # Create spans - may fail with different Bokeh versions but we handle it
331
- span_start = Span(
332
- location=feature_start,
333
- dimension="height",
334
- line_color="green",
335
- line_width=1,
336
- line_dash="dashed",
337
- )
338
- span_end = Span(
339
- location=feature_end,
340
- dimension="height",
341
- line_color="green",
342
- line_width=1,
343
- line_dash="dashed",
344
- )
345
- p.add_layout(span_start)
346
- p.add_layout(span_end)
347
- span_renderers.extend([span_start, span_end])
348
- if "feature_apex" in self.__dict__:
349
- feature_apex = self.feature_apex
350
- span_apex = Span(
351
- location=feature_apex,
352
- dimension="height",
353
- line_color="green",
354
- line_width=1,
355
- )
356
- p.add_layout(span_apex)
357
- span_renderers.append(span_apex)
358
- if "lib_rt" in self.__dict__:
359
- lib_rt = self.lib_rt
360
- span_lib = Span(
361
- location=lib_rt,
362
- dimension="height",
363
- line_color="red",
364
- line_width=1,
365
- )
366
- p.add_layout(span_lib)
367
- span_renderers.append(span_lib)
368
-
369
- # Add hover tool for spans (using a dummy invisible renderer, since Span is not a glyph)
370
- # Workaround: add invisible vbar glyphs at the span locations for hover
371
- vbar_data: dict[str, list] = {"rt": [], "top": [], "bottom": []}
372
- vbar_tooltips = []
373
- if "feature_start" in self.__dict__:
374
- vbar_data["rt"].append(self.feature_start)
375
- vbar_data["top"].append(np.max(self.inty))
376
- vbar_data["bottom"].append(np.min(self.inty))
377
- vbar_tooltips.append(("feature_start", str(self.feature_start)))
378
- if "feature_end" in self.__dict__:
379
- vbar_data["rt"].append(self.feature_end)
380
- vbar_data["top"].append(np.max(self.inty))
381
- vbar_data["bottom"].append(np.min(self.inty))
382
- vbar_tooltips.append(("feature_end", str(self.feature_end)))
383
- if "lib_rt" in self.__dict__:
384
- vbar_data["rt"].append(self.lib_rt)
385
- vbar_data["top"].append(np.max(self.inty))
386
- vbar_data["bottom"].append(np.min(self.inty))
387
- vbar_tooltips.append(("lib_rt", str(self.lib_rt)))
388
- if "feature_apex" in self.__dict__:
389
- vbar_data["rt"].append(self.feature_apex)
390
- vbar_data["top"].append(np.max(self.inty))
391
- vbar_data["bottom"].append(np.min(self.inty))
392
- vbar_tooltips.append(("feature_apex", str(self.feature_apex)))
393
- if vbar_data["rt"]:
394
- vbar_source = ColumnDataSource(data=vbar_data)
395
- vbars = p.vbar(
396
- x="rt",
397
- top="top",
398
- bottom="bottom",
399
- width=0.01,
400
- alpha=0,
401
- source=vbar_source,
402
- )
403
- hover_span = HoverTool(tooltips=[("rt", "@rt")], renderers=[vbars])
404
- p.add_tools(hover_span)
405
-
406
- bp.show(p)
407
-
408
- def find_peaks(self, order_by="prominences"):
409
- sinty = self.inty
410
- p = []
411
- if len(sinty) > 5:
412
- # smooth
413
- # sinty = savgol_filter(sinty, window_length=7, polyorder=2)
414
- p, props = find_peaks(
415
- sinty,
416
- prominence=(None, None),
417
- height=(None, None),
418
- width=(None, None),
419
- )
420
-
421
- # TODO Instance attributes defined outside __init__
422
- if len(p) == 0:
423
- self.feature_apex = None
424
- self.feature_coherence = 0.0
425
- self.peak_rts = np.array([])
426
- self.peak_heights = np.array([])
427
- self.peak_prominences = np.array([])
428
- self.peak_widths = np.array([])
429
- self.peak_left_bases = np.array([])
430
- self.peak_right_bases = np.array([])
431
- else:
432
- prt = self.rt[p]
433
- # remove prt with prt < c['feature_start'] or prt > c['feature_end']
434
- mask = (prt >= self.feature_start) & (prt <= self.feature_end)
435
- # apply mask to all arrays in props
436
- if mask.any():
437
- for key in props:
438
- if isinstance(props[key], np.ndarray):
439
- props[key] = props[key][mask]
440
- # order peaks by the specified order
441
- p = p[mask]
442
- prt = prt[mask]
443
- if order_by in props:
444
- # descending order
445
- order = np.argsort(props[order_by])[::-1]
446
- else:
447
- order = np.arange(len(prt))
448
-
449
- # add to self
450
- self.feature_apex = prt[order[0]]
451
- self.feature_coherence = 0.0
452
- self.peak_rts = prt[order]
453
- self.peak_heights = self.inty[p][order]
454
- self.peak_prominences = peak_prominences(self.inty, p)[0][order]
455
- self.peak_widths = props["widths"][order]
456
- self.peak_left_bases = self.rt[props["left_bases"][order]]
457
- self.peak_right_bases = self.rt[props["right_bases"][order]]
458
- self.feature_start = self.peak_left_bases[0]
459
- self.feature_end = self.peak_right_bases[0]
460
-
461
- # select inty for rt between feature_start and feature_end
462
- mask = (
463
- (self.rt >= self.feature_start)
464
- & (self.rt <= self.feature_end)
465
- & (self.rt >= self.feature_apex - 4)
466
- & (self.rt <= self.feature_apex + 4)
467
- )
468
-
469
- sinty = sinty[mask]
470
- # calculate how many times the derivative of self.inty crosses zero
471
- if len(sinty) > 3:
472
- self.feature_coherence = 1 - np.sum(
473
- np.diff(np.sign(np.diff(sinty))) != 0,
474
- ) / (len(sinty) - 3)
475
- return self
476
-
477
- def integrate(self):
478
- """
479
- Integrate the chromatogram between feature_start and feature_end.
480
- """
481
- if self.feature_start is None or self.feature_end is None:
482
- raise ValueError(
483
- "feature_start and feature_end must be set before integration",
484
- )
485
-
486
- # At this point, mypy knows feature_start and feature_end are not None
487
- mask = (self.rt >= self.feature_start) & (self.rt <= self.feature_end)
488
- area_result = np.trapezoid(self.inty[mask], self.rt[mask])
489
- self.feature_area = float(area_result)
490
- if self.bl is not None:
491
- # subtract baseline
492
- self.feature_area -= self.bl * (self.feature_end - self.feature_start)
493
- if self.feature_area < 0:
494
- self.feature_area = 0.0
495
-
496
- def get_area(self):
497
- """
498
- Get the area of the chromatogram between feature_start and feature_end.
499
- If the area is not calculated, it will be calculated first.
500
- """
501
- if self.feature_area is None:
502
- self.integrate()
503
- return self.feature_area
1
+ """
2
+ chrom.py
3
+
4
+ This module provides tools for processing and analyzing chromatographic data.
5
+ It defines the `chrom` class for handling retention time and intensity profiles,
6
+ including peak detection, chromatographic feature extraction, and visualization.
7
+
8
+ Key Features:
9
+ - **Chromatogram Processing**: Handle retention time and intensity data arrays.
10
+ - **Peak Detection**: Advanced chromatographic peak picking with customizable parameters.
11
+ - **Feature Extraction**: Extract chromatographic features including peak areas, widths, and shapes.
12
+ - **Baseline Correction**: Remove baseline contributions from chromatographic data.
13
+ - **Visualization**: Plot chromatograms with peak annotations and feature highlighting.
14
+ - **Quality Metrics**: Calculate peak quality metrics and chromatographic statistics.
15
+
16
+ Dependencies:
17
+ - `numpy`: For numerical array operations and mathematical computations.
18
+ - `polars`: For structured data handling and tabulation.
19
+ - `scipy.signal`: For signal processing, peak detection, and chromatographic algorithms.
20
+
21
+ Classes:
22
+ - `chrom`: Main class for chromatographic data processing, providing methods for
23
+ peak detection, feature extraction, and analysis.
24
+
25
+ Example Usage:
26
+ ```python
27
+ from chrom import chrom
28
+ import numpy as np
29
+
30
+ # Create chromatogram from retention time and intensity arrays
31
+ rt = np.linspace(0, 300, 1000) # 5 minutes in seconds
32
+ intensity = np.random.normal(1000, 100, 1000) # Baseline noise
33
+ # Add a peak
34
+ peak_center = 150
35
+ peak_intensity = np.exp(-((rt - peak_center) ** 2) / (2 * 10**2)) * 10000
36
+ intensity += peak_intensity
37
+
38
+ chromatogram = chrom(rt=rt, inty=intensity, label="Sample 1")
39
+ chromatogram.find_peaks()
40
+ chromatogram.plot()
41
+ ```
42
+
43
+ See Also:
44
+ - `single.py`: For complete mass spectrometry file processing including chromatograms.
45
+ - `parameters.chrom_parameters`: For chromatography-specific parameter configuration.
46
+
47
+ """
48
+
49
+ import importlib
50
+
51
+ from dataclasses import dataclass
52
+
53
+ import numpy as np
54
+ import polars
55
+
56
+ from scipy.signal import find_peaks
57
+ from scipy.signal import peak_prominences
58
+
59
+
60
+ @dataclass
61
+ class Chromatogram:
62
+ """
63
+ A class for processing and analyzing chromatographic data.
64
+
65
+ The `chrom` class provides comprehensive tools for handling chromatographic profiles,
66
+ including retention time and intensity data processing, peak detection, feature
67
+ extraction, and quality assessment. It supports various chromatographic data types
68
+ and provides methods for baseline correction and peak characterization.
69
+
70
+ Attributes:
71
+ rt (np.ndarray): Retention time values (typically in seconds).
72
+ inty (np.ndarray): Intensity values corresponding to retention times.
73
+ label (str, optional): Text label for the chromatogram.
74
+ rt_unit (str, optional): Unit for retention time ("sec" or "min").
75
+ history (str): Processing history log.
76
+ bl (np.ndarray, optional): Baseline values for baseline correction.
77
+ feature_start (float, optional): Start retention time of detected feature.
78
+ feature_end (float, optional): End retention time of detected feature.
79
+ feature_apex (float, optional): Apex retention time of detected feature.
80
+ feature_area (float, optional): Integrated area of detected feature.
81
+
82
+ Key Methods:
83
+ - `find_peaks()`: Detect chromatographic peaks with customizable parameters.
84
+ - `calculate_area()`: Integrate peak areas for quantification.
85
+ - `baseline_correct()`: Remove baseline contributions.
86
+ - `plot()`: Visualize chromatographic data with annotations.
87
+ - `get_statistics()`: Calculate chromatographic quality metrics.
88
+
89
+ Example Usage:
90
+ >>> import numpy as np
91
+ >>> from masster import chrom
92
+ >>> rt = np.linspace(0, 300, 1000)
93
+ >>> intensity = np.random.normal(1000, 100, 1000)
94
+ >>> chromatogram = chrom(rt=rt, inty=intensity, label="EIC m/z 150")
95
+ >>> chromatogram.find_peaks()
96
+ >>> chromatogram.calculate_area()
97
+
98
+ See Also:
99
+ - `ddafile`: For complete mass spectrometry data including chromatograms.
100
+ - `ChromParameters`: For chromatography-specific parameter configuration.
101
+ """
102
+
103
+ def __init__(
104
+ self,
105
+ rt: np.ndarray | None = None,
106
+ inty: np.ndarray | None = None,
107
+ label: str | None = None,
108
+ rt_unit: str | None = None,
109
+ **kwargs,
110
+ ):
111
+ # Handle case where rt and inty might be in kwargs (from from_dict/from_json)
112
+ if rt is None and "rt" in kwargs:
113
+ rt = kwargs.pop("rt")
114
+ if inty is None and "inty" in kwargs:
115
+ inty = kwargs.pop("inty")
116
+
117
+ # Ensure rt and inty are provided
118
+ if rt is None or inty is None:
119
+ raise ValueError("rt and inty arrays are required")
120
+
121
+ self.label = label
122
+ self.rt = np.asarray(rt, dtype=np.float64)
123
+ # if all rt are less than 60, assume minutes
124
+ if rt_unit is None:
125
+ if np.all(self.rt < 60):
126
+ self.rt_unit = "sec"
127
+ else:
128
+ self.rt_unit = "sec"
129
+ else:
130
+ self.rt_unit = rt_unit
131
+ self.inty = np.asarray(inty, dtype=np.float64)
132
+ self.history = ""
133
+ self.bl: float | None = None
134
+ self.feature_start: float | None = None
135
+ self.feature_end: float | None = None
136
+ self.feature_apex: float | None = None
137
+ self.feature_area: float | None = None
138
+ self.lib_rt: float | None = None # Library retention time for reference
139
+ self.__dict__.update(kwargs)
140
+ # sort rt and inty by rt
141
+ if len(self.rt) > 0:
142
+ sorted_indices = np.argsort(self.rt)
143
+ self.rt = self.rt[sorted_indices]
144
+ self.inty = self.inty[sorted_indices]
145
+ self.__post_init__()
146
+
147
+ # a spectrum is defined by mz and intensity values. It can also have ms_level, centroided, and label. If additional arguments are provided, they are added to the dictionary.
148
+
149
+ def __post_init__(self):
150
+ """Validate and ensure arrays are numpy arrays."""
151
+ self.rt = np.asarray(self.rt)
152
+ self.inty = np.asarray(self.inty)
153
+ if self.rt.shape != self.inty.shape:
154
+ raise ValueError("rt and intensity arrays must have the same shape")
155
+
156
+ def __len__(self):
157
+ """Return the number of points in the chromatogram."""
158
+ return len(self.rt)
159
+
160
+ def reload(self):
161
+ """
162
+ Reloads the module and updates the class reference of the instance.
163
+ """
164
+ # Get the name of the module containing the class
165
+ modname = self.__class__.__module__
166
+ # Import the module
167
+ mod = __import__(modname, fromlist=[modname.split(".")[0]])
168
+ # Reload the module
169
+ importlib.reload(mod)
170
+ # Get the updated class reference from the reloaded module
171
+ new = getattr(mod, self.__class__.__name__)
172
+ # Update the class reference of the instance
173
+ self.__class__ = new
174
+
175
+ def to_dict(self):
176
+ # return a dictionary representation of the chromatogram. include all the attributes
177
+ # Create a copy to avoid modifying the original object
178
+ result = {}
179
+
180
+ # Handle numpy arrays by creating copies and converting to lists
181
+ for key, value in self.__dict__.items():
182
+ if isinstance(value, np.ndarray):
183
+ result[key] = value.copy().tolist()
184
+ elif isinstance(value, (list, dict)):
185
+ # Create copies of mutable objects
186
+ import copy
187
+
188
+ result[key] = copy.deepcopy(value)
189
+ else:
190
+ # Immutable objects can be copied directly
191
+ result[key] = value
192
+
193
+ # Sort rt and inty in the result (not the original object)
194
+ if "rt" in result and "inty" in result and len(result["rt"]) > 0:
195
+ rt_array = np.array(result["rt"])
196
+ inty_array = np.array(result["inty"])
197
+ sorted_indices = np.argsort(rt_array)
198
+ result["rt"] = np.round(rt_array[sorted_indices], 3).tolist()
199
+ result["inty"] = np.round(inty_array[sorted_indices], 3).tolist()
200
+
201
+ return result
202
+
203
+ @classmethod
204
+ def from_dict(cls, data):
205
+ """
206
+ Create a Chromatogram instance from a dictionary of attributes.
207
+ Args:
208
+ data (dict): Dictionary containing chromatogram attributes.
209
+ Returns:
210
+ Chromatogram: New instance with attributes set from the dictionary.
211
+ """
212
+ # Create instance directly from data dictionary
213
+ return cls(**data)
214
+
215
+ def to_json(self):
216
+ """
217
+ Serialize the chromatogram to a JSON string.
218
+
219
+ Returns:
220
+ str: JSON string representation of the chromatogram.
221
+ """
222
+ import json
223
+
224
+ data = self.to_dict()
225
+ return json.dumps(data, indent=2)
226
+
227
+ @classmethod
228
+ def from_json(cls, json_str):
229
+ """
230
+ Create a Chromatogram instance from a JSON string.
231
+
232
+ Args:
233
+ json_str (str): JSON string containing chromatogram data.
234
+
235
+ Returns:
236
+ Chromatogram: New instance with attributes set from the JSON data.
237
+ """
238
+ import json
239
+
240
+ data = json.loads(json_str)
241
+ return cls.from_dict(data)
242
+
243
+ def copy(self):
244
+ """
245
+ Create a copy of the chromatogram instance.
246
+ Returns:
247
+ A new instance of the chromatogram with the same data.
248
+ """
249
+ return Chromatogram(
250
+ rt=self.rt.copy(),
251
+ inty=self.inty.copy(),
252
+ label=self.label,
253
+ rt_unit=self.rt_unit,
254
+ **{k: v.copy() for k, v in self.__dict__.items() if isinstance(v, np.ndarray)},
255
+ )
256
+
257
+ def pandalize(self):
258
+ """
259
+ Convert the spectrum to a pandas DataFrame.
260
+ This is an alias for to_df.
261
+ """
262
+ return self.to_df()
263
+
264
+ def to_df(self):
265
+ """
266
+ Convert the spectrum to a pandas dataframe. include all the attributes that have the same length as mz
267
+ find all attributes that are numpy arrays and have the same length as mz
268
+ """
269
+ data = {
270
+ key: val for key, val in self.__dict__.items() if isinstance(val, np.ndarray) and val.size == self.rt.size
271
+ }
272
+ return polars.DataFrame(data)
273
+
274
+ def plot(self, ax=None, width=800, height=300, **kwargs):
275
+ """
276
+ Plot the chromatogram using bokeh
277
+ """
278
+ import bokeh.plotting as bp
279
+
280
+ from bokeh.models import ColumnDataSource
281
+ from bokeh.models import HoverTool
282
+
283
+ # Import Span with fallback - use type: ignore to avoid mypy issues with different Bokeh versions
284
+ try:
285
+ from bokeh.models import Span # type: ignore
286
+ except ImportError:
287
+ from bokeh.models import VSpan as Span
288
+
289
+ if ax is None:
290
+ p = bp.figure(
291
+ title=self.label,
292
+ width=width,
293
+ height=height,
294
+ )
295
+ p.xaxis.axis_label = f"rt ({self.rt_unit})"
296
+ p.yaxis.axis_label = "inty"
297
+ else:
298
+ p = ax
299
+
300
+ # sort by rt
301
+ sorted_indices = np.argsort(self.rt)
302
+ self.rt = self.rt[sorted_indices]
303
+ self.inty = self.inty[sorted_indices]
304
+
305
+ source = ColumnDataSource(data={"rt": self.rt, "inty": self.inty})
306
+
307
+ line = p.line("rt", "inty", source=source, **kwargs)
308
+
309
+ # Add hover tool for the chromatogram line
310
+ hover = HoverTool(
311
+ tooltips=[
312
+ ("rt", "@rt"),
313
+ ("inty", "@inty"),
314
+ ],
315
+ renderers=[line],
316
+ )
317
+ p.add_tools(hover)
318
+
319
+ # Add spans and hover tools for them
320
+ span_renderers = []
321
+ if "feature_start" in self.__dict__:
322
+ feature_start = self.feature_start
323
+ feature_end = self.feature_end
324
+ # Create spans - may fail with different Bokeh versions but we handle it
325
+ span_start = Span(
326
+ location=feature_start,
327
+ dimension="height",
328
+ line_color="green",
329
+ line_width=1,
330
+ line_dash="dashed",
331
+ )
332
+ span_end = Span(
333
+ location=feature_end,
334
+ dimension="height",
335
+ line_color="green",
336
+ line_width=1,
337
+ line_dash="dashed",
338
+ )
339
+ p.add_layout(span_start)
340
+ p.add_layout(span_end)
341
+ span_renderers.extend([span_start, span_end])
342
+ if "feature_apex" in self.__dict__:
343
+ feature_apex = self.feature_apex
344
+ span_apex = Span(
345
+ location=feature_apex,
346
+ dimension="height",
347
+ line_color="green",
348
+ line_width=1,
349
+ )
350
+ p.add_layout(span_apex)
351
+ span_renderers.append(span_apex)
352
+ if "lib_rt" in self.__dict__:
353
+ lib_rt = self.lib_rt
354
+ span_lib = Span(
355
+ location=lib_rt,
356
+ dimension="height",
357
+ line_color="red",
358
+ line_width=1,
359
+ )
360
+ p.add_layout(span_lib)
361
+ span_renderers.append(span_lib)
362
+
363
+ # Add hover tool for spans (using a dummy invisible renderer, since Span is not a glyph)
364
+ # Workaround: add invisible vbar glyphs at the span locations for hover
365
+ vbar_data: dict[str, list] = {"rt": [], "top": [], "bottom": []}
366
+ vbar_tooltips = []
367
+ if "feature_start" in self.__dict__:
368
+ vbar_data["rt"].append(self.feature_start)
369
+ vbar_data["top"].append(np.max(self.inty))
370
+ vbar_data["bottom"].append(np.min(self.inty))
371
+ vbar_tooltips.append(("feature_start", str(self.feature_start)))
372
+ if "feature_end" in self.__dict__:
373
+ vbar_data["rt"].append(self.feature_end)
374
+ vbar_data["top"].append(np.max(self.inty))
375
+ vbar_data["bottom"].append(np.min(self.inty))
376
+ vbar_tooltips.append(("feature_end", str(self.feature_end)))
377
+ if "lib_rt" in self.__dict__:
378
+ vbar_data["rt"].append(self.lib_rt)
379
+ vbar_data["top"].append(np.max(self.inty))
380
+ vbar_data["bottom"].append(np.min(self.inty))
381
+ vbar_tooltips.append(("lib_rt", str(self.lib_rt)))
382
+ if "feature_apex" in self.__dict__:
383
+ vbar_data["rt"].append(self.feature_apex)
384
+ vbar_data["top"].append(np.max(self.inty))
385
+ vbar_data["bottom"].append(np.min(self.inty))
386
+ vbar_tooltips.append(("feature_apex", str(self.feature_apex)))
387
+ if vbar_data["rt"]:
388
+ vbar_source = ColumnDataSource(data=vbar_data)
389
+ vbars = p.vbar(
390
+ x="rt",
391
+ top="top",
392
+ bottom="bottom",
393
+ width=0.01,
394
+ alpha=0,
395
+ source=vbar_source,
396
+ )
397
+ hover_span = HoverTool(tooltips=[("rt", "@rt")], renderers=[vbars])
398
+ p.add_tools(hover_span)
399
+
400
+ bp.show(p)
401
+
402
+ def find_peaks(self, order_by="prominences"):
403
+ sinty = self.inty
404
+ p = []
405
+ if len(sinty) > 5:
406
+ # smooth
407
+ # sinty = savgol_filter(sinty, window_length=7, polyorder=2)
408
+ p, props = find_peaks(
409
+ sinty,
410
+ prominence=(None, None),
411
+ height=(None, None),
412
+ width=(None, None),
413
+ )
414
+
415
+ # TODO Instance attributes defined outside __init__
416
+ if len(p) == 0:
417
+ self.feature_apex = None
418
+ self.feature_coherence = 0.0
419
+ self.peak_rts = np.array([])
420
+ self.peak_heights = np.array([])
421
+ self.peak_prominences = np.array([])
422
+ self.peak_widths = np.array([])
423
+ self.peak_left_bases = np.array([])
424
+ self.peak_right_bases = np.array([])
425
+ else:
426
+ prt = self.rt[p]
427
+ # remove prt with prt < c['feature_start'] or prt > c['feature_end']
428
+ mask = (prt >= self.feature_start) & (prt <= self.feature_end)
429
+ # apply mask to all arrays in props
430
+ if mask.any():
431
+ for key in props:
432
+ if isinstance(props[key], np.ndarray):
433
+ props[key] = props[key][mask]
434
+ # order peaks by the specified order
435
+ p = p[mask]
436
+ prt = prt[mask]
437
+ if order_by in props:
438
+ # descending order
439
+ order = np.argsort(props[order_by])[::-1]
440
+ else:
441
+ order = np.arange(len(prt))
442
+
443
+ # add to self
444
+ self.feature_apex = prt[order[0]]
445
+ self.feature_coherence = 0.0
446
+ self.peak_rts = prt[order]
447
+ self.peak_heights = self.inty[p][order]
448
+ self.peak_prominences = peak_prominences(self.inty, p)[0][order]
449
+ self.peak_widths = props["widths"][order]
450
+ self.peak_left_bases = self.rt[props["left_bases"][order]]
451
+ self.peak_right_bases = self.rt[props["right_bases"][order]]
452
+ self.feature_start = self.peak_left_bases[0]
453
+ self.feature_end = self.peak_right_bases[0]
454
+
455
+ # select inty for rt between feature_start and feature_end
456
+ mask = (
457
+ (self.rt >= self.feature_start)
458
+ & (self.rt <= self.feature_end)
459
+ & (self.rt >= self.feature_apex - 4)
460
+ & (self.rt <= self.feature_apex + 4)
461
+ )
462
+
463
+ sinty = sinty[mask]
464
+ # calculate how many times the derivative of self.inty crosses zero
465
+ if len(sinty) > 3:
466
+ self.feature_coherence = 1 - np.sum(
467
+ np.diff(np.sign(np.diff(sinty))) != 0,
468
+ ) / (len(sinty) - 3)
469
+ return self
470
+
471
+ def integrate(self):
472
+ """
473
+ Integrate the chromatogram between feature_start and feature_end.
474
+ """
475
+ if self.feature_start is None or self.feature_end is None:
476
+ raise ValueError(
477
+ "feature_start and feature_end must be set before integration",
478
+ )
479
+
480
+ # At this point, mypy knows feature_start and feature_end are not None
481
+ mask = (self.rt >= self.feature_start) & (self.rt <= self.feature_end)
482
+ area_result = np.trapezoid(self.inty[mask], self.rt[mask])
483
+ self.feature_area = float(area_result)
484
+ if self.bl is not None:
485
+ # subtract baseline
486
+ self.feature_area -= self.bl * (self.feature_end - self.feature_start)
487
+ if self.feature_area < 0:
488
+ self.feature_area = 0.0
489
+
490
+ def get_area(self):
491
+ """
492
+ Get the area of the chromatogram between feature_start and feature_end.
493
+ If the area is not calculated, it will be calculated first.
494
+ """
495
+ if self.feature_area is None:
496
+ self.integrate()
497
+ return self.feature_area