masster 0.4.19__py3-none-any.whl → 0.4.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -1
- masster/_version.py +1 -1
- masster/logger.py +42 -0
- masster/sample/h5.py +58 -1
- masster/sample/load.py +12 -5
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -9
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +137 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +238 -152
- masster/study/h5.py +65 -1
- masster/study/helpers.py +55 -14
- masster/study/merge.py +910 -67
- masster/study/plot.py +50 -7
- masster/study/processing.py +257 -1
- masster/study/save.py +48 -5
- masster/study/study.py +34 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +8 -2
- masster/wizard/wizard.py +612 -876
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/RECORD +27 -30
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- masster/wizard.py +0 -1175
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0
masster/__init__.py
CHANGED
|
@@ -8,6 +8,12 @@ mass spectrometry workflows.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
# Suppress pyOpenMS environment variable warnings globally
|
|
14
|
+
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
|
|
15
|
+
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
|
|
16
|
+
|
|
11
17
|
from masster._version import __version__
|
|
12
18
|
|
|
13
19
|
# from masster._version import get_version
|
|
@@ -27,5 +33,4 @@ __all__ = [
|
|
|
27
33
|
"Study",
|
|
28
34
|
"Wizard",
|
|
29
35
|
"__version__",
|
|
30
|
-
# "get_version",
|
|
31
36
|
]
|
masster/_version.py
CHANGED
masster/logger.py
CHANGED
|
@@ -55,6 +55,9 @@ class MassterLogger:
|
|
|
55
55
|
# Convert string sink to actual object
|
|
56
56
|
if sink == "sys.stdout" or sink is None:
|
|
57
57
|
self.sink = sys.stdout
|
|
58
|
+
elif isinstance(sink, str) and sink != "sys.stdout":
|
|
59
|
+
# If it's a file path string, open the file for writing
|
|
60
|
+
self.sink = open(sink, "a", encoding="utf-8")
|
|
58
61
|
else:
|
|
59
62
|
self.sink = sink
|
|
60
63
|
|
|
@@ -67,6 +70,21 @@ class MassterLogger:
|
|
|
67
70
|
# Remove any existing handlers to prevent duplicates
|
|
68
71
|
if self.logger_instance.hasHandlers():
|
|
69
72
|
self.logger_instance.handlers.clear()
|
|
73
|
+
|
|
74
|
+
# Also ensure no duplicate handlers on parent loggers
|
|
75
|
+
parent = self.logger_instance.parent
|
|
76
|
+
while parent:
|
|
77
|
+
if parent.name == "masster" and parent.hasHandlers():
|
|
78
|
+
# Remove duplicate handlers from masster parent logger
|
|
79
|
+
unique_handlers = []
|
|
80
|
+
handler_types = set()
|
|
81
|
+
for handler in parent.handlers:
|
|
82
|
+
handler_type = type(handler)
|
|
83
|
+
if handler_type not in handler_types:
|
|
84
|
+
unique_handlers.append(handler)
|
|
85
|
+
handler_types.add(handler_type)
|
|
86
|
+
parent.handlers = unique_handlers
|
|
87
|
+
parent = parent.parent
|
|
70
88
|
|
|
71
89
|
self.logger_instance.setLevel(getattr(logging, self.level))
|
|
72
90
|
|
|
@@ -129,6 +147,17 @@ class MassterLogger:
|
|
|
129
147
|
|
|
130
148
|
# Prevent propagation to avoid duplicate messages
|
|
131
149
|
self.logger_instance.propagate = False
|
|
150
|
+
|
|
151
|
+
# Additional fix: ensure no duplicate handlers in the entire logging hierarchy
|
|
152
|
+
masster_logger = logging.getLogger("masster")
|
|
153
|
+
if masster_logger.hasHandlers():
|
|
154
|
+
# Keep only one handler per type
|
|
155
|
+
unique_handlers = {}
|
|
156
|
+
for handler in masster_logger.handlers:
|
|
157
|
+
handler_key = (type(handler).__name__, getattr(handler, 'stream', None))
|
|
158
|
+
if handler_key not in unique_handlers:
|
|
159
|
+
unique_handlers[handler_key] = handler
|
|
160
|
+
masster_logger.handlers = list(unique_handlers.values())
|
|
132
161
|
|
|
133
162
|
def update_level(self, level: str):
|
|
134
163
|
"""Update the logging level."""
|
|
@@ -326,7 +355,20 @@ class MassterLogger:
|
|
|
326
355
|
"""Remove this logger's handler."""
|
|
327
356
|
if self.handler:
|
|
328
357
|
self.logger_instance.removeHandler(self.handler)
|
|
358
|
+
# Close the file handle if it's not stdout
|
|
359
|
+
if hasattr(self.sink, 'close') and self.sink != sys.stdout:
|
|
360
|
+
try:
|
|
361
|
+
self.sink.close()
|
|
362
|
+
except Exception:
|
|
363
|
+
pass # Ignore close errors
|
|
329
364
|
self.handler = None
|
|
330
365
|
|
|
366
|
+
def __del__(self):
|
|
367
|
+
"""Cleanup when the logger is destroyed."""
|
|
368
|
+
try:
|
|
369
|
+
self.remove()
|
|
370
|
+
except Exception:
|
|
371
|
+
pass # Ignore cleanup errors during destruction
|
|
372
|
+
|
|
331
373
|
def __repr__(self):
|
|
332
374
|
return f"MassterLogger(type={self.instance_type}, id={self.instance_id}, level={self.level})"
|
masster/sample/h5.py
CHANGED
|
@@ -235,6 +235,22 @@ def _save_sample5(
|
|
|
235
235
|
data=serialized_data,
|
|
236
236
|
compression="gzip",
|
|
237
237
|
)
|
|
238
|
+
elif col == "ms1_spec":
|
|
239
|
+
# this column contains either None or numpy arrays with isotope pattern data
|
|
240
|
+
# serialize numpy arrays to JSON strings for storage
|
|
241
|
+
data = features[col]
|
|
242
|
+
data_as_json_strings = []
|
|
243
|
+
for i in range(len(data)):
|
|
244
|
+
if data[i] is not None:
|
|
245
|
+
# Convert numpy array to list and then to JSON
|
|
246
|
+
data_as_json_strings.append(json.dumps(data[i].tolist()))
|
|
247
|
+
else:
|
|
248
|
+
data_as_json_strings.append("None")
|
|
249
|
+
features_group.create_dataset(
|
|
250
|
+
col,
|
|
251
|
+
data=data_as_json_strings,
|
|
252
|
+
compression="gzip",
|
|
253
|
+
)
|
|
238
254
|
|
|
239
255
|
else:
|
|
240
256
|
self.logger.warning(
|
|
@@ -630,6 +646,25 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
630
646
|
)
|
|
631
647
|
reconstructed_data.append(spectrum_list)
|
|
632
648
|
|
|
649
|
+
data[col] = reconstructed_data
|
|
650
|
+
case "ms1_spec":
|
|
651
|
+
data_col = features_group[col][:]
|
|
652
|
+
# Convert JSON strings back to numpy arrays
|
|
653
|
+
reconstructed_data = []
|
|
654
|
+
for item in data_col:
|
|
655
|
+
if isinstance(item, bytes):
|
|
656
|
+
item = item.decode("utf-8")
|
|
657
|
+
|
|
658
|
+
if item == "None" or item == "":
|
|
659
|
+
reconstructed_data.append(None)
|
|
660
|
+
else:
|
|
661
|
+
try:
|
|
662
|
+
# Parse JSON string to get list and convert to numpy array
|
|
663
|
+
array_data = json.loads(item)
|
|
664
|
+
reconstructed_data.append(np.array(array_data, dtype=np.float64))
|
|
665
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
666
|
+
reconstructed_data.append(None)
|
|
667
|
+
|
|
633
668
|
data[col] = reconstructed_data
|
|
634
669
|
case _:
|
|
635
670
|
self.logger.debug(f"Unexpected Object column '{col}'")
|
|
@@ -1371,6 +1406,25 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1371
1406
|
):
|
|
1372
1407
|
reconstructed_data.append(None)
|
|
1373
1408
|
|
|
1409
|
+
data[col] = reconstructed_data
|
|
1410
|
+
case "ms1_spec":
|
|
1411
|
+
data_col = features_group[col][:]
|
|
1412
|
+
# Convert JSON strings back to numpy arrays
|
|
1413
|
+
reconstructed_data = []
|
|
1414
|
+
for item in data_col:
|
|
1415
|
+
if isinstance(item, bytes):
|
|
1416
|
+
item = item.decode("utf-8")
|
|
1417
|
+
|
|
1418
|
+
if item == "None" or item == "":
|
|
1419
|
+
reconstructed_data.append(None)
|
|
1420
|
+
else:
|
|
1421
|
+
try:
|
|
1422
|
+
# Parse JSON string to get list and convert to numpy array
|
|
1423
|
+
array_data = json.loads(item)
|
|
1424
|
+
reconstructed_data.append(np.array(array_data, dtype=np.float64))
|
|
1425
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
1426
|
+
reconstructed_data.append(None)
|
|
1427
|
+
|
|
1374
1428
|
data[col] = reconstructed_data
|
|
1375
1429
|
case _:
|
|
1376
1430
|
# Handle other Object columns as raw data
|
|
@@ -1407,6 +1461,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1407
1461
|
# Add Object columns one by one
|
|
1408
1462
|
for col, values in object_columns.items():
|
|
1409
1463
|
if not self.features_df.is_empty():
|
|
1464
|
+
# Fix for missing columns: if values is None, create list of None with correct length
|
|
1465
|
+
if values is None:
|
|
1466
|
+
values = [None] * len(self.features_df)
|
|
1410
1467
|
self.features_df = self.features_df.with_columns(
|
|
1411
1468
|
pl.Series(col, values, dtype=pl.Object).alias(col),
|
|
1412
1469
|
)
|
|
@@ -2027,7 +2084,7 @@ def load_dataframe_from_h5_group(
|
|
|
2027
2084
|
for col in schema_columns:
|
|
2028
2085
|
if col not in group:
|
|
2029
2086
|
if logger:
|
|
2030
|
-
logger.
|
|
2087
|
+
logger.info(f"Column '{col}' not found in {df_name}.")
|
|
2031
2088
|
data[col] = None
|
|
2032
2089
|
missing_columns.append(col)
|
|
2033
2090
|
continue
|
masster/sample/load.py
CHANGED
|
@@ -37,21 +37,27 @@ See Also:
|
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
39
|
import os
|
|
40
|
-
|
|
40
|
+
import warnings
|
|
41
41
|
from datetime import datetime
|
|
42
42
|
|
|
43
43
|
import numpy as np
|
|
44
44
|
import pandas as pd
|
|
45
45
|
import polars as pl
|
|
46
|
-
import pyopenms as oms
|
|
47
|
-
|
|
48
46
|
from tqdm import tqdm
|
|
49
47
|
|
|
50
48
|
from masster.chromatogram import Chromatogram
|
|
51
|
-
|
|
52
|
-
# Parameters removed - using hardcoded defaults
|
|
53
49
|
from masster.spectrum import Spectrum
|
|
54
50
|
|
|
51
|
+
# Suppress pyOpenMS warnings globally
|
|
52
|
+
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
|
|
53
|
+
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
|
|
54
|
+
|
|
55
|
+
# Import pyopenms with suppressed warnings
|
|
56
|
+
with warnings.catch_warnings():
|
|
57
|
+
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
|
|
58
|
+
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
|
|
59
|
+
import pyopenms as oms
|
|
60
|
+
|
|
55
61
|
|
|
56
62
|
def load(
|
|
57
63
|
self,
|
|
@@ -632,6 +638,7 @@ def _load_wiff(
|
|
|
632
638
|
mz=peaks.mz.values,
|
|
633
639
|
inty=peaks.intensity.values,
|
|
634
640
|
ms_level=ms_level,
|
|
641
|
+
centroided=False, # WIFF files always contain profile data
|
|
635
642
|
)
|
|
636
643
|
bl = spect.baseline()
|
|
637
644
|
spect = spect.denoise(threshold=bl)
|
masster/sample/plot.py
CHANGED
|
@@ -387,18 +387,19 @@ def plot_2d(
|
|
|
387
387
|
show_only_features_with_ms2=False,
|
|
388
388
|
show_isotopes=False,
|
|
389
389
|
show_ms2=False,
|
|
390
|
+
show_in_browser=False,
|
|
390
391
|
title=None,
|
|
391
392
|
cmap=None,
|
|
392
393
|
marker="circle",
|
|
393
394
|
markersize=10,
|
|
394
|
-
size="
|
|
395
|
+
size="static",
|
|
395
396
|
raster_dynamic=True,
|
|
396
397
|
raster_max_px=8,
|
|
397
398
|
raster_threshold=0.8,
|
|
398
399
|
height=600,
|
|
399
400
|
width=800,
|
|
400
401
|
mz_range=None,
|
|
401
|
-
rt_range=None
|
|
402
|
+
rt_range=None
|
|
402
403
|
):
|
|
403
404
|
"""
|
|
404
405
|
Plot a two-dimensional visualization of MS1 survey scan data with optional overlays
|
|
@@ -634,8 +635,10 @@ def plot_2d(
|
|
|
634
635
|
("m/z", "@mz{0.0000}"),
|
|
635
636
|
("feature_uid", "@feature_uid"),
|
|
636
637
|
("inty", "@inty"),
|
|
637
|
-
("
|
|
638
|
-
("
|
|
638
|
+
("iso", "@iso"),
|
|
639
|
+
("adduct", "@adduct"),
|
|
640
|
+
("chrom_coherence", "@chrom_coherence"),
|
|
641
|
+
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
639
642
|
],
|
|
640
643
|
)
|
|
641
644
|
feature_points_1 = hv.Points(
|
|
@@ -644,8 +647,8 @@ def plot_2d(
|
|
|
644
647
|
vdims=[
|
|
645
648
|
"feature_uid",
|
|
646
649
|
"inty",
|
|
647
|
-
"
|
|
648
|
-
"
|
|
650
|
+
"iso",
|
|
651
|
+
"adduct",
|
|
649
652
|
"ms2_scans",
|
|
650
653
|
"chrom_coherence",
|
|
651
654
|
"chrom_prominence_scaled",
|
|
@@ -666,8 +669,10 @@ def plot_2d(
|
|
|
666
669
|
("m/z", "@mz{0.0000}"),
|
|
667
670
|
("feature_uid", "@feature_uid"),
|
|
668
671
|
("inty", "@inty"),
|
|
669
|
-
("
|
|
670
|
-
("
|
|
672
|
+
("iso", "@iso"),
|
|
673
|
+
("adduct", "@adduct"),
|
|
674
|
+
("chrom_coherence", "@chrom_coherence"),
|
|
675
|
+
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
671
676
|
],
|
|
672
677
|
)
|
|
673
678
|
feature_points_2 = hv.Points(
|
|
@@ -676,8 +681,8 @@ def plot_2d(
|
|
|
676
681
|
vdims=[
|
|
677
682
|
"feature_uid",
|
|
678
683
|
"inty",
|
|
679
|
-
"
|
|
680
|
-
"
|
|
684
|
+
"iso",
|
|
685
|
+
"adduct",
|
|
681
686
|
"chrom_coherence",
|
|
682
687
|
"chrom_prominence_scaled",
|
|
683
688
|
],
|
|
@@ -702,10 +707,11 @@ def plot_2d(
|
|
|
702
707
|
("m/z", "@mz{0.0000}"),
|
|
703
708
|
("feature_uid", "@feature_uid"),
|
|
704
709
|
("inty", "@inty"),
|
|
705
|
-
("quality", "@quality"),
|
|
706
|
-
("rt_delta", "@rt_delta"),
|
|
707
710
|
("iso", "@iso"),
|
|
708
711
|
("iso_of", "@iso_of"),
|
|
712
|
+
("adduct", "@adduct"),
|
|
713
|
+
("chrom_coherence", "@chrom_coherence"),
|
|
714
|
+
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
709
715
|
],
|
|
710
716
|
)
|
|
711
717
|
feature_points_iso = hv.Points(
|
|
@@ -714,10 +720,9 @@ def plot_2d(
|
|
|
714
720
|
vdims=[
|
|
715
721
|
"feature_uid",
|
|
716
722
|
"inty",
|
|
717
|
-
"quality",
|
|
718
|
-
"rt_delta",
|
|
719
723
|
"iso",
|
|
720
724
|
"iso_of",
|
|
725
|
+
"adduct",
|
|
721
726
|
"chrom_coherence",
|
|
722
727
|
"chrom_prominence_scaled",
|
|
723
728
|
],
|
|
@@ -918,21 +923,24 @@ def plot_2d(
|
|
|
918
923
|
else:
|
|
919
924
|
# For slider plots, save the current state
|
|
920
925
|
hv.save(create_feature_overlay(markersize), filename, fmt="png")
|
|
921
|
-
return None
|
|
922
926
|
else:
|
|
923
|
-
#
|
|
924
|
-
|
|
927
|
+
# Use show() for display in notebook
|
|
928
|
+
layout.show()
|
|
925
929
|
else:
|
|
926
930
|
# Create a panel layout without slider
|
|
927
931
|
layout = panel.Column(overlay)
|
|
928
932
|
|
|
933
|
+
# Handle display logic based on show_in_browser and raster_dynamic
|
|
929
934
|
if filename is not None:
|
|
930
935
|
# Use consistent save/display behavior
|
|
931
936
|
self._handle_sample_plot_output(layout, filename, "panel")
|
|
932
|
-
return None
|
|
933
937
|
else:
|
|
934
|
-
#
|
|
935
|
-
|
|
938
|
+
# Show in browser if both show_in_browser and raster_dynamic are True
|
|
939
|
+
if show_in_browser and raster_dynamic:
|
|
940
|
+
layout.show()
|
|
941
|
+
else:
|
|
942
|
+
# Return to notebook for inline display
|
|
943
|
+
return layout
|
|
936
944
|
|
|
937
945
|
|
|
938
946
|
def plot_2d_oracle(
|
|
@@ -1952,11 +1960,10 @@ def plot_feature_stats(
|
|
|
1952
1960
|
filename=None,
|
|
1953
1961
|
):
|
|
1954
1962
|
"""
|
|
1955
|
-
Generates
|
|
1963
|
+
Generates vertically stacked density plots for selected feature metrics.
|
|
1956
1964
|
The distributions are created separately for features with and without MS2 data.
|
|
1957
|
-
Metrics include
|
|
1958
|
-
|
|
1959
|
-
differences between features that are linked to MS2 spectra and those that are not.
|
|
1965
|
+
Metrics include mz, rt, log10(inty), chrom_coherence, chrom_prominence, and chrom_prominence_scaled.
|
|
1966
|
+
The plots help to visualize the distribution differences between features that are linked to MS2 spectra and those that are not.
|
|
1960
1967
|
|
|
1961
1968
|
Parameters:
|
|
1962
1969
|
filename (str, optional): The output filename. If the filename ends with ".html",
|
|
@@ -1972,54 +1979,28 @@ def plot_feature_stats(
|
|
|
1972
1979
|
# Convert to pandas for operations that require pandas functionality
|
|
1973
1980
|
if hasattr(feats, "to_pandas"):
|
|
1974
1981
|
feats = feats.to_pandas()
|
|
1975
|
-
# Compute m/z delta for each feature
|
|
1976
|
-
feats["mz_delta"] = feats["mz_end"] - feats["mz_start"]
|
|
1977
|
-
# Add a column with the number of peaks in the MS2 spectrum
|
|
1978
|
-
feats["MS2peaks"] = feats["ms2_specs"].apply(
|
|
1979
|
-
lambda x: len(x[0]) if x is not None else 0,
|
|
1980
|
-
)
|
|
1981
|
-
# Add a column with the sum of intensities in the MS2 spectrum
|
|
1982
|
-
feats["MS2int"] = feats["ms2_specs"].apply(
|
|
1983
|
-
lambda x: sum(x[0].inty) if x is not None else 0,
|
|
1984
|
-
)
|
|
1985
1982
|
|
|
1986
|
-
#
|
|
1987
|
-
feats["MS2toMS1"] = feats["MS2int"] / feats["inty"]
|
|
1988
|
-
# Apply log10 transformation to intensity, quality, and MS2int columns (handling non-positive values)
|
|
1983
|
+
# Apply log10 transformation to intensity (handling non-positive values)
|
|
1989
1984
|
feats["inty"] = np.where(feats["inty"] <= 0, np.nan, np.log10(feats["inty"]))
|
|
1990
|
-
|
|
1991
|
-
#
|
|
1992
|
-
|
|
1993
|
-
# )
|
|
1994
|
-
|
|
1995
|
-
feats["quality"] = np.where(
|
|
1996
|
-
feats["quality"] <= 0,
|
|
1997
|
-
np.nan,
|
|
1998
|
-
np.log10(feats["quality"]),
|
|
1999
|
-
)
|
|
2000
|
-
feats["MS2int"] = np.where(feats["MS2int"] <= 0, np.nan, np.log10(feats["MS2int"]))
|
|
1985
|
+
|
|
1986
|
+
# Apply log10 transformation to quality (handling non-positive values)
|
|
1987
|
+
feats["quality"] = np.where(feats["quality"] <= 0, np.nan, np.log10(feats["quality"]))
|
|
2001
1988
|
|
|
2002
1989
|
# Separate features based on presence of MS2 data
|
|
2003
1990
|
feats_with_MS2 = feats[feats["ms2_scans"].notnull()]
|
|
2004
1991
|
feats_without_MS2 = feats[feats["ms2_scans"].isnull()]
|
|
2005
1992
|
|
|
2006
|
-
# Define the metrics to plot
|
|
1993
|
+
# Define the specific metrics to plot
|
|
2007
1994
|
cols_to_plot = [
|
|
2008
1995
|
"mz",
|
|
2009
|
-
"
|
|
2010
|
-
"inty",
|
|
2011
|
-
"quality",
|
|
2012
|
-
"rt",
|
|
1996
|
+
"rt",
|
|
1997
|
+
"inty", # Already log10 transformed above
|
|
2013
1998
|
"rt_delta",
|
|
1999
|
+
"quality", # Already log10 transformed above
|
|
2014
2000
|
"chrom_coherence",
|
|
2015
2001
|
"chrom_prominence",
|
|
2016
2002
|
"chrom_prominence_scaled",
|
|
2017
|
-
|
|
2018
|
-
# "chrom_heights",
|
|
2019
|
-
# "chrom_heights_scaled",
|
|
2020
|
-
"MS2peaks",
|
|
2021
|
-
"MS2int",
|
|
2022
|
-
"MS2toMS1",
|
|
2003
|
+
"chrom_height_scaled",
|
|
2023
2004
|
]
|
|
2024
2005
|
|
|
2025
2006
|
# Ensure an index column is available for plotting
|
|
@@ -2032,29 +2013,39 @@ def plot_feature_stats(
|
|
|
2032
2013
|
data_with = feats_with_MS2[col].dropna().values
|
|
2033
2014
|
data_without = feats_without_MS2[col].dropna().values
|
|
2034
2015
|
|
|
2035
|
-
# Create distribution elements for
|
|
2016
|
+
# Create distribution elements - Green for WITH MS2, Red for WITHOUT MS2
|
|
2036
2017
|
dist_with = hv.Distribution(data_with, label="With MS2").opts(
|
|
2037
|
-
color="
|
|
2018
|
+
color="green",
|
|
2038
2019
|
alpha=0.6,
|
|
2039
2020
|
)
|
|
2040
2021
|
dist_without = hv.Distribution(data_without, label="Without MS2").opts(
|
|
2041
|
-
color="
|
|
2022
|
+
color="red",
|
|
2042
2023
|
alpha=0.6,
|
|
2043
2024
|
)
|
|
2044
2025
|
|
|
2045
2026
|
# Overlay the distributions with a legend and hover tool enabled
|
|
2027
|
+
title = col
|
|
2028
|
+
if col == "inty":
|
|
2029
|
+
title = "log10(inty)"
|
|
2030
|
+
elif col == "quality":
|
|
2031
|
+
title = "log10(quality)"
|
|
2032
|
+
|
|
2046
2033
|
overlay = (dist_with * dist_without).opts(
|
|
2047
|
-
title=
|
|
2034
|
+
title=title,
|
|
2048
2035
|
show_legend=True,
|
|
2049
2036
|
tools=["hover"],
|
|
2050
2037
|
)
|
|
2051
2038
|
density_plots.append(overlay)
|
|
2052
2039
|
|
|
2053
|
-
# Arrange the plots in a layout
|
|
2040
|
+
# Arrange the plots in a grid layout (3 columns for 7 plots)
|
|
2054
2041
|
layout = hv.Layout(density_plots).cols(3).opts(shared_axes=False)
|
|
2055
2042
|
|
|
2056
2043
|
# Use consistent save/display behavior
|
|
2057
|
-
|
|
2044
|
+
if filename is not None:
|
|
2045
|
+
self._handle_sample_plot_output(layout, filename, "holoviews")
|
|
2046
|
+
else:
|
|
2047
|
+
# Return the layout directly for notebook display
|
|
2048
|
+
return layout
|
|
2058
2049
|
|
|
2059
2050
|
|
|
2060
2051
|
def plot_tic(
|
masster/sample/processing.py
CHANGED
|
@@ -1273,3 +1273,161 @@ def find_ms2(self, **kwargs):
|
|
|
1273
1273
|
self.logger.debug(
|
|
1274
1274
|
"Parameters stored to find_ms2",
|
|
1275
1275
|
)
|
|
1276
|
+
|
|
1277
|
+
|
|
1278
|
+
def find_iso(self, rt_tolerance: float = 0.1, **kwargs):
|
|
1279
|
+
"""Extract isotopic distributions from MS1 data and add to features_df.
|
|
1280
|
+
|
|
1281
|
+
This method processes each feature to find isotopic distributions from MS1 data,
|
|
1282
|
+
similar to the study.find_iso() method but for individual samples. The method
|
|
1283
|
+
adds a new 'ms1_spec' column to features_df containing numpy arrays with
|
|
1284
|
+
isotopic distribution data.
|
|
1285
|
+
|
|
1286
|
+
Args:
|
|
1287
|
+
rt_tolerance (float): RT tolerance in minutes for matching MS1 scans. Default 0.1.
|
|
1288
|
+
**kwargs: Additional parameters
|
|
1289
|
+
|
|
1290
|
+
Notes:
|
|
1291
|
+
- Adds a new 'ms1_spec' column to features_df containing numpy arrays
|
|
1292
|
+
- Each array contains [mz, intensity] pairs for the isotopic distribution
|
|
1293
|
+
- Uses the same isotope shift pattern as study.find_iso()
|
|
1294
|
+
- Only processes features that don't already have ms1_spec data
|
|
1295
|
+
"""
|
|
1296
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1297
|
+
self.logger.warning("No features found. Run find_features() first.")
|
|
1298
|
+
return
|
|
1299
|
+
|
|
1300
|
+
if self.ms1_df is None or self.ms1_df.is_empty():
|
|
1301
|
+
self.logger.warning("No MS1 data found.")
|
|
1302
|
+
return
|
|
1303
|
+
|
|
1304
|
+
# Check if ms1_spec column already exists
|
|
1305
|
+
if "ms1_spec" in self.features_df.columns:
|
|
1306
|
+
features_without_spec = self.features_df.filter(pl.col("ms1_spec").is_null())
|
|
1307
|
+
if features_without_spec.is_empty():
|
|
1308
|
+
self.logger.info("All features already have isotopic distributions.")
|
|
1309
|
+
return
|
|
1310
|
+
self.logger.info(f"Processing {len(features_without_spec)} features without isotopic distributions.")
|
|
1311
|
+
else:
|
|
1312
|
+
# Add the ms1_spec column with None values
|
|
1313
|
+
self.features_df = self.features_df.with_columns(
|
|
1314
|
+
pl.lit(None, dtype=pl.Object).alias("ms1_spec")
|
|
1315
|
+
)
|
|
1316
|
+
features_without_spec = self.features_df
|
|
1317
|
+
self.logger.info(f"Processing {len(features_without_spec)} features for isotopic distributions.")
|
|
1318
|
+
|
|
1319
|
+
# Define isotope shifts (same as study.find_iso)
|
|
1320
|
+
isotope_shifts = np.array([
|
|
1321
|
+
0.33,
|
|
1322
|
+
0.50,
|
|
1323
|
+
0.66,
|
|
1324
|
+
1.00335,
|
|
1325
|
+
1.50502,
|
|
1326
|
+
2.00670,
|
|
1327
|
+
3.01005,
|
|
1328
|
+
4.01340,
|
|
1329
|
+
5.01675,
|
|
1330
|
+
6.02010,
|
|
1331
|
+
7.02345,
|
|
1332
|
+
])
|
|
1333
|
+
|
|
1334
|
+
# Convert rt_tolerance from minutes to seconds
|
|
1335
|
+
rt_tolerance_s = rt_tolerance * 60
|
|
1336
|
+
|
|
1337
|
+
# Process each feature
|
|
1338
|
+
ms1_specs = []
|
|
1339
|
+
feature_indices = []
|
|
1340
|
+
|
|
1341
|
+
for i, row in enumerate(tqdm(
|
|
1342
|
+
features_without_spec.rows(named=True),
|
|
1343
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extracting isotope patterns"
|
|
1344
|
+
)):
|
|
1345
|
+
feature_rt = row["rt"]
|
|
1346
|
+
feature_mz = row["mz"]
|
|
1347
|
+
|
|
1348
|
+
# Find MS1 scans within RT tolerance
|
|
1349
|
+
rt_mask = (
|
|
1350
|
+
(self.ms1_df["rt"] >= (feature_rt - rt_tolerance_s)) &
|
|
1351
|
+
(self.ms1_df["rt"] <= (feature_rt + rt_tolerance_s))
|
|
1352
|
+
)
|
|
1353
|
+
ms1_in_range = self.ms1_df.filter(rt_mask)
|
|
1354
|
+
|
|
1355
|
+
if ms1_in_range.is_empty():
|
|
1356
|
+
ms1_specs.append(None)
|
|
1357
|
+
feature_indices.append(row["feature_uid"])
|
|
1358
|
+
continue
|
|
1359
|
+
|
|
1360
|
+
# Extract isotopic pattern
|
|
1361
|
+
isotope_pattern = []
|
|
1362
|
+
|
|
1363
|
+
# Start with the monoisotopic peak (M+0)
|
|
1364
|
+
base_intensity = 0
|
|
1365
|
+
mz_tolerance = 0.01 # 10 ppm at 1000 Da
|
|
1366
|
+
|
|
1367
|
+
# Find the base peak intensity
|
|
1368
|
+
base_mask = (
|
|
1369
|
+
(ms1_in_range["mz"] >= (feature_mz - mz_tolerance)) &
|
|
1370
|
+
(ms1_in_range["mz"] <= (feature_mz + mz_tolerance))
|
|
1371
|
+
)
|
|
1372
|
+
base_peaks = ms1_in_range.filter(base_mask)
|
|
1373
|
+
|
|
1374
|
+
if not base_peaks.is_empty():
|
|
1375
|
+
base_intensity = base_peaks["inty"].max()
|
|
1376
|
+
isotope_pattern.append([feature_mz, base_intensity])
|
|
1377
|
+
|
|
1378
|
+
# Look for isotope peaks
|
|
1379
|
+
for shift in isotope_shifts:
|
|
1380
|
+
isotope_mz = feature_mz + shift
|
|
1381
|
+
isotope_mask = (
|
|
1382
|
+
(ms1_in_range["mz"] >= (isotope_mz - mz_tolerance)) &
|
|
1383
|
+
(ms1_in_range["mz"] <= (isotope_mz + mz_tolerance))
|
|
1384
|
+
)
|
|
1385
|
+
isotope_peaks = ms1_in_range.filter(isotope_mask)
|
|
1386
|
+
|
|
1387
|
+
if not isotope_peaks.is_empty():
|
|
1388
|
+
max_intensity = isotope_peaks["inty"].max()
|
|
1389
|
+
# Only keep isotope peaks that are at least 1% of base peak
|
|
1390
|
+
if base_intensity > 0 and max_intensity >= 0.01 * base_intensity:
|
|
1391
|
+
# Get the mz of the most intense peak
|
|
1392
|
+
max_peak = isotope_peaks.filter(pl.col("inty") == max_intensity).row(0, named=True)
|
|
1393
|
+
isotope_pattern.append([max_peak["mz"], max_intensity])
|
|
1394
|
+
|
|
1395
|
+
# Convert to numpy array or None if empty
|
|
1396
|
+
if len(isotope_pattern) > 1: # Need at least 2 points (monoisotopic + 1 isotope)
|
|
1397
|
+
ms1_spec = np.array(isotope_pattern, dtype=np.float64)
|
|
1398
|
+
else:
|
|
1399
|
+
ms1_spec = None
|
|
1400
|
+
|
|
1401
|
+
ms1_specs.append(ms1_spec)
|
|
1402
|
+
feature_indices.append(row["feature_uid"])
|
|
1403
|
+
|
|
1404
|
+
# Update the features_df with the isotopic spectra
|
|
1405
|
+
update_df = pl.DataFrame({
|
|
1406
|
+
"feature_uid": feature_indices,
|
|
1407
|
+
"ms1_spec_new": pl.Series("ms1_spec_new", ms1_specs, dtype=pl.Object)
|
|
1408
|
+
})
|
|
1409
|
+
|
|
1410
|
+
# Join and update
|
|
1411
|
+
self.features_df = (
|
|
1412
|
+
self.features_df.join(
|
|
1413
|
+
update_df,
|
|
1414
|
+
on="feature_uid",
|
|
1415
|
+
how="left"
|
|
1416
|
+
)
|
|
1417
|
+
.with_columns([
|
|
1418
|
+
pl.when(pl.col("ms1_spec_new").is_not_null())
|
|
1419
|
+
.then(pl.col("ms1_spec_new"))
|
|
1420
|
+
.otherwise(pl.col("ms1_spec"))
|
|
1421
|
+
.alias("ms1_spec")
|
|
1422
|
+
])
|
|
1423
|
+
.drop("ms1_spec_new")
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1426
|
+
# Log results
|
|
1427
|
+
non_null_count = len([spec for spec in ms1_specs if spec is not None])
|
|
1428
|
+
self.logger.info(f"Extracted isotopic distributions for {non_null_count}/{len(ms1_specs)} features.")
|
|
1429
|
+
|
|
1430
|
+
# Store parameters in history
|
|
1431
|
+
params_dict = {"rt_tolerance": rt_tolerance}
|
|
1432
|
+
params_dict.update(kwargs)
|
|
1433
|
+
self.store_history(["find_iso"], params_dict)
|
masster/sample/sample.py
CHANGED
|
@@ -56,15 +56,6 @@ from masster.sample.helpers import _estimate_memory_usage
|
|
|
56
56
|
from masster.sample.helpers import _get_scan_uids
|
|
57
57
|
from masster.sample.helpers import _get_feature_uids
|
|
58
58
|
from masster.sample.helpers import _features_sync
|
|
59
|
-
|
|
60
|
-
# from masster.sample.helpers import _parse_adduct_specs
|
|
61
|
-
# from masster.sample.helpers import _calculate_adduct_mass_shift
|
|
62
|
-
# from masster.sample.helpers import _parse_formula_expression
|
|
63
|
-
# from masster.sample.helpers import _calculate_molecular_mass
|
|
64
|
-
# from masster.sample.helpers import _parse_legacy_adduct_format
|
|
65
|
-
# from masster.sample.helpers import _extract_adduct_probability
|
|
66
|
-
# from masster.sample.helpers import _detect_adduct_groups_direct
|
|
67
|
-
# from masster.sample.helpers import _check_adduct_relationship
|
|
68
59
|
from masster.sample.adducts import _get_adducts
|
|
69
60
|
from masster.sample.adducts import find_adducts
|
|
70
61
|
from masster.sample.helpers import features_delete
|
|
@@ -106,6 +97,7 @@ from masster.sample.processing import _get_ztscan_stats
|
|
|
106
97
|
from masster.sample.processing import _spec_to_mat
|
|
107
98
|
from masster.sample.processing import analyze_dda
|
|
108
99
|
from masster.sample.processing import find_features
|
|
100
|
+
from masster.sample.processing import find_iso
|
|
109
101
|
from masster.sample.processing import find_ms2
|
|
110
102
|
from masster.sample.processing import get_spectrum
|
|
111
103
|
from masster.sample.parameters import store_history
|
|
@@ -227,6 +219,7 @@ class Sample:
|
|
|
227
219
|
save = save
|
|
228
220
|
find_features = find_features
|
|
229
221
|
find_adducts = find_adducts
|
|
222
|
+
find_iso = find_iso
|
|
230
223
|
find_ms2 = find_ms2
|
|
231
224
|
get_spectrum = get_spectrum
|
|
232
225
|
filter = features_filter
|