masster 0.4.20__py3-none-any.whl → 0.4.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +137 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +238 -152
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +39 -2
- masster/study/processing.py +257 -1
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +430 -1866
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/RECORD +26 -28
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0
masster/__init__.py
CHANGED
|
@@ -8,6 +8,12 @@ mass spectrometry workflows.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
# Suppress pyOpenMS environment variable warnings globally
|
|
14
|
+
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
|
|
15
|
+
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
|
|
16
|
+
|
|
11
17
|
from masster._version import __version__
|
|
12
18
|
|
|
13
19
|
# from masster._version import get_version
|
masster/_version.py
CHANGED
masster/sample/h5.py
CHANGED
|
@@ -235,6 +235,22 @@ def _save_sample5(
|
|
|
235
235
|
data=serialized_data,
|
|
236
236
|
compression="gzip",
|
|
237
237
|
)
|
|
238
|
+
elif col == "ms1_spec":
|
|
239
|
+
# this column contains either None or numpy arrays with isotope pattern data
|
|
240
|
+
# serialize numpy arrays to JSON strings for storage
|
|
241
|
+
data = features[col]
|
|
242
|
+
data_as_json_strings = []
|
|
243
|
+
for i in range(len(data)):
|
|
244
|
+
if data[i] is not None:
|
|
245
|
+
# Convert numpy array to list and then to JSON
|
|
246
|
+
data_as_json_strings.append(json.dumps(data[i].tolist()))
|
|
247
|
+
else:
|
|
248
|
+
data_as_json_strings.append("None")
|
|
249
|
+
features_group.create_dataset(
|
|
250
|
+
col,
|
|
251
|
+
data=data_as_json_strings,
|
|
252
|
+
compression="gzip",
|
|
253
|
+
)
|
|
238
254
|
|
|
239
255
|
else:
|
|
240
256
|
self.logger.warning(
|
|
@@ -630,6 +646,25 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
630
646
|
)
|
|
631
647
|
reconstructed_data.append(spectrum_list)
|
|
632
648
|
|
|
649
|
+
data[col] = reconstructed_data
|
|
650
|
+
case "ms1_spec":
|
|
651
|
+
data_col = features_group[col][:]
|
|
652
|
+
# Convert JSON strings back to numpy arrays
|
|
653
|
+
reconstructed_data = []
|
|
654
|
+
for item in data_col:
|
|
655
|
+
if isinstance(item, bytes):
|
|
656
|
+
item = item.decode("utf-8")
|
|
657
|
+
|
|
658
|
+
if item == "None" or item == "":
|
|
659
|
+
reconstructed_data.append(None)
|
|
660
|
+
else:
|
|
661
|
+
try:
|
|
662
|
+
# Parse JSON string to get list and convert to numpy array
|
|
663
|
+
array_data = json.loads(item)
|
|
664
|
+
reconstructed_data.append(np.array(array_data, dtype=np.float64))
|
|
665
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
666
|
+
reconstructed_data.append(None)
|
|
667
|
+
|
|
633
668
|
data[col] = reconstructed_data
|
|
634
669
|
case _:
|
|
635
670
|
self.logger.debug(f"Unexpected Object column '{col}'")
|
|
@@ -1371,6 +1406,25 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1371
1406
|
):
|
|
1372
1407
|
reconstructed_data.append(None)
|
|
1373
1408
|
|
|
1409
|
+
data[col] = reconstructed_data
|
|
1410
|
+
case "ms1_spec":
|
|
1411
|
+
data_col = features_group[col][:]
|
|
1412
|
+
# Convert JSON strings back to numpy arrays
|
|
1413
|
+
reconstructed_data = []
|
|
1414
|
+
for item in data_col:
|
|
1415
|
+
if isinstance(item, bytes):
|
|
1416
|
+
item = item.decode("utf-8")
|
|
1417
|
+
|
|
1418
|
+
if item == "None" or item == "":
|
|
1419
|
+
reconstructed_data.append(None)
|
|
1420
|
+
else:
|
|
1421
|
+
try:
|
|
1422
|
+
# Parse JSON string to get list and convert to numpy array
|
|
1423
|
+
array_data = json.loads(item)
|
|
1424
|
+
reconstructed_data.append(np.array(array_data, dtype=np.float64))
|
|
1425
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
1426
|
+
reconstructed_data.append(None)
|
|
1427
|
+
|
|
1374
1428
|
data[col] = reconstructed_data
|
|
1375
1429
|
case _:
|
|
1376
1430
|
# Handle other Object columns as raw data
|
|
@@ -1407,6 +1461,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
|
|
|
1407
1461
|
# Add Object columns one by one
|
|
1408
1462
|
for col, values in object_columns.items():
|
|
1409
1463
|
if not self.features_df.is_empty():
|
|
1464
|
+
# Fix for missing columns: if values is None, create list of None with correct length
|
|
1465
|
+
if values is None:
|
|
1466
|
+
values = [None] * len(self.features_df)
|
|
1410
1467
|
self.features_df = self.features_df.with_columns(
|
|
1411
1468
|
pl.Series(col, values, dtype=pl.Object).alias(col),
|
|
1412
1469
|
)
|
|
@@ -2027,7 +2084,7 @@ def load_dataframe_from_h5_group(
|
|
|
2027
2084
|
for col in schema_columns:
|
|
2028
2085
|
if col not in group:
|
|
2029
2086
|
if logger:
|
|
2030
|
-
logger.
|
|
2087
|
+
logger.info(f"Column '{col}' not found in {df_name}.")
|
|
2031
2088
|
data[col] = None
|
|
2032
2089
|
missing_columns.append(col)
|
|
2033
2090
|
continue
|
masster/sample/load.py
CHANGED
|
@@ -48,9 +48,14 @@ from tqdm import tqdm
|
|
|
48
48
|
from masster.chromatogram import Chromatogram
|
|
49
49
|
from masster.spectrum import Spectrum
|
|
50
50
|
|
|
51
|
+
# Suppress pyOpenMS warnings globally
|
|
52
|
+
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
|
|
53
|
+
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
|
|
54
|
+
|
|
51
55
|
# Import pyopenms with suppressed warnings
|
|
52
56
|
with warnings.catch_warnings():
|
|
53
|
-
warnings.filterwarnings("ignore", message="
|
|
57
|
+
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
|
|
58
|
+
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
|
|
54
59
|
import pyopenms as oms
|
|
55
60
|
|
|
56
61
|
|
|
@@ -633,6 +638,7 @@ def _load_wiff(
|
|
|
633
638
|
mz=peaks.mz.values,
|
|
634
639
|
inty=peaks.intensity.values,
|
|
635
640
|
ms_level=ms_level,
|
|
641
|
+
centroided=False, # WIFF files always contain profile data
|
|
636
642
|
)
|
|
637
643
|
bl = spect.baseline()
|
|
638
644
|
spect = spect.denoise(threshold=bl)
|
masster/sample/plot.py
CHANGED
|
@@ -387,18 +387,19 @@ def plot_2d(
|
|
|
387
387
|
show_only_features_with_ms2=False,
|
|
388
388
|
show_isotopes=False,
|
|
389
389
|
show_ms2=False,
|
|
390
|
+
show_in_browser=False,
|
|
390
391
|
title=None,
|
|
391
392
|
cmap=None,
|
|
392
393
|
marker="circle",
|
|
393
394
|
markersize=10,
|
|
394
|
-
size="
|
|
395
|
+
size="static",
|
|
395
396
|
raster_dynamic=True,
|
|
396
397
|
raster_max_px=8,
|
|
397
398
|
raster_threshold=0.8,
|
|
398
399
|
height=600,
|
|
399
400
|
width=800,
|
|
400
401
|
mz_range=None,
|
|
401
|
-
rt_range=None
|
|
402
|
+
rt_range=None
|
|
402
403
|
):
|
|
403
404
|
"""
|
|
404
405
|
Plot a two-dimensional visualization of MS1 survey scan data with optional overlays
|
|
@@ -634,8 +635,10 @@ def plot_2d(
|
|
|
634
635
|
("m/z", "@mz{0.0000}"),
|
|
635
636
|
("feature_uid", "@feature_uid"),
|
|
636
637
|
("inty", "@inty"),
|
|
637
|
-
("
|
|
638
|
-
("
|
|
638
|
+
("iso", "@iso"),
|
|
639
|
+
("adduct", "@adduct"),
|
|
640
|
+
("chrom_coherence", "@chrom_coherence"),
|
|
641
|
+
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
639
642
|
],
|
|
640
643
|
)
|
|
641
644
|
feature_points_1 = hv.Points(
|
|
@@ -644,8 +647,8 @@ def plot_2d(
|
|
|
644
647
|
vdims=[
|
|
645
648
|
"feature_uid",
|
|
646
649
|
"inty",
|
|
647
|
-
"
|
|
648
|
-
"
|
|
650
|
+
"iso",
|
|
651
|
+
"adduct",
|
|
649
652
|
"ms2_scans",
|
|
650
653
|
"chrom_coherence",
|
|
651
654
|
"chrom_prominence_scaled",
|
|
@@ -666,8 +669,10 @@ def plot_2d(
|
|
|
666
669
|
("m/z", "@mz{0.0000}"),
|
|
667
670
|
("feature_uid", "@feature_uid"),
|
|
668
671
|
("inty", "@inty"),
|
|
669
|
-
("
|
|
670
|
-
("
|
|
672
|
+
("iso", "@iso"),
|
|
673
|
+
("adduct", "@adduct"),
|
|
674
|
+
("chrom_coherence", "@chrom_coherence"),
|
|
675
|
+
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
671
676
|
],
|
|
672
677
|
)
|
|
673
678
|
feature_points_2 = hv.Points(
|
|
@@ -676,8 +681,8 @@ def plot_2d(
|
|
|
676
681
|
vdims=[
|
|
677
682
|
"feature_uid",
|
|
678
683
|
"inty",
|
|
679
|
-
"
|
|
680
|
-
"
|
|
684
|
+
"iso",
|
|
685
|
+
"adduct",
|
|
681
686
|
"chrom_coherence",
|
|
682
687
|
"chrom_prominence_scaled",
|
|
683
688
|
],
|
|
@@ -702,10 +707,11 @@ def plot_2d(
|
|
|
702
707
|
("m/z", "@mz{0.0000}"),
|
|
703
708
|
("feature_uid", "@feature_uid"),
|
|
704
709
|
("inty", "@inty"),
|
|
705
|
-
("quality", "@quality"),
|
|
706
|
-
("rt_delta", "@rt_delta"),
|
|
707
710
|
("iso", "@iso"),
|
|
708
711
|
("iso_of", "@iso_of"),
|
|
712
|
+
("adduct", "@adduct"),
|
|
713
|
+
("chrom_coherence", "@chrom_coherence"),
|
|
714
|
+
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
709
715
|
],
|
|
710
716
|
)
|
|
711
717
|
feature_points_iso = hv.Points(
|
|
@@ -714,10 +720,9 @@ def plot_2d(
|
|
|
714
720
|
vdims=[
|
|
715
721
|
"feature_uid",
|
|
716
722
|
"inty",
|
|
717
|
-
"quality",
|
|
718
|
-
"rt_delta",
|
|
719
723
|
"iso",
|
|
720
724
|
"iso_of",
|
|
725
|
+
"adduct",
|
|
721
726
|
"chrom_coherence",
|
|
722
727
|
"chrom_prominence_scaled",
|
|
723
728
|
],
|
|
@@ -918,21 +923,24 @@ def plot_2d(
|
|
|
918
923
|
else:
|
|
919
924
|
# For slider plots, save the current state
|
|
920
925
|
hv.save(create_feature_overlay(markersize), filename, fmt="png")
|
|
921
|
-
return None
|
|
922
926
|
else:
|
|
923
|
-
#
|
|
924
|
-
|
|
927
|
+
# Use show() for display in notebook
|
|
928
|
+
layout.show()
|
|
925
929
|
else:
|
|
926
930
|
# Create a panel layout without slider
|
|
927
931
|
layout = panel.Column(overlay)
|
|
928
932
|
|
|
933
|
+
# Handle display logic based on show_in_browser and raster_dynamic
|
|
929
934
|
if filename is not None:
|
|
930
935
|
# Use consistent save/display behavior
|
|
931
936
|
self._handle_sample_plot_output(layout, filename, "panel")
|
|
932
|
-
return None
|
|
933
937
|
else:
|
|
934
|
-
#
|
|
935
|
-
|
|
938
|
+
# Show in browser if both show_in_browser and raster_dynamic are True
|
|
939
|
+
if show_in_browser and raster_dynamic:
|
|
940
|
+
layout.show()
|
|
941
|
+
else:
|
|
942
|
+
# Return to notebook for inline display
|
|
943
|
+
return layout
|
|
936
944
|
|
|
937
945
|
|
|
938
946
|
def plot_2d_oracle(
|
|
@@ -1952,11 +1960,10 @@ def plot_feature_stats(
|
|
|
1952
1960
|
filename=None,
|
|
1953
1961
|
):
|
|
1954
1962
|
"""
|
|
1955
|
-
Generates
|
|
1963
|
+
Generates vertically stacked density plots for selected feature metrics.
|
|
1956
1964
|
The distributions are created separately for features with and without MS2 data.
|
|
1957
|
-
Metrics include
|
|
1958
|
-
|
|
1959
|
-
differences between features that are linked to MS2 spectra and those that are not.
|
|
1965
|
+
Metrics include mz, rt, log10(inty), chrom_coherence, chrom_prominence, and chrom_prominence_scaled.
|
|
1966
|
+
The plots help to visualize the distribution differences between features that are linked to MS2 spectra and those that are not.
|
|
1960
1967
|
|
|
1961
1968
|
Parameters:
|
|
1962
1969
|
filename (str, optional): The output filename. If the filename ends with ".html",
|
|
@@ -1972,54 +1979,28 @@ def plot_feature_stats(
|
|
|
1972
1979
|
# Convert to pandas for operations that require pandas functionality
|
|
1973
1980
|
if hasattr(feats, "to_pandas"):
|
|
1974
1981
|
feats = feats.to_pandas()
|
|
1975
|
-
# Compute m/z delta for each feature
|
|
1976
|
-
feats["mz_delta"] = feats["mz_end"] - feats["mz_start"]
|
|
1977
|
-
# Add a column with the number of peaks in the MS2 spectrum
|
|
1978
|
-
feats["MS2peaks"] = feats["ms2_specs"].apply(
|
|
1979
|
-
lambda x: len(x[0]) if x is not None else 0,
|
|
1980
|
-
)
|
|
1981
|
-
# Add a column with the sum of intensities in the MS2 spectrum
|
|
1982
|
-
feats["MS2int"] = feats["ms2_specs"].apply(
|
|
1983
|
-
lambda x: sum(x[0].inty) if x is not None else 0,
|
|
1984
|
-
)
|
|
1985
1982
|
|
|
1986
|
-
#
|
|
1987
|
-
feats["MS2toMS1"] = feats["MS2int"] / feats["inty"]
|
|
1988
|
-
# Apply log10 transformation to intensity, quality, and MS2int columns (handling non-positive values)
|
|
1983
|
+
# Apply log10 transformation to intensity (handling non-positive values)
|
|
1989
1984
|
feats["inty"] = np.where(feats["inty"] <= 0, np.nan, np.log10(feats["inty"]))
|
|
1990
|
-
|
|
1991
|
-
#
|
|
1992
|
-
|
|
1993
|
-
# )
|
|
1994
|
-
|
|
1995
|
-
feats["quality"] = np.where(
|
|
1996
|
-
feats["quality"] <= 0,
|
|
1997
|
-
np.nan,
|
|
1998
|
-
np.log10(feats["quality"]),
|
|
1999
|
-
)
|
|
2000
|
-
feats["MS2int"] = np.where(feats["MS2int"] <= 0, np.nan, np.log10(feats["MS2int"]))
|
|
1985
|
+
|
|
1986
|
+
# Apply log10 transformation to quality (handling non-positive values)
|
|
1987
|
+
feats["quality"] = np.where(feats["quality"] <= 0, np.nan, np.log10(feats["quality"]))
|
|
2001
1988
|
|
|
2002
1989
|
# Separate features based on presence of MS2 data
|
|
2003
1990
|
feats_with_MS2 = feats[feats["ms2_scans"].notnull()]
|
|
2004
1991
|
feats_without_MS2 = feats[feats["ms2_scans"].isnull()]
|
|
2005
1992
|
|
|
2006
|
-
# Define the metrics to plot
|
|
1993
|
+
# Define the specific metrics to plot
|
|
2007
1994
|
cols_to_plot = [
|
|
2008
1995
|
"mz",
|
|
2009
|
-
"
|
|
2010
|
-
"inty",
|
|
2011
|
-
"quality",
|
|
2012
|
-
"rt",
|
|
1996
|
+
"rt",
|
|
1997
|
+
"inty", # Already log10 transformed above
|
|
2013
1998
|
"rt_delta",
|
|
1999
|
+
"quality", # Already log10 transformed above
|
|
2014
2000
|
"chrom_coherence",
|
|
2015
2001
|
"chrom_prominence",
|
|
2016
2002
|
"chrom_prominence_scaled",
|
|
2017
|
-
|
|
2018
|
-
# "chrom_heights",
|
|
2019
|
-
# "chrom_heights_scaled",
|
|
2020
|
-
"MS2peaks",
|
|
2021
|
-
"MS2int",
|
|
2022
|
-
"MS2toMS1",
|
|
2003
|
+
"chrom_height_scaled",
|
|
2023
2004
|
]
|
|
2024
2005
|
|
|
2025
2006
|
# Ensure an index column is available for plotting
|
|
@@ -2032,29 +2013,39 @@ def plot_feature_stats(
|
|
|
2032
2013
|
data_with = feats_with_MS2[col].dropna().values
|
|
2033
2014
|
data_without = feats_without_MS2[col].dropna().values
|
|
2034
2015
|
|
|
2035
|
-
# Create distribution elements for
|
|
2016
|
+
# Create distribution elements - Green for WITH MS2, Red for WITHOUT MS2
|
|
2036
2017
|
dist_with = hv.Distribution(data_with, label="With MS2").opts(
|
|
2037
|
-
color="
|
|
2018
|
+
color="green",
|
|
2038
2019
|
alpha=0.6,
|
|
2039
2020
|
)
|
|
2040
2021
|
dist_without = hv.Distribution(data_without, label="Without MS2").opts(
|
|
2041
|
-
color="
|
|
2022
|
+
color="red",
|
|
2042
2023
|
alpha=0.6,
|
|
2043
2024
|
)
|
|
2044
2025
|
|
|
2045
2026
|
# Overlay the distributions with a legend and hover tool enabled
|
|
2027
|
+
title = col
|
|
2028
|
+
if col == "inty":
|
|
2029
|
+
title = "log10(inty)"
|
|
2030
|
+
elif col == "quality":
|
|
2031
|
+
title = "log10(quality)"
|
|
2032
|
+
|
|
2046
2033
|
overlay = (dist_with * dist_without).opts(
|
|
2047
|
-
title=
|
|
2034
|
+
title=title,
|
|
2048
2035
|
show_legend=True,
|
|
2049
2036
|
tools=["hover"],
|
|
2050
2037
|
)
|
|
2051
2038
|
density_plots.append(overlay)
|
|
2052
2039
|
|
|
2053
|
-
# Arrange the plots in a layout
|
|
2040
|
+
# Arrange the plots in a grid layout (3 columns for 7 plots)
|
|
2054
2041
|
layout = hv.Layout(density_plots).cols(3).opts(shared_axes=False)
|
|
2055
2042
|
|
|
2056
2043
|
# Use consistent save/display behavior
|
|
2057
|
-
|
|
2044
|
+
if filename is not None:
|
|
2045
|
+
self._handle_sample_plot_output(layout, filename, "holoviews")
|
|
2046
|
+
else:
|
|
2047
|
+
# Return the layout directly for notebook display
|
|
2048
|
+
return layout
|
|
2058
2049
|
|
|
2059
2050
|
|
|
2060
2051
|
def plot_tic(
|
masster/sample/processing.py
CHANGED
|
@@ -1273,3 +1273,161 @@ def find_ms2(self, **kwargs):
|
|
|
1273
1273
|
self.logger.debug(
|
|
1274
1274
|
"Parameters stored to find_ms2",
|
|
1275
1275
|
)
|
|
1276
|
+
|
|
1277
|
+
|
|
1278
|
+
def find_iso(self, rt_tolerance: float = 0.1, **kwargs):
|
|
1279
|
+
"""Extract isotopic distributions from MS1 data and add to features_df.
|
|
1280
|
+
|
|
1281
|
+
This method processes each feature to find isotopic distributions from MS1 data,
|
|
1282
|
+
similar to the study.find_iso() method but for individual samples. The method
|
|
1283
|
+
adds a new 'ms1_spec' column to features_df containing numpy arrays with
|
|
1284
|
+
isotopic distribution data.
|
|
1285
|
+
|
|
1286
|
+
Args:
|
|
1287
|
+
rt_tolerance (float): RT tolerance in minutes for matching MS1 scans. Default 0.1.
|
|
1288
|
+
**kwargs: Additional parameters
|
|
1289
|
+
|
|
1290
|
+
Notes:
|
|
1291
|
+
- Adds a new 'ms1_spec' column to features_df containing numpy arrays
|
|
1292
|
+
- Each array contains [mz, intensity] pairs for the isotopic distribution
|
|
1293
|
+
- Uses the same isotope shift pattern as study.find_iso()
|
|
1294
|
+
- Only processes features that don't already have ms1_spec data
|
|
1295
|
+
"""
|
|
1296
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1297
|
+
self.logger.warning("No features found. Run find_features() first.")
|
|
1298
|
+
return
|
|
1299
|
+
|
|
1300
|
+
if self.ms1_df is None or self.ms1_df.is_empty():
|
|
1301
|
+
self.logger.warning("No MS1 data found.")
|
|
1302
|
+
return
|
|
1303
|
+
|
|
1304
|
+
# Check if ms1_spec column already exists
|
|
1305
|
+
if "ms1_spec" in self.features_df.columns:
|
|
1306
|
+
features_without_spec = self.features_df.filter(pl.col("ms1_spec").is_null())
|
|
1307
|
+
if features_without_spec.is_empty():
|
|
1308
|
+
self.logger.info("All features already have isotopic distributions.")
|
|
1309
|
+
return
|
|
1310
|
+
self.logger.info(f"Processing {len(features_without_spec)} features without isotopic distributions.")
|
|
1311
|
+
else:
|
|
1312
|
+
# Add the ms1_spec column with None values
|
|
1313
|
+
self.features_df = self.features_df.with_columns(
|
|
1314
|
+
pl.lit(None, dtype=pl.Object).alias("ms1_spec")
|
|
1315
|
+
)
|
|
1316
|
+
features_without_spec = self.features_df
|
|
1317
|
+
self.logger.info(f"Processing {len(features_without_spec)} features for isotopic distributions.")
|
|
1318
|
+
|
|
1319
|
+
# Define isotope shifts (same as study.find_iso)
|
|
1320
|
+
isotope_shifts = np.array([
|
|
1321
|
+
0.33,
|
|
1322
|
+
0.50,
|
|
1323
|
+
0.66,
|
|
1324
|
+
1.00335,
|
|
1325
|
+
1.50502,
|
|
1326
|
+
2.00670,
|
|
1327
|
+
3.01005,
|
|
1328
|
+
4.01340,
|
|
1329
|
+
5.01675,
|
|
1330
|
+
6.02010,
|
|
1331
|
+
7.02345,
|
|
1332
|
+
])
|
|
1333
|
+
|
|
1334
|
+
# Convert rt_tolerance from minutes to seconds
|
|
1335
|
+
rt_tolerance_s = rt_tolerance * 60
|
|
1336
|
+
|
|
1337
|
+
# Process each feature
|
|
1338
|
+
ms1_specs = []
|
|
1339
|
+
feature_indices = []
|
|
1340
|
+
|
|
1341
|
+
for i, row in enumerate(tqdm(
|
|
1342
|
+
features_without_spec.rows(named=True),
|
|
1343
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extracting isotope patterns"
|
|
1344
|
+
)):
|
|
1345
|
+
feature_rt = row["rt"]
|
|
1346
|
+
feature_mz = row["mz"]
|
|
1347
|
+
|
|
1348
|
+
# Find MS1 scans within RT tolerance
|
|
1349
|
+
rt_mask = (
|
|
1350
|
+
(self.ms1_df["rt"] >= (feature_rt - rt_tolerance_s)) &
|
|
1351
|
+
(self.ms1_df["rt"] <= (feature_rt + rt_tolerance_s))
|
|
1352
|
+
)
|
|
1353
|
+
ms1_in_range = self.ms1_df.filter(rt_mask)
|
|
1354
|
+
|
|
1355
|
+
if ms1_in_range.is_empty():
|
|
1356
|
+
ms1_specs.append(None)
|
|
1357
|
+
feature_indices.append(row["feature_uid"])
|
|
1358
|
+
continue
|
|
1359
|
+
|
|
1360
|
+
# Extract isotopic pattern
|
|
1361
|
+
isotope_pattern = []
|
|
1362
|
+
|
|
1363
|
+
# Start with the monoisotopic peak (M+0)
|
|
1364
|
+
base_intensity = 0
|
|
1365
|
+
mz_tolerance = 0.01 # 10 ppm at 1000 Da
|
|
1366
|
+
|
|
1367
|
+
# Find the base peak intensity
|
|
1368
|
+
base_mask = (
|
|
1369
|
+
(ms1_in_range["mz"] >= (feature_mz - mz_tolerance)) &
|
|
1370
|
+
(ms1_in_range["mz"] <= (feature_mz + mz_tolerance))
|
|
1371
|
+
)
|
|
1372
|
+
base_peaks = ms1_in_range.filter(base_mask)
|
|
1373
|
+
|
|
1374
|
+
if not base_peaks.is_empty():
|
|
1375
|
+
base_intensity = base_peaks["inty"].max()
|
|
1376
|
+
isotope_pattern.append([feature_mz, base_intensity])
|
|
1377
|
+
|
|
1378
|
+
# Look for isotope peaks
|
|
1379
|
+
for shift in isotope_shifts:
|
|
1380
|
+
isotope_mz = feature_mz + shift
|
|
1381
|
+
isotope_mask = (
|
|
1382
|
+
(ms1_in_range["mz"] >= (isotope_mz - mz_tolerance)) &
|
|
1383
|
+
(ms1_in_range["mz"] <= (isotope_mz + mz_tolerance))
|
|
1384
|
+
)
|
|
1385
|
+
isotope_peaks = ms1_in_range.filter(isotope_mask)
|
|
1386
|
+
|
|
1387
|
+
if not isotope_peaks.is_empty():
|
|
1388
|
+
max_intensity = isotope_peaks["inty"].max()
|
|
1389
|
+
# Only keep isotope peaks that are at least 1% of base peak
|
|
1390
|
+
if base_intensity > 0 and max_intensity >= 0.01 * base_intensity:
|
|
1391
|
+
# Get the mz of the most intense peak
|
|
1392
|
+
max_peak = isotope_peaks.filter(pl.col("inty") == max_intensity).row(0, named=True)
|
|
1393
|
+
isotope_pattern.append([max_peak["mz"], max_intensity])
|
|
1394
|
+
|
|
1395
|
+
# Convert to numpy array or None if empty
|
|
1396
|
+
if len(isotope_pattern) > 1: # Need at least 2 points (monoisotopic + 1 isotope)
|
|
1397
|
+
ms1_spec = np.array(isotope_pattern, dtype=np.float64)
|
|
1398
|
+
else:
|
|
1399
|
+
ms1_spec = None
|
|
1400
|
+
|
|
1401
|
+
ms1_specs.append(ms1_spec)
|
|
1402
|
+
feature_indices.append(row["feature_uid"])
|
|
1403
|
+
|
|
1404
|
+
# Update the features_df with the isotopic spectra
|
|
1405
|
+
update_df = pl.DataFrame({
|
|
1406
|
+
"feature_uid": feature_indices,
|
|
1407
|
+
"ms1_spec_new": pl.Series("ms1_spec_new", ms1_specs, dtype=pl.Object)
|
|
1408
|
+
})
|
|
1409
|
+
|
|
1410
|
+
# Join and update
|
|
1411
|
+
self.features_df = (
|
|
1412
|
+
self.features_df.join(
|
|
1413
|
+
update_df,
|
|
1414
|
+
on="feature_uid",
|
|
1415
|
+
how="left"
|
|
1416
|
+
)
|
|
1417
|
+
.with_columns([
|
|
1418
|
+
pl.when(pl.col("ms1_spec_new").is_not_null())
|
|
1419
|
+
.then(pl.col("ms1_spec_new"))
|
|
1420
|
+
.otherwise(pl.col("ms1_spec"))
|
|
1421
|
+
.alias("ms1_spec")
|
|
1422
|
+
])
|
|
1423
|
+
.drop("ms1_spec_new")
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1426
|
+
# Log results
|
|
1427
|
+
non_null_count = len([spec for spec in ms1_specs if spec is not None])
|
|
1428
|
+
self.logger.info(f"Extracted isotopic distributions for {non_null_count}/{len(ms1_specs)} features.")
|
|
1429
|
+
|
|
1430
|
+
# Store parameters in history
|
|
1431
|
+
params_dict = {"rt_tolerance": rt_tolerance}
|
|
1432
|
+
params_dict.update(kwargs)
|
|
1433
|
+
self.store_history(["find_iso"], params_dict)
|
masster/sample/sample.py
CHANGED
|
@@ -97,6 +97,7 @@ from masster.sample.processing import _get_ztscan_stats
|
|
|
97
97
|
from masster.sample.processing import _spec_to_mat
|
|
98
98
|
from masster.sample.processing import analyze_dda
|
|
99
99
|
from masster.sample.processing import find_features
|
|
100
|
+
from masster.sample.processing import find_iso
|
|
100
101
|
from masster.sample.processing import find_ms2
|
|
101
102
|
from masster.sample.processing import get_spectrum
|
|
102
103
|
from masster.sample.parameters import store_history
|
|
@@ -218,6 +219,7 @@ class Sample:
|
|
|
218
219
|
save = save
|
|
219
220
|
find_features = find_features
|
|
220
221
|
find_adducts = find_adducts
|
|
222
|
+
find_iso = find_iso
|
|
221
223
|
find_ms2 = find_ms2
|
|
222
224
|
get_spectrum = get_spectrum
|
|
223
225
|
filter = features_filter
|