MaldiAMRKit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maldiamrkit/__init__.py +19 -0
- maldiamrkit/config.py +14 -0
- maldiamrkit/dataset.py +185 -0
- maldiamrkit/io.py +31 -0
- maldiamrkit/peak_detector.py +57 -0
- maldiamrkit/preprocessing.py +67 -0
- maldiamrkit/spectrum.py +105 -0
- maldiamrkit-0.1.0.dist-info/METADATA +84 -0
- maldiamrkit-0.1.0.dist-info/RECORD +12 -0
- maldiamrkit-0.1.0.dist-info/WHEEL +5 -0
- maldiamrkit-0.1.0.dist-info/licenses/LICENSE +21 -0
- maldiamrkit-0.1.0.dist-info/top_level.txt +1 -0
maldiamrkit/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .config import PreprocessingSettings
|
|
2
|
+
from .preprocessing import preprocess, bin_spectrum
|
|
3
|
+
from .io import read_spectrum
|
|
4
|
+
from .dataset import MaldiSet
|
|
5
|
+
from .spectrum import MaldiSpectrum
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__author__ = "Ettore Rocchi"
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"MaldiSpectrum",
|
|
12
|
+
"MaldiSet",
|
|
13
|
+
"PreprocessingSettings",
|
|
14
|
+
"preprocess",
|
|
15
|
+
"bin_spectrum",
|
|
16
|
+
"read_spectrum",
|
|
17
|
+
"__version__",
|
|
18
|
+
"__author__",
|
|
19
|
+
]
|
maldiamrkit/config.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
@dataclass()
|
|
4
|
+
class PreprocessingSettings:
|
|
5
|
+
|
|
6
|
+
trim_from: int = 2_000
|
|
7
|
+
trim_to: int = 20_000
|
|
8
|
+
|
|
9
|
+
savgol_window: int = 20
|
|
10
|
+
savgol_poly: int = 2
|
|
11
|
+
baseline_half_window: int = 40
|
|
12
|
+
|
|
13
|
+
def as_dict(self) -> dict:
|
|
14
|
+
return self.__dict__.copy()
|
maldiamrkit/dataset.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .spectrum import MaldiSpectrum
|
|
8
|
+
from .config import PreprocessingSettings
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MaldiSet:
|
|
12
|
+
"""
|
|
13
|
+
A collection of spectra with metadata.
|
|
14
|
+
|
|
15
|
+
Example
|
|
16
|
+
-------
|
|
17
|
+
>>> ds = MaldiSet.from_directory(
|
|
18
|
+
"spectra/", "meta.csv",
|
|
19
|
+
aggregate_by=dict(antibiotic="Ceftriaxone")
|
|
20
|
+
)
|
|
21
|
+
>>> ds.X.shape, ds.y.value_counts()
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
spectra: list[MaldiSpectrum],
|
|
27
|
+
meta: pd.DataFrame,
|
|
28
|
+
*,
|
|
29
|
+
aggregate_by: dict[str, str],
|
|
30
|
+
bin_width: int = 3,
|
|
31
|
+
verbose: bool = False,
|
|
32
|
+
) -> MaldiSet:
|
|
33
|
+
self.spectra = spectra
|
|
34
|
+
self.meta = meta.set_index("ID")
|
|
35
|
+
|
|
36
|
+
self.antibiotic = aggregate_by.get("antibiotic")
|
|
37
|
+
self.species = aggregate_by.get("species")
|
|
38
|
+
self.bin_width = bin_width
|
|
39
|
+
|
|
40
|
+
self.verbose = verbose
|
|
41
|
+
if verbose:
|
|
42
|
+
print(f"INFO: Dataset created: {len(self.spectra)} spectra")
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_directory(
|
|
46
|
+
self,
|
|
47
|
+
spectra_dir: str | Path,
|
|
48
|
+
meta_file: str | Path,
|
|
49
|
+
*,
|
|
50
|
+
aggregate_by: dict[str, str],
|
|
51
|
+
cfg: PreprocessingSettings | None = None,
|
|
52
|
+
bin_width: int = 3,
|
|
53
|
+
verbose: bool = False,
|
|
54
|
+
) -> MaldiSet:
|
|
55
|
+
spectra_dir = Path(spectra_dir)
|
|
56
|
+
specs = [MaldiSpectrum(p, cfg=cfg).bin(bin_width) for p in spectra_dir.glob("*.txt")]
|
|
57
|
+
meta = pd.read_csv(meta_file)
|
|
58
|
+
return self(specs, meta, aggregate_by=aggregate_by, bin_width=bin_width, verbose=verbose)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def X(self) -> pd.DataFrame:
|
|
62
|
+
"""
|
|
63
|
+
Return matrix (n_samples, n_features) limited to the configured subset.
|
|
64
|
+
"""
|
|
65
|
+
rows = []
|
|
66
|
+
for s in self.spectra:
|
|
67
|
+
sid = s.id
|
|
68
|
+
if sid not in self.meta.index and self.verbose:
|
|
69
|
+
print(f"WARNING: ID {sid} missing in metadata - skipped.")
|
|
70
|
+
continue
|
|
71
|
+
row = (s.binned if s._binned is not None else s.bin(self.bin_width).binned) \
|
|
72
|
+
.set_index("mass")["intensity"].rename(sid)
|
|
73
|
+
rows.append(row)
|
|
74
|
+
|
|
75
|
+
df = pd.concat(rows, axis=1).T
|
|
76
|
+
|
|
77
|
+
df = df.join(self.meta, how="left")
|
|
78
|
+
if self.antibiotic:
|
|
79
|
+
df = df[df[self.antibiotic].notna()]
|
|
80
|
+
if self.species:
|
|
81
|
+
df = df[df["Species"] == self.species]
|
|
82
|
+
|
|
83
|
+
return df.select_dtypes("number")
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def y(self) -> pd.Series:
|
|
87
|
+
"""Return the classification/label vector (antibiotic resistance)."""
|
|
88
|
+
return self.meta.loc[self.X.index, self.antibiotic]
|
|
89
|
+
|
|
90
|
+
def plot_pseudogel(
|
|
91
|
+
self,
|
|
92
|
+
*,
|
|
93
|
+
antibiotic: str | None = None,
|
|
94
|
+
cmap: str = "inferno",
|
|
95
|
+
vmin: float | None = None,
|
|
96
|
+
vmax: float | None = None,
|
|
97
|
+
figsize: tuple[int, int] | None = None,
|
|
98
|
+
log_scale: bool = True,
|
|
99
|
+
sort_by_intensity: bool = True,
|
|
100
|
+
title: str | None = None,
|
|
101
|
+
show: bool = True,
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
Displays a pseudogel heatmap of the spectra, with one subplot
|
|
105
|
+
for each unique value of the antibiotic column.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
antibiotic : str | None
|
|
110
|
+
Name of the target column to use (default: self.antibiotic).
|
|
111
|
+
cmap : str
|
|
112
|
+
Matplotlib colormap to use (default: "inferno").
|
|
113
|
+
vmin, vmax : float | None
|
|
114
|
+
Color scale limits. Use None for automatic scaling.
|
|
115
|
+
figsize : (int, int) | None
|
|
116
|
+
Figure size. If None, it is automatically set based on the number of subplots.
|
|
117
|
+
log_scale : bool
|
|
118
|
+
Apply log1p to intensity values to emphasize weaker signals.
|
|
119
|
+
sort_by_intensity : bool
|
|
120
|
+
Sort samples by average intensity before plotting.
|
|
121
|
+
title : str | None
|
|
122
|
+
Title of the overall figure.
|
|
123
|
+
show : bool
|
|
124
|
+
If True, calls plt.show() at the end of the method.
|
|
125
|
+
|
|
126
|
+
Returns
|
|
127
|
+
-------
|
|
128
|
+
fig, axes : matplotlib.figure.Figure, ndarray[Axes]
|
|
129
|
+
Matplotlib figure and axes objects, useful for further customization.
|
|
130
|
+
"""
|
|
131
|
+
if antibiotic is None:
|
|
132
|
+
antibiotic = self.antibiotic
|
|
133
|
+
if antibiotic is None:
|
|
134
|
+
raise ValueError(
|
|
135
|
+
"Antibiotic column not defined. "
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
X = self.X
|
|
139
|
+
y = self.y
|
|
140
|
+
|
|
141
|
+
groups = y.groupby(y).groups
|
|
142
|
+
n_groups = len(groups)
|
|
143
|
+
if figsize is None:
|
|
144
|
+
figsize = (6.0, 2.5 * n_groups)
|
|
145
|
+
|
|
146
|
+
fig, axes = plt.subplots(
|
|
147
|
+
n_groups, 1, figsize=figsize, sharex=True, constrained_layout=True
|
|
148
|
+
)
|
|
149
|
+
if n_groups == 1:
|
|
150
|
+
axes = np.asarray([axes])
|
|
151
|
+
|
|
152
|
+
for ax, (label, idx) in zip(axes, sorted(groups.items(), key=lambda t: str(t[0]))):
|
|
153
|
+
M = X.loc[idx].to_numpy()
|
|
154
|
+
if sort_by_intensity:
|
|
155
|
+
order = np.argsort(M.mean(axis=1))[::-1]
|
|
156
|
+
M = M[order]
|
|
157
|
+
if log_scale:
|
|
158
|
+
M = np.log1p(M)
|
|
159
|
+
|
|
160
|
+
im = ax.imshow(
|
|
161
|
+
M,
|
|
162
|
+
aspect="auto",
|
|
163
|
+
interpolation="nearest",
|
|
164
|
+
cmap=cmap,
|
|
165
|
+
vmin=vmin,
|
|
166
|
+
vmax=vmax,
|
|
167
|
+
)
|
|
168
|
+
ax.set_ylabel(f"{label}\n(n={M.shape[0]})", rotation=0, ha="right", va="center")
|
|
169
|
+
ax.set_yticks([])
|
|
170
|
+
|
|
171
|
+
xticks = np.linspace(0, X.shape[1] - 1, 6, dtype=int)
|
|
172
|
+
axes[-1].set_xticks(xticks)
|
|
173
|
+
axes[-1].set_xticklabels([f"{m}" for m in X.columns[xticks]])
|
|
174
|
+
axes[-1].set_xlabel("m/z (binned)")
|
|
175
|
+
|
|
176
|
+
cbar = fig.colorbar(im, ax=axes, orientation="vertical", pad=0.01)
|
|
177
|
+
cbar.set_label("Log(intensity + 1)" if log_scale else "intensity")
|
|
178
|
+
|
|
179
|
+
if title:
|
|
180
|
+
fig.suptitle(title, y=1.02)
|
|
181
|
+
|
|
182
|
+
if show:
|
|
183
|
+
plt.show()
|
|
184
|
+
|
|
185
|
+
return fig, axes
|
maldiamrkit/io.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
def sniff_delimiter(path: str | Path, sample_lines: int = 10) -> str:
|
|
6
|
+
with open(path, "r", newline="") as f:
|
|
7
|
+
dialect = csv.Sniffer().sniff(
|
|
8
|
+
"".join([next(f) for _ in range(sample_lines)]),
|
|
9
|
+
delimiters=",;\t "
|
|
10
|
+
)
|
|
11
|
+
return dialect.delimiter
|
|
12
|
+
|
|
13
|
+
def read_spectrum(path: str | Path) -> pd.DataFrame:
|
|
14
|
+
"""
|
|
15
|
+
Read raw txt/csv file with two unnamed columns into a DataFrame
|
|
16
|
+
with columns ['mass', 'intensity'].
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
delim = sniff_delimiter(path)
|
|
20
|
+
except:
|
|
21
|
+
delim = "\s+"
|
|
22
|
+
|
|
23
|
+
df = pd.read_csv(
|
|
24
|
+
path,
|
|
25
|
+
sep=delim,
|
|
26
|
+
comment="#",
|
|
27
|
+
header=None,
|
|
28
|
+
names=["mass", "intensity"]
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
return df
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
5
|
+
from scipy.signal import find_peaks
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MaldiPeakDetector(BaseEstimator, TransformerMixin):
|
|
9
|
+
"""
|
|
10
|
+
Peak detector for MALDI-TOF spectra.
|
|
11
|
+
|
|
12
|
+
The transformer keeps the original feature dimension; all non-peak
|
|
13
|
+
positions are set to 0. Peaks can be returned as **binary flags**
|
|
14
|
+
or with their original intensities.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
binary : bool, default=True
|
|
19
|
+
If *True* every peak is marked with 1; otherwise its original
|
|
20
|
+
intensity is kept.
|
|
21
|
+
**kwargs :
|
|
22
|
+
Any keyword accepted by :func:`scipy.signal.find_peaks`
|
|
23
|
+
(e.g. `prominence`, `height`, `distance`, …).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
binary: bool = True,
|
|
29
|
+
**kwargs
|
|
30
|
+
) -> MaldiPeakDetector:
|
|
31
|
+
self.binary = binary
|
|
32
|
+
self.kwargs = kwargs
|
|
33
|
+
|
|
34
|
+
def fit(self, X: pd.DataFrame, y=None):
|
|
35
|
+
"""No learning required, just return *self*."""
|
|
36
|
+
return self
|
|
37
|
+
|
|
38
|
+
def transform(self, X: pd.DataFrame):
|
|
39
|
+
"""Detect peaks in *each* sample independently and mask everything else."""
|
|
40
|
+
X_out = X.copy()
|
|
41
|
+
|
|
42
|
+
for i in range(X_out.shape[0]):
|
|
43
|
+
row = X_out.iloc[i].values
|
|
44
|
+
peaks, _ = find_peaks(row, **self.kwargs)
|
|
45
|
+
|
|
46
|
+
masked = np.zeros_like(row, dtype=row.dtype)
|
|
47
|
+
if self.binary:
|
|
48
|
+
masked[peaks] = 1
|
|
49
|
+
else:
|
|
50
|
+
masked[peaks] = row[peaks]
|
|
51
|
+
X_out.iloc[i] = masked
|
|
52
|
+
|
|
53
|
+
return X_out
|
|
54
|
+
|
|
55
|
+
def fit_transform(self, X: pd.DataFrame, y=None, **fit_params):
|
|
56
|
+
"""Convenience shortcut."""
|
|
57
|
+
return self.fit(X, y).transform(X)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from pybaselines import Baseline
|
|
4
|
+
from scipy.signal import savgol_filter
|
|
5
|
+
|
|
6
|
+
from .config import PreprocessingSettings
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def preprocess(
|
|
10
|
+
df: pd.DataFrame,
|
|
11
|
+
cfg: PreprocessingSettings = PreprocessingSettings()
|
|
12
|
+
) -> pd.DataFrame:
|
|
13
|
+
"""Return intensity-normalised, baseline-corrected and trimmed spectrum."""
|
|
14
|
+
df = df.copy()
|
|
15
|
+
df["intensity"] = df["intensity"].clip(lower=0)
|
|
16
|
+
|
|
17
|
+
# smooth+sqrt
|
|
18
|
+
intensity = np.sqrt(df["intensity"])
|
|
19
|
+
intensity = savgol_filter(intensity,
|
|
20
|
+
window_length=cfg.savgol_window,
|
|
21
|
+
polyorder=cfg.savgol_poly)
|
|
22
|
+
|
|
23
|
+
# baseline correction
|
|
24
|
+
bkg = Baseline(x_data=df["mass"]).snip(
|
|
25
|
+
intensity,
|
|
26
|
+
max_half_window=cfg.baseline_half_window,
|
|
27
|
+
decreasing=True,
|
|
28
|
+
smooth_half_window=0
|
|
29
|
+
)[0]
|
|
30
|
+
intensity -= bkg
|
|
31
|
+
intensity[intensity < 0] = 0 # remove any small negative values post-baseline
|
|
32
|
+
|
|
33
|
+
out = pd.DataFrame({"mass": df["mass"], "intensity": intensity})
|
|
34
|
+
|
|
35
|
+
mmin, mmax = cfg.trim_from, cfg.trim_to
|
|
36
|
+
out = out[(out.mass.between(mmin, mmax))].reset_index(drop=True)
|
|
37
|
+
|
|
38
|
+
total = out["intensity"].sum()
|
|
39
|
+
if total > 0:
|
|
40
|
+
out["intensity"] /= total
|
|
41
|
+
return out
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def bin_spectrum(
|
|
45
|
+
df: pd.DataFrame,
|
|
46
|
+
cfg: PreprocessingSettings,
|
|
47
|
+
bin_width: int | float | None = None,
|
|
48
|
+
) -> pd.DataFrame:
|
|
49
|
+
"""
|
|
50
|
+
Bin intensities using *inclusive left* intervals
|
|
51
|
+
[start, start+bin_width). Returns DataFrame («mass», «intensity»).
|
|
52
|
+
"""
|
|
53
|
+
if bin_width is None:
|
|
54
|
+
raise ValueError(" 'bin_width=None': no binning requested.")
|
|
55
|
+
|
|
56
|
+
edges = np.arange(cfg.trim_from, cfg.trim_to + bin_width, bin_width)
|
|
57
|
+
labels = edges[:-1].astype(str)
|
|
58
|
+
binned = (
|
|
59
|
+
df
|
|
60
|
+
.assign(bins=pd.cut(df.mass, edges, labels=labels, include_lowest=True))
|
|
61
|
+
.groupby("bins", observed=True)["intensity"]
|
|
62
|
+
.sum()
|
|
63
|
+
.reindex(labels, fill_value=0.0)
|
|
64
|
+
.reset_index()
|
|
65
|
+
.rename(columns={"bins": "mass"})
|
|
66
|
+
)
|
|
67
|
+
return binned
|
maldiamrkit/spectrum.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from .config import PreprocessingSettings
|
|
6
|
+
from .preprocessing import preprocess, bin_spectrum
|
|
7
|
+
from .io import read_spectrum
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MaldiSpectrum:
|
|
11
|
+
"""
|
|
12
|
+
A single MALDI-TOF spectrum.
|
|
13
|
+
|
|
14
|
+
Workflow
|
|
15
|
+
--------
|
|
16
|
+
>>> spec = MaldiSpectrum("raw/abc.txt")
|
|
17
|
+
>>> spec.preprocess()
|
|
18
|
+
>>> spec.bin(3)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
source: str | Path | pd.DataFrame,
|
|
24
|
+
*,
|
|
25
|
+
cfg: PreprocessingSettings | None = None,
|
|
26
|
+
verbose: bool = False,
|
|
27
|
+
) -> MaldiSpectrum:
|
|
28
|
+
self.cfg = cfg or PreprocessingSettings()
|
|
29
|
+
self._raw: pd.DataFrame
|
|
30
|
+
self._preprocessed: pd.DataFrame | None = None
|
|
31
|
+
self._binned: pd.DataFrame | None = None
|
|
32
|
+
self.verbose = verbose
|
|
33
|
+
|
|
34
|
+
if isinstance(source, (str, Path)):
|
|
35
|
+
self.path = Path(source)
|
|
36
|
+
self._raw = read_spectrum(self.path)
|
|
37
|
+
self.id = self.path.stem
|
|
38
|
+
elif isinstance(source, pd.DataFrame):
|
|
39
|
+
self.path = None
|
|
40
|
+
self._raw = source.copy()
|
|
41
|
+
self.id = "in-memory"
|
|
42
|
+
else:
|
|
43
|
+
raise TypeError("Unsupported source type for MaldiSpectrum")
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def raw(self) -> pd.DataFrame:
|
|
47
|
+
return self._raw.copy()
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def bin_width(self) -> int | float | None:
|
|
51
|
+
return self._bin_width
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def preprocessed(self) -> pd.DataFrame:
|
|
55
|
+
if self._preprocessed is None:
|
|
56
|
+
raise RuntimeError("Call .preprocess() before accessing this property.")
|
|
57
|
+
return self._preprocessed.copy()
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def binned(self) -> pd.DataFrame:
|
|
61
|
+
if self._binned is None:
|
|
62
|
+
raise RuntimeError("Call .bin() before accessing this property.")
|
|
63
|
+
return self._binned.copy()
|
|
64
|
+
|
|
65
|
+
def preprocess(self, **override) -> MaldiSpectrum:
|
|
66
|
+
"""
|
|
67
|
+
Run baseline correction, smoothing, normalisation, trimming.
|
|
68
|
+
Optionally override parameters from the current `PreprocessingSettings`
|
|
69
|
+
with `**override` *kwargs*.
|
|
70
|
+
"""
|
|
71
|
+
cfg = self.cfg if not override else self.cfg.__class__(**{**self.cfg.as_dict(), **override})
|
|
72
|
+
self._preprocessed = preprocess(self._raw, cfg)
|
|
73
|
+
if self.verbose:
|
|
74
|
+
print(f"INFO: Preprocessed spectrum {self.id}")
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
def bin(self, bin_width: int | float) -> MaldiSpectrum:
|
|
78
|
+
"""
|
|
79
|
+
Binning. If `bin_width` is None we skip binning.
|
|
80
|
+
"""
|
|
81
|
+
self._bin_width = bin_width
|
|
82
|
+
|
|
83
|
+
if self._preprocessed is None:
|
|
84
|
+
self.preprocess()
|
|
85
|
+
|
|
86
|
+
self._binned = bin_spectrum(self._preprocessed, self.cfg, self._bin_width)
|
|
87
|
+
if self.verbose:
|
|
88
|
+
print(F"INFO: Binned spectrum {self.id} (w={self._bin_width})")
|
|
89
|
+
return self
|
|
90
|
+
|
|
91
|
+
def plot(self, binned: bool = True, ax=None, **kwargs):
|
|
92
|
+
import matplotlib.pyplot as plt
|
|
93
|
+
import seaborn as sns
|
|
94
|
+
_ax = ax or plt.subplots(figsize=(10, 4))[1]
|
|
95
|
+
data = self.binned if binned else (self.preprocessed if self._preprocessed is not None else self.raw)
|
|
96
|
+
if binned:
|
|
97
|
+
sns.barplot(data=data, x="mass", y="intensity", ax=_ax, **kwargs)
|
|
98
|
+
else:
|
|
99
|
+
_ax.plot(data.mass, data.intensity, **kwargs)
|
|
100
|
+
_ax.set(
|
|
101
|
+
title=f"{self.id}{' (binned)' if binned else ''}",
|
|
102
|
+
xlabel="m/z", ylabel="intensity", xticks=[],
|
|
103
|
+
ylim=[0,(data.intensity.max())*1.05]
|
|
104
|
+
)
|
|
105
|
+
return _ax
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: MaldiAMRKit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Toolkit to read and preprocess MALDI-TOF mass-spectra for AMR analyses.
|
|
5
|
+
Author-email: Ettore Rocchi <ettoreroc@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/EttoreRocchi/MaldiAMRKit
|
|
8
|
+
Project-URL: Documentation, https://github.com/EttoreRocchi/MaldiAMRKit#readme
|
|
9
|
+
Project-URL: Source, https://github.com/EttoreRocchi/MaldiAMRKit
|
|
10
|
+
Project-URL: Issues, https://github.com/EttoreRocchi/MaldiAMRKit/issues
|
|
11
|
+
Keywords: MALDI,mass-spectrometry,machine-learning,scikit-learn
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: pandas
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: scipy
|
|
23
|
+
Requires-Dist: scikit-learn
|
|
24
|
+
Requires-Dist: pybaselines
|
|
25
|
+
Requires-Dist: matplotlib
|
|
26
|
+
Requires-Dist: seaborn
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# MaldiAMRKit
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<img src="docs/maldiamrkit.png" alt="MaldiAMRKit" width="250"/>
|
|
33
|
+
</p>
|
|
34
|
+
<p align="center">
|
|
35
|
+
<strong>Toolkit to read and preprocess MALDI-TOF mass-spectra for AMR analyses</strong>
|
|
36
|
+
</p>
|
|
37
|
+
|
|
38
|
+
## 🚀 Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install maldiamrkit
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## 🏃 Quick Start
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from maldiamrkit.spectrum import MaldiSpectrum
|
|
48
|
+
from maldiamrkit.dataset import MaldiSet
|
|
49
|
+
from maldiamrkit.peak_detector import MaldiPeakDetector
|
|
50
|
+
|
|
51
|
+
# Load and preprocess a single spectrum
|
|
52
|
+
spec = MaldiSpectrum("data/1s.txt").preprocess() # smoothing, baseline removal, normalisation
|
|
53
|
+
spec.bin(3) # [optional] bin width 3 Da
|
|
54
|
+
spec.plot(binned=True) # plot
|
|
55
|
+
|
|
56
|
+
# Build a dataset from a directory of spectra + metadata CSV
|
|
57
|
+
data = MaldiSet.from_directory(
|
|
58
|
+
"data/", "data/metadata/metadata.csv",
|
|
59
|
+
aggregate_by=dict(antibiotic="Drug"),
|
|
60
|
+
bin_width=3
|
|
61
|
+
)
|
|
62
|
+
X, y = data.X, data.y
|
|
63
|
+
|
|
64
|
+
# Machine learning pipeline
|
|
65
|
+
from sklearn.pipeline import Pipeline
|
|
66
|
+
from sklearn.preprocessing import StandardScaler
|
|
67
|
+
from sklearn.linear_model import LogisticRegression
|
|
68
|
+
|
|
69
|
+
pipe = Pipeline([
|
|
70
|
+
("peaks", MaldiPeakDetector(binary=False, prominence=0.05)),
|
|
71
|
+
("scaler", StandardScaler()),
|
|
72
|
+
("clf", LogisticRegression(max_iter=500))
|
|
73
|
+
])
|
|
74
|
+
pipe.fit(X, y)
|
|
75
|
+
```
|
|
76
|
+
For further details please see the [quick guide](docs/quick_guide.ipynb).
|
|
77
|
+
|
|
78
|
+
## 🤝 Contributing
|
|
79
|
+
|
|
80
|
+
Pull requests, bug reports, and feature ideas are welcome: feel free to open a PR!
|
|
81
|
+
|
|
82
|
+
## 📝 License
|
|
83
|
+
|
|
84
|
+
This project is licensed under the **MIT License**. See the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
maldiamrkit/__init__.py,sha256=PWufJsuW3fEoO-eXr5TvnttWdmPPKdMZjuBqUJEwho8,418
|
|
2
|
+
maldiamrkit/config.py,sha256=BkfMteV_N7-W1etqi_Jm1wupa9QcJafp3nSp5spl_SI,292
|
|
3
|
+
maldiamrkit/dataset.py,sha256=bH7f7FQV5xuIF6jcktgjna7gpJlXR0tVTrFUmS_7k6o,5939
|
|
4
|
+
maldiamrkit/io.py,sha256=ENWzDFKxC6-3wzWOOyoV52tmk2CkWS8Xk0icOP8sp9o,762
|
|
5
|
+
maldiamrkit/peak_detector.py,sha256=emlf_-KaWMnHocvhJw3leTKXerygtzjQuZBqDauKPJs,1736
|
|
6
|
+
maldiamrkit/preprocessing.py,sha256=wec26M0BWXlGM-LIx7rbjLOrw60-5w8cIfj6k-4JZZ0,2053
|
|
7
|
+
maldiamrkit/spectrum.py,sha256=A_MYkTdVxjW2MaZ3s87czaEmy-_dSuhgEA5aXHhYZ_M,3490
|
|
8
|
+
maldiamrkit-0.1.0.dist-info/licenses/LICENSE,sha256=O34CBRTmdL59PxDYOa6nq1N0-2A9xyXGkBXKbsL1NeY,1070
|
|
9
|
+
maldiamrkit-0.1.0.dist-info/METADATA,sha256=zKLqMaWa0fWIA5D4VES5uWaxttxFk4DrWjqOinaZ_uA,2620
|
|
10
|
+
maldiamrkit-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
maldiamrkit-0.1.0.dist-info/top_level.txt,sha256=Xws_Zvs9hgSfp2yNawSCSC84sYt0_AJmyJzYEjEMFZA,12
|
|
12
|
+
maldiamrkit-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ettore Rocchi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
maldiamrkit
|