MaldiAMRKit 0.6.2__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maldiamrkit-0.8.0/MaldiAMRKit.egg-info/PKG-INFO +537 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/SOURCES.txt +19 -17
- maldiamrkit-0.8.0/MaldiAMRKit.egg-info/entry_points.txt +2 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/requires.txt +7 -11
- maldiamrkit-0.8.0/PKG-INFO +537 -0
- maldiamrkit-0.8.0/README.md +487 -0
- maldiamrkit-0.8.0/maldiamrkit/__init__.py +37 -0
- maldiamrkit-0.8.0/maldiamrkit/alignment/__init__.py +12 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/alignment/raw_warping.py +71 -165
- maldiamrkit-0.8.0/maldiamrkit/alignment/strategies.py +326 -0
- maldiamrkit-0.8.0/maldiamrkit/alignment/warping.py +327 -0
- maldiamrkit-0.8.0/maldiamrkit/builder.py +618 -0
- maldiamrkit-0.8.0/maldiamrkit/cli.py +333 -0
- {maldiamrkit-0.6.2/maldiamrkit/core → maldiamrkit-0.8.0/maldiamrkit}/dataset.py +185 -228
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/detection/peak_detector.py +25 -108
- maldiamrkit-0.8.0/maldiamrkit/evaluation/__init__.py +40 -0
- maldiamrkit-0.8.0/maldiamrkit/evaluation/label_encoder.py +138 -0
- maldiamrkit-0.8.0/maldiamrkit/evaluation/metrics.py +327 -0
- maldiamrkit-0.8.0/maldiamrkit/evaluation/splitting.py +279 -0
- maldiamrkit-0.8.0/maldiamrkit/filters.py +265 -0
- maldiamrkit-0.8.0/maldiamrkit/io/__init__.py +5 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/io/readers.py +5 -0
- maldiamrkit-0.8.0/maldiamrkit/preprocessing/__init__.py +46 -0
- maldiamrkit-0.8.0/maldiamrkit/preprocessing/binning.py +465 -0
- maldiamrkit-0.8.0/maldiamrkit/preprocessing/merging.py +170 -0
- maldiamrkit-0.8.0/maldiamrkit/preprocessing/pipeline.py +45 -0
- maldiamrkit-0.8.0/maldiamrkit/preprocessing/preprocessing_pipeline.py +272 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/preprocessing/quality.py +109 -29
- maldiamrkit-0.8.0/maldiamrkit/preprocessing/transformers.py +352 -0
- {maldiamrkit-0.6.2/maldiamrkit/core → maldiamrkit-0.8.0/maldiamrkit}/spectrum.py +125 -62
- maldiamrkit-0.8.0/maldiamrkit/visualization/__init__.py +12 -0
- maldiamrkit-0.8.0/maldiamrkit/visualization/alignment_plots.py +244 -0
- maldiamrkit-0.8.0/maldiamrkit/visualization/peak_plots.py +131 -0
- maldiamrkit-0.8.0/maldiamrkit/visualization/spectrum_plots.py +229 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/pyproject.toml +14 -4
- maldiamrkit-0.8.0/requirements-docs.txt +11 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/requirements.txt +11 -1
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/tests/test_peak_detector.py +37 -6
- maldiamrkit-0.6.2/MaldiAMRKit.egg-info/PKG-INFO +0 -320
- maldiamrkit-0.6.2/PKG-INFO +0 -320
- maldiamrkit-0.6.2/README.md +0 -267
- maldiamrkit-0.6.2/maldiamrkit/__init__.py +0 -64
- maldiamrkit-0.6.2/maldiamrkit/alignment/__init__.py +0 -6
- maldiamrkit-0.6.2/maldiamrkit/alignment/warping.py +0 -608
- maldiamrkit-0.6.2/maldiamrkit/core/__init__.py +0 -7
- maldiamrkit-0.6.2/maldiamrkit/core/config.py +0 -34
- maldiamrkit-0.6.2/maldiamrkit/io/__init__.py +0 -5
- maldiamrkit-0.6.2/maldiamrkit/preprocessing/__init__.py +0 -14
- maldiamrkit-0.6.2/maldiamrkit/preprocessing/binning.py +0 -336
- maldiamrkit-0.6.2/maldiamrkit/preprocessing/pipeline.py +0 -79
- maldiamrkit-0.6.2/maldiamrkit/utils/__init__.py +0 -6
- maldiamrkit-0.6.2/maldiamrkit/utils/plotting.py +0 -109
- maldiamrkit-0.6.2/maldiamrkit/utils/validation.py +0 -70
- maldiamrkit-0.6.2/requirements-docs.txt +0 -23
- maldiamrkit-0.6.2/tests/test_binning.py +0 -185
- maldiamrkit-0.6.2/tests/test_dataset.py +0 -415
- maldiamrkit-0.6.2/tests/test_pipeline.py +0 -161
- maldiamrkit-0.6.2/tests/test_quality.py +0 -183
- maldiamrkit-0.6.2/tests/test_raw_warping.py +0 -307
- maldiamrkit-0.6.2/tests/test_readers.py +0 -103
- maldiamrkit-0.6.2/tests/test_spectrum.py +0 -222
- maldiamrkit-0.6.2/tests/test_utils.py +0 -131
- maldiamrkit-0.6.2/tests/test_warping.py +0 -261
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/LICENSE +0 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/dependency_links.txt +0 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/top_level.txt +0 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/detection/__init__.py +0 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/py.typed +0 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/requirements-dev.txt +0 -0
- {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/setup.cfg +0 -0
|
@@ -0,0 +1,537 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: MaldiAMRKit
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: A comprehensive toolkit for MALDI-TOF mass spectrometry data preprocessing for antimicrobial resistance (AMR) prediction purposes
|
|
5
|
+
Author-email: Ettore Rocchi <ettore.rocchi3@unibo.it>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/EttoreRocchi/MaldiAMRKit
|
|
8
|
+
Project-URL: Documentation, https://maldiamrkit.readthedocs.io/
|
|
9
|
+
Project-URL: Source, https://github.com/EttoreRocchi/MaldiAMRKit
|
|
10
|
+
Project-URL: Issues, https://github.com/EttoreRocchi/MaldiAMRKit/issues
|
|
11
|
+
Keywords: MALDI,MALDI-TOF,mass-spectrometry,machine-learning,AMR,antimicrobial-resistance
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy>=1.21.0
|
|
26
|
+
Requires-Dist: pandas>=1.3.0
|
|
27
|
+
Requires-Dist: scipy>=1.13.0
|
|
28
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
29
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
30
|
+
Requires-Dist: seaborn>=0.11.0
|
|
31
|
+
Requires-Dist: pybaselines>=1.0.0
|
|
32
|
+
Requires-Dist: gudhi>=3.5.0
|
|
33
|
+
Requires-Dist: tslearn>=0.6.0
|
|
34
|
+
Requires-Dist: pyyaml>=6.0
|
|
35
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
36
|
+
Requires-Dist: typer>=0.9.0
|
|
37
|
+
Requires-Dist: rich>=13.0.0
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: sphinx>=7.0.0; extra == "docs"
|
|
40
|
+
Requires-Dist: pydata-sphinx-theme>=0.15.0; extra == "docs"
|
|
41
|
+
Requires-Dist: sphinx-autodoc-typehints>=2.0.0; extra == "docs"
|
|
42
|
+
Requires-Dist: sphinx-design>=0.6.0; extra == "docs"
|
|
43
|
+
Requires-Dist: nbsphinx>=0.9.0; extra == "docs"
|
|
44
|
+
Requires-Dist: ipykernel>=6.0.0; extra == "docs"
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
47
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
48
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
49
|
+
Dynamic: license-file
|
|
50
|
+
|
|
51
|
+
# MaldiAMRKit
|
|
52
|
+
|
|
53
|
+
[](https://github.com/EttoreRocchi/MaldiAMRKit/actions/workflows/ci.yml)
|
|
54
|
+
[](https://codecov.io/github/EttoreRocchi/MaldiAMRKit)
|
|
55
|
+
[](https://maldiamrkit.readthedocs.io/)
|
|
56
|
+
|
|
57
|
+
[](https://pypi.org/project/maldiamrkit/)
|
|
58
|
+
[](https://pypi.org/project/maldiamrkit/)
|
|
59
|
+
[](https://github.com/EttoreRocchi/MaldiAMRKit/blob/main/LICENSE)
|
|
60
|
+
|
|
61
|
+
<p align="center">
|
|
62
|
+
<img src="docs/maldiamrkit.png" alt="MaldiAMRKit" width="320"/>
|
|
63
|
+
</p>
|
|
64
|
+
|
|
65
|
+
<p align="center">
|
|
66
|
+
<strong>A comprehensive toolkit for MALDI-TOF mass spectrometry data preprocessing for antimicrobial resistance (AMR) prediction purposes</strong>
|
|
67
|
+
</p>
|
|
68
|
+
|
|
69
|
+
<p align="center">
|
|
70
|
+
<a href="#installation">Installation</a> •
|
|
71
|
+
<a href="#features">Features</a> •
|
|
72
|
+
<a href="https://maldiamrkit.readthedocs.io/">Documentation</a> •
|
|
73
|
+
<a href="#license">License</a>
|
|
74
|
+
</p>
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install maldiamrkit
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Development Installation
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
git clone https://github.com/EttoreRocchi/MaldiAMRKit.git
|
|
86
|
+
cd MaldiAMRKit
|
|
87
|
+
pip install -e .[dev]
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Features
|
|
91
|
+
|
|
92
|
+
- **Spectrum Processing**: Load, smooth, baseline correct, and normalize MALDI-TOF spectra
|
|
93
|
+
- **Dataset Management**: Process multiple spectra with metadata integration
|
|
94
|
+
- **Peak Detection**: Local maxima and persistent homology methods
|
|
95
|
+
- **Spectral Alignment (Warping)**: Multiple alignment methods (shift, linear, piecewise, DTW)
|
|
96
|
+
- **Raw Spectra Warping**: Full m/z resolution alignment before binning
|
|
97
|
+
- **Quality Metrics**: SNR estimation, comprehensive quality reports, and alignment assessment
|
|
98
|
+
- **Replicate Merging**: Mean/median/weighted merging of spectral replicates with correlation-based outlier detection
|
|
99
|
+
- **Composable Preprocessing Pipeline**: Build custom `PreprocessingPipeline` from individual transformers, serializable to JSON/YAML
|
|
100
|
+
- **Composable Filter System**: `SpeciesFilter`, `DrugFilter`, `QualityFilter`, `MetadataFilter` with `&`/`|`/`~` operators for flexible dataset filtering
|
|
101
|
+
- **Evaluation Metrics**: VME, ME, sensitivity, specificity, categorical agreement, and `amr_classification_report`
|
|
102
|
+
- **Stratified Splitting**: Species-drug stratified and case-based (patient-grouped) splitting to prevent data leakage
|
|
103
|
+
- **Label Encoding**: `LabelEncoder` for mapping R/I/S to binary with configurable intermediate handling
|
|
104
|
+
- **DRIAMS Dataset Building**: Build DRIAMS-like dataset directories from raw spectra and metadata via `build_driams_dataset()`, with year-based subfolders and custom processing handlers
|
|
105
|
+
- **Spectrum Export**: Save individual spectra (raw, preprocessed, or binned) to CSV or TXT via `MaldiSet.save_spectra()`
|
|
106
|
+
- **CLI**: `maldiamrkit preprocess`, `maldiamrkit quality`, and `maldiamrkit build-driams` commands for batch processing
|
|
107
|
+
- **Parallel Processing**: Multi-core support via `n_jobs` parameter for faster processing
|
|
108
|
+
- **ML-Ready**: Direct integration with scikit-learn pipelines
|
|
109
|
+
|
|
110
|
+
## Quick Start
|
|
111
|
+
|
|
112
|
+
### Load and Preprocess a Single Spectrum
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from maldiamrkit import MaldiSpectrum
|
|
116
|
+
|
|
117
|
+
# Load spectrum from file
|
|
118
|
+
spec = MaldiSpectrum("data/spectrum.txt")
|
|
119
|
+
|
|
120
|
+
# Preprocess: smoothing, baseline removal, normalization
|
|
121
|
+
spec.preprocess()
|
|
122
|
+
|
|
123
|
+
# Optional: bin to reduce dimensions
|
|
124
|
+
spec.bin(bin_width=3) # 3 Da bins
|
|
125
|
+
|
|
126
|
+
# Visualize
|
|
127
|
+
from maldiamrkit.visualization import plot_spectrum
|
|
128
|
+
plot_spectrum(spec, binned=True)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Build a Dataset from Multiple Spectra
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from maldiamrkit import MaldiSet
|
|
135
|
+
|
|
136
|
+
# Load multiple spectra with metadata
|
|
137
|
+
data = MaldiSet.from_directory(
|
|
138
|
+
spectra_dir="data/spectra/",
|
|
139
|
+
meta_file="data/metadata.csv",
|
|
140
|
+
aggregate_by=dict(antibiotics="Drug", species="Escherichia coli"),
|
|
141
|
+
bin_width=3
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Access features and labels
|
|
145
|
+
X = data.X # Feature matrix
|
|
146
|
+
y = data.get_y_single("Drug") # Target labels
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Build a DRIAMS-like Dataset
|
|
150
|
+
|
|
151
|
+
Create a standardised dataset directory from raw spectra and a metadata CSV:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from maldiamrkit import build_driams_dataset, ProcessingHandler
|
|
155
|
+
|
|
156
|
+
# Basic: produces raw/, preprocessed/, binned_6000/, id/
|
|
157
|
+
report = build_driams_dataset(
|
|
158
|
+
spectra_dir="data/spectra/",
|
|
159
|
+
metadata_csv="data/metadata.csv",
|
|
160
|
+
output_dir="output/my_dataset",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# With year-based subfolders and extra processing variants
|
|
164
|
+
report = build_driams_dataset(
|
|
165
|
+
"data/spectra/", "data/metadata.csv", "output/my_dataset",
|
|
166
|
+
year_column="acquisition_date",
|
|
167
|
+
extra_handlers=[
|
|
168
|
+
ProcessingHandler("preprocessed_sqrt", "preprocessed",
|
|
169
|
+
pipeline=sqrt_pipeline),
|
|
170
|
+
ProcessingHandler("binned_3000", "binned", bin_width=6),
|
|
171
|
+
],
|
|
172
|
+
)
|
|
173
|
+
print(f"Processed {report.succeeded}/{report.total} spectra")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Output structure:
|
|
177
|
+
```
|
|
178
|
+
my_dataset/
|
|
179
|
+
├── raw/{year}/ # Raw spectra
|
|
180
|
+
├── preprocessed/{year}/ # Default preprocessing
|
|
181
|
+
├── preprocessed_sqrt/{year}/ # Extra handler output
|
|
182
|
+
├── binned_6000/{year}/ # Default binning (3 Da)
|
|
183
|
+
├── binned_3000/{year}/ # Extra handler output (6 Da)
|
|
184
|
+
└── id/{year}/ # Metadata CSVs
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Binning Methods
|
|
188
|
+
|
|
189
|
+
MaldiAMRKit supports multiple binning strategies:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from maldiamrkit import MaldiSpectrum
|
|
193
|
+
|
|
194
|
+
spec = MaldiSpectrum("data/spectrum.txt").preprocess()
|
|
195
|
+
|
|
196
|
+
# Uniform binning (default)
|
|
197
|
+
spec.bin(bin_width=3)
|
|
198
|
+
|
|
199
|
+
# Logarithmic binning (width scales with m/z)
|
|
200
|
+
spec.bin(bin_width=3, method="logarithmic")
|
|
201
|
+
|
|
202
|
+
# Adaptive binning (smaller bins in peak-dense regions)
|
|
203
|
+
spec.bin(method="adaptive", adaptive_min_width=1.0, adaptive_max_width=10.0)
|
|
204
|
+
|
|
205
|
+
# Custom binning (user-defined edges)
|
|
206
|
+
spec.bin(method="custom", custom_edges=[2000, 5000, 10000, 15000, 20000])
|
|
207
|
+
|
|
208
|
+
# Access bin metadata
|
|
209
|
+
print(spec.bin_metadata.head())
|
|
210
|
+
# bin_index bin_start bin_end bin_width
|
|
211
|
+
# 0 0 2000.0 2003.0 3.0
|
|
212
|
+
# 1 1 2003.0 2006.0 3.0
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
**Binning Methods:**
|
|
216
|
+
- `uniform`: Fixed width bins (default)
|
|
217
|
+
- `logarithmic`: Bin width scales with m/z (matches instrument resolution)
|
|
218
|
+
- `adaptive`: Smaller bins where peaks are dense, larger bins elsewhere
|
|
219
|
+
- `custom`: User-defined bin edges for domain-specific analysis
|
|
220
|
+
|
|
221
|
+
### Machine Learning Pipeline
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from sklearn.pipeline import Pipeline
|
|
225
|
+
from sklearn.preprocessing import StandardScaler
|
|
226
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
227
|
+
from sklearn.model_selection import cross_val_score
|
|
228
|
+
from maldiamrkit.alignment import Warping
|
|
229
|
+
from maldiamrkit.detection import MaldiPeakDetector
|
|
230
|
+
|
|
231
|
+
# Create ML pipeline
|
|
232
|
+
pipe = Pipeline([
|
|
233
|
+
("peaks", MaldiPeakDetector(binary=False, prominence=0.05)),
|
|
234
|
+
("warp", Warping(method="shift")),
|
|
235
|
+
("scaler", StandardScaler()),
|
|
236
|
+
("clf", RandomForestClassifier(n_estimators=100, random_state=42))
|
|
237
|
+
])
|
|
238
|
+
|
|
239
|
+
# Cross-validation (recommended over train accuracy)
|
|
240
|
+
scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
|
|
241
|
+
print(f"CV Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Spectral Alignment
|
|
245
|
+
|
|
246
|
+
Align spectra to correct for mass calibration drift:
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
from maldiamrkit.alignment import Warping
|
|
250
|
+
|
|
251
|
+
# Create warping transformer
|
|
252
|
+
warper = Warping(
|
|
253
|
+
method='piecewise', # or 'shift', 'linear', 'dtw'
|
|
254
|
+
reference='median',
|
|
255
|
+
n_segments=5
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Fit on training data and transform
|
|
259
|
+
warper.fit(X_train)
|
|
260
|
+
X_aligned = warper.transform(X_test)
|
|
261
|
+
|
|
262
|
+
# Check alignment quality
|
|
263
|
+
quality = warper.get_alignment_quality(X_test, X_aligned)
|
|
264
|
+
print(f"Mean improvement: {quality['improvement'].mean():.4f}")
|
|
265
|
+
|
|
266
|
+
# Visualize
|
|
267
|
+
from maldiamrkit.visualization import plot_alignment
|
|
268
|
+
plot_alignment(warper, X_test, X_aligned, indices=[0], show_peaks=True)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### Raw Spectra Warping
|
|
272
|
+
|
|
273
|
+
For higher precision, use RawWarping which operates at full m/z resolution:
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
from maldiamrkit.alignment import RawWarping, create_raw_input
|
|
277
|
+
|
|
278
|
+
# Create input DataFrame from spectrum files
|
|
279
|
+
X_raw = create_raw_input("data/spectra/")
|
|
280
|
+
|
|
281
|
+
# Raw warping loads original files for warping
|
|
282
|
+
warper = RawWarping(
|
|
283
|
+
method="piecewise",
|
|
284
|
+
bin_width=3,
|
|
285
|
+
max_shift_da=10.0,
|
|
286
|
+
n_jobs=-1 # Parallel processing
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Outputs binned data for pipeline compatibility
|
|
290
|
+
warper.fit(X_raw)
|
|
291
|
+
X_aligned = warper.transform(X_raw)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
**Alignment Methods:**
|
|
295
|
+
- `shift`: Global median shift (fast, simple)
|
|
296
|
+
- `linear`: Least-squares linear transformation
|
|
297
|
+
- `piecewise`: Local shifts across spectrum segments (most flexible)
|
|
298
|
+
- `dtw`: Dynamic Time Warping (best for non-linear drift)
|
|
299
|
+
|
|
300
|
+
### Quality Assessment
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
from maldiamrkit import MaldiSpectrum
|
|
304
|
+
from maldiamrkit.preprocessing import estimate_snr, SpectrumQuality
|
|
305
|
+
|
|
306
|
+
# Estimate signal-to-noise ratio
|
|
307
|
+
spec = MaldiSpectrum("spectrum.txt").preprocess()
|
|
308
|
+
snr = estimate_snr(spec)
|
|
309
|
+
print(f"SNR: {snr:.1f}")
|
|
310
|
+
|
|
311
|
+
# Comprehensive quality report
|
|
312
|
+
qc = SpectrumQuality() # Uses high m/z region (19500-20000) by default
|
|
313
|
+
report = qc.assess(spec)
|
|
314
|
+
print(f"SNR: {report.snr:.1f}")
|
|
315
|
+
print(f"Peak count: {report.peak_count}")
|
|
316
|
+
print(f"Dynamic range: {report.dynamic_range:.2f}")
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### Replicate Merging
|
|
320
|
+
|
|
321
|
+
Merge multiple spectral replicates per isolate into a single consensus spectrum:
|
|
322
|
+
|
|
323
|
+
```python
|
|
324
|
+
from maldiamrkit import MaldiSpectrum
|
|
325
|
+
from maldiamrkit.preprocessing import merge_replicates, detect_outlier_replicates
|
|
326
|
+
|
|
327
|
+
# Load replicates as MaldiSpectrum objects
|
|
328
|
+
spectra = [MaldiSpectrum(f"data/isolate_rep{i}.txt") for i in range(1, 4)]
|
|
329
|
+
|
|
330
|
+
# Detect and remove outlier replicates
|
|
331
|
+
keep = detect_outlier_replicates(spectra)
|
|
332
|
+
clean = [s for s, k in zip(spectra, keep) if k]
|
|
333
|
+
|
|
334
|
+
# Merge into a single consensus spectrum
|
|
335
|
+
merged = merge_replicates(clean, method="mean")
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Composable Preprocessing Pipeline
|
|
339
|
+
|
|
340
|
+
Build a composable, serializable preprocessing pipeline:
|
|
341
|
+
|
|
342
|
+
```python
|
|
343
|
+
from maldiamrkit.preprocessing import (
|
|
344
|
+
PreprocessingPipeline,
|
|
345
|
+
ClipNegatives, SqrtTransform, SavitzkyGolaySmooth,
|
|
346
|
+
SNIPBaseline, MzTrimmer, TICNormalizer,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Use the default pipeline
|
|
350
|
+
pipe = PreprocessingPipeline.default()
|
|
351
|
+
|
|
352
|
+
# Or build a custom pipeline
|
|
353
|
+
pipe = PreprocessingPipeline([
|
|
354
|
+
("clip", ClipNegatives()),
|
|
355
|
+
("sqrt", SqrtTransform()),
|
|
356
|
+
("smooth", SavitzkyGolaySmooth(window_length=15, polyorder=2)),
|
|
357
|
+
("baseline", SNIPBaseline(half_window=30)),
|
|
358
|
+
("trim", MzTrimmer(mz_min=2000, mz_max=20000)),
|
|
359
|
+
("norm", TICNormalizer()),
|
|
360
|
+
])
|
|
361
|
+
|
|
362
|
+
# Serialize to JSON/YAML for reproducibility
|
|
363
|
+
pipe.to_json("my_pipeline.json")
|
|
364
|
+
pipe = PreprocessingPipeline.from_json("my_pipeline.json")
|
|
365
|
+
|
|
366
|
+
# Apply to a spectrum
|
|
367
|
+
spec = MaldiSpectrum("data/spectrum.txt", pipeline=pipe)
|
|
368
|
+
spec.preprocess().bin(3)
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
### Dataset Filtering
|
|
372
|
+
|
|
373
|
+
Use composable filters to select subsets of a `MaldiSet`:
|
|
374
|
+
|
|
375
|
+
```python
|
|
376
|
+
from maldiamrkit import MaldiSet
|
|
377
|
+
from maldiamrkit.filters import SpeciesFilter, DrugFilter, QualityFilter, MetadataFilter
|
|
378
|
+
|
|
379
|
+
data = MaldiSet.from_directory("spectra/", "metadata.csv",
|
|
380
|
+
aggregate_by=dict(antibiotics="Drug"))
|
|
381
|
+
|
|
382
|
+
# Filter by species
|
|
383
|
+
ecoli = data.filter(SpeciesFilter("Escherichia coli"))
|
|
384
|
+
|
|
385
|
+
# Combine filters with & (and), | (or), ~ (not)
|
|
386
|
+
f = SpeciesFilter("Escherichia coli") & QualityFilter(min_snr=5.0)
|
|
387
|
+
high_quality_ecoli = data.filter(f)
|
|
388
|
+
|
|
389
|
+
# Filter by antibiotic resistance status
|
|
390
|
+
f = SpeciesFilter("Escherichia coli") & DrugFilter("Ceftriaxone", status="R")
|
|
391
|
+
resistant_ecoli = data.filter(f)
|
|
392
|
+
|
|
393
|
+
# Custom metadata filter
|
|
394
|
+
f = MetadataFilter("batch_id", lambda v: v == "batch_1")
|
|
395
|
+
batch1 = data.filter(f)
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
### Evaluation Metrics
|
|
399
|
+
|
|
400
|
+
AMR-specific evaluation following EUCAST/CLSI conventions:
|
|
401
|
+
|
|
402
|
+
```python
|
|
403
|
+
from maldiamrkit.evaluation import (
|
|
404
|
+
very_major_error_rate, major_error_rate,
|
|
405
|
+
amr_classification_report, vme_scorer, me_scorer,
|
|
406
|
+
LabelEncoder,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Encode R/I/S labels to binary
|
|
410
|
+
enc = LabelEncoder(intermediate="susceptible")
|
|
411
|
+
y_binary = enc.fit_transform(y_raw)
|
|
412
|
+
|
|
413
|
+
# Compute individual metrics
|
|
414
|
+
vme = very_major_error_rate(y_true, y_pred)
|
|
415
|
+
me = major_error_rate(y_true, y_pred)
|
|
416
|
+
|
|
417
|
+
# Full classification report
|
|
418
|
+
report = amr_classification_report(y_true, y_pred)
|
|
419
|
+
# {'vme': 0.1, 'me': 0.05, 'sensitivity': 0.9, 'specificity': 0.95, ...}
|
|
420
|
+
|
|
421
|
+
# Use as sklearn scorers in cross-validation
|
|
422
|
+
from sklearn.model_selection import cross_val_score
|
|
423
|
+
scores = cross_val_score(pipe, X, y, cv=5, scoring=vme_scorer)
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### Stratified Splitting
|
|
427
|
+
|
|
428
|
+
Prevent data leakage with species-aware and patient-grouped splits:
|
|
429
|
+
|
|
430
|
+
```python
|
|
431
|
+
from maldiamrkit.evaluation import (
|
|
432
|
+
stratified_species_drug_split,
|
|
433
|
+
case_based_split,
|
|
434
|
+
SpeciesDrugStratifiedKFold,
|
|
435
|
+
CaseGroupedKFold,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Single split stratified by species + drug label
|
|
439
|
+
X_train, X_test, y_train, y_test = stratified_species_drug_split(
|
|
440
|
+
X, y, species=species_labels, test_size=0.2, random_state=42
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Patient-grouped split (no patient in both train and test)
|
|
444
|
+
X_train, X_test, y_train, y_test = case_based_split(
|
|
445
|
+
X, y, case_ids=patient_ids, test_size=0.2
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# Cross-validation splitters (sklearn-compatible)
|
|
449
|
+
cv = SpeciesDrugStratifiedKFold(n_splits=5)
|
|
450
|
+
for train_idx, test_idx in cv.split(X, y, species=species_labels):
|
|
451
|
+
...
|
|
452
|
+
|
|
453
|
+
cv = CaseGroupedKFold(n_splits=5)
|
|
454
|
+
for train_idx, test_idx in cv.split(X, y, groups=patient_ids):
|
|
455
|
+
...
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
### Command-Line Interface
|
|
459
|
+
|
|
460
|
+
Batch preprocess spectra or generate quality reports from the terminal:
|
|
461
|
+
|
|
462
|
+
```bash
|
|
463
|
+
# Preprocess and bin to a CSV feature matrix
|
|
464
|
+
maldiamrkit preprocess --input-dir data/ --output processed.csv --bin-width 3
|
|
465
|
+
|
|
466
|
+
# Also save individual preprocessed spectra as TXT files
|
|
467
|
+
maldiamrkit preprocess --input-dir data/ --output processed.csv --save-spectra-dir processed/
|
|
468
|
+
|
|
469
|
+
# Use a custom pipeline config
|
|
470
|
+
maldiamrkit preprocess --input-dir data/ --output processed.csv --pipeline config.yaml
|
|
471
|
+
|
|
472
|
+
# Generate quality report
|
|
473
|
+
maldiamrkit quality --input-dir data/ --output report.csv
|
|
474
|
+
|
|
475
|
+
# Build a DRIAMS-like dataset
|
|
476
|
+
maldiamrkit build-driams --spectra-dir data/ --metadata meta.csv --output-dir output/
|
|
477
|
+
|
|
478
|
+
# With year-based subfolders
|
|
479
|
+
maldiamrkit build-driams --spectra-dir data/ --metadata meta.csv --output-dir output/ \
|
|
480
|
+
--year-column acquisition_date
|
|
481
|
+
|
|
482
|
+
# With extra processing handlers (JSON/YAML config)
|
|
483
|
+
maldiamrkit build-driams --spectra-dir data/ --metadata meta.csv --output-dir output/ \
|
|
484
|
+
--extra-handlers handlers.yaml
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
### Parallel Processing
|
|
488
|
+
|
|
489
|
+
Use `n_jobs` parameter for multi-core processing:
|
|
490
|
+
|
|
491
|
+
```python
|
|
492
|
+
from maldiamrkit import MaldiSet
|
|
493
|
+
from maldiamrkit.alignment import Warping
|
|
494
|
+
from maldiamrkit.detection import MaldiPeakDetector
|
|
495
|
+
|
|
496
|
+
# Parallel dataset loading
|
|
497
|
+
data = MaldiSet.from_directory("spectra/", "meta.csv", n_jobs=-1)
|
|
498
|
+
|
|
499
|
+
# Parallel peak detection
|
|
500
|
+
detector = MaldiPeakDetector(prominence=0.01, n_jobs=-1)
|
|
501
|
+
peaks = detector.fit_transform(X)
|
|
502
|
+
|
|
503
|
+
# Parallel alignment
|
|
504
|
+
warper = Warping(method="piecewise", n_jobs=-1)
|
|
505
|
+
X_aligned = warper.fit_transform(X)
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
## Tutorials
|
|
509
|
+
|
|
510
|
+
For more detailed examples, see the notebooks:
|
|
511
|
+
|
|
512
|
+
- [Quick Start](notebooks/01_quick_start.ipynb) - Loading, preprocessing, binning, and quality assessment
|
|
513
|
+
- [Peak Detection](notebooks/02_peak_detection.ipynb) - Local maxima and persistent homology methods
|
|
514
|
+
- [Alignment](notebooks/03_alignment.ipynb) - Warping methods and alignment quality
|
|
515
|
+
- [Evaluation](notebooks/04_evaluation.ipynb) - AMR metrics, label encoding, and stratified splitting
|
|
516
|
+
|
|
517
|
+
## Contributing
|
|
518
|
+
|
|
519
|
+
Pull requests, bug reports, and feature ideas are welcome: feel free to open a PR!
|
|
520
|
+
|
|
521
|
+
## License
|
|
522
|
+
|
|
523
|
+
This project is licensed under the **MIT License**. See the [LICENSE](LICENSE) file for details.
|
|
524
|
+
|
|
525
|
+
## Papers
|
|
526
|
+
|
|
527
|
+
Publications using `MaldiAMRKit`:
|
|
528
|
+
|
|
529
|
+
> Rocchi, E., Nicitra, E., Calvo, M. et al. *Combining mass spectrometry and machine learning models for predicting Klebsiella pneumoniae antimicrobial resistance: a multicenter experience from clinical isolates in Italy*. **BMC Microbiol** (2026). https://doi.org/10.1186/s12866-025-04657-2
|
|
530
|
+
|
|
531
|
+
## Acknowledgements
|
|
532
|
+
|
|
533
|
+
This toolkit is inspired by:
|
|
534
|
+
|
|
535
|
+
> **Weis, C., Cuénod, A., Rieck, B., et al.** (2022). *Direct antimicrobial resistance prediction from clinical MALDI-TOF mass spectra using machine learning*. **Nature Medicine**, 28, 164–174. [https://doi.org/10.1038/s41591-021-01619-9](https://doi.org/10.1038/s41591-021-01619-9)
|
|
536
|
+
|
|
537
|
+
Please consider citing this work if you find `MaldiAMRKit` useful.
|
|
@@ -7,35 +7,37 @@ requirements.txt
|
|
|
7
7
|
MaldiAMRKit.egg-info/PKG-INFO
|
|
8
8
|
MaldiAMRKit.egg-info/SOURCES.txt
|
|
9
9
|
MaldiAMRKit.egg-info/dependency_links.txt
|
|
10
|
+
MaldiAMRKit.egg-info/entry_points.txt
|
|
10
11
|
MaldiAMRKit.egg-info/requires.txt
|
|
11
12
|
MaldiAMRKit.egg-info/top_level.txt
|
|
12
13
|
maldiamrkit/__init__.py
|
|
14
|
+
maldiamrkit/builder.py
|
|
15
|
+
maldiamrkit/cli.py
|
|
16
|
+
maldiamrkit/dataset.py
|
|
17
|
+
maldiamrkit/filters.py
|
|
13
18
|
maldiamrkit/py.typed
|
|
19
|
+
maldiamrkit/spectrum.py
|
|
14
20
|
maldiamrkit/alignment/__init__.py
|
|
15
21
|
maldiamrkit/alignment/raw_warping.py
|
|
22
|
+
maldiamrkit/alignment/strategies.py
|
|
16
23
|
maldiamrkit/alignment/warping.py
|
|
17
|
-
maldiamrkit/core/__init__.py
|
|
18
|
-
maldiamrkit/core/config.py
|
|
19
|
-
maldiamrkit/core/dataset.py
|
|
20
|
-
maldiamrkit/core/spectrum.py
|
|
21
24
|
maldiamrkit/detection/__init__.py
|
|
22
25
|
maldiamrkit/detection/peak_detector.py
|
|
26
|
+
maldiamrkit/evaluation/__init__.py
|
|
27
|
+
maldiamrkit/evaluation/label_encoder.py
|
|
28
|
+
maldiamrkit/evaluation/metrics.py
|
|
29
|
+
maldiamrkit/evaluation/splitting.py
|
|
23
30
|
maldiamrkit/io/__init__.py
|
|
24
31
|
maldiamrkit/io/readers.py
|
|
25
32
|
maldiamrkit/preprocessing/__init__.py
|
|
26
33
|
maldiamrkit/preprocessing/binning.py
|
|
34
|
+
maldiamrkit/preprocessing/merging.py
|
|
27
35
|
maldiamrkit/preprocessing/pipeline.py
|
|
36
|
+
maldiamrkit/preprocessing/preprocessing_pipeline.py
|
|
28
37
|
maldiamrkit/preprocessing/quality.py
|
|
29
|
-
maldiamrkit/
|
|
30
|
-
maldiamrkit/
|
|
31
|
-
maldiamrkit/
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
tests/test_peak_detector.py
|
|
35
|
-
tests/test_pipeline.py
|
|
36
|
-
tests/test_quality.py
|
|
37
|
-
tests/test_raw_warping.py
|
|
38
|
-
tests/test_readers.py
|
|
39
|
-
tests/test_spectrum.py
|
|
40
|
-
tests/test_utils.py
|
|
41
|
-
tests/test_warping.py
|
|
38
|
+
maldiamrkit/preprocessing/transformers.py
|
|
39
|
+
maldiamrkit/visualization/__init__.py
|
|
40
|
+
maldiamrkit/visualization/alignment_plots.py
|
|
41
|
+
maldiamrkit/visualization/peak_plots.py
|
|
42
|
+
maldiamrkit/visualization/spectrum_plots.py
|
|
43
|
+
tests/test_peak_detector.py
|
|
@@ -6,7 +6,11 @@ matplotlib>=3.4.0
|
|
|
6
6
|
seaborn>=0.11.0
|
|
7
7
|
pybaselines>=1.0.0
|
|
8
8
|
gudhi>=3.5.0
|
|
9
|
-
|
|
9
|
+
tslearn>=0.6.0
|
|
10
|
+
pyyaml>=6.0
|
|
11
|
+
pyarrow>=14.0.0
|
|
12
|
+
typer>=0.9.0
|
|
13
|
+
rich>=13.0.0
|
|
10
14
|
|
|
11
15
|
[dev]
|
|
12
16
|
pytest>=7.0.0
|
|
@@ -14,17 +18,9 @@ pytest-cov>=4.0.0
|
|
|
14
18
|
ruff>=0.1.0
|
|
15
19
|
|
|
16
20
|
[docs]
|
|
17
|
-
numpy>=1.21.0
|
|
18
|
-
pandas>=1.3.0
|
|
19
|
-
scipy>=1.7.0
|
|
20
|
-
scikit-learn>=1.0.0
|
|
21
|
-
matplotlib>=3.4.0
|
|
22
|
-
seaborn>=0.11.0
|
|
23
|
-
pybaselines>=1.0.0
|
|
24
|
-
gudhi>=3.5.0
|
|
25
|
-
fastdtw>=0.3.4
|
|
26
21
|
sphinx>=7.0.0
|
|
27
|
-
sphinx-
|
|
22
|
+
pydata-sphinx-theme>=0.15.0
|
|
28
23
|
sphinx-autodoc-typehints>=2.0.0
|
|
24
|
+
sphinx-design>=0.6.0
|
|
29
25
|
nbsphinx>=0.9.0
|
|
30
26
|
ipykernel>=6.0.0
|