MaldiAMRKit 0.6.2__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. maldiamrkit-0.8.0/MaldiAMRKit.egg-info/PKG-INFO +537 -0
  2. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/SOURCES.txt +19 -17
  3. maldiamrkit-0.8.0/MaldiAMRKit.egg-info/entry_points.txt +2 -0
  4. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/requires.txt +7 -11
  5. maldiamrkit-0.8.0/PKG-INFO +537 -0
  6. maldiamrkit-0.8.0/README.md +487 -0
  7. maldiamrkit-0.8.0/maldiamrkit/__init__.py +37 -0
  8. maldiamrkit-0.8.0/maldiamrkit/alignment/__init__.py +12 -0
  9. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/alignment/raw_warping.py +71 -165
  10. maldiamrkit-0.8.0/maldiamrkit/alignment/strategies.py +326 -0
  11. maldiamrkit-0.8.0/maldiamrkit/alignment/warping.py +327 -0
  12. maldiamrkit-0.8.0/maldiamrkit/builder.py +618 -0
  13. maldiamrkit-0.8.0/maldiamrkit/cli.py +333 -0
  14. {maldiamrkit-0.6.2/maldiamrkit/core → maldiamrkit-0.8.0/maldiamrkit}/dataset.py +185 -228
  15. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/detection/peak_detector.py +25 -108
  16. maldiamrkit-0.8.0/maldiamrkit/evaluation/__init__.py +40 -0
  17. maldiamrkit-0.8.0/maldiamrkit/evaluation/label_encoder.py +138 -0
  18. maldiamrkit-0.8.0/maldiamrkit/evaluation/metrics.py +327 -0
  19. maldiamrkit-0.8.0/maldiamrkit/evaluation/splitting.py +279 -0
  20. maldiamrkit-0.8.0/maldiamrkit/filters.py +265 -0
  21. maldiamrkit-0.8.0/maldiamrkit/io/__init__.py +5 -0
  22. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/io/readers.py +5 -0
  23. maldiamrkit-0.8.0/maldiamrkit/preprocessing/__init__.py +46 -0
  24. maldiamrkit-0.8.0/maldiamrkit/preprocessing/binning.py +465 -0
  25. maldiamrkit-0.8.0/maldiamrkit/preprocessing/merging.py +170 -0
  26. maldiamrkit-0.8.0/maldiamrkit/preprocessing/pipeline.py +45 -0
  27. maldiamrkit-0.8.0/maldiamrkit/preprocessing/preprocessing_pipeline.py +272 -0
  28. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/preprocessing/quality.py +109 -29
  29. maldiamrkit-0.8.0/maldiamrkit/preprocessing/transformers.py +352 -0
  30. {maldiamrkit-0.6.2/maldiamrkit/core → maldiamrkit-0.8.0/maldiamrkit}/spectrum.py +125 -62
  31. maldiamrkit-0.8.0/maldiamrkit/visualization/__init__.py +12 -0
  32. maldiamrkit-0.8.0/maldiamrkit/visualization/alignment_plots.py +244 -0
  33. maldiamrkit-0.8.0/maldiamrkit/visualization/peak_plots.py +131 -0
  34. maldiamrkit-0.8.0/maldiamrkit/visualization/spectrum_plots.py +229 -0
  35. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/pyproject.toml +14 -4
  36. maldiamrkit-0.8.0/requirements-docs.txt +11 -0
  37. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/requirements.txt +11 -1
  38. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/tests/test_peak_detector.py +37 -6
  39. maldiamrkit-0.6.2/MaldiAMRKit.egg-info/PKG-INFO +0 -320
  40. maldiamrkit-0.6.2/PKG-INFO +0 -320
  41. maldiamrkit-0.6.2/README.md +0 -267
  42. maldiamrkit-0.6.2/maldiamrkit/__init__.py +0 -64
  43. maldiamrkit-0.6.2/maldiamrkit/alignment/__init__.py +0 -6
  44. maldiamrkit-0.6.2/maldiamrkit/alignment/warping.py +0 -608
  45. maldiamrkit-0.6.2/maldiamrkit/core/__init__.py +0 -7
  46. maldiamrkit-0.6.2/maldiamrkit/core/config.py +0 -34
  47. maldiamrkit-0.6.2/maldiamrkit/io/__init__.py +0 -5
  48. maldiamrkit-0.6.2/maldiamrkit/preprocessing/__init__.py +0 -14
  49. maldiamrkit-0.6.2/maldiamrkit/preprocessing/binning.py +0 -336
  50. maldiamrkit-0.6.2/maldiamrkit/preprocessing/pipeline.py +0 -79
  51. maldiamrkit-0.6.2/maldiamrkit/utils/__init__.py +0 -6
  52. maldiamrkit-0.6.2/maldiamrkit/utils/plotting.py +0 -109
  53. maldiamrkit-0.6.2/maldiamrkit/utils/validation.py +0 -70
  54. maldiamrkit-0.6.2/requirements-docs.txt +0 -23
  55. maldiamrkit-0.6.2/tests/test_binning.py +0 -185
  56. maldiamrkit-0.6.2/tests/test_dataset.py +0 -415
  57. maldiamrkit-0.6.2/tests/test_pipeline.py +0 -161
  58. maldiamrkit-0.6.2/tests/test_quality.py +0 -183
  59. maldiamrkit-0.6.2/tests/test_raw_warping.py +0 -307
  60. maldiamrkit-0.6.2/tests/test_readers.py +0 -103
  61. maldiamrkit-0.6.2/tests/test_spectrum.py +0 -222
  62. maldiamrkit-0.6.2/tests/test_utils.py +0 -131
  63. maldiamrkit-0.6.2/tests/test_warping.py +0 -261
  64. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/LICENSE +0 -0
  65. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/dependency_links.txt +0 -0
  66. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/MaldiAMRKit.egg-info/top_level.txt +0 -0
  67. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/detection/__init__.py +0 -0
  68. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/maldiamrkit/py.typed +0 -0
  69. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/requirements-dev.txt +0 -0
  70. {maldiamrkit-0.6.2 → maldiamrkit-0.8.0}/setup.cfg +0 -0
@@ -0,0 +1,537 @@
1
+ Metadata-Version: 2.4
2
+ Name: MaldiAMRKit
3
+ Version: 0.8.0
4
+ Summary: A comprehensive toolkit for MALDI-TOF mass spectrometry data preprocessing for antimicrobial resistance (AMR) prediction purposes
5
+ Author-email: Ettore Rocchi <ettore.rocchi3@unibo.it>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/EttoreRocchi/MaldiAMRKit
8
+ Project-URL: Documentation, https://maldiamrkit.readthedocs.io/
9
+ Project-URL: Source, https://github.com/EttoreRocchi/MaldiAMRKit
10
+ Project-URL: Issues, https://github.com/EttoreRocchi/MaldiAMRKit/issues
11
+ Keywords: MALDI,MALDI-TOF,mass-spectrometry,machine-learning,AMR,antimicrobial-resistance
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
21
+ Classifier: Operating System :: OS Independent
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: numpy>=1.21.0
26
+ Requires-Dist: pandas>=1.3.0
27
+ Requires-Dist: scipy>=1.13.0
28
+ Requires-Dist: scikit-learn>=1.0.0
29
+ Requires-Dist: matplotlib>=3.4.0
30
+ Requires-Dist: seaborn>=0.11.0
31
+ Requires-Dist: pybaselines>=1.0.0
32
+ Requires-Dist: gudhi>=3.5.0
33
+ Requires-Dist: tslearn>=0.6.0
34
+ Requires-Dist: pyyaml>=6.0
35
+ Requires-Dist: pyarrow>=14.0.0
36
+ Requires-Dist: typer>=0.9.0
37
+ Requires-Dist: rich>=13.0.0
38
+ Provides-Extra: docs
39
+ Requires-Dist: sphinx>=7.0.0; extra == "docs"
40
+ Requires-Dist: pydata-sphinx-theme>=0.15.0; extra == "docs"
41
+ Requires-Dist: sphinx-autodoc-typehints>=2.0.0; extra == "docs"
42
+ Requires-Dist: sphinx-design>=0.6.0; extra == "docs"
43
+ Requires-Dist: nbsphinx>=0.9.0; extra == "docs"
44
+ Requires-Dist: ipykernel>=6.0.0; extra == "docs"
45
+ Provides-Extra: dev
46
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
47
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
48
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
49
+ Dynamic: license-file
50
+
51
+ # MaldiAMRKit
52
+
53
+ [![CI](https://github.com/EttoreRocchi/MaldiAMRKit/actions/workflows/ci.yml/badge.svg)](https://github.com/EttoreRocchi/MaldiAMRKit/actions/workflows/ci.yml)
54
+ [![Coverage](https://codecov.io/github/EttoreRocchi/MaldiAMRKit/branch/main/graph/badge.svg)](https://codecov.io/github/EttoreRocchi/MaldiAMRKit)
55
+ [![Documentation](https://img.shields.io/badge/docs-online-blue)](https://maldiamrkit.readthedocs.io/)
56
+
57
+ [![PyPI Version](https://img.shields.io/pypi/v/maldiamrkit)](https://pypi.org/project/maldiamrkit/)
58
+ [![Python](https://img.shields.io/pypi/pyversions/maldiamrkit)](https://pypi.org/project/maldiamrkit/)
59
+ [![License](https://img.shields.io/github/license/EttoreRocchi/MaldiAMRKit)](https://github.com/EttoreRocchi/MaldiAMRKit/blob/main/LICENSE)
60
+
61
+ <p align="center">
62
+ <img src="docs/maldiamrkit.png" alt="MaldiAMRKit" width="320"/>
63
+ </p>
64
+
65
+ <p align="center">
66
+ <strong>A comprehensive toolkit for MALDI-TOF mass spectrometry data preprocessing for antimicrobial resistance (AMR) prediction purposes</strong>
67
+ </p>
68
+
69
+ <p align="center">
70
+ <a href="#installation">Installation</a> •
71
+ <a href="#features">Features</a> •
72
+ <a href="https://maldiamrkit.readthedocs.io/">Documentation</a> •
73
+ <a href="#license">License</a>
74
+ </p>
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ pip install maldiamrkit
80
+ ```
81
+
82
+ ### Development Installation
83
+
84
+ ```bash
85
+ git clone https://github.com/EttoreRocchi/MaldiAMRKit.git
86
+ cd MaldiAMRKit
87
+ pip install -e .[dev]
88
+ ```
89
+
90
+ ## Features
91
+
92
+ - **Spectrum Processing**: Load, smooth, baseline correct, and normalize MALDI-TOF spectra
93
+ - **Dataset Management**: Process multiple spectra with metadata integration
94
+ - **Peak Detection**: Local maxima and persistent homology methods
95
+ - **Spectral Alignment (Warping)**: Multiple alignment methods (shift, linear, piecewise, DTW)
96
+ - **Raw Spectra Warping**: Full m/z resolution alignment before binning
97
+ - **Quality Metrics**: SNR estimation, comprehensive quality reports, and alignment assessment
98
+ - **Replicate Merging**: Mean/median/weighted merging of spectral replicates with correlation-based outlier detection
99
+ - **Composable Preprocessing Pipeline**: Build custom `PreprocessingPipeline` from individual transformers, serializable to JSON/YAML
100
+ - **Composable Filter System**: `SpeciesFilter`, `DrugFilter`, `QualityFilter`, `MetadataFilter` with `&`/`|`/`~` operators for flexible dataset filtering
101
+ - **Evaluation Metrics**: VME, ME, sensitivity, specificity, categorical agreement, and `amr_classification_report`
102
+ - **Stratified Splitting**: Species-drug stratified and case-based (patient-grouped) splitting to prevent data leakage
103
+ - **Label Encoding**: `LabelEncoder` for mapping R/I/S to binary with configurable intermediate handling
104
+ - **DRIAMS Dataset Building**: Build DRIAMS-like dataset directories from raw spectra and metadata via `build_driams_dataset()`, with year-based subfolders and custom processing handlers
105
+ - **Spectrum Export**: Save individual spectra (raw, preprocessed, or binned) to CSV or TXT via `MaldiSet.save_spectra()`
106
+ - **CLI**: `maldiamrkit preprocess`, `maldiamrkit quality`, and `maldiamrkit build-driams` commands for batch processing
107
+ - **Parallel Processing**: Multi-core support via `n_jobs` parameter for faster processing
108
+ - **ML-Ready**: Direct integration with scikit-learn pipelines
109
+
110
+ ## Quick Start
111
+
112
+ ### Load and Preprocess a Single Spectrum
113
+
114
+ ```python
115
+ from maldiamrkit import MaldiSpectrum
116
+
117
+ # Load spectrum from file
118
+ spec = MaldiSpectrum("data/spectrum.txt")
119
+
120
+ # Preprocess: smoothing, baseline removal, normalization
121
+ spec.preprocess()
122
+
123
+ # Optional: bin to reduce dimensions
124
+ spec.bin(bin_width=3) # 3 Da bins
125
+
126
+ # Visualize
127
+ from maldiamrkit.visualization import plot_spectrum
128
+ plot_spectrum(spec, binned=True)
129
+ ```
130
+
131
+ ### Build a Dataset from Multiple Spectra
132
+
133
+ ```python
134
+ from maldiamrkit import MaldiSet
135
+
136
+ # Load multiple spectra with metadata
137
+ data = MaldiSet.from_directory(
138
+ spectra_dir="data/spectra/",
139
+ meta_file="data/metadata.csv",
140
+ aggregate_by=dict(antibiotics="Drug", species="Escherichia coli"),
141
+ bin_width=3
142
+ )
143
+
144
+ # Access features and labels
145
+ X = data.X # Feature matrix
146
+ y = data.get_y_single("Drug") # Target labels
147
+ ```
148
+
149
+ ### Build a DRIAMS-like Dataset
150
+
151
+ Create a standardised dataset directory from raw spectra and a metadata CSV:
152
+
153
+ ```python
154
+ from maldiamrkit import build_driams_dataset, ProcessingHandler
155
+
156
+ # Basic: produces raw/, preprocessed/, binned_6000/, id/
157
+ report = build_driams_dataset(
158
+ spectra_dir="data/spectra/",
159
+ metadata_csv="data/metadata.csv",
160
+ output_dir="output/my_dataset",
161
+ )
162
+
163
+ # With year-based subfolders and extra processing variants
164
+ report = build_driams_dataset(
165
+ "data/spectra/", "data/metadata.csv", "output/my_dataset",
166
+ year_column="acquisition_date",
167
+ extra_handlers=[
168
+ ProcessingHandler("preprocessed_sqrt", "preprocessed",
169
+ pipeline=sqrt_pipeline),
170
+ ProcessingHandler("binned_3000", "binned", bin_width=6),
171
+ ],
172
+ )
173
+ print(f"Processed {report.succeeded}/{report.total} spectra")
174
+ ```
175
+
176
+ Output structure:
177
+ ```
178
+ my_dataset/
179
+ ├── raw/{year}/ # Raw spectra
180
+ ├── preprocessed/{year}/ # Default preprocessing
181
+ ├── preprocessed_sqrt/{year}/ # Extra handler output
182
+ ├── binned_6000/{year}/ # Default binning (3 Da)
183
+ ├── binned_3000/{year}/ # Extra handler output (6 Da)
184
+ └── id/{year}/ # Metadata CSVs
185
+ ```
186
+
187
+ ### Binning Methods
188
+
189
+ MaldiAMRKit supports multiple binning strategies:
190
+
191
+ ```python
192
+ from maldiamrkit import MaldiSpectrum
193
+
194
+ spec = MaldiSpectrum("data/spectrum.txt").preprocess()
195
+
196
+ # Uniform binning (default)
197
+ spec.bin(bin_width=3)
198
+
199
+ # Logarithmic binning (width scales with m/z)
200
+ spec.bin(bin_width=3, method="logarithmic")
201
+
202
+ # Adaptive binning (smaller bins in peak-dense regions)
203
+ spec.bin(method="adaptive", adaptive_min_width=1.0, adaptive_max_width=10.0)
204
+
205
+ # Custom binning (user-defined edges)
206
+ spec.bin(method="custom", custom_edges=[2000, 5000, 10000, 15000, 20000])
207
+
208
+ # Access bin metadata
209
+ print(spec.bin_metadata.head())
210
+ # bin_index bin_start bin_end bin_width
211
+ # 0 0 2000.0 2003.0 3.0
212
+ # 1 1 2003.0 2006.0 3.0
213
+ ```
214
+
215
+ **Binning Methods:**
216
+ - `uniform`: Fixed width bins (default)
217
+ - `logarithmic`: Bin width scales with m/z (matches instrument resolution)
218
+ - `adaptive`: Smaller bins where peaks are dense, larger bins elsewhere
219
+ - `custom`: User-defined bin edges for domain-specific analysis
220
+
221
+ ### Machine Learning Pipeline
222
+
223
+ ```python
224
+ from sklearn.pipeline import Pipeline
225
+ from sklearn.preprocessing import StandardScaler
226
+ from sklearn.ensemble import RandomForestClassifier
227
+ from sklearn.model_selection import cross_val_score
228
+ from maldiamrkit.alignment import Warping
229
+ from maldiamrkit.detection import MaldiPeakDetector
230
+
231
+ # Create ML pipeline
232
+ pipe = Pipeline([
233
+ ("peaks", MaldiPeakDetector(binary=False, prominence=0.05)),
234
+ ("warp", Warping(method="shift")),
235
+ ("scaler", StandardScaler()),
236
+ ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
237
+ ])
238
+
239
+ # Cross-validation (recommended over train accuracy)
240
+ scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
241
+ print(f"CV Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")
242
+ ```
243
+
244
+ ### Spectral Alignment
245
+
246
+ Align spectra to correct for mass calibration drift:
247
+
248
+ ```python
249
+ from maldiamrkit.alignment import Warping
250
+
251
+ # Create warping transformer
252
+ warper = Warping(
253
+ method='piecewise', # or 'shift', 'linear', 'dtw'
254
+ reference='median',
255
+ n_segments=5
256
+ )
257
+
258
+ # Fit on training data and transform
259
+ warper.fit(X_train)
260
+ X_aligned = warper.transform(X_test)
261
+
262
+ # Check alignment quality
263
+ quality = warper.get_alignment_quality(X_test, X_aligned)
264
+ print(f"Mean improvement: {quality['improvement'].mean():.4f}")
265
+
266
+ # Visualize
267
+ from maldiamrkit.visualization import plot_alignment
268
+ plot_alignment(warper, X_test, X_aligned, indices=[0], show_peaks=True)
269
+ ```
270
+
271
+ ### Raw Spectra Warping
272
+
273
+ For higher precision, use RawWarping which operates at full m/z resolution:
274
+
275
+ ```python
276
+ from maldiamrkit.alignment import RawWarping, create_raw_input
277
+
278
+ # Create input DataFrame from spectrum files
279
+ X_raw = create_raw_input("data/spectra/")
280
+
281
+ # Raw warping loads original files for warping
282
+ warper = RawWarping(
283
+ method="piecewise",
284
+ bin_width=3,
285
+ max_shift_da=10.0,
286
+ n_jobs=-1 # Parallel processing
287
+ )
288
+
289
+ # Outputs binned data for pipeline compatibility
290
+ warper.fit(X_raw)
291
+ X_aligned = warper.transform(X_raw)
292
+ ```
293
+
294
+ **Alignment Methods:**
295
+ - `shift`: Global median shift (fast, simple)
296
+ - `linear`: Least-squares linear transformation
297
+ - `piecewise`: Local shifts across spectrum segments (most flexible)
298
+ - `dtw`: Dynamic Time Warping (best for non-linear drift)
299
+
300
+ ### Quality Assessment
301
+
302
+ ```python
303
+ from maldiamrkit import MaldiSpectrum
304
+ from maldiamrkit.preprocessing import estimate_snr, SpectrumQuality
305
+
306
+ # Estimate signal-to-noise ratio
307
+ spec = MaldiSpectrum("spectrum.txt").preprocess()
308
+ snr = estimate_snr(spec)
309
+ print(f"SNR: {snr:.1f}")
310
+
311
+ # Comprehensive quality report
312
+ qc = SpectrumQuality() # Uses high m/z region (19500-20000) by default
313
+ report = qc.assess(spec)
314
+ print(f"SNR: {report.snr:.1f}")
315
+ print(f"Peak count: {report.peak_count}")
316
+ print(f"Dynamic range: {report.dynamic_range:.2f}")
317
+ ```
318
+
319
+ ### Replicate Merging
320
+
321
+ Merge multiple spectral replicates per isolate into a single consensus spectrum:
322
+
323
+ ```python
324
+ from maldiamrkit import MaldiSpectrum
325
+ from maldiamrkit.preprocessing import merge_replicates, detect_outlier_replicates
326
+
327
+ # Load replicates as MaldiSpectrum objects
328
+ spectra = [MaldiSpectrum(f"data/isolate_rep{i}.txt") for i in range(1, 4)]
329
+
330
+ # Detect and remove outlier replicates
331
+ keep = detect_outlier_replicates(spectra)
332
+ clean = [s for s, k in zip(spectra, keep) if k]
333
+
334
+ # Merge into a single consensus spectrum
335
+ merged = merge_replicates(clean, method="mean")
336
+ ```
337
+
338
+ ### Composable Preprocessing Pipeline
339
+
340
+ Build a composable, serializable preprocessing pipeline:
341
+
342
+ ```python
343
+ from maldiamrkit.preprocessing import (
344
+ PreprocessingPipeline,
345
+ ClipNegatives, SqrtTransform, SavitzkyGolaySmooth,
346
+ SNIPBaseline, MzTrimmer, TICNormalizer,
347
+ )
348
+
349
+ # Use the default pipeline
350
+ pipe = PreprocessingPipeline.default()
351
+
352
+ # Or build a custom pipeline
353
+ pipe = PreprocessingPipeline([
354
+ ("clip", ClipNegatives()),
355
+ ("sqrt", SqrtTransform()),
356
+ ("smooth", SavitzkyGolaySmooth(window_length=15, polyorder=2)),
357
+ ("baseline", SNIPBaseline(half_window=30)),
358
+ ("trim", MzTrimmer(mz_min=2000, mz_max=20000)),
359
+ ("norm", TICNormalizer()),
360
+ ])
361
+
362
+ # Serialize to JSON/YAML for reproducibility
363
+ pipe.to_json("my_pipeline.json")
364
+ pipe = PreprocessingPipeline.from_json("my_pipeline.json")
365
+
366
+ # Apply to a spectrum
367
+ spec = MaldiSpectrum("data/spectrum.txt", pipeline=pipe)
368
+ spec.preprocess().bin(3)
369
+ ```
370
+
371
+ ### Dataset Filtering
372
+
373
+ Use composable filters to select subsets of a `MaldiSet`:
374
+
375
+ ```python
376
+ from maldiamrkit import MaldiSet
377
+ from maldiamrkit.filters import SpeciesFilter, DrugFilter, QualityFilter, MetadataFilter
378
+
379
+ data = MaldiSet.from_directory("spectra/", "metadata.csv",
380
+ aggregate_by=dict(antibiotics="Drug"))
381
+
382
+ # Filter by species
383
+ ecoli = data.filter(SpeciesFilter("Escherichia coli"))
384
+
385
+ # Combine filters with & (and), | (or), ~ (not)
386
+ f = SpeciesFilter("Escherichia coli") & QualityFilter(min_snr=5.0)
387
+ high_quality_ecoli = data.filter(f)
388
+
389
+ # Filter by antibiotic resistance status
390
+ f = SpeciesFilter("Escherichia coli") & DrugFilter("Ceftriaxone", status="R")
391
+ resistant_ecoli = data.filter(f)
392
+
393
+ # Custom metadata filter
394
+ f = MetadataFilter("batch_id", lambda v: v == "batch_1")
395
+ batch1 = data.filter(f)
396
+ ```
397
+
398
+ ### Evaluation Metrics
399
+
400
+ AMR-specific evaluation following EUCAST/CLSI conventions:
401
+
402
+ ```python
403
+ from maldiamrkit.evaluation import (
404
+ very_major_error_rate, major_error_rate,
405
+ amr_classification_report, vme_scorer, me_scorer,
406
+ LabelEncoder,
407
+ )
408
+
409
+ # Encode R/I/S labels to binary
410
+ enc = LabelEncoder(intermediate="susceptible")
411
+ y_binary = enc.fit_transform(y_raw)
412
+
413
+ # Compute individual metrics
414
+ vme = very_major_error_rate(y_true, y_pred)
415
+ me = major_error_rate(y_true, y_pred)
416
+
417
+ # Full classification report
418
+ report = amr_classification_report(y_true, y_pred)
419
+ # {'vme': 0.1, 'me': 0.05, 'sensitivity': 0.9, 'specificity': 0.95, ...}
420
+
421
+ # Use as sklearn scorers in cross-validation
422
+ from sklearn.model_selection import cross_val_score
423
+ scores = cross_val_score(pipe, X, y, cv=5, scoring=vme_scorer)
424
+ ```
425
+
426
+ ### Stratified Splitting
427
+
428
+ Prevent data leakage with species-aware and patient-grouped splits:
429
+
430
+ ```python
431
+ from maldiamrkit.evaluation import (
432
+ stratified_species_drug_split,
433
+ case_based_split,
434
+ SpeciesDrugStratifiedKFold,
435
+ CaseGroupedKFold,
436
+ )
437
+
438
+ # Single split stratified by species + drug label
439
+ X_train, X_test, y_train, y_test = stratified_species_drug_split(
440
+ X, y, species=species_labels, test_size=0.2, random_state=42
441
+ )
442
+
443
+ # Patient-grouped split (no patient in both train and test)
444
+ X_train, X_test, y_train, y_test = case_based_split(
445
+ X, y, case_ids=patient_ids, test_size=0.2
446
+ )
447
+
448
+ # Cross-validation splitters (sklearn-compatible)
449
+ cv = SpeciesDrugStratifiedKFold(n_splits=5)
450
+ for train_idx, test_idx in cv.split(X, y, species=species_labels):
451
+ ...
452
+
453
+ cv = CaseGroupedKFold(n_splits=5)
454
+ for train_idx, test_idx in cv.split(X, y, groups=patient_ids):
455
+ ...
456
+ ```
457
+
458
+ ### Command-Line Interface
459
+
460
+ Batch preprocess spectra or generate quality reports from the terminal:
461
+
462
+ ```bash
463
+ # Preprocess and bin to a CSV feature matrix
464
+ maldiamrkit preprocess --input-dir data/ --output processed.csv --bin-width 3
465
+
466
+ # Also save individual preprocessed spectra as TXT files
467
+ maldiamrkit preprocess --input-dir data/ --output processed.csv --save-spectra-dir processed/
468
+
469
+ # Use a custom pipeline config
470
+ maldiamrkit preprocess --input-dir data/ --output processed.csv --pipeline config.yaml
471
+
472
+ # Generate quality report
473
+ maldiamrkit quality --input-dir data/ --output report.csv
474
+
475
+ # Build a DRIAMS-like dataset
476
+ maldiamrkit build-driams --spectra-dir data/ --metadata meta.csv --output-dir output/
477
+
478
+ # With year-based subfolders
479
+ maldiamrkit build-driams --spectra-dir data/ --metadata meta.csv --output-dir output/ \
480
+ --year-column acquisition_date
481
+
482
+ # With extra processing handlers (JSON/YAML config)
483
+ maldiamrkit build-driams --spectra-dir data/ --metadata meta.csv --output-dir output/ \
484
+ --extra-handlers handlers.yaml
485
+ ```
486
+
487
+ ### Parallel Processing
488
+
489
+ Use `n_jobs` parameter for multi-core processing:
490
+
491
+ ```python
492
+ from maldiamrkit import MaldiSet
493
+ from maldiamrkit.alignment import Warping
494
+ from maldiamrkit.detection import MaldiPeakDetector
495
+
496
+ # Parallel dataset loading
497
+ data = MaldiSet.from_directory("spectra/", "meta.csv", n_jobs=-1)
498
+
499
+ # Parallel peak detection
500
+ detector = MaldiPeakDetector(prominence=0.01, n_jobs=-1)
501
+ peaks = detector.fit_transform(X)
502
+
503
+ # Parallel alignment
504
+ warper = Warping(method="piecewise", n_jobs=-1)
505
+ X_aligned = warper.fit_transform(X)
506
+ ```
507
+
508
+ ## Tutorials
509
+
510
+ For more detailed examples, see the notebooks:
511
+
512
+ - [Quick Start](notebooks/01_quick_start.ipynb) - Loading, preprocessing, binning, and quality assessment
513
+ - [Peak Detection](notebooks/02_peak_detection.ipynb) - Local maxima and persistent homology methods
514
+ - [Alignment](notebooks/03_alignment.ipynb) - Warping methods and alignment quality
515
+ - [Evaluation](notebooks/04_evaluation.ipynb) - AMR metrics, label encoding, and stratified splitting
516
+
517
+ ## Contributing
518
+
519
+ Pull requests, bug reports, and feature ideas are welcome: feel free to open a PR!
520
+
521
+ ## License
522
+
523
+ This project is licensed under the **MIT License**. See the [LICENSE](LICENSE) file for details.
524
+
525
+ ## Papers
526
+
527
+ Publications using `MaldiAMRKit`:
528
+
529
+ > Rocchi, E., Nicitra, E., Calvo, M. et al. *Combining mass spectrometry and machine learning models for predicting Klebsiella pneumoniae antimicrobial resistance: a multicenter experience from clinical isolates in Italy*. **BMC Microbiol** (2026). https://doi.org/10.1186/s12866-025-04657-2
530
+
531
+ ## Acknowledgements
532
+
533
+ This toolkit is inspired by:
534
+
535
+ > **Weis, C., Cuénod, A., Rieck, B., et al.** (2022). *Direct antimicrobial resistance prediction from clinical MALDI-TOF mass spectra using machine learning*. **Nature Medicine**, 28, 164–174. [https://doi.org/10.1038/s41591-021-01619-9](https://doi.org/10.1038/s41591-021-01619-9)
536
+
537
+ Please consider citing this work if you find `MaldiAMRKit` useful.
@@ -7,35 +7,37 @@ requirements.txt
7
7
  MaldiAMRKit.egg-info/PKG-INFO
8
8
  MaldiAMRKit.egg-info/SOURCES.txt
9
9
  MaldiAMRKit.egg-info/dependency_links.txt
10
+ MaldiAMRKit.egg-info/entry_points.txt
10
11
  MaldiAMRKit.egg-info/requires.txt
11
12
  MaldiAMRKit.egg-info/top_level.txt
12
13
  maldiamrkit/__init__.py
14
+ maldiamrkit/builder.py
15
+ maldiamrkit/cli.py
16
+ maldiamrkit/dataset.py
17
+ maldiamrkit/filters.py
13
18
  maldiamrkit/py.typed
19
+ maldiamrkit/spectrum.py
14
20
  maldiamrkit/alignment/__init__.py
15
21
  maldiamrkit/alignment/raw_warping.py
22
+ maldiamrkit/alignment/strategies.py
16
23
  maldiamrkit/alignment/warping.py
17
- maldiamrkit/core/__init__.py
18
- maldiamrkit/core/config.py
19
- maldiamrkit/core/dataset.py
20
- maldiamrkit/core/spectrum.py
21
24
  maldiamrkit/detection/__init__.py
22
25
  maldiamrkit/detection/peak_detector.py
26
+ maldiamrkit/evaluation/__init__.py
27
+ maldiamrkit/evaluation/label_encoder.py
28
+ maldiamrkit/evaluation/metrics.py
29
+ maldiamrkit/evaluation/splitting.py
23
30
  maldiamrkit/io/__init__.py
24
31
  maldiamrkit/io/readers.py
25
32
  maldiamrkit/preprocessing/__init__.py
26
33
  maldiamrkit/preprocessing/binning.py
34
+ maldiamrkit/preprocessing/merging.py
27
35
  maldiamrkit/preprocessing/pipeline.py
36
+ maldiamrkit/preprocessing/preprocessing_pipeline.py
28
37
  maldiamrkit/preprocessing/quality.py
29
- maldiamrkit/utils/__init__.py
30
- maldiamrkit/utils/plotting.py
31
- maldiamrkit/utils/validation.py
32
- tests/test_binning.py
33
- tests/test_dataset.py
34
- tests/test_peak_detector.py
35
- tests/test_pipeline.py
36
- tests/test_quality.py
37
- tests/test_raw_warping.py
38
- tests/test_readers.py
39
- tests/test_spectrum.py
40
- tests/test_utils.py
41
- tests/test_warping.py
38
+ maldiamrkit/preprocessing/transformers.py
39
+ maldiamrkit/visualization/__init__.py
40
+ maldiamrkit/visualization/alignment_plots.py
41
+ maldiamrkit/visualization/peak_plots.py
42
+ maldiamrkit/visualization/spectrum_plots.py
43
+ tests/test_peak_detector.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ maldiamrkit = maldiamrkit.cli:app
@@ -6,7 +6,11 @@ matplotlib>=3.4.0
6
6
  seaborn>=0.11.0
7
7
  pybaselines>=1.0.0
8
8
  gudhi>=3.5.0
9
- fastdtw>=0.3.4
9
+ tslearn>=0.6.0
10
+ pyyaml>=6.0
11
+ pyarrow>=14.0.0
12
+ typer>=0.9.0
13
+ rich>=13.0.0
10
14
 
11
15
  [dev]
12
16
  pytest>=7.0.0
@@ -14,17 +18,9 @@ pytest-cov>=4.0.0
14
18
  ruff>=0.1.0
15
19
 
16
20
  [docs]
17
- numpy>=1.21.0
18
- pandas>=1.3.0
19
- scipy>=1.7.0
20
- scikit-learn>=1.0.0
21
- matplotlib>=3.4.0
22
- seaborn>=0.11.0
23
- pybaselines>=1.0.0
24
- gudhi>=3.5.0
25
- fastdtw>=0.3.4
26
21
  sphinx>=7.0.0
27
- sphinx-rtd-theme>=2.0.0
22
+ pydata-sphinx-theme>=0.15.0
28
23
  sphinx-autodoc-typehints>=2.0.0
24
+ sphinx-design>=0.6.0
29
25
  nbsphinx>=0.9.0
30
26
  ipykernel>=6.0.0