msiverse 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msiverse-0.0.1/PKG-INFO +188 -0
- msiverse-0.0.1/README.md +135 -0
- msiverse-0.0.1/msiverse/__init__.py +66 -0
- msiverse-0.0.1/msiverse/annotate/__init__.py +255 -0
- msiverse-0.0.1/msiverse/core.py +321 -0
- msiverse-0.0.1/msiverse/deep/__init__.py +270 -0
- msiverse-0.0.1/msiverse/diagnostics.py +868 -0
- msiverse-0.0.1/msiverse/gui/__init__.py +155 -0
- msiverse-0.0.1/msiverse/io/__init__.py +454 -0
- msiverse-0.0.1/msiverse/multimodal/__init__.py +290 -0
- msiverse-0.0.1/msiverse/preprocess/__init__.py +1168 -0
- msiverse-0.0.1/msiverse/register/__init__.py +267 -0
- msiverse-0.0.1/msiverse/segment/__init__.py +423 -0
- msiverse-0.0.1/msiverse/visualize/__init__.py +263 -0
- msiverse-0.0.1/msiverse/workflow/__init__.py +315 -0
- msiverse-0.0.1/msiverse.egg-info/PKG-INFO +188 -0
- msiverse-0.0.1/msiverse.egg-info/SOURCES.txt +22 -0
- msiverse-0.0.1/msiverse.egg-info/dependency_links.txt +1 -0
- msiverse-0.0.1/msiverse.egg-info/requires.txt +40 -0
- msiverse-0.0.1/msiverse.egg-info/top_level.txt +1 -0
- msiverse-0.0.1/pyproject.toml +74 -0
- msiverse-0.0.1/setup.cfg +4 -0
- msiverse-0.0.1/tests/test_diagnostics.py +59 -0
- msiverse-0.0.1/tests/test_msiverse.py +631 -0
msiverse-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: msiverse
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Python-first, biologist-friendly toolkit for MALDI-MSI analysis
|
|
5
|
+
Author: msiverse contributors
|
|
6
|
+
License: BSD-3-Clause
|
|
7
|
+
Project-URL: Repository, https://github.com/aqgy2749/msiverse
|
|
8
|
+
Project-URL: Documentation, https://msiverse.readthedocs.io
|
|
9
|
+
Keywords: mass-spectrometry-imaging,MALDI,MSI,spatial-metabolomics,bioinformatics,scverse
|
|
10
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: numpy>=1.22
|
|
22
|
+
Requires-Dist: scipy>=1.9
|
|
23
|
+
Requires-Dist: pandas>=1.5
|
|
24
|
+
Requires-Dist: scikit-learn>=1.1
|
|
25
|
+
Requires-Dist: matplotlib>=3.5
|
|
26
|
+
Provides-Extra: imzml
|
|
27
|
+
Requires-Dist: pyimzML>=1.5; extra == "imzml"
|
|
28
|
+
Provides-Extra: scverse
|
|
29
|
+
Requires-Dist: anndata>=0.9; extra == "scverse"
|
|
30
|
+
Requires-Dist: scanpy>=1.9; extra == "scverse"
|
|
31
|
+
Provides-Extra: deep
|
|
32
|
+
Requires-Dist: torch>=2.0; extra == "deep"
|
|
33
|
+
Provides-Extra: gui
|
|
34
|
+
Requires-Dist: napari[all]>=0.4.18; extra == "gui"
|
|
35
|
+
Requires-Dist: magicgui>=0.7; extra == "gui"
|
|
36
|
+
Provides-Extra: metaspace
|
|
37
|
+
Requires-Dist: metaspace2020>=2.0; extra == "metaspace"
|
|
38
|
+
Provides-Extra: workflow
|
|
39
|
+
Requires-Dist: pyyaml>=6.0; extra == "workflow"
|
|
40
|
+
Provides-Extra: all
|
|
41
|
+
Requires-Dist: pyimzML>=1.5; extra == "all"
|
|
42
|
+
Requires-Dist: anndata>=0.9; extra == "all"
|
|
43
|
+
Requires-Dist: scanpy>=1.9; extra == "all"
|
|
44
|
+
Requires-Dist: torch>=2.0; extra == "all"
|
|
45
|
+
Requires-Dist: napari[all]>=0.4.18; extra == "all"
|
|
46
|
+
Requires-Dist: magicgui>=0.7; extra == "all"
|
|
47
|
+
Requires-Dist: metaspace2020>=2.0; extra == "all"
|
|
48
|
+
Requires-Dist: pyyaml>=6.0; extra == "all"
|
|
49
|
+
Provides-Extra: dev
|
|
50
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
51
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
52
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
53
|
+
|
|
54
|
+
# msiverse
|
|
55
|
+
|
|
56
|
+
> A Python-first, biologist-friendly toolkit for MALDI-MSI analysis.
|
|
57
|
+
|
|
58
|
+
> [!WARNING]
|
|
59
|
+
> **Early development release (`0.0.1`).** `msiverse` is under active
|
|
60
|
+
> development. APIs, outputs, and behavior may change without notice, and this
|
|
61
|
+
> release is not recommended for production use.
|
|
62
|
+
|
|
63
|
+
`msiverse` is a reference implementation of the architecture recommended in the
|
|
64
|
+
*2026 MALDI-MSI Software Landscape* report. It addresses the seven critical
|
|
65
|
+
gaps identified in the open-source ecosystem and demonstrates a viable path to
|
|
66
|
+
a "Scanpy moment" for mass spectrometry imaging.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## What it does
|
|
71
|
+
|
|
72
|
+
| Module | Purpose | Report recommendation |
|
|
73
|
+
|---|---|---|
|
|
74
|
+
| `msiverse.core` | `MSIData` container with AnnData/SpatialData interop | Rec 1 |
|
|
75
|
+
| `msiverse.io` | imzML reader + synthetic data generator | Rec 3 |
|
|
76
|
+
| `msiverse.preprocess` | TIC / RMS norm, TopHat baseline, hotspot clip, log1p | parity w/ MALDIquant/rMSIproc |
|
|
77
|
+
| `msiverse.segment` | k-means, spatial k-means, **Spatial Shrunken Centroids** (Cardinal port) | Rec 1 |
|
|
78
|
+
| `msiverse.annotate` | local DB matcher + METASPACE adapter stub | Rec 4 |
|
|
79
|
+
| `msiverse.register` | landmark affine + thin-plate-spline; image warping | Rec 5 |
|
|
80
|
+
| `msiverse.multimodal` | MSI ↔ Visium/Xenium spot aggregation, MSI ↔ IF/IHC fusion | Rec 5 + scientific frontier |
|
|
81
|
+
| `msiverse.visualize` | ion images, segmentation maps, overview panels | core UX |
|
|
82
|
+
| `msiverse.deep` | PyTorch `Dataset` + `VAEEmbedding` (pyM²aia / msiPL style) | Rec 6 |
|
|
83
|
+
| `msiverse.workflow` | hashed, reproducible `Pipeline` + Snakemake config export | Rec 7 |
|
|
84
|
+
| `msiverse.gui` | napari plugin with ion-image browser widget | Rec 2 |
|
|
85
|
+
|
|
86
|
+
## Installation
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# Minimal install
|
|
90
|
+
pip install -e .
|
|
91
|
+
|
|
92
|
+
# With scverse / DL / GUI extras
|
|
93
|
+
pip install -e ".[scverse,deep,gui,imzml,workflow]"
|
|
94
|
+
|
|
95
|
+
# Everything
|
|
96
|
+
pip install -e ".[all]"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Quick start
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from msiverse import io, preprocess, segment, annotate, visualize
|
|
103
|
+
|
|
104
|
+
# Synthetic MSI for tutorials/tests — no data download required
|
|
105
|
+
data = io.simulate_msi(height=80, width=80, n_features=200, n_regions=4)
|
|
106
|
+
|
|
107
|
+
# One-line preprocessing (baseline → TIC norm → hotspot → log1p)
|
|
108
|
+
data = preprocess.standard_pipeline(data)
|
|
109
|
+
|
|
110
|
+
# Cardinal-style Spatial Shrunken Centroids — first Python port
|
|
111
|
+
segment.spatial_shrunken_centroids(data, n_clusters=4, shrinkage=1.5)
|
|
112
|
+
|
|
113
|
+
# Local annotation against built-in lipid/metabolite DB
|
|
114
|
+
hits = annotate.annotate_local(data, polarity="positive", tol_ppm=5)
|
|
115
|
+
|
|
116
|
+
# Overview panel (TIC, mean spectrum, top features, segmentation, ...)
|
|
117
|
+
fig = visualize.overview(data, label_key="ssc")
|
|
118
|
+
fig.savefig("overview.png")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Reproducible pipelines
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from msiverse.workflow import Pipeline
|
|
125
|
+
|
|
126
|
+
p = (Pipeline("my_run")
|
|
127
|
+
.add("baseline", preprocess.baseline_correct, window=51)
|
|
128
|
+
.add("normalize", preprocess.normalize, method="tic")
|
|
129
|
+
.add("ssc", segment.spatial_shrunken_centroids, n_clusters=5))
|
|
130
|
+
|
|
131
|
+
result = p.run(data)
|
|
132
|
+
p.save_provenance("run.json") # JSON record with input/output hashes
|
|
133
|
+
p.to_snakemake_config("Snakefile.yaml") # HPC handoff
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## scverse interop
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
adata = data.to_anndata() # → Scanpy / Squidpy / SpatialData
|
|
140
|
+
data2 = MSIData.from_anndata(adata)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Same-section MSI + spatial transcriptomics
|
|
144
|
+
|
|
145
|
+
The scientific frontier identified in the report:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from msiverse.multimodal import integrate_with_visium
|
|
149
|
+
|
|
150
|
+
# Provide fiducial landmarks from both modalities
|
|
151
|
+
joint = integrate_with_visium(
|
|
152
|
+
msi=msi_data,
|
|
153
|
+
visium_adata=visium_adata,
|
|
154
|
+
msi_landmarks=msi_pts,
|
|
155
|
+
visium_landmarks=visium_pts,
|
|
156
|
+
aggregation="mean",
|
|
157
|
+
)
|
|
158
|
+
# joint.obsm['msi'] now contains MSI intensities per Visium spot
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## GUI (napari)
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
import napari
|
|
165
|
+
from msiverse.gui import view_msi
|
|
166
|
+
|
|
167
|
+
viewer = view_msi(data, label_key="ssc")
|
|
168
|
+
napari.run()
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Tests
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
pytest tests/ -v
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
BSD-3-Clause.
|
|
180
|
+
|
|
181
|
+
## Citation
|
|
182
|
+
|
|
183
|
+
If you use `msiverse` in your work, please cite the underlying methods:
|
|
184
|
+
|
|
185
|
+
- Cardinal v3: Bemis et al., *Nat. Methods* 20:1883 (2023)
|
|
186
|
+
- METASPACE-ML: Wadie et al., *Nat. Commun.* 15:9110 (2024)
|
|
187
|
+
- pyM²aia: Cordes et al., *Bioinformatics* 40:btae133 (2024)
|
|
188
|
+
- SMA: Vicari et al., *Nat. Biotechnol.* 42:1046 (2024)
|
msiverse-0.0.1/README.md
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# msiverse
|
|
2
|
+
|
|
3
|
+
> A Python-first, biologist-friendly toolkit for MALDI-MSI analysis.
|
|
4
|
+
|
|
5
|
+
> [!WARNING]
|
|
6
|
+
> **Early development release (`0.0.1`).** `msiverse` is under active
|
|
7
|
+
> development. APIs, outputs, and behavior may change without notice, and this
|
|
8
|
+
> release is not recommended for production use.
|
|
9
|
+
|
|
10
|
+
`msiverse` is a reference implementation of the architecture recommended in the
|
|
11
|
+
*2026 MALDI-MSI Software Landscape* report. It addresses the seven critical
|
|
12
|
+
gaps identified in the open-source ecosystem and demonstrates a viable path to
|
|
13
|
+
a "Scanpy moment" for mass spectrometry imaging.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## What it does
|
|
18
|
+
|
|
19
|
+
| Module | Purpose | Report recommendation |
|
|
20
|
+
|---|---|---|
|
|
21
|
+
| `msiverse.core` | `MSIData` container with AnnData/SpatialData interop | Rec 1 |
|
|
22
|
+
| `msiverse.io` | imzML reader + synthetic data generator | Rec 3 |
|
|
23
|
+
| `msiverse.preprocess` | TIC / RMS norm, TopHat baseline, hotspot clip, log1p | parity w/ MALDIquant/rMSIproc |
|
|
24
|
+
| `msiverse.segment` | k-means, spatial k-means, **Spatial Shrunken Centroids** (Cardinal port) | Rec 1 |
|
|
25
|
+
| `msiverse.annotate` | local DB matcher + METASPACE adapter stub | Rec 4 |
|
|
26
|
+
| `msiverse.register` | landmark affine + thin-plate-spline; image warping | Rec 5 |
|
|
27
|
+
| `msiverse.multimodal` | MSI ↔ Visium/Xenium spot aggregation, MSI ↔ IF/IHC fusion | Rec 5 + scientific frontier |
|
|
28
|
+
| `msiverse.visualize` | ion images, segmentation maps, overview panels | core UX |
|
|
29
|
+
| `msiverse.deep` | PyTorch `Dataset` + `VAEEmbedding` (pyM²aia / msiPL style) | Rec 6 |
|
|
30
|
+
| `msiverse.workflow` | hashed, reproducible `Pipeline` + Snakemake config export | Rec 7 |
|
|
31
|
+
| `msiverse.gui` | napari plugin with ion-image browser widget | Rec 2 |
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Minimal install
|
|
37
|
+
pip install -e .
|
|
38
|
+
|
|
39
|
+
# With scverse / DL / GUI extras
|
|
40
|
+
pip install -e ".[scverse,deep,gui,imzml,workflow]"
|
|
41
|
+
|
|
42
|
+
# Everything
|
|
43
|
+
pip install -e ".[all]"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quick start
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from msiverse import io, preprocess, segment, annotate, visualize
|
|
50
|
+
|
|
51
|
+
# Synthetic MSI for tutorials/tests — no data download required
|
|
52
|
+
data = io.simulate_msi(height=80, width=80, n_features=200, n_regions=4)
|
|
53
|
+
|
|
54
|
+
# One-line preprocessing (baseline → TIC norm → hotspot → log1p)
|
|
55
|
+
data = preprocess.standard_pipeline(data)
|
|
56
|
+
|
|
57
|
+
# Cardinal-style Spatial Shrunken Centroids — first Python port
|
|
58
|
+
segment.spatial_shrunken_centroids(data, n_clusters=4, shrinkage=1.5)
|
|
59
|
+
|
|
60
|
+
# Local annotation against built-in lipid/metabolite DB
|
|
61
|
+
hits = annotate.annotate_local(data, polarity="positive", tol_ppm=5)
|
|
62
|
+
|
|
63
|
+
# Overview panel (TIC, mean spectrum, top features, segmentation, ...)
|
|
64
|
+
fig = visualize.overview(data, label_key="ssc")
|
|
65
|
+
fig.savefig("overview.png")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Reproducible pipelines
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from msiverse.workflow import Pipeline
|
|
72
|
+
|
|
73
|
+
p = (Pipeline("my_run")
|
|
74
|
+
.add("baseline", preprocess.baseline_correct, window=51)
|
|
75
|
+
.add("normalize", preprocess.normalize, method="tic")
|
|
76
|
+
.add("ssc", segment.spatial_shrunken_centroids, n_clusters=5))
|
|
77
|
+
|
|
78
|
+
result = p.run(data)
|
|
79
|
+
p.save_provenance("run.json") # JSON record with input/output hashes
|
|
80
|
+
p.to_snakemake_config("Snakefile.yaml") # HPC handoff
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## scverse interop
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
adata = data.to_anndata() # → Scanpy / Squidpy / SpatialData
|
|
87
|
+
data2 = MSIData.from_anndata(adata)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Same-section MSI + spatial transcriptomics
|
|
91
|
+
|
|
92
|
+
The scientific frontier identified in the report:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from msiverse.multimodal import integrate_with_visium
|
|
96
|
+
|
|
97
|
+
# Provide fiducial landmarks from both modalities
|
|
98
|
+
joint = integrate_with_visium(
|
|
99
|
+
msi=msi_data,
|
|
100
|
+
visium_adata=visium_adata,
|
|
101
|
+
msi_landmarks=msi_pts,
|
|
102
|
+
visium_landmarks=visium_pts,
|
|
103
|
+
aggregation="mean",
|
|
104
|
+
)
|
|
105
|
+
# joint.obsm['msi'] now contains MSI intensities per Visium spot
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## GUI (napari)
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
import napari
|
|
112
|
+
from msiverse.gui import view_msi
|
|
113
|
+
|
|
114
|
+
viewer = view_msi(data, label_key="ssc")
|
|
115
|
+
napari.run()
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Tests
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
pytest tests/ -v
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
BSD-3-Clause.
|
|
127
|
+
|
|
128
|
+
## Citation
|
|
129
|
+
|
|
130
|
+
If you use `msiverse` in your work, please cite the underlying methods:
|
|
131
|
+
|
|
132
|
+
- Cardinal v3: Bemis et al., *Nat. Methods* 20:1883 (2023)
|
|
133
|
+
- METASPACE-ML: Wadie et al., *Nat. Commun.* 15:9110 (2024)
|
|
134
|
+
- pyM²aia: Cordes et al., *Bioinformatics* 40:btae133 (2024)
|
|
135
|
+
- SMA: Vicari et al., *Nat. Biotechnol.* 42:1046 (2024)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
msiverse: a Python-first, biologist-friendly toolkit for MALDI-MSI.
|
|
3
|
+
|
|
4
|
+
Designed to address the seven critical gaps in the MALDI-MSI software
|
|
5
|
+
ecosystem identified in our 2026 landscape review:
|
|
6
|
+
|
|
7
|
+
1. End-to-end open-source Python platform
|
|
8
|
+
2. (lobby) imzML 1.2 standardization — interim native handling here
|
|
9
|
+
3. Vendor I/O coverage (via imzy/MSIGen adapters)
|
|
10
|
+
4. Accessible deep learning (msiverse.deep)
|
|
11
|
+
5. Better protein annotation (HIT-MAP-style; planned)
|
|
12
|
+
6. FAIR / reproducibility (msiverse.workflow)
|
|
13
|
+
7. Same-section ST + MSI integration (msiverse.multimodal)
|
|
14
|
+
|
|
15
|
+
Quick start
|
|
16
|
+
-----------
|
|
17
|
+
>>> from msiverse import io, preprocess, segment, annotate, visualize
|
|
18
|
+
>>> data = io.simulate_msi() # synthetic demo data
|
|
19
|
+
>>> data = preprocess.standard_pipeline(data) # baseline → norm → log
|
|
20
|
+
>>> segment.spatial_shrunken_centroids(data, n_clusters=5)
|
|
21
|
+
>>> annotate.annotate_local(data, tol_ppm=5)
|
|
22
|
+
>>> visualize.overview(data, label_key="ssc")
|
|
23
|
+
|
|
24
|
+
End-to-end (one-call)
|
|
25
|
+
---------------------
|
|
26
|
+
>>> from msiverse.workflow import run_standard_workflow
|
|
27
|
+
>>> data, pipeline = run_standard_workflow(
|
|
28
|
+
... data, n_clusters=5, output_dir="./results"
|
|
29
|
+
... )
|
|
30
|
+
|
|
31
|
+
scverse interop
|
|
32
|
+
---------------
|
|
33
|
+
>>> adata = data.to_anndata() # → Scanpy / Squidpy / SpatialData
|
|
34
|
+
>>> data2 = MSIData.from_anndata(adata)
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
__version__ = "0.0.1"
|
|
38
|
+
|
|
39
|
+
from .core import MSIData
|
|
40
|
+
|
|
41
|
+
# Submodule aliases for the canonical workflow
|
|
42
|
+
from . import (
|
|
43
|
+
io,
|
|
44
|
+
preprocess,
|
|
45
|
+
segment,
|
|
46
|
+
annotate,
|
|
47
|
+
register,
|
|
48
|
+
multimodal,
|
|
49
|
+
visualize,
|
|
50
|
+
workflow,
|
|
51
|
+
diagnostics,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
"MSIData",
|
|
56
|
+
"io",
|
|
57
|
+
"preprocess",
|
|
58
|
+
"segment",
|
|
59
|
+
"annotate",
|
|
60
|
+
"register",
|
|
61
|
+
"multimodal",
|
|
62
|
+
"visualize",
|
|
63
|
+
"workflow",
|
|
64
|
+
"diagnostics",
|
|
65
|
+
"__version__",
|
|
66
|
+
]
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""
|
|
2
|
+
msiverse.annotate
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
Metabolite annotation for MSI data.
|
|
6
|
+
|
|
7
|
+
Strategy: provide a *thin local matcher* for offline workflows + an
|
|
8
|
+
optional adapter to METASPACE / METASPACE-ML for FDR-controlled cloud
|
|
9
|
+
annotation. Lipid in-source fragmentation (rMSIfragment-style) and
|
|
10
|
+
spatial coherence scoring (METASPACE's MSM) are stubbed for future work.
|
|
11
|
+
|
|
12
|
+
What works today:
|
|
13
|
+
- Local mass-only matching against a small built-in lipid/metabolite
|
|
14
|
+
reference list with common adducts.
|
|
15
|
+
- Adduct enumeration ([M+H]+, [M+Na]+, [M-H]-, [M+K]+, ...).
|
|
16
|
+
- Returns per-feature ranked candidates with mass error in ppm.
|
|
17
|
+
|
|
18
|
+
What's pluggable:
|
|
19
|
+
- The METASPACEClient stub mirrors the metaspace2020/python-client
|
|
20
|
+
API so it can be swapped in once a network is available.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
import pandas as pd
|
|
29
|
+
|
|
30
|
+
from ..core import MSIData
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Proton mass / common adduct deltas
|
|
34
|
+
PROTON = 1.00728
|
|
35
|
+
ADDUCTS = {
|
|
36
|
+
"[M+H]+": (+PROTON, +1),
|
|
37
|
+
"[M+Na]+": (+22.98922, +1),
|
|
38
|
+
"[M+K]+": (+38.96316, +1),
|
|
39
|
+
"[M+NH4]+": (+18.03383, +1),
|
|
40
|
+
"[M-H]-": (-PROTON, -1),
|
|
41
|
+
"[M+Cl]-": (+34.96885, -1),
|
|
42
|
+
"[M+FA-H]-": (+44.99765, -1), # formate adduct
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# =============================================================================
|
|
47
|
+
# Tiny built-in database for demos / offline use
|
|
48
|
+
# =============================================================================
|
|
49
|
+
def builtin_db() -> pd.DataFrame:
|
|
50
|
+
"""
|
|
51
|
+
A minimal panel of common biological metabolites and lipids.
|
|
52
|
+
Intended for tutorials and offline tests, not production annotation.
|
|
53
|
+
"""
|
|
54
|
+
return pd.DataFrame(
|
|
55
|
+
[
|
|
56
|
+
# (name, formula, monoisotopic_mass, class)
|
|
57
|
+
("Glucose", "C6H12O6", 180.0634, "Sugar"),
|
|
58
|
+
("Cholesterol", "C27H46O", 386.3549, "Sterol"),
|
|
59
|
+
("Phosphatidylcholine 34:1", "C42H82NO8P", 759.5778, "Lipid/PC"),
|
|
60
|
+
("Phosphatidylcholine 36:2", "C44H84NO8P", 785.5935, "Lipid/PC"),
|
|
61
|
+
("Sphingomyelin d18:1/16:0", "C39H79N2O6P", 702.5676, "Lipid/SM"),
|
|
62
|
+
("Glutamic acid", "C5H9NO4", 147.0532, "Amino acid"),
|
|
63
|
+
("ATP", "C10H16N5O13P3", 506.9957, "Nucleotide"),
|
|
64
|
+
("Heme B", "C34H32FeN4O4", 616.1773, "Cofactor"),
|
|
65
|
+
("Taurine", "C2H7NO3S", 125.0147, "Amino acid"),
|
|
66
|
+
("Creatine", "C4H9N3O2", 131.0695, "Amino acid"),
|
|
67
|
+
("Dopamine", "C8H11NO2", 153.0790, "Neurotransmitter"),
|
|
68
|
+
("Acetylcholine", "C7H16NO2", 146.1181, "Neurotransmitter"),
|
|
69
|
+
("PE 36:2", "C41H78NO8P", 743.5465, "Lipid/PE"),
|
|
70
|
+
("LysoPC 16:0", "C24H50NO7P", 495.3325, "Lipid/LPC"),
|
|
71
|
+
],
|
|
72
|
+
columns=["name", "formula", "neutral_mass", "class"],
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# =============================================================================
|
|
77
|
+
# Local annotation
|
|
78
|
+
# =============================================================================
|
|
79
|
+
@dataclass
|
|
80
|
+
class AnnotationHit:
|
|
81
|
+
feature_idx: int
|
|
82
|
+
observed_mz: float
|
|
83
|
+
name: str
|
|
84
|
+
formula: str
|
|
85
|
+
adduct: str
|
|
86
|
+
theoretical_mz: float
|
|
87
|
+
ppm_error: float
|
|
88
|
+
db_class: str
|
|
89
|
+
|
|
90
|
+
def to_dict(self) -> dict:
|
|
91
|
+
return {
|
|
92
|
+
"feature_idx": self.feature_idx,
|
|
93
|
+
"observed_mz": self.observed_mz,
|
|
94
|
+
"name": self.name,
|
|
95
|
+
"formula": self.formula,
|
|
96
|
+
"adduct": self.adduct,
|
|
97
|
+
"theoretical_mz": self.theoretical_mz,
|
|
98
|
+
"ppm_error": self.ppm_error,
|
|
99
|
+
"class": self.db_class,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def annotate_local(
|
|
104
|
+
data: MSIData,
|
|
105
|
+
db: pd.DataFrame | None = None,
|
|
106
|
+
adducts: list[str] | None = None,
|
|
107
|
+
polarity: str = "positive",
|
|
108
|
+
tol_ppm: float = 5.0,
|
|
109
|
+
inplace: bool = True,
|
|
110
|
+
) -> pd.DataFrame:
|
|
111
|
+
"""
|
|
112
|
+
Annotate features against a local DataFrame database (mass-only).
|
|
113
|
+
|
|
114
|
+
Each feature is matched against every (compound, adduct) pair within
|
|
115
|
+
±tol_ppm. Best hit per feature is written to var; full list returned.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
db : DataFrame, optional
|
|
120
|
+
Columns must include {name, formula, neutral_mass, class}.
|
|
121
|
+
Defaults to the built-in demo database.
|
|
122
|
+
adducts : list of str
|
|
123
|
+
Which adducts to consider. Defaults to mode-appropriate set.
|
|
124
|
+
polarity : 'positive' or 'negative'
|
|
125
|
+
Filters adducts by charge if `adducts` not provided.
|
|
126
|
+
tol_ppm : float
|
|
127
|
+
Match tolerance.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
DataFrame
|
|
132
|
+
All candidate hits, sorted by ppm_error.
|
|
133
|
+
"""
|
|
134
|
+
if db is None:
|
|
135
|
+
db = builtin_db()
|
|
136
|
+
if adducts is None:
|
|
137
|
+
# Filter by charge from ADDUCTS table
|
|
138
|
+
sign = +1 if polarity == "positive" else -1
|
|
139
|
+
adducts = [a for a, (_, q) in ADDUCTS.items() if q == sign]
|
|
140
|
+
|
|
141
|
+
hits: list[AnnotationHit] = []
|
|
142
|
+
for j, observed_mz in enumerate(data.mz):
|
|
143
|
+
tol_da = observed_mz * tol_ppm * 1e-6
|
|
144
|
+
for _, row in db.iterrows():
|
|
145
|
+
for ad in adducts:
|
|
146
|
+
delta, _q = ADDUCTS[ad]
|
|
147
|
+
theo = row["neutral_mass"] + delta
|
|
148
|
+
if abs(theo - observed_mz) <= tol_da:
|
|
149
|
+
ppm = (observed_mz - theo) / theo * 1e6
|
|
150
|
+
hits.append(
|
|
151
|
+
AnnotationHit(
|
|
152
|
+
feature_idx=j,
|
|
153
|
+
observed_mz=float(observed_mz),
|
|
154
|
+
name=row["name"],
|
|
155
|
+
formula=row["formula"],
|
|
156
|
+
adduct=ad,
|
|
157
|
+
theoretical_mz=theo,
|
|
158
|
+
ppm_error=float(ppm),
|
|
159
|
+
db_class=row["class"],
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if not hits:
|
|
164
|
+
result = pd.DataFrame(
|
|
165
|
+
columns=[
|
|
166
|
+
"feature_idx", "observed_mz", "name", "formula", "adduct",
|
|
167
|
+
"theoretical_mz", "ppm_error", "class",
|
|
168
|
+
]
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
result = pd.DataFrame([h.to_dict() for h in hits]).sort_values(
|
|
172
|
+
["feature_idx", "ppm_error"], key=lambda s: s.abs() if s.name == "ppm_error" else s
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if inplace:
|
|
176
|
+
# Annotate var with best hit per feature
|
|
177
|
+
best = (
|
|
178
|
+
result.loc[result.groupby("feature_idx")["ppm_error"].apply(lambda s: s.abs().idxmin())]
|
|
179
|
+
if not result.empty
|
|
180
|
+
else pd.DataFrame()
|
|
181
|
+
)
|
|
182
|
+
data.var["annotation"] = ""
|
|
183
|
+
data.var["adduct"] = ""
|
|
184
|
+
data.var["ppm_error"] = np.nan
|
|
185
|
+
data.var["compound_class"] = ""
|
|
186
|
+
if not best.empty:
|
|
187
|
+
for _, h in best.iterrows():
|
|
188
|
+
idx = int(h["feature_idx"])
|
|
189
|
+
data.var.iloc[idx, data.var.columns.get_loc("annotation")] = h["name"]
|
|
190
|
+
data.var.iloc[idx, data.var.columns.get_loc("adduct")] = h["adduct"]
|
|
191
|
+
data.var.iloc[idx, data.var.columns.get_loc("ppm_error")] = h["ppm_error"]
|
|
192
|
+
data.var.iloc[idx, data.var.columns.get_loc("compound_class")] = h["class"]
|
|
193
|
+
data.uns["annotation_db_size"] = len(db)
|
|
194
|
+
data.uns["annotation_tol_ppm"] = tol_ppm
|
|
195
|
+
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# =============================================================================
|
|
200
|
+
# METASPACE cloud adapter (stub; real call requires network + token)
|
|
201
|
+
# =============================================================================
|
|
202
|
+
class METASPACEClient:
|
|
203
|
+
"""
|
|
204
|
+
Thin adapter to the METASPACE platform.
|
|
205
|
+
|
|
206
|
+
On a connected machine, install `metaspace2020` and pass an API token:
|
|
207
|
+
|
|
208
|
+
>>> client = METASPACEClient(api_key="your_token")
|
|
209
|
+
>>> ds_id = client.submit(imzml_path, metadata)
|
|
210
|
+
>>> hits = client.get_annotations(ds_id, fdr=0.1)
|
|
211
|
+
|
|
212
|
+
This class deliberately wraps but does not reimplement the
|
|
213
|
+
metaspace2020 client, to stay in sync with their API.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
def __init__(self, api_key: str | None = None, host: str | None = None) -> None:
|
|
217
|
+
try:
|
|
218
|
+
from metaspace import SMInstance
|
|
219
|
+
except ImportError:
|
|
220
|
+
self._sm = None
|
|
221
|
+
self._unavailable_reason = (
|
|
222
|
+
"metaspace2020 package not installed. "
|
|
223
|
+
"Run `pip install metaspace2020`."
|
|
224
|
+
)
|
|
225
|
+
return
|
|
226
|
+
kwargs = {}
|
|
227
|
+
if api_key:
|
|
228
|
+
kwargs["api_key"] = api_key
|
|
229
|
+
if host:
|
|
230
|
+
kwargs["host"] = host
|
|
231
|
+
self._sm = SMInstance(**kwargs)
|
|
232
|
+
self._unavailable_reason = None
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def available(self) -> bool:
|
|
236
|
+
return self._sm is not None
|
|
237
|
+
|
|
238
|
+
def get_annotations(
|
|
239
|
+
self,
|
|
240
|
+
dataset_id: str,
|
|
241
|
+
fdr: float = 0.1,
|
|
242
|
+
database: str = "HMDB-v4",
|
|
243
|
+
) -> pd.DataFrame:
|
|
244
|
+
"""Fetch FDR-controlled annotations for an existing dataset."""
|
|
245
|
+
if not self.available:
|
|
246
|
+
raise RuntimeError(self._unavailable_reason)
|
|
247
|
+
ds = self._sm.dataset(id=dataset_id)
|
|
248
|
+
ann = ds.annotations(fdr=fdr, database=database)
|
|
249
|
+
return pd.DataFrame(ann)
|
|
250
|
+
|
|
251
|
+
def submit(self, imzml_path: str, metadata: dict) -> str:
|
|
252
|
+
"""Submit a new dataset to METASPACE for annotation."""
|
|
253
|
+
if not self.available:
|
|
254
|
+
raise RuntimeError(self._unavailable_reason)
|
|
255
|
+
return self._sm.submit_dataset(imzml_path, metadata=metadata)
|