wnetdeconv 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wnetdeconv-0.8.0/CMakeLists.txt +32 -0
- wnetdeconv-0.8.0/LICENCE +7 -0
- wnetdeconv-0.8.0/PKG-INFO +185 -0
- wnetdeconv-0.8.0/README.md +158 -0
- wnetdeconv-0.8.0/pyproject.toml +42 -0
- wnetdeconv-0.8.0/src/wnetdeconv/__init__.py +11 -0
- wnetdeconv-0.8.0/src/wnetdeconv/__main__.py +25 -0
- wnetdeconv-0.8.0/src/wnetdeconv/cpp/wnetdeconv/wnetdeconv.cpp +12 -0
- wnetdeconv-0.8.0/src/wnetdeconv/solver.py +741 -0
- wnetdeconv-0.8.0/src/wnetdeconv/spectrum.py +161 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.15)
|
|
2
|
+
|
|
3
|
+
execute_process(
|
|
4
|
+
COMMAND python -m pylmcf --include
|
|
5
|
+
OUTPUT_VARIABLE PYLMCF_INCLUDE_DIRS
|
|
6
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
execute_process(
|
|
10
|
+
COMMAND python -m wnet --include
|
|
11
|
+
OUTPUT_VARIABLE WNET_INCLUDE_DIRS
|
|
12
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
project(wnetdeconv LANGUAGES CXX)
|
|
17
|
+
find_package(Python 3.8
|
|
18
|
+
REQUIRED COMPONENTS Interpreter Development.Module
|
|
19
|
+
OPTIONAL_COMPONENTS Development.SABIModule)
|
|
20
|
+
find_package(nanobind REQUIRED CONFIG)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
set(CMAKE_CXX_STANDARD 20)
|
|
24
|
+
|
|
25
|
+
nanobind_add_module(wnetdeconv_cpp
|
|
26
|
+
NB_STATIC NOMINSIZE
|
|
27
|
+
src/wnetdeconv/cpp/wnetdeconv/wnetdeconv.cpp)
|
|
28
|
+
|
|
29
|
+
target_include_directories(wnetdeconv_cpp PRIVATE ${PYLMCF_INCLUDE_DIRS} ${WNET_INCLUDE_DIRS})
|
|
30
|
+
target_compile_definitions(wnetdeconv_cpp PRIVATE INCLUDE_NANOBIND_STUFF)
|
|
31
|
+
|
|
32
|
+
install(TARGETS wnetdeconv_cpp LIBRARY DESTINATION wnetdeconv)
|
wnetdeconv-0.8.0/LICENCE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
This software is Copyright 2026 Michał Startek, and provided under terms of MIT licence, below.
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wnetdeconv
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: Python implementation of spectral deconvolution using Wasserstein metric
|
|
5
|
+
Author-Email: =?utf-8?q?Micha=C5=82_Startek?= <michal.startek@mimuw.edu.pl>
|
|
6
|
+
Maintainer-Email: =?utf-8?q?Micha=C5=82_Startek?= <michal.startek@mimuw.edu.pl>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENCE
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
11
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Project-URL: Homepage, https://github.com/michalsta/wnetdeconv
|
|
14
|
+
Project-URL: Repository, https://github.com/michalsta/wnetdeconv.git
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Requires-Dist: pylmcf
|
|
17
|
+
Requires-Dist: wnet>=0.9.16
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
Provides-Extra: extras
|
|
20
|
+
Requires-Dist: pyopenms; extra == "extras"
|
|
21
|
+
Provides-Extra: pytest
|
|
22
|
+
Requires-Dist: pytest; extra == "pytest"
|
|
23
|
+
Requires-Dist: pandas; extra == "pytest"
|
|
24
|
+
Requires-Dist: numpy; extra == "pytest"
|
|
25
|
+
Requires-Dist: scipy; extra == "pytest"
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# wnetdeconv
|
|
29
|
+
|
|
30
|
+
Spectral deconvolution via Wasserstein optimal transport.
|
|
31
|
+
|
|
32
|
+
Given an empirical spectrum and a library of theoretical component spectra,
|
|
33
|
+
`wnetdeconv` finds the mixture proportions that minimise the total Wasserstein
|
|
34
|
+
transport cost between the empirical signal and the weighted sum of components.
|
|
35
|
+
The inner problem at each set of proportions is solved exactly as a min-cost
|
|
36
|
+
flow (via [pylmcf](https://github.com/michalsta/pylmcf) / LEMON), giving an
|
|
37
|
+
exact piecewise-linear objective with exact gradients — suitable for gradient-
|
|
38
|
+
based outer optimisation with scipy.
|
|
39
|
+
|
|
40
|
+
Supports 1-D spectra (NMR chemical shift, m/z) and higher-dimensional data
|
|
41
|
+
(e.g. m/z + retention time).
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install wnetdeconv
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Dependencies: `pylmcf`, `wnet`, `numpy`, `scipy`.
|
|
50
|
+
Optional: `pyopenms` for loading featureXML files.
|
|
51
|
+
|
|
52
|
+
## Concepts
|
|
53
|
+
|
|
54
|
+
### Spectra as distributions
|
|
55
|
+
|
|
56
|
+
A spectrum is a set of *(position, intensity)* pairs. In 1-D (NMR chemical
|
|
57
|
+
shift, m/z) use `Spectrum_1D`; for higher-dimensional data (m/z + retention
|
|
58
|
+
time) use `Spectrum` with a `(d, n)` positions array.
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from wnetdeconv import Spectrum_1D
|
|
62
|
+
|
|
63
|
+
empirical = Spectrum_1D([1.0, 2.0, 3.0], [10.0, 25.0, 15.0])
|
|
64
|
+
component = Spectrum_1D([1.0, 2.0, 3.0], [1.0, 2.0, 1.0])
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Transport cost
|
|
68
|
+
|
|
69
|
+
Matching a unit of intensity from an empirical peak at position *p* to a
|
|
70
|
+
theoretical peak at position *q* costs `distance(p, q)`. Peaks that cannot
|
|
71
|
+
be matched cheaply are instead routed to a *trash node* at a fixed penalty.
|
|
72
|
+
|
|
73
|
+
`max_distance` caps the farthest match considered; anything farther is cheaper
|
|
74
|
+
to trash. `trash_cost` (or the asymmetric pair
|
|
75
|
+
`experimental_trash_cost` / `theoretical_trash_cost`) sets that penalty.
|
|
76
|
+
|
|
77
|
+
### Precision and scaling
|
|
78
|
+
|
|
79
|
+
Internally all intensities and costs are scaled to integers for the MCF solver.
|
|
80
|
+
The `precision` parameter (default `1e-3`) sets the desired relative accuracy
|
|
81
|
+
of the cost output: `precision=1e-3` gives ≈ 3 significant figures. The same
|
|
82
|
+
value becomes the `ftol` stop criterion for scipy optimisers, so the outer loop
|
|
83
|
+
stops as soon as further improvement is below the resolution the integer network
|
|
84
|
+
can deliver.
|
|
85
|
+
|
|
86
|
+
## Solvers
|
|
87
|
+
|
|
88
|
+
### `DeconvSolver` — unconstrained baseline
|
|
89
|
+
|
|
90
|
+
Solves the network at a given point and exposes `total_cost()` and
|
|
91
|
+
`gradient()`. Optimisation (via `optimize()`, L-BFGS-B) minimises cost with
|
|
92
|
+
only non-negativity bounds.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from wnetdeconv import DeconvSolver, Spectrum_1D
|
|
96
|
+
from wnet.distances import DistanceMetric
|
|
97
|
+
|
|
98
|
+
emp = Spectrum_1D([1.0, 100.0], [10.0, 30.0])
|
|
99
|
+
t1 = Spectrum_1D([1.0], [2.0]) # optimal proportion: 5
|
|
100
|
+
t2 = Spectrum_1D([100.0], [3.0]) # optimal proportion: 10
|
|
101
|
+
|
|
102
|
+
solver = DeconvSolver(
|
|
103
|
+
empirical_spectrum=emp,
|
|
104
|
+
theoretical_spectra=[t1, t2],
|
|
105
|
+
distance=DistanceMetric.LINF,
|
|
106
|
+
max_distance=10.0,
|
|
107
|
+
trash_cost=100.0,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
result = solver.optimize()
|
|
111
|
+
print(result.x) # [5. 10.]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
You can also drive the solver manually — useful when embedding it in your own
|
|
115
|
+
optimisation loop:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
solver.set_point([5.0, 10.0])
|
|
119
|
+
print(solver.total_cost()) # 0.0
|
|
120
|
+
print(solver.gradient()) # [0. 0.] (at the optimum)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### `ConstrainedSolver` — total-mass equality
|
|
124
|
+
|
|
125
|
+
Adds the constraint `Σ wₛ · Iₛ = I_emp` so that the mixture exactly accounts
|
|
126
|
+
for all empirical intensity. Uses SLSQP. Drop-in replacement for
|
|
127
|
+
`DeconvSolver`; call `optimize()` the same way.
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from wnetdeconv import ConstrainedSolver
|
|
131
|
+
|
|
132
|
+
solver = ConstrainedSolver(
|
|
133
|
+
empirical_spectrum=emp,
|
|
134
|
+
theoretical_spectra=[t1, t2],
|
|
135
|
+
distance=DistanceMetric.LINF,
|
|
136
|
+
max_distance=10.0,
|
|
137
|
+
trash_cost=100.0,
|
|
138
|
+
)
|
|
139
|
+
result = solver.optimize()
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Key parameters
|
|
143
|
+
|
|
144
|
+
| Parameter | Applies to | Description |
|
|
145
|
+
|---|---|---|
|
|
146
|
+
| `max_distance` | all | Maximum peak-to-peak match distance. Also sets the sparsity of the internal network in 1-D. |
|
|
147
|
+
| `trash_cost` | all | Symmetric penalty for unmatched peaks. |
|
|
148
|
+
| `experimental_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding empirical mass. |
|
|
149
|
+
| `theoretical_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding theoretical mass. |
|
|
150
|
+
| `precision` | all | Desired relative cost accuracy; drives `scale_factor` and `ftol` (default `1e-3`). |
|
|
151
|
+
| `scale_factor` | all | Override automatic scaling (bypasses `precision`). |
|
|
152
|
+
|
|
153
|
+
## Distance metrics
|
|
154
|
+
|
|
155
|
+
From `wnet.distances.DistanceMetric`:
|
|
156
|
+
|
|
157
|
+
- `L1` — sum of absolute coordinate differences (Manhattan / taxicab)
|
|
158
|
+
- `L2` — Euclidean distance
|
|
159
|
+
- `LINF` — maximum absolute coordinate difference (Chebyshev); dual of the W₁ earth-mover distance used by masserstein
|
|
160
|
+
|
|
161
|
+
## Loading MS data (featureXML)
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from wnetdeconv import Spectrum
|
|
165
|
+
|
|
166
|
+
emp = Spectrum.FromFeatureXML("sample.featureXML") # requires pyopenms
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Architecture
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
wnetdeconv
|
|
173
|
+
├── Spectrum / Spectrum_1D — data containers (extend wnet.Distribution)
|
|
174
|
+
├── DeconvSolver — core: builds WassersteinNetwork, exposes cost + gradient
|
|
175
|
+
└── ConstrainedSolver — adds total-mass equality, uses SLSQP
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
The underlying min-cost flow is provided by
|
|
179
|
+
[wnet](https://github.com/michalsta/wnet) (network construction) and
|
|
180
|
+
[pylmcf](https://github.com/michalsta/pylmcf) (LEMON-based MCF algorithms,
|
|
181
|
+
including warm-restart Network Simplex).
|
|
182
|
+
|
|
183
|
+
## License
|
|
184
|
+
|
|
185
|
+
MIT
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# wnetdeconv
|
|
2
|
+
|
|
3
|
+
Spectral deconvolution via Wasserstein optimal transport.
|
|
4
|
+
|
|
5
|
+
Given an empirical spectrum and a library of theoretical component spectra,
|
|
6
|
+
`wnetdeconv` finds the mixture proportions that minimise the total Wasserstein
|
|
7
|
+
transport cost between the empirical signal and the weighted sum of components.
|
|
8
|
+
The inner problem at each set of proportions is solved exactly as a min-cost
|
|
9
|
+
flow (via [pylmcf](https://github.com/michalsta/pylmcf) / LEMON), giving an
|
|
10
|
+
exact piecewise-linear objective with exact gradients — suitable for gradient-
|
|
11
|
+
based outer optimisation with scipy.
|
|
12
|
+
|
|
13
|
+
Supports 1-D spectra (NMR chemical shift, m/z) and higher-dimensional data
|
|
14
|
+
(e.g. m/z + retention time).
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install wnetdeconv
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Dependencies: `pylmcf`, `wnet`, `numpy`, `scipy`.
|
|
23
|
+
Optional: `pyopenms` for loading featureXML files.
|
|
24
|
+
|
|
25
|
+
## Concepts
|
|
26
|
+
|
|
27
|
+
### Spectra as distributions
|
|
28
|
+
|
|
29
|
+
A spectrum is a set of *(position, intensity)* pairs. In 1-D (NMR chemical
|
|
30
|
+
shift, m/z) use `Spectrum_1D`; for higher-dimensional data (m/z + retention
|
|
31
|
+
time) use `Spectrum` with a `(d, n)` positions array.
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from wnetdeconv import Spectrum_1D
|
|
35
|
+
|
|
36
|
+
empirical = Spectrum_1D([1.0, 2.0, 3.0], [10.0, 25.0, 15.0])
|
|
37
|
+
component = Spectrum_1D([1.0, 2.0, 3.0], [1.0, 2.0, 1.0])
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Transport cost
|
|
41
|
+
|
|
42
|
+
Matching a unit of intensity from an empirical peak at position *p* to a
|
|
43
|
+
theoretical peak at position *q* costs `distance(p, q)`. Peaks that cannot
|
|
44
|
+
be matched cheaply are instead routed to a *trash node* at a fixed penalty.
|
|
45
|
+
|
|
46
|
+
`max_distance` caps the farthest match considered; anything farther is cheaper
|
|
47
|
+
to trash. `trash_cost` (or the asymmetric pair
|
|
48
|
+
`experimental_trash_cost` / `theoretical_trash_cost`) sets that penalty.
|
|
49
|
+
|
|
50
|
+
### Precision and scaling
|
|
51
|
+
|
|
52
|
+
Internally all intensities and costs are scaled to integers for the MCF solver.
|
|
53
|
+
The `precision` parameter (default `1e-3`) sets the desired relative accuracy
|
|
54
|
+
of the cost output: `precision=1e-3` gives ≈ 3 significant figures. The same
|
|
55
|
+
value becomes the `ftol` stop criterion for scipy optimisers, so the outer loop
|
|
56
|
+
stops as soon as further improvement is below the resolution the integer network
|
|
57
|
+
can deliver.
|
|
58
|
+
|
|
59
|
+
## Solvers
|
|
60
|
+
|
|
61
|
+
### `DeconvSolver` — unconstrained baseline
|
|
62
|
+
|
|
63
|
+
Solves the network at a given point and exposes `total_cost()` and
|
|
64
|
+
`gradient()`. Optimisation (via `optimize()`, L-BFGS-B) minimises cost with
|
|
65
|
+
only non-negativity bounds.
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from wnetdeconv import DeconvSolver, Spectrum_1D
|
|
69
|
+
from wnet.distances import DistanceMetric
|
|
70
|
+
|
|
71
|
+
emp = Spectrum_1D([1.0, 100.0], [10.0, 30.0])
|
|
72
|
+
t1 = Spectrum_1D([1.0], [2.0]) # optimal proportion: 5
|
|
73
|
+
t2 = Spectrum_1D([100.0], [3.0]) # optimal proportion: 10
|
|
74
|
+
|
|
75
|
+
solver = DeconvSolver(
|
|
76
|
+
empirical_spectrum=emp,
|
|
77
|
+
theoretical_spectra=[t1, t2],
|
|
78
|
+
distance=DistanceMetric.LINF,
|
|
79
|
+
max_distance=10.0,
|
|
80
|
+
trash_cost=100.0,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
result = solver.optimize()
|
|
84
|
+
print(result.x) # [5. 10.]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
You can also drive the solver manually — useful when embedding it in your own
|
|
88
|
+
optimisation loop:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
solver.set_point([5.0, 10.0])
|
|
92
|
+
print(solver.total_cost()) # 0.0
|
|
93
|
+
print(solver.gradient()) # [0. 0.] (at the optimum)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### `ConstrainedSolver` — total-mass equality
|
|
97
|
+
|
|
98
|
+
Adds the constraint `Σ wₛ · Iₛ = I_emp` so that the mixture exactly accounts
|
|
99
|
+
for all empirical intensity. Uses SLSQP. Drop-in replacement for
|
|
100
|
+
`DeconvSolver`; call `optimize()` the same way.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from wnetdeconv import ConstrainedSolver
|
|
104
|
+
|
|
105
|
+
solver = ConstrainedSolver(
|
|
106
|
+
empirical_spectrum=emp,
|
|
107
|
+
theoretical_spectra=[t1, t2],
|
|
108
|
+
distance=DistanceMetric.LINF,
|
|
109
|
+
max_distance=10.0,
|
|
110
|
+
trash_cost=100.0,
|
|
111
|
+
)
|
|
112
|
+
result = solver.optimize()
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Key parameters
|
|
116
|
+
|
|
117
|
+
| Parameter | Applies to | Description |
|
|
118
|
+
|---|---|---|
|
|
119
|
+
| `max_distance` | all | Maximum peak-to-peak match distance. Also sets the sparsity of the internal network in 1-D. |
|
|
120
|
+
| `trash_cost` | all | Symmetric penalty for unmatched peaks. |
|
|
121
|
+
| `experimental_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding empirical mass. |
|
|
122
|
+
| `theoretical_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding theoretical mass. |
|
|
123
|
+
| `precision` | all | Desired relative cost accuracy; drives `scale_factor` and `ftol` (default `1e-3`). |
|
|
124
|
+
| `scale_factor` | all | Override automatic scaling (bypasses `precision`). |
|
|
125
|
+
|
|
126
|
+
## Distance metrics
|
|
127
|
+
|
|
128
|
+
From `wnet.distances.DistanceMetric`:
|
|
129
|
+
|
|
130
|
+
- `L1` — sum of absolute coordinate differences (Manhattan / taxicab)
|
|
131
|
+
- `L2` — Euclidean distance
|
|
132
|
+
- `LINF` — maximum absolute coordinate difference (Chebyshev); dual of the W₁ earth-mover distance used by masserstein
|
|
133
|
+
|
|
134
|
+
## Loading MS data (featureXML)
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from wnetdeconv import Spectrum
|
|
138
|
+
|
|
139
|
+
emp = Spectrum.FromFeatureXML("sample.featureXML") # requires pyopenms
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Architecture
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
wnetdeconv
|
|
146
|
+
├── Spectrum / Spectrum_1D — data containers (extend wnet.Distribution)
|
|
147
|
+
├── DeconvSolver — core: builds WassersteinNetwork, exposes cost + gradient
|
|
148
|
+
└── ConstrainedSolver — adds total-mass equality, uses SLSQP
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
The underlying min-cost flow is provided by
|
|
152
|
+
[wnet](https://github.com/michalsta/wnet) (network construction) and
|
|
153
|
+
[pylmcf](https://github.com/michalsta/pylmcf) (LEMON-based MCF algorithms,
|
|
154
|
+
including warm-restart Network Simplex).
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
MIT
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["scikit-build-core", "nanobind", "pylmcf", "wnet>=0.9.16"]
|
|
3
|
+
build-backend = "scikit_build_core.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "wnetdeconv"
|
|
7
|
+
version = "0.8.0"
|
|
8
|
+
description = "Python implementation of spectral deconvolution using Wasserstein metric"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
dependencies = ["pylmcf", "wnet>=0.9.16", "numpy"]
|
|
11
|
+
authors = [{ name="Michał Startek", email="michal.startek@mimuw.edu.pl" }]
|
|
12
|
+
maintainers = [{ name="Michał Startek", email="michal.startek@mimuw.edu.pl" }]
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
license = "MIT"
|
|
15
|
+
license-files = ["LICENCE"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Mathematics",
|
|
19
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
"Homepage" = "https://github.com/michalsta/wnetdeconv"
|
|
25
|
+
"Repository" = "https://github.com/michalsta/wnetdeconv.git"
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
extras = ["pyopenms"]
|
|
29
|
+
pytest = ["pytest", "pandas", "numpy", "scipy"]
|
|
30
|
+
|
|
31
|
+
[tool.pytest.ini_options]
|
|
32
|
+
testpaths = ["pytest"]
|
|
33
|
+
|
|
34
|
+
[tool.scikit-build.sdist]
|
|
35
|
+
exclude = [
|
|
36
|
+
".github/",
|
|
37
|
+
"experiments/",
|
|
38
|
+
"pytest/",
|
|
39
|
+
"optimization_example.py",
|
|
40
|
+
"reinstall.sh",
|
|
41
|
+
".gitignore",
|
|
42
|
+
]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#! /usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from . import wnetdeconv_cpp
|
|
6
|
+
from .solver import DeconvSolver, ConstrainedSolver, MagnetsteinSolver, MassersteinSolver
|
|
7
|
+
from .spectrum import Spectrum, Spectrum_1D
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def hello():
|
|
11
|
+
print("Hello, world from wnetdeconv!")
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
__version__ = importlib.metadata.version("wnetdeconv")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def module_main():
|
|
8
|
+
import argparse
|
|
9
|
+
|
|
10
|
+
parser = argparse.ArgumentParser(
|
|
11
|
+
description="WNetDeconv: A tool for spectral deconvolution."
|
|
12
|
+
)
|
|
13
|
+
parser.add_argument("--version", "-v", action="version", version=__version__)
|
|
14
|
+
# parser.add_argument("--include", "-i", help="Print include path for C++ headers", action="store_true")
|
|
15
|
+
|
|
16
|
+
args = parser.parse_args()
|
|
17
|
+
|
|
18
|
+
# if args.include:
|
|
19
|
+
# print(Path(__file__).parent / "cpp")
|
|
20
|
+
# else:
|
|
21
|
+
parser.print_help()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
module_main()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <nanobind/nanobind.h>
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
int hello() {
|
|
6
|
+
std::cout << "Hello, world from wnetdeconv_cpp!" << std::endl;
|
|
7
|
+
return 0;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
NB_MODULE(wnetdeconv_cpp, m) {
|
|
11
|
+
m.def("hello", &hello, "A function that prints 'Hello, world from wnetdeconv_cpp!'");
|
|
12
|
+
}
|
|
@@ -0,0 +1,741 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from collections import namedtuple
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import Callable, Optional, Union, List, Tuple
|
|
5
|
+
import numpy as np
|
|
6
|
+
from scipy.optimize import minimize, OptimizeResult
|
|
7
|
+
|
|
8
|
+
from wnet import Distribution, WassersteinNetwork
|
|
9
|
+
|
|
10
|
+
_Flow = namedtuple("Flow", ["empirical_peak_idx", "theoretical_peak_idx", "flow"])
|
|
11
|
+
from wnet.distances import DistanceMetric
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DeconvSolver:
|
|
15
|
+
"""
|
|
16
|
+
Aligns an empirical spectrum to one or more theoretical spectra using a Wasserstein network approach.
|
|
17
|
+
Alignment of two empirical spectra E1, E2 can be performed by setting E1 as the empirical_spectrum
|
|
18
|
+
and E2 as the only element of theoretical_spectra.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
empirical_spectrum : Distribution
|
|
23
|
+
The empirical spectrum to be aligned.
|
|
24
|
+
theoretical_spectra : Sequence[Distribution]
|
|
25
|
+
A sequence of theoretical spectra to align against.
|
|
26
|
+
distance_function : Callable[[np.ndarray, np.ndarray], np.ndarray]
|
|
27
|
+
Function to compute the distance between empirical and theoretical peaks.
|
|
28
|
+
max_distance : int or float
|
|
29
|
+
Maximum allowed distance for matching peaks.
|
|
30
|
+
trash_cost : int or float, optional
|
|
31
|
+
Cost for assigning unmatched peaks to trash (symmetric). Used as fallback for
|
|
32
|
+
experimental_trash_cost / theoretical_trash_cost when only one is set.
|
|
33
|
+
scale_factor : None, int, or float, optional
|
|
34
|
+
Scaling factor for intensities and costs. If None, it is computed from ``precision``.
|
|
35
|
+
precision : float, optional
|
|
36
|
+
Desired relative precision of the cost output (fraction of the theoretical cost
|
|
37
|
+
upper bound ``max_cost_per_unit_flow * max_sum_intensity``). Drives both the
|
|
38
|
+
auto scale_factor and the ``ftol`` stop criterion passed to scipy optimizers.
|
|
39
|
+
Ignored when ``scale_factor`` is supplied explicitly. Default 1e-3 (≈ 3
|
|
40
|
+
significant figures).
|
|
41
|
+
experimental_trash_cost : int or float, optional
|
|
42
|
+
Cost for discarding unmatched empirical peaks. Enables asymmetric trash mode.
|
|
43
|
+
theoretical_trash_cost : int or float, optional
|
|
44
|
+
Cost for discarding unmatched theoretical peaks. Enables asymmetric trash mode.
|
|
45
|
+
method : str, optional
|
|
46
|
+
Min-cost flow algorithm: ``"network_simplex"`` (default), ``"cycle_canceling"``, ``"cost_scaling"``, or ``"capacity_scaling"``.
|
|
47
|
+
Ignored when ``solver`` is provided.
|
|
48
|
+
solver : NetworkSimplex | CostScaling | CycleCanceling | CapacityScaling, optional
|
|
49
|
+
Solver configuration object. Takes precedence over ``method``.
|
|
50
|
+
Defaults to ``NetworkSimplex()`` (warm restarts, BLOCK_SEARCH pivot).
|
|
51
|
+
force_dense_1d : bool, optional
|
|
52
|
+
In 1D, force the O(m*n) dense factory instead of the O(m+n) chain
|
|
53
|
+
factory (default False = chain in 1D). Forwarded to
|
|
54
|
+
:class:`WassersteinNetwork`.
|
|
55
|
+
|
|
56
|
+
Attributes
|
|
57
|
+
----------
|
|
58
|
+
scale_factor : float
|
|
59
|
+
The scaling factor used for intensities and costs.
|
|
60
|
+
empirical_spectrum : Distribution
|
|
61
|
+
The scaled empirical spectrum.
|
|
62
|
+
theoretical_spectra : list[Distribution]
|
|
63
|
+
The scaled theoretical spectra.
|
|
64
|
+
graph : WassersteinNetwork
|
|
65
|
+
The underlying Wasserstein network graph.
|
|
66
|
+
point : Sequence[float] or np.ndarray or None
|
|
67
|
+
The current point for solving the alignment.
|
|
68
|
+
|
|
69
|
+
Methods
|
|
70
|
+
-------
|
|
71
|
+
set_point(point)
|
|
72
|
+
Sets the point for solving the alignment and runs the solver.
|
|
73
|
+
total_cost()
|
|
74
|
+
Returns the total cost of the alignment, rescaled to original units.
|
|
75
|
+
print()
|
|
76
|
+
Prints a string representation of the underlying graph.
|
|
77
|
+
flows()
|
|
78
|
+
Returns a list of flows (alignments) between empirical and theoretical peaks.
|
|
79
|
+
no_subgraphs()
|
|
80
|
+
Returns the number of subgraphs in the alignment network.
|
|
81
|
+
print_diagnostics(subgraphs_too=False)
|
|
82
|
+
Prints diagnostic information about the alignment and optionally about each subgraph.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
empirical_spectrum: Distribution,
|
|
88
|
+
theoretical_spectra: Sequence[Distribution],
|
|
89
|
+
distance: DistanceMetric,
|
|
90
|
+
max_distance: Union[int, float],
|
|
91
|
+
trash_cost: Optional[Union[int, float]] = None,
|
|
92
|
+
scale_factor: Optional[Union[int, float]] = None,
|
|
93
|
+
experimental_trash_cost: Optional[Union[int, float]] = None,
|
|
94
|
+
theoretical_trash_cost: Optional[Union[int, float]] = None,
|
|
95
|
+
method: str = None,
|
|
96
|
+
solver=None,
|
|
97
|
+
force_dense_1d: bool = False,
|
|
98
|
+
precision: float = 1e-3,
|
|
99
|
+
) -> None:
|
|
100
|
+
|
|
101
|
+
if (
|
|
102
|
+
trash_cost is None
|
|
103
|
+
and experimental_trash_cost is None
|
|
104
|
+
and theoretical_trash_cost is None
|
|
105
|
+
):
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"At least one of trash_cost, experimental_trash_cost, or theoretical_trash_cost must be provided."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if not isinstance(empirical_spectrum, Distribution):
|
|
111
|
+
raise TypeError("empirical_spectrum must be a Distribution")
|
|
112
|
+
if not isinstance(theoretical_spectra, Sequence):
|
|
113
|
+
raise TypeError("theoretical_spectra must be a Sequence")
|
|
114
|
+
if not all(isinstance(t, Distribution) for t in theoretical_spectra):
|
|
115
|
+
raise TypeError("all theoretical_spectra elements must be Distribution")
|
|
116
|
+
if not isinstance(max_distance, (int, float)):
|
|
117
|
+
raise TypeError("max_distance must be a number")
|
|
118
|
+
for name, val in [
|
|
119
|
+
("trash_cost", trash_cost),
|
|
120
|
+
("experimental_trash_cost", experimental_trash_cost),
|
|
121
|
+
("theoretical_trash_cost", theoretical_trash_cost),
|
|
122
|
+
]:
|
|
123
|
+
if val is not None and not isinstance(val, (int, float)):
|
|
124
|
+
raise TypeError(f"{name} must be a number")
|
|
125
|
+
if scale_factor is not None and not isinstance(scale_factor, (int, float)):
|
|
126
|
+
raise TypeError("scale_factor must be a number")
|
|
127
|
+
|
|
128
|
+
asymmetric = (
|
|
129
|
+
experimental_trash_cost is not None or theoretical_trash_cost is not None
|
|
130
|
+
)
|
|
131
|
+
if asymmetric:
|
|
132
|
+
eff_exp = (
|
|
133
|
+
experimental_trash_cost
|
|
134
|
+
if experimental_trash_cost is not None
|
|
135
|
+
else trash_cost
|
|
136
|
+
)
|
|
137
|
+
eff_theo = (
|
|
138
|
+
theoretical_trash_cost
|
|
139
|
+
if theoretical_trash_cost is not None
|
|
140
|
+
else trash_cost
|
|
141
|
+
)
|
|
142
|
+
active_costs = [c for c in (eff_exp, eff_theo) if c is not None]
|
|
143
|
+
else:
|
|
144
|
+
active_costs = [trash_cost]
|
|
145
|
+
|
|
146
|
+
if scale_factor is None:
|
|
147
|
+
ALMOST_MAXINT = 2**60
|
|
148
|
+
empirical_sum_intensity = empirical_spectrum.sum_intensities
|
|
149
|
+
theoretical_sum_intensity = sum(
|
|
150
|
+
t.sum_intensities for t in theoretical_spectra
|
|
151
|
+
)
|
|
152
|
+
max_sum_intensity = max(empirical_sum_intensity, theoretical_sum_intensity)
|
|
153
|
+
|
|
154
|
+
# Output-precision constraint (original): integer resolution
|
|
155
|
+
# 1/sf^2 should be ~precision of the worst-case absolute cost
|
|
156
|
+
# max_cost*max_sum_intensity, so
|
|
157
|
+
# sf >= sqrt(1/(precision * cost_scale)).
|
|
158
|
+
# This used to be the only constraint. When the experimental
|
|
159
|
+
# spectrum has huge unnormalized intensities (raw MS counts ~1e7+),
|
|
160
|
+
# the formula drops sf so low that int(max_distance*sf) rounds to 0
|
|
161
|
+
# and the graph factory builds zero edges (silent failure).
|
|
162
|
+
#
|
|
163
|
+
# Per-edge floor: int(min_cost_per_unit_flow * sf) must be at least
|
|
164
|
+
# MIN_COST_TICKS so the cost map has usable resolution. Below ~25
|
|
165
|
+
# the gradient signal is too coarse for L-BFGS-B to make progress
|
|
166
|
+
# (empirical: scaled_MTD=10 on pbttt → 1 iter, scaled_MTD=25 → 36
|
|
167
|
+
# iters with a real optimum). Going higher than ~25 produces more
|
|
168
|
+
# accurate cost numbers but multiplies LEMON's pivot count on
|
|
169
|
+
# large graphs (cold solve scales roughly with sf), so we cap the
|
|
170
|
+
# auto floor at MIN_COST_TICKS rather than tying it to precision.
|
|
171
|
+
# Pass scale_factor explicitly (or tighten precision) when more
|
|
172
|
+
# input precision is needed.
|
|
173
|
+
MIN_COST_TICKS = 25
|
|
174
|
+
max_cost_per_unit_flow = max([max_distance] + active_costs)
|
|
175
|
+
min_cost_per_unit_flow = min([max_distance] + active_costs)
|
|
176
|
+
cost_scale = max_cost_per_unit_flow * max_sum_intensity
|
|
177
|
+
sf_output = np.sqrt(1.0 / (precision * cost_scale))
|
|
178
|
+
sf_floor = MIN_COST_TICKS / min_cost_per_unit_flow
|
|
179
|
+
desired_sf = max(sf_output, sf_floor)
|
|
180
|
+
max_sf = np.sqrt(ALMOST_MAXINT / cost_scale)
|
|
181
|
+
if desired_sf > max_sf:
|
|
182
|
+
achieved_ticks = max_sf * min_cost_per_unit_flow
|
|
183
|
+
achieved_out = 1.0 / (max_sf**2 * cost_scale)
|
|
184
|
+
warnings.warn(
|
|
185
|
+
f"Requested precision {precision} exceeds int64 capacity for this "
|
|
186
|
+
f"dataset (cost_scale={cost_scale:.3g}, "
|
|
187
|
+
f"min_cost={min_cost_per_unit_flow:.3g}); clamping scale_factor to "
|
|
188
|
+
f"{max_sf:.3g}. Achieved cost precision {achieved_out:.2e} "
|
|
189
|
+
f"(relative), min-cost integer ticks {achieved_ticks:.1f}."
|
|
190
|
+
)
|
|
191
|
+
scale_factor = max_sf
|
|
192
|
+
else:
|
|
193
|
+
scale_factor = desired_sf
|
|
194
|
+
assert (
|
|
195
|
+
scale_factor > 0
|
|
196
|
+
), "Can't auto-compute a sensible scale factor. You might have some luck with setting it manually, but it probably means something about your data or trash_cost is off."
|
|
197
|
+
if int(min_cost_per_unit_flow * scale_factor) < 1:
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"Auto-computed scale_factor={scale_factor:.3g} cannot represent "
|
|
200
|
+
f"min_cost_per_unit_flow={min_cost_per_unit_flow:.3g} as a "
|
|
201
|
+
f"positive integer (the graph would have no edges). "
|
|
202
|
+
f"empirical_sum_intensity={empirical_sum_intensity:.3g}, "
|
|
203
|
+
f"theoretical_sum_intensity={theoretical_sum_intensity:.3g}. "
|
|
204
|
+
f"Normalize the spectra, pass an explicit scale_factor, or "
|
|
205
|
+
f"relax precision."
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
self.scale_factor = scale_factor
|
|
209
|
+
self._ftol = 1.0 / (scale_factor * scale_factor)
|
|
210
|
+
self.empirical_spectrum = empirical_spectrum.positions_intensities_scaled(
|
|
211
|
+
scale_factor
|
|
212
|
+
)
|
|
213
|
+
self.theoretical_spectra = [
|
|
214
|
+
t.positions_intensities_scaled(scale_factor) for t in theoretical_spectra
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
self.graph = WassersteinNetwork(
|
|
218
|
+
self.empirical_spectrum,
|
|
219
|
+
self.theoretical_spectra,
|
|
220
|
+
distance,
|
|
221
|
+
int(max_distance * scale_factor),
|
|
222
|
+
force_dense_1d=force_dense_1d,
|
|
223
|
+
method=method,
|
|
224
|
+
solver=solver,
|
|
225
|
+
)
|
|
226
|
+
if asymmetric:
|
|
227
|
+
if eff_exp is not None:
|
|
228
|
+
self.graph.add_experimental_trash(int(eff_exp * scale_factor))
|
|
229
|
+
if eff_theo is not None:
|
|
230
|
+
self.graph.add_theoretical_trash(int(eff_theo * scale_factor))
|
|
231
|
+
else:
|
|
232
|
+
self.graph.add_simple_trash(int(trash_cost * scale_factor))
|
|
233
|
+
self.graph.build()
|
|
234
|
+
self.point = None
|
|
235
|
+
|
|
236
|
+
def set_point(self, point: Union[Sequence[float], np.ndarray]) -> None:
|
|
237
|
+
"""
|
|
238
|
+
Set proportions of theoretical spectra and solve the graph at the given point.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
point : Sequence[float] or np.ndarray
|
|
243
|
+
Proportions for each theoretical spectrum.
|
|
244
|
+
|
|
245
|
+
Returns
|
|
246
|
+
-------
|
|
247
|
+
None
|
|
248
|
+
"""
|
|
249
|
+
self.point = point
|
|
250
|
+
self.graph.solve(point)
|
|
251
|
+
|
|
252
|
+
def total_cost(self) -> float:
|
|
253
|
+
"""
|
|
254
|
+
Calculates the total cost of the graph. Can only be called after set_point().
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
float: The normalized total cost.
|
|
258
|
+
"""
|
|
259
|
+
return self.graph.total_cost() / (self.scale_factor * self.scale_factor)
|
|
260
|
+
|
|
261
|
+
def print(self) -> None:
|
|
262
|
+
"""
|
|
263
|
+
Prints a string representation of the graph associated with this aligner instance.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
None
|
|
267
|
+
"""
|
|
268
|
+
print(str(self.graph))
|
|
269
|
+
|
|
270
|
+
def flows(self) -> list[_Flow]:
|
|
271
|
+
"""
|
|
272
|
+
Computes and returns a list of flow information for each theoretical spectrum.
|
|
273
|
+
|
|
274
|
+
Each flow is represented as a namedtuple containing the empirical peak index,
|
|
275
|
+
theoretical peak index, and the scaled flow value (divided by self.scale_factor).
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
list[namedtuple]: A list of Flow namedtuples, one for each theoretical
|
|
279
|
+
spectrum, each containing:
|
|
280
|
+
- empirical_peak_idx (int): Index of the empirical peak.
|
|
281
|
+
- theoretical_peak_idx (int): Index of the theoretical peak.
|
|
282
|
+
- flow (float): Scaled flow value between the peaks.
|
|
283
|
+
"""
|
|
284
|
+
result = []
|
|
285
|
+
for i in range(len(self.theoretical_spectra)):
|
|
286
|
+
empirical_peak_idx, theoretical_peak_idx, flow = (
|
|
287
|
+
self.graph.flows_for_target(i)
|
|
288
|
+
)
|
|
289
|
+
result.append(_Flow(empirical_peak_idx, theoretical_peak_idx, flow / self.scale_factor))
|
|
290
|
+
return result
|
|
291
|
+
|
|
292
|
+
def gradient(self) -> np.ndarray:
|
|
293
|
+
"""
|
|
294
|
+
Returns the gradient of total_cost with respect to the point
|
|
295
|
+
(spectrum proportions). Can only be called after set_point().
|
|
296
|
+
|
|
297
|
+
Returns
|
|
298
|
+
-------
|
|
299
|
+
np.ndarray
|
|
300
|
+
Array of partial derivatives, one per theoretical spectrum.
|
|
301
|
+
"""
|
|
302
|
+
return (
|
|
303
|
+
self.graph.spectrum_proportion_derivatives().astype(float)
|
|
304
|
+
/ (self.scale_factor * self.scale_factor)
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
def gradient_fast_approx(self) -> np.ndarray:
|
|
308
|
+
"""Fast, APPROXIMATE gradient (dual-potential difference instead of the
|
|
309
|
+
residual shortest-path marginal).
|
|
310
|
+
|
|
311
|
+
Much cheaper (skips the per-subgraph Dijkstra) but returns a
|
|
312
|
+
different, basis-dependent gradient: a lower bound on the true
|
|
313
|
+
marginal, exact only on the optimal flow support. Opt-in; do not use
|
|
314
|
+
as a drop-in replacement for gradient() without validating convergence.
|
|
315
|
+
"""
|
|
316
|
+
return (
|
|
317
|
+
self.graph.spectrum_proportion_derivatives_fast_approx().astype(float)
|
|
318
|
+
/ (self.scale_factor * self.scale_factor)
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def optimize(self, x0: Optional[np.ndarray] = None) -> OptimizeResult:
|
|
322
|
+
"""
|
|
323
|
+
Minimize total transport cost over non-negative spectrum proportions.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
x0 : np.ndarray, optional
|
|
328
|
+
Initial proportions. Defaults to a vector of ones.
|
|
329
|
+
|
|
330
|
+
Returns
|
|
331
|
+
-------
|
|
332
|
+
scipy.optimize.OptimizeResult
|
|
333
|
+
Standard scipy result; .x holds the optimal proportions.
|
|
334
|
+
"""
|
|
335
|
+
n = len(self.theoretical_spectra)
|
|
336
|
+
if x0 is None:
|
|
337
|
+
x0 = np.ones(n)
|
|
338
|
+
|
|
339
|
+
def cost_and_grad(w):
|
|
340
|
+
self.set_point(w)
|
|
341
|
+
return self.total_cost(), self.gradient()
|
|
342
|
+
|
|
343
|
+
return minimize(
|
|
344
|
+
cost_and_grad,
|
|
345
|
+
x0=x0,
|
|
346
|
+
jac=True,
|
|
347
|
+
method="L-BFGS-B",
|
|
348
|
+
bounds=[(0.0, None)] * n,
|
|
349
|
+
options={"ftol": self._ftol},
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def no_subgraphs(self) -> int:
|
|
353
|
+
"""
|
|
354
|
+
Returns the number of subgraphs in the underlying Wasserstein network.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
int: The number of subgraphs present in the graph.
|
|
358
|
+
"""
|
|
359
|
+
return self.graph.no_subgraphs()
|
|
360
|
+
|
|
361
|
+
def print_diagnostics(self, subgraphs_too=False):
|
|
362
|
+
"""
|
|
363
|
+
Prints diagnostic information about the current state of the alignment.
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
subgraphs_too : bool, optional
|
|
368
|
+
If True, prints diagnostics for each subgraph in addition to the overall graph.
|
|
369
|
+
|
|
370
|
+
Diagnostics Printed
|
|
371
|
+
------------------
|
|
372
|
+
- Number of subgraphs
|
|
373
|
+
- Number of empirical nodes
|
|
374
|
+
- Number of theoretical nodes
|
|
375
|
+
- Number of matching edges (dense factory)
|
|
376
|
+
- Number of chain edges (1D chain factory)
|
|
377
|
+
- Number of src-to-empirical edges
|
|
378
|
+
- Number of theoretical-to-sink edges
|
|
379
|
+
- Number of simple trash edges
|
|
380
|
+
- Matching density
|
|
381
|
+
- Scale factor (and its log10 value)
|
|
382
|
+
- Total cost
|
|
383
|
+
|
|
384
|
+
If `subgraphs_too` is True, for each subgraph:
|
|
385
|
+
- Number of empirical nodes
|
|
386
|
+
- Number of theoretical nodes
|
|
387
|
+
- Cost
|
|
388
|
+
- Matching density
|
|
389
|
+
- Theoretical spectra involved
|
|
390
|
+
"""
|
|
391
|
+
print("Diagnostics:")
|
|
392
|
+
print("No subgraphs:", self.graph.no_subgraphs())
|
|
393
|
+
print("No empirical nodes:", self.graph.count_empirical_nodes())
|
|
394
|
+
print("No theoretical nodes:", self.graph.count_theoretical_nodes())
|
|
395
|
+
print("No matching edges:", self.graph.count_matching_edges())
|
|
396
|
+
print("No chain edges:", self.graph.count_chain_edges())
|
|
397
|
+
print("No src-to-empirical edges:", self.graph.count_src_to_empirical_edges())
|
|
398
|
+
print("No theoretical-to-sink edges:", self.graph.count_theoretical_to_sink_edges())
|
|
399
|
+
print("No simple trash edges:", self.graph.count_simple_trash_edges())
|
|
400
|
+
print("Matching density:", self.graph.matching_density())
|
|
401
|
+
print(
|
|
402
|
+
"Scale factor:", self.scale_factor, f" log10: {np.log10(self.scale_factor)}"
|
|
403
|
+
)
|
|
404
|
+
print("Total cost:", self.graph.total_cost())
|
|
405
|
+
if not subgraphs_too:
|
|
406
|
+
return
|
|
407
|
+
for ii in range(self.graph.no_subgraphs()):
|
|
408
|
+
s = self.graph.get_subgraph(ii)
|
|
409
|
+
print("Subgraph", ii, ":")
|
|
410
|
+
print(" No. empirical nodes:", s.count_empirical_nodes())
|
|
411
|
+
print(" No. theoretical nodes:", s.count_theoretical_nodes())
|
|
412
|
+
print(" No. matching edges:", s.count_matching_edges())
|
|
413
|
+
print(" No. chain edges:", s.count_chain_edges())
|
|
414
|
+
print(" No. src-to-empirical edges:", s.count_src_to_empirical_edges())
|
|
415
|
+
print(" No. theoretical-to-sink edges:", s.count_theoretical_to_sink_edges())
|
|
416
|
+
print(" No. simple trash edges:", s.count_simple_trash_edges())
|
|
417
|
+
print(" Cost:", s.total_cost())
|
|
418
|
+
print(" Matching density:", s.matching_density())
|
|
419
|
+
print(" Theoretical spectra involved:", s.theoretical_spectra_involved())
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
class ConstrainedSolver(DeconvSolver):
|
|
423
|
+
"""
|
|
424
|
+
DeconvSolver with a total-mass equality constraint:
|
|
425
|
+
|
|
426
|
+
sum_s(w_s * total_intensity_s) = total_empirical_intensity
|
|
427
|
+
|
|
428
|
+
This couples the proportions so that components with extra unmatched peaks
|
|
429
|
+
(diluted libraries) are naturally down-weighted without tuning
|
|
430
|
+
theo_trash_cost. The constraint is enforced during the call to
|
|
431
|
+
optimize(), which uses SLSQP instead of L-BFGS-B.
|
|
432
|
+
|
|
433
|
+
All DeconvSolver methods (set_point, total_cost, gradient, flows, …)
|
|
434
|
+
are inherited unchanged and work identically.
|
|
435
|
+
|
|
436
|
+
Parameters
|
|
437
|
+
----------
|
|
438
|
+
Same as DeconvSolver.
|
|
439
|
+
"""
|
|
440
|
+
|
|
441
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
442
|
+
super().__init__(*args, **kwargs)
|
|
443
|
+
self._emp_total = self.empirical_spectrum.sum_intensities
|
|
444
|
+
self._theo_totals = np.array(
|
|
445
|
+
[t.sum_intensities for t in self.theoretical_spectra]
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
def optimize(self, x0: Optional[np.ndarray] = None) -> OptimizeResult:
|
|
449
|
+
"""
|
|
450
|
+
Minimize total transport cost subject to the total-mass constraint.
|
|
451
|
+
|
|
452
|
+
Parameters
|
|
453
|
+
----------
|
|
454
|
+
x0 : np.ndarray, optional
|
|
455
|
+
Initial proportions. Must satisfy the constraint. Defaults to
|
|
456
|
+
equal weights scaled to satisfy sum_s(w_s * I_s) = I_emp.
|
|
457
|
+
|
|
458
|
+
Returns
|
|
459
|
+
-------
|
|
460
|
+
scipy.optimize.OptimizeResult
|
|
461
|
+
Standard scipy result; .x holds the optimal proportions.
|
|
462
|
+
"""
|
|
463
|
+
n = len(self.theoretical_spectra)
|
|
464
|
+
if x0 is None:
|
|
465
|
+
w0 = self._emp_total / self._theo_totals.sum()
|
|
466
|
+
x0 = np.full(n, w0)
|
|
467
|
+
|
|
468
|
+
def cost_and_grad(w):
|
|
469
|
+
self.set_point(w)
|
|
470
|
+
return self.total_cost(), self.gradient()
|
|
471
|
+
|
|
472
|
+
constraint = {
|
|
473
|
+
"type": "eq",
|
|
474
|
+
"fun": lambda w: np.dot(w, self._theo_totals) - self._emp_total,
|
|
475
|
+
"jac": lambda w: self._theo_totals,
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
return minimize(
|
|
479
|
+
cost_and_grad,
|
|
480
|
+
x0=x0,
|
|
481
|
+
jac=True,
|
|
482
|
+
method="SLSQP",
|
|
483
|
+
bounds=[(0.0, None)] * n,
|
|
484
|
+
constraints=constraint,
|
|
485
|
+
options={"maxiter": 2000, "ftol": self._ftol},
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
class MagnetsteinSolver(ConstrainedSolver):
|
|
490
|
+
"""
|
|
491
|
+
ConstrainedSolver that normalizes all spectra to sum to 1 internally,
|
|
492
|
+
reproducing magnetstein's dual-LP problem formulation.
|
|
493
|
+
|
|
494
|
+
With unit-norm spectra the total-mass equality constraint reduces to
|
|
495
|
+
sum(w) = 1, matching the LP's implicit mass-balance condition.
|
|
496
|
+
experimental_trash_cost = MTD and theoretical_trash_cost = MTD_th
|
|
497
|
+
correspond directly to magnetstein's penalty and penalty_th parameters.
|
|
498
|
+
|
|
499
|
+
Parameters
|
|
500
|
+
----------
|
|
501
|
+
empirical_spectrum : Distribution
|
|
502
|
+
The empirical spectrum (normalized internally to sum to 1).
|
|
503
|
+
theoretical_spectra : Sequence[Distribution]
|
|
504
|
+
A sequence of theoretical spectra (each normalized internally).
|
|
505
|
+
distance : DistanceMetric
|
|
506
|
+
Distance metric. Use DistanceMetric.L1 for 1D NMR spectra.
|
|
507
|
+
MTD : float
|
|
508
|
+
Maximum Transport Distance for the mix (experimental trash cost).
|
|
509
|
+
MTD_th : float, optional
|
|
510
|
+
Maximum Transport Distance for components (theoretical trash cost).
|
|
511
|
+
If None, uses symmetric trash with cost MTD.
|
|
512
|
+
method : str, optional
|
|
513
|
+
Min-cost flow algorithm (default: ``"network_simplex"``). Ignored when ``solver`` is provided.
|
|
514
|
+
solver : NetworkSimplex | CostScaling | CycleCanceling | CapacityScaling, optional
|
|
515
|
+
Solver configuration object. Takes precedence over ``method``.
|
|
516
|
+
"""
|
|
517
|
+
|
|
518
|
+
def __init__(
|
|
519
|
+
self,
|
|
520
|
+
empirical_spectrum: Distribution,
|
|
521
|
+
theoretical_spectra: Sequence[Distribution],
|
|
522
|
+
distance: DistanceMetric,
|
|
523
|
+
MTD: float,
|
|
524
|
+
MTD_th: Optional[float] = None,
|
|
525
|
+
method: str = None,
|
|
526
|
+
solver=None,
|
|
527
|
+
precision: float = 1e-3,
|
|
528
|
+
) -> None:
|
|
529
|
+
emp = empirical_spectrum.normalized()
|
|
530
|
+
theos = [t.normalized() for t in theoretical_spectra]
|
|
531
|
+
if MTD_th is None:
|
|
532
|
+
super().__init__(
|
|
533
|
+
emp,
|
|
534
|
+
theos,
|
|
535
|
+
distance,
|
|
536
|
+
max_distance=MTD,
|
|
537
|
+
trash_cost=MTD,
|
|
538
|
+
method=method,
|
|
539
|
+
solver=solver,
|
|
540
|
+
precision=precision,
|
|
541
|
+
)
|
|
542
|
+
else:
|
|
543
|
+
super().__init__(
|
|
544
|
+
emp,
|
|
545
|
+
theos,
|
|
546
|
+
distance,
|
|
547
|
+
max_distance=max(MTD, MTD_th),
|
|
548
|
+
experimental_trash_cost=MTD,
|
|
549
|
+
theoretical_trash_cost=MTD_th,
|
|
550
|
+
method=method,
|
|
551
|
+
solver=solver,
|
|
552
|
+
precision=precision,
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
class MassersteinSolver(DeconvSolver):
|
|
557
|
+
"""
|
|
558
|
+
Reproduces masserstein's ``dualdeconv2`` / ``dualdeconv4``.
|
|
559
|
+
|
|
560
|
+
All spectra are normalized to sum to 1 internally (as dualdeconv2
|
|
561
|
+
requires). The distance is always LINF (= absolute distance in 1D, the
|
|
562
|
+
dual of W1 / earth mover's distance used by masserstein).
|
|
563
|
+
|
|
564
|
+
Faithful model of dualdeconv2's LP
|
|
565
|
+
----------------------------------
|
|
566
|
+
dualdeconv2 prices transport at the true linear W1 cost with an
|
|
567
|
+
experimental abyss at ``MTD``, and has *no theoretical abyss*: every unit
|
|
568
|
+
of ``w_k * theo_k`` must reach an experimental position — a component is
|
|
569
|
+
discarded only by driving ``w_k -> 0``, never by trashing theoretical
|
|
570
|
+
mass. Transporting a unit farther than ``MTD`` is never optimal in that
|
|
571
|
+
LP (the experimental abyss at ``MTD`` is always cheaper), so ``MTD`` is
|
|
572
|
+
already the LP's *effective* transport cap. We reproduce that with:
|
|
573
|
+
|
|
574
|
+
* ``max_distance = MTD`` — the effective cap; also keeps the 1D chain
|
|
575
|
+
sparse (O(m+n)) instead of dense (O(m*n)) on real spectra;
|
|
576
|
+
* ``experimental_trash_cost = MTD`` — the denoising penalty;
|
|
577
|
+
* ``theoretical_trash_cost = 2*MTD`` (dualdeconv2 case). This is a
|
|
578
|
+
numerical device only: with experimental-only trash the inner
|
|
579
|
+
min-cost-flow cost ``f(w)`` is degenerate/flat (un-routable
|
|
580
|
+
theoretical mass is dropped for free, so the outer optimizer gets a
|
|
581
|
+
zero gradient and returns its starting point — the old bug). Any
|
|
582
|
+
cost strictly above the ``MTD`` transport cap is never chosen over
|
|
583
|
+
transporting or lowering ``w_k``, so it carries no flow at the
|
|
584
|
+
optimum (= "no theoretical abyss; drop the component by lowering
|
|
585
|
+
w_k") yet makes ``f(w)`` well-defined and convex for every ``w``.
|
|
586
|
+
The multiplier is kept small (2x) on purpose: the auto
|
|
587
|
+
``scale_factor`` divides by ``max_cost_per_unit_flow``, so a large
|
|
588
|
+
value would shrink it and lose m/z precision. A sweep (2/4/8/20x)
|
|
589
|
+
showed 2x gives the best Part-1 agreement (L1 ~2e-7 vs dualdeconv2)
|
|
590
|
+
while 8x already degrades it ~4x, with no compensating gain — the
|
|
591
|
+
fixed-integer network's dynamic range makes a true +inf infeasible,
|
|
592
|
+
so this is a deliberate approximation, exact for fully-placeable and
|
|
593
|
+
fully-unplaceable components, slightly soft for partial placement.
|
|
594
|
+
|
|
595
|
+
Residual caveats:
|
|
596
|
+
* dualdeconv2 solves one joint LP (proportions = exact shadow prices);
|
|
597
|
+
this is a nested optimization (SLSQP over ``w``, inner MCF). The
|
|
598
|
+
objective and noise/sum behaviour match, but under degeneracy
|
|
599
|
+
(near-collinear components) per-component proportions agree only to
|
|
600
|
+
optimizer tolerance, not bit-exactly.
|
|
601
|
+
* On raw unfiltered spectra the two formulations agree closely in
|
|
602
|
+
controlled tests (single/multi-component, collinear decoys, dense
|
|
603
|
+
overlapping + noise — see
|
|
604
|
+
``experiments/direct_dualdeconv2_{nofilter,multi,dense}.py``):
|
|
605
|
+
objective to ~1e-5, signal fraction to ~1%, decoys zeroed.
|
|
606
|
+
* On DENSE-noisy mass spectra (e.g. hemoglobin Part 2 in
|
|
607
|
+
``compare_dualdeconv2.py``) this reproduction breaks structurally:
|
|
608
|
+
the nested empirical->theoretical MCF matches per peak with the
|
|
609
|
+
sum Σ w_j*theo_j, while dualdeconv2's joint LP couples all isotope
|
|
610
|
+
positions of a component via Σ thr_ji Z_i ≤ 0. An 11-config grid
|
|
611
|
+
search (``experiments/grid_search_masserstein.py``) over
|
|
612
|
+
max_distance and theoretical_trash_cost found that NO setting
|
|
613
|
+
bridges the gap — larger max_distance makes it worse (more noise
|
|
614
|
+
targets), larger theo_trash does nothing (theo-trash never fires at
|
|
615
|
+
the optimum on dense noise), and either breaks the minimal case
|
|
616
|
+
first. Cross-scoring confirms it: at w_wnet, masserstein's own LP
|
|
617
|
+
gives ~100x worse cost than at w_dd2 — i.e. wnetdeconv's reported
|
|
618
|
+
``fun`` is its own (lenient) model, not a competitive solution to
|
|
619
|
+
masserstein's LP. For inputs in this regime use
|
|
620
|
+
``masserstein.estimate_proportions`` (which pre-filters to the
|
|
621
|
+
theoretical envelope, the agreement regime) or call
|
|
622
|
+
``dualdeconv2`` directly — not this class.
|
|
623
|
+
|
|
624
|
+
``deconvolve()`` uses SLSQP with bounds w_k >= 0 and the explicit
|
|
625
|
+
inequality constraint sum(w_k) <= 1, which dualdeconv2 enforces implicitly
|
|
626
|
+
via sum(probs) + sum(abyss) = 1, abyss >= 0.
|
|
627
|
+
|
|
628
|
+
For the symmetric case (MTD_th=None) this reproduces dualdeconv2;
|
|
629
|
+
with MTD_th set it reproduces dualdeconv4 (real theoretical penalty
|
|
630
|
+
MTD_th, still with the unbounded transport metric).
|
|
631
|
+
|
|
632
|
+
Parameters
|
|
633
|
+
----------
|
|
634
|
+
empirical_spectrum : Distribution
|
|
635
|
+
Empirical spectrum (normalized internally to sum to 1).
|
|
636
|
+
theoretical_spectra : Sequence[Distribution]
|
|
637
|
+
Theoretical spectra (each normalized internally).
|
|
638
|
+
MTD : float
|
|
639
|
+
Maximum Transport Distance / denoising penalty (``penalty`` in dualdeconv2).
|
|
640
|
+
MTD_th : float, optional
|
|
641
|
+
Separate theoretical trash cost. None → symmetric = dualdeconv2;
|
|
642
|
+
non-None → asymmetric = dualdeconv4.
|
|
643
|
+
theo_trash_mult : float, optional
|
|
644
|
+
Multiplier on MTD for the +inf-proxy theoretical trash cost
|
|
645
|
+
(dualdeconv2 path only). Default 10x is what fixes the
|
|
646
|
+
minimal-divergence example
|
|
647
|
+
(``experiments/minimal_dense_noise_divergence.py``); below ~10x the
|
|
648
|
+
nested MCF under-prices un-routable theoretical mass relative to
|
|
649
|
+
masserstein's real-distance transport. Should be at least as large as
|
|
650
|
+
the maximum inter-isotope distance you expect un-routed mass to need
|
|
651
|
+
to travel (in m/z units of MTD). Above ~few hundred it can lose
|
|
652
|
+
precision via the auto ``scale_factor``.
|
|
653
|
+
method : str, optional
|
|
654
|
+
Min-cost flow algorithm. Ignored when ``solver`` is provided.
|
|
655
|
+
solver : NetworkSimplex | CostScaling | CycleCanceling | CapacityScaling, optional
|
|
656
|
+
Solver configuration object. Takes precedence over ``method``.
|
|
657
|
+
"""
|
|
658
|
+
|
|
659
|
+
def __init__(
|
|
660
|
+
self,
|
|
661
|
+
empirical_spectrum: Distribution,
|
|
662
|
+
theoretical_spectra: Sequence[Distribution],
|
|
663
|
+
MTD: float,
|
|
664
|
+
MTD_th: Optional[float] = None,
|
|
665
|
+
theo_trash_mult: float = 10.0,
|
|
666
|
+
method: str = None,
|
|
667
|
+
solver=None,
|
|
668
|
+
precision: float = 1e-3,
|
|
669
|
+
) -> None:
|
|
670
|
+
emp = empirical_spectrum.normalized()
|
|
671
|
+
theos = [t.normalized() for t in theoretical_spectra]
|
|
672
|
+
if MTD_th is None:
|
|
673
|
+
super().__init__(
|
|
674
|
+
emp,
|
|
675
|
+
theos,
|
|
676
|
+
distance=DistanceMetric.LINF,
|
|
677
|
+
max_distance=MTD,
|
|
678
|
+
experimental_trash_cost=MTD,
|
|
679
|
+
# effective +inf: large enough that the optimizer prefers
|
|
680
|
+
# lowering w_k over carrying flow on this edge — i.e. mimics
|
|
681
|
+
# masserstein's "no theoretical abyss; transport at real
|
|
682
|
+
# distance". Default 10x covers the typical asymmetric-isotope
|
|
683
|
+
# case; user can dial up if inter-isotope distances >> MTD.
|
|
684
|
+
theoretical_trash_cost=theo_trash_mult * MTD,
|
|
685
|
+
method=method,
|
|
686
|
+
solver=solver,
|
|
687
|
+
precision=precision,
|
|
688
|
+
)
|
|
689
|
+
else:
|
|
690
|
+
super().__init__(
|
|
691
|
+
emp,
|
|
692
|
+
theos,
|
|
693
|
+
distance=DistanceMetric.LINF,
|
|
694
|
+
max_distance=max(MTD, MTD_th),
|
|
695
|
+
experimental_trash_cost=MTD,
|
|
696
|
+
theoretical_trash_cost=MTD_th,
|
|
697
|
+
method=method,
|
|
698
|
+
solver=solver,
|
|
699
|
+
precision=precision,
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
def deconvolve(self, x0: Optional[np.ndarray] = None) -> dict:
|
|
703
|
+
"""
|
|
704
|
+
Find optimal component proportions, matching dualdeconv2's output format.
|
|
705
|
+
|
|
706
|
+
Parameters
|
|
707
|
+
----------
|
|
708
|
+
x0 : np.ndarray, optional
|
|
709
|
+
Initial proportions. Defaults to uniform 1/(2k) (interior of feasible set).
|
|
710
|
+
|
|
711
|
+
Returns
|
|
712
|
+
-------
|
|
713
|
+
dict
|
|
714
|
+
probs : list[float] – weight of each theoretical spectrum
|
|
715
|
+
fun : float – optimal transport cost (= dual LP objective)
|
|
716
|
+
success : bool
|
|
717
|
+
"""
|
|
718
|
+
n = len(self.theoretical_spectra)
|
|
719
|
+
if x0 is None:
|
|
720
|
+
x0 = np.ones(n) / (2 * n)
|
|
721
|
+
|
|
722
|
+
def cost_and_grad(w):
|
|
723
|
+
self.set_point(w)
|
|
724
|
+
return self.total_cost(), self.gradient()
|
|
725
|
+
|
|
726
|
+
constraints = [{
|
|
727
|
+
"type": "ineq",
|
|
728
|
+
"fun": lambda w: 1.0 - w.sum(),
|
|
729
|
+
"jac": lambda w: -np.ones(n),
|
|
730
|
+
}]
|
|
731
|
+
|
|
732
|
+
result = minimize(
|
|
733
|
+
cost_and_grad,
|
|
734
|
+
x0=x0,
|
|
735
|
+
jac=True,
|
|
736
|
+
method="SLSQP",
|
|
737
|
+
bounds=[(0.0, None)] * n,
|
|
738
|
+
constraints=constraints,
|
|
739
|
+
options={"maxiter": 2000, "ftol": self._ftol},
|
|
740
|
+
)
|
|
741
|
+
return {"probs": list(result.x), "fun": result.fun, "success": result.success}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from functools import cached_property
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from wnet import Distribution
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Spectrum(Distribution):
|
|
10
|
+
"""
|
|
11
|
+
A class representing NMR or MS spectrum data.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
positions: np.ndarray,
|
|
17
|
+
intensities: np.ndarray,
|
|
18
|
+
label: Optional[str] = None,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Initialize a Spectrum object. Compared to Distribution, this class
|
|
22
|
+
retains the original intensities (not converted to int) for more precise
|
|
23
|
+
scaling operations. They are stored in the `original_intensities` attribute.
|
|
24
|
+
They are still converted to int before running any alignment algorithms.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
positions : np.ndarray
|
|
29
|
+
The spatial coordinates of the spectrum (e.g., m/z and RT for MS).
|
|
30
|
+
intensities : np.ndarray
|
|
31
|
+
The intensity values corresponding to the spatial coordinates.
|
|
32
|
+
"""
|
|
33
|
+
self.original_intensities = intensities
|
|
34
|
+
super().__init__(positions, intensities, label=label)
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def FromFeatureXML(path):
|
|
38
|
+
"""
|
|
39
|
+
Parse a featureXML file and return a Spectrum object.
|
|
40
|
+
"""
|
|
41
|
+
import pyopenms as oms
|
|
42
|
+
|
|
43
|
+
# load the featureXML file
|
|
44
|
+
featureXML = oms.FeatureXMLFile()
|
|
45
|
+
features = oms.FeatureMap()
|
|
46
|
+
featureXML.load(path, features)
|
|
47
|
+
# load m/z, rt, and intensity values from the features
|
|
48
|
+
mzs = []
|
|
49
|
+
rts = []
|
|
50
|
+
intensities = []
|
|
51
|
+
for feature in features:
|
|
52
|
+
mzs.append(feature.getMZ())
|
|
53
|
+
rts.append(feature.getRT())
|
|
54
|
+
intensities.append(feature.getIntensity())
|
|
55
|
+
# create a Spectrum object
|
|
56
|
+
spectrum = Spectrum(np.array([mzs, rts]), np.array(intensities))
|
|
57
|
+
return spectrum
|
|
58
|
+
|
|
59
|
+
@cached_property
|
|
60
|
+
def sum_intensities(self) -> float:
|
|
61
|
+
"""
|
|
62
|
+
Return the sum of the original intensities.
|
|
63
|
+
"""
|
|
64
|
+
return np.sum(self.original_intensities)
|
|
65
|
+
|
|
66
|
+
def scaled(self, factor: float) -> "Spectrum":
|
|
67
|
+
"""
|
|
68
|
+
Return a new Spectrum object with intensities scaled by the given factor.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
factor : float
|
|
73
|
+
The scaling factor to apply to the intensities.
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
Spectrum
|
|
78
|
+
A new Spectrum object with scaled intensities.
|
|
79
|
+
"""
|
|
80
|
+
return Spectrum(
|
|
81
|
+
self.positions, self.original_intensities * factor, label=self.label
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def positions_intensities_scaled(self, scale_factor: float) -> "Spectrum":
|
|
85
|
+
"""
|
|
86
|
+
Return a new Spectrum with both positions and intensities scaled by the given factor.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
scale_factor : float
|
|
91
|
+
The scaling factor to apply to positions and intensities.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
Spectrum
|
|
96
|
+
A new Spectrum object with scaled positions and intensities.
|
|
97
|
+
"""
|
|
98
|
+
new_positions = self.positions.astype(np.float64, copy=False) * scale_factor
|
|
99
|
+
return Spectrum(new_positions, self.original_intensities * scale_factor, label=self.label)
|
|
100
|
+
|
|
101
|
+
def normalized(self) -> "Spectrum":
|
|
102
|
+
"""
|
|
103
|
+
Return a new Spectrum object with intensities normalized to sum to 1.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
Spectrum
|
|
108
|
+
A new Spectrum object with normalized intensities.
|
|
109
|
+
"""
|
|
110
|
+
total = self.sum_intensities
|
|
111
|
+
if total == 0:
|
|
112
|
+
raise ValueError("Cannot normalize a spectrum with total intensity of 0.")
|
|
113
|
+
return Spectrum(
|
|
114
|
+
self.positions, self.original_intensities / total, label=self.label
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def as_distribution(self) -> Distribution:
|
|
118
|
+
"""
|
|
119
|
+
Convert the Spectrum object to a Distribution object.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
Distribution
|
|
124
|
+
A Distribution object with the same positions and intensities.
|
|
125
|
+
"""
|
|
126
|
+
return Distribution(self.positions, self.intensities, label=self.label)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def Spectrum_1D(
|
|
130
|
+
positions: np.ndarray, intensities: np.ndarray, label: Optional[str] = None
|
|
131
|
+
) -> Spectrum:
|
|
132
|
+
"""
|
|
133
|
+
Create a 1D Spectrum object.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
positions : np.ndarray
|
|
138
|
+
The spatial coordinates of the spectrum (e.g., m/z for MS).
|
|
139
|
+
intensities : np.ndarray
|
|
140
|
+
The intensity values corresponding to the spatial coordinates.
|
|
141
|
+
label : str, optional
|
|
142
|
+
An optional label for the spectrum.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
Spectrum
|
|
147
|
+
A 1D Spectrum object.
|
|
148
|
+
"""
|
|
149
|
+
if not isinstance(positions, np.ndarray):
|
|
150
|
+
positions = np.array(positions)
|
|
151
|
+
if not isinstance(intensities, np.ndarray):
|
|
152
|
+
intensities = np.array(intensities)
|
|
153
|
+
if positions.ndim != 1:
|
|
154
|
+
raise ValueError(f"positions must be 1D, got shape {positions.shape}")
|
|
155
|
+
if intensities.ndim != 1:
|
|
156
|
+
raise ValueError(f"intensities must be 1D, got shape {intensities.shape}")
|
|
157
|
+
if positions.shape[0] != intensities.shape[0]:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"positions and intensities must have the same length, got {positions.shape[0]} and {intensities.shape[0]}"
|
|
160
|
+
)
|
|
161
|
+
return Spectrum(positions[np.newaxis, :], intensities, label=label)
|