wnetdeconv 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ cmake_minimum_required(VERSION 3.15)
2
+
3
+ execute_process(
4
+ COMMAND python -m pylmcf --include
5
+ OUTPUT_VARIABLE PYLMCF_INCLUDE_DIRS
6
+ OUTPUT_STRIP_TRAILING_WHITESPACE
7
+ )
8
+
9
+ execute_process(
10
+ COMMAND python -m wnet --include
11
+ OUTPUT_VARIABLE WNET_INCLUDE_DIRS
12
+ OUTPUT_STRIP_TRAILING_WHITESPACE
13
+ )
14
+
15
+
16
+ project(wnetdeconv LANGUAGES CXX)
17
+ find_package(Python 3.8
18
+ REQUIRED COMPONENTS Interpreter Development.Module
19
+ OPTIONAL_COMPONENTS Development.SABIModule)
20
+ find_package(nanobind REQUIRED CONFIG)
21
+
22
+
23
+ set(CMAKE_CXX_STANDARD 20)
24
+
25
+ nanobind_add_module(wnetdeconv_cpp
26
+ NB_STATIC NOMINSIZE
27
+ src/wnetdeconv/cpp/wnetdeconv/wnetdeconv.cpp)
28
+
29
+ target_include_directories(wnetdeconv_cpp PRIVATE ${PYLMCF_INCLUDE_DIRS} ${WNET_INCLUDE_DIRS})
30
+ target_compile_definitions(wnetdeconv_cpp PRIVATE INCLUDE_NANOBIND_STUFF)
31
+
32
+ install(TARGETS wnetdeconv_cpp LIBRARY DESTINATION wnetdeconv)
@@ -0,0 +1,7 @@
1
+ This software is Copyright 2026 Michał Startek, and provided under terms of MIT licence, below.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,185 @@
1
+ Metadata-Version: 2.4
2
+ Name: wnetdeconv
3
+ Version: 0.8.0
4
+ Summary: Python implementation of spectral deconvolution using Wasserstein metric
5
+ Author-Email: =?utf-8?q?Micha=C5=82_Startek?= <michal.startek@mimuw.edu.pl>
6
+ Maintainer-Email: =?utf-8?q?Micha=C5=82_Startek?= <michal.startek@mimuw.edu.pl>
7
+ License-Expression: MIT
8
+ License-File: LICENCE
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
11
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
12
+ Classifier: Development Status :: 4 - Beta
13
+ Project-URL: Homepage, https://github.com/michalsta/wnetdeconv
14
+ Project-URL: Repository, https://github.com/michalsta/wnetdeconv.git
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: pylmcf
17
+ Requires-Dist: wnet>=0.9.16
18
+ Requires-Dist: numpy
19
+ Provides-Extra: extras
20
+ Requires-Dist: pyopenms; extra == "extras"
21
+ Provides-Extra: pytest
22
+ Requires-Dist: pytest; extra == "pytest"
23
+ Requires-Dist: pandas; extra == "pytest"
24
+ Requires-Dist: numpy; extra == "pytest"
25
+ Requires-Dist: scipy; extra == "pytest"
26
+ Description-Content-Type: text/markdown
27
+
28
+ # wnetdeconv
29
+
30
+ Spectral deconvolution via Wasserstein optimal transport.
31
+
32
+ Given an empirical spectrum and a library of theoretical component spectra,
33
+ `wnetdeconv` finds the mixture proportions that minimise the total Wasserstein
34
+ transport cost between the empirical signal and the weighted sum of components.
35
+ The inner problem at each set of proportions is solved exactly as a min-cost
36
+ flow (via [pylmcf](https://github.com/michalsta/pylmcf) / LEMON), giving an
37
+ exact piecewise-linear objective with exact gradients — suitable for gradient-
38
+ based outer optimisation with scipy.
39
+
40
+ Supports 1-D spectra (NMR chemical shift, m/z) and higher-dimensional data
41
+ (e.g. m/z + retention time).
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install wnetdeconv
47
+ ```
48
+
49
+ Dependencies: `pylmcf`, `wnet`, `numpy`, `scipy`.
50
+ Optional: `pyopenms` for loading featureXML files.
51
+
52
+ ## Concepts
53
+
54
+ ### Spectra as distributions
55
+
56
+ A spectrum is a set of *(position, intensity)* pairs. In 1-D (NMR chemical
57
+ shift, m/z) use `Spectrum_1D`; for higher-dimensional data (m/z + retention
58
+ time) use `Spectrum` with a `(d, n)` positions array.
59
+
60
+ ```python
61
+ from wnetdeconv import Spectrum_1D
62
+
63
+ empirical = Spectrum_1D([1.0, 2.0, 3.0], [10.0, 25.0, 15.0])
64
+ component = Spectrum_1D([1.0, 2.0, 3.0], [1.0, 2.0, 1.0])
65
+ ```
66
+
67
+ ### Transport cost
68
+
69
+ Matching a unit of intensity from an empirical peak at position *p* to a
70
+ theoretical peak at position *q* costs `distance(p, q)`. Peaks that cannot
71
+ be matched cheaply are instead routed to a *trash node* at a fixed penalty.
72
+
73
+ `max_distance` caps the farthest match considered; anything farther is cheaper
74
+ to trash. `trash_cost` (or the asymmetric pair
75
+ `experimental_trash_cost` / `theoretical_trash_cost`) sets that penalty.
76
+
77
+ ### Precision and scaling
78
+
79
+ Internally all intensities and costs are scaled to integers for the MCF solver.
80
+ The `precision` parameter (default `1e-3`) sets the desired relative accuracy
81
+ of the cost output: `precision=1e-3` gives ≈ 3 significant figures. The same
82
+ value becomes the `ftol` stop criterion for scipy optimisers, so the outer loop
83
+ stops as soon as further improvement is below the resolution the integer network
84
+ can deliver.
85
+
86
+ ## Solvers
87
+
88
+ ### `DeconvSolver` — unconstrained baseline
89
+
90
+ Solves the network at a given point and exposes `total_cost()` and
91
+ `gradient()`. Optimisation (via `optimize()`, L-BFGS-B) minimises cost with
92
+ only non-negativity bounds.
93
+
94
+ ```python
95
+ from wnetdeconv import DeconvSolver, Spectrum_1D
96
+ from wnet.distances import DistanceMetric
97
+
98
+ emp = Spectrum_1D([1.0, 100.0], [10.0, 30.0])
99
+ t1 = Spectrum_1D([1.0], [2.0]) # optimal proportion: 5
100
+ t2 = Spectrum_1D([100.0], [3.0]) # optimal proportion: 10
101
+
102
+ solver = DeconvSolver(
103
+ empirical_spectrum=emp,
104
+ theoretical_spectra=[t1, t2],
105
+ distance=DistanceMetric.LINF,
106
+ max_distance=10.0,
107
+ trash_cost=100.0,
108
+ )
109
+
110
+ result = solver.optimize()
111
+ print(result.x) # [5. 10.]
112
+ ```
113
+
114
+ You can also drive the solver manually — useful when embedding it in your own
115
+ optimisation loop:
116
+
117
+ ```python
118
+ solver.set_point([5.0, 10.0])
119
+ print(solver.total_cost()) # 0.0
120
+ print(solver.gradient()) # [0. 0.] (at the optimum)
121
+ ```
122
+
123
+ ### `ConstrainedSolver` — total-mass equality
124
+
125
+ Adds the constraint `Σ wₛ · Iₛ = I_emp` so that the mixture exactly accounts
126
+ for all empirical intensity. Uses SLSQP. Drop-in replacement for
127
+ `DeconvSolver`; call `optimize()` the same way.
128
+
129
+ ```python
130
+ from wnetdeconv import ConstrainedSolver
131
+
132
+ solver = ConstrainedSolver(
133
+ empirical_spectrum=emp,
134
+ theoretical_spectra=[t1, t2],
135
+ distance=DistanceMetric.LINF,
136
+ max_distance=10.0,
137
+ trash_cost=100.0,
138
+ )
139
+ result = solver.optimize()
140
+ ```
141
+
142
+ ## Key parameters
143
+
144
+ | Parameter | Applies to | Description |
145
+ |---|---|---|
146
+ | `max_distance` | all | Maximum peak-to-peak match distance. Also sets the sparsity of the internal network in 1-D. |
147
+ | `trash_cost` | all | Symmetric penalty for unmatched peaks. |
148
+ | `experimental_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding empirical mass. |
149
+ | `theoretical_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding theoretical mass. |
150
+ | `precision` | all | Desired relative cost accuracy; drives `scale_factor` and `ftol` (default `1e-3`). |
151
+ | `scale_factor` | all | Override automatic scaling (bypasses `precision`). |
152
+
153
+ ## Distance metrics
154
+
155
+ From `wnet.distances.DistanceMetric`:
156
+
157
+ - `L1` — sum of absolute coordinate differences (Manhattan / taxicab)
158
+ - `L2` — Euclidean distance
159
+ - `LINF` — maximum absolute coordinate difference (Chebyshev); dual of the W₁ earth-mover distance used by masserstein
160
+
161
+ ## Loading MS data (featureXML)
162
+
163
+ ```python
164
+ from wnetdeconv import Spectrum
165
+
166
+ emp = Spectrum.FromFeatureXML("sample.featureXML") # requires pyopenms
167
+ ```
168
+
169
+ ## Architecture
170
+
171
+ ```
172
+ wnetdeconv
173
+ ├── Spectrum / Spectrum_1D — data containers (extend wnet.Distribution)
174
+ ├── DeconvSolver — core: builds WassersteinNetwork, exposes cost + gradient
175
+ └── ConstrainedSolver — adds total-mass equality, uses SLSQP
176
+ ```
177
+
178
+ The underlying min-cost flow is provided by
179
+ [wnet](https://github.com/michalsta/wnet) (network construction) and
180
+ [pylmcf](https://github.com/michalsta/pylmcf) (LEMON-based MCF algorithms,
181
+ including warm-restart Network Simplex).
182
+
183
+ ## License
184
+
185
+ MIT
@@ -0,0 +1,158 @@
1
+ # wnetdeconv
2
+
3
+ Spectral deconvolution via Wasserstein optimal transport.
4
+
5
+ Given an empirical spectrum and a library of theoretical component spectra,
6
+ `wnetdeconv` finds the mixture proportions that minimise the total Wasserstein
7
+ transport cost between the empirical signal and the weighted sum of components.
8
+ The inner problem at each set of proportions is solved exactly as a min-cost
9
+ flow (via [pylmcf](https://github.com/michalsta/pylmcf) / LEMON), giving an
10
+ exact piecewise-linear objective with exact gradients — suitable for gradient-
11
+ based outer optimisation with scipy.
12
+
13
+ Supports 1-D spectra (NMR chemical shift, m/z) and higher-dimensional data
14
+ (e.g. m/z + retention time).
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install wnetdeconv
20
+ ```
21
+
22
+ Dependencies: `pylmcf`, `wnet`, `numpy`, `scipy`.
23
+ Optional: `pyopenms` for loading featureXML files.
24
+
25
+ ## Concepts
26
+
27
+ ### Spectra as distributions
28
+
29
+ A spectrum is a set of *(position, intensity)* pairs. In 1-D (NMR chemical
30
+ shift, m/z) use `Spectrum_1D`; for higher-dimensional data (m/z + retention
31
+ time) use `Spectrum` with a `(d, n)` positions array.
32
+
33
+ ```python
34
+ from wnetdeconv import Spectrum_1D
35
+
36
+ empirical = Spectrum_1D([1.0, 2.0, 3.0], [10.0, 25.0, 15.0])
37
+ component = Spectrum_1D([1.0, 2.0, 3.0], [1.0, 2.0, 1.0])
38
+ ```
39
+
40
+ ### Transport cost
41
+
42
+ Matching a unit of intensity from an empirical peak at position *p* to a
43
+ theoretical peak at position *q* costs `distance(p, q)`. Peaks that cannot
44
+ be matched cheaply are instead routed to a *trash node* at a fixed penalty.
45
+
46
+ `max_distance` caps the farthest match considered; anything farther is cheaper
47
+ to trash. `trash_cost` (or the asymmetric pair
48
+ `experimental_trash_cost` / `theoretical_trash_cost`) sets that penalty.
49
+
50
+ ### Precision and scaling
51
+
52
+ Internally all intensities and costs are scaled to integers for the MCF solver.
53
+ The `precision` parameter (default `1e-3`) sets the desired relative accuracy
54
+ of the cost output: `precision=1e-3` gives ≈ 3 significant figures. The same
55
+ value becomes the `ftol` stop criterion for scipy optimisers, so the outer loop
56
+ stops as soon as further improvement is below the resolution the integer network
57
+ can deliver.
58
+
59
+ ## Solvers
60
+
61
+ ### `DeconvSolver` — unconstrained baseline
62
+
63
+ Solves the network at a given point and exposes `total_cost()` and
64
+ `gradient()`. Optimisation (via `optimize()`, L-BFGS-B) minimises cost with
65
+ only non-negativity bounds.
66
+
67
+ ```python
68
+ from wnetdeconv import DeconvSolver, Spectrum_1D
69
+ from wnet.distances import DistanceMetric
70
+
71
+ emp = Spectrum_1D([1.0, 100.0], [10.0, 30.0])
72
+ t1 = Spectrum_1D([1.0], [2.0]) # optimal proportion: 5
73
+ t2 = Spectrum_1D([100.0], [3.0]) # optimal proportion: 10
74
+
75
+ solver = DeconvSolver(
76
+ empirical_spectrum=emp,
77
+ theoretical_spectra=[t1, t2],
78
+ distance=DistanceMetric.LINF,
79
+ max_distance=10.0,
80
+ trash_cost=100.0,
81
+ )
82
+
83
+ result = solver.optimize()
84
+ print(result.x) # [5. 10.]
85
+ ```
86
+
87
+ You can also drive the solver manually — useful when embedding it in your own
88
+ optimisation loop:
89
+
90
+ ```python
91
+ solver.set_point([5.0, 10.0])
92
+ print(solver.total_cost()) # 0.0
93
+ print(solver.gradient()) # [0. 0.] (at the optimum)
94
+ ```
95
+
96
+ ### `ConstrainedSolver` — total-mass equality
97
+
98
+ Adds the constraint `Σ wₛ · Iₛ = I_emp` so that the mixture exactly accounts
99
+ for all empirical intensity. Uses SLSQP. Drop-in replacement for
100
+ `DeconvSolver`; call `optimize()` the same way.
101
+
102
+ ```python
103
+ from wnetdeconv import ConstrainedSolver
104
+
105
+ solver = ConstrainedSolver(
106
+ empirical_spectrum=emp,
107
+ theoretical_spectra=[t1, t2],
108
+ distance=DistanceMetric.LINF,
109
+ max_distance=10.0,
110
+ trash_cost=100.0,
111
+ )
112
+ result = solver.optimize()
113
+ ```
114
+
115
+ ## Key parameters
116
+
117
+ | Parameter | Applies to | Description |
118
+ |---|---|---|
119
+ | `max_distance` | all | Maximum peak-to-peak match distance. Also sets the sparsity of the internal network in 1-D. |
120
+ | `trash_cost` | all | Symmetric penalty for unmatched peaks. |
121
+ | `experimental_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding empirical mass. |
122
+ | `theoretical_trash_cost` | `DeconvSolver` | Per-unit penalty for discarding theoretical mass. |
123
+ | `precision` | all | Desired relative cost accuracy; drives `scale_factor` and `ftol` (default `1e-3`). |
124
+ | `scale_factor` | all | Override automatic scaling (bypasses `precision`). |
125
+
126
+ ## Distance metrics
127
+
128
+ From `wnet.distances.DistanceMetric`:
129
+
130
+ - `L1` — sum of absolute coordinate differences (Manhattan / taxicab)
131
+ - `L2` — Euclidean distance
132
+ - `LINF` — maximum absolute coordinate difference (Chebyshev); dual of the W₁ earth-mover distance used by masserstein
133
+
134
+ ## Loading MS data (featureXML)
135
+
136
+ ```python
137
+ from wnetdeconv import Spectrum
138
+
139
+ emp = Spectrum.FromFeatureXML("sample.featureXML") # requires pyopenms
140
+ ```
141
+
142
+ ## Architecture
143
+
144
+ ```
145
+ wnetdeconv
146
+ ├── Spectrum / Spectrum_1D — data containers (extend wnet.Distribution)
147
+ ├── DeconvSolver — core: builds WassersteinNetwork, exposes cost + gradient
148
+ └── ConstrainedSolver — adds total-mass equality, uses SLSQP
149
+ ```
150
+
151
+ The underlying min-cost flow is provided by
152
+ [wnet](https://github.com/michalsta/wnet) (network construction) and
153
+ [pylmcf](https://github.com/michalsta/pylmcf) (LEMON-based MCF algorithms,
154
+ including warm-restart Network Simplex).
155
+
156
+ ## License
157
+
158
+ MIT
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["scikit-build-core", "nanobind", "pylmcf", "wnet>=0.9.16"]
3
+ build-backend = "scikit_build_core.build"
4
+
5
+ [project]
6
+ name = "wnetdeconv"
7
+ version = "0.8.0"
8
+ description = "Python implementation of spectral deconvolution using Wasserstein metric"
9
+ requires-python = ">=3.9"
10
+ dependencies = ["pylmcf", "wnet>=0.9.16", "numpy"]
11
+ authors = [{ name="Michał Startek", email="michal.startek@mimuw.edu.pl" }]
12
+ maintainers = [{ name="Michał Startek", email="michal.startek@mimuw.edu.pl" }]
13
+ readme = "README.md"
14
+ license = "MIT"
15
+ license-files = ["LICENCE"]
16
+ classifiers = [
17
+ "Programming Language :: Python :: 3",
18
+ "Topic :: Scientific/Engineering :: Mathematics",
19
+ "Topic :: Software Development :: Libraries :: Python Modules",
20
+ "Development Status :: 4 - Beta",
21
+ ]
22
+
23
+ [project.urls]
24
+ "Homepage" = "https://github.com/michalsta/wnetdeconv"
25
+ "Repository" = "https://github.com/michalsta/wnetdeconv.git"
26
+
27
+ [project.optional-dependencies]
28
+ extras = ["pyopenms"]
29
+ pytest = ["pytest", "pandas", "numpy", "scipy"]
30
+
31
+ [tool.pytest.ini_options]
32
+ testpaths = ["pytest"]
33
+
34
+ [tool.scikit-build.sdist]
35
+ exclude = [
36
+ ".github/",
37
+ "experiments/",
38
+ "pytest/",
39
+ "optimization_example.py",
40
+ "reinstall.sh",
41
+ ".gitignore",
42
+ ]
@@ -0,0 +1,11 @@
1
+ #! /usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ from . import wnetdeconv_cpp
6
+ from .solver import DeconvSolver, ConstrainedSolver, MagnetsteinSolver, MassersteinSolver
7
+ from .spectrum import Spectrum, Spectrum_1D
8
+
9
+
10
+ def hello():
11
+ print("Hello, world from wnetdeconv!")
@@ -0,0 +1,25 @@
1
+ import importlib.metadata
2
+ from pathlib import Path
3
+
4
+ __version__ = importlib.metadata.version("wnetdeconv")
5
+
6
+
7
+ def module_main():
8
+ import argparse
9
+
10
+ parser = argparse.ArgumentParser(
11
+ description="WNetDeconv: A tool for spectral deconvolution."
12
+ )
13
+ parser.add_argument("--version", "-v", action="version", version=__version__)
14
+ # parser.add_argument("--include", "-i", help="Print include path for C++ headers", action="store_true")
15
+
16
+ args = parser.parse_args()
17
+
18
+ # if args.include:
19
+ # print(Path(__file__).parent / "cpp")
20
+ # else:
21
+ parser.print_help()
22
+
23
+
24
+ if __name__ == "__main__":
25
+ module_main()
@@ -0,0 +1,12 @@
1
+ #include <iostream>
2
+ #include <nanobind/nanobind.h>
3
+
4
+
5
+ int hello() {
6
+ std::cout << "Hello, world from wnetdeconv_cpp!" << std::endl;
7
+ return 0;
8
+ }
9
+
10
+ NB_MODULE(wnetdeconv_cpp, m) {
11
+ m.def("hello", &hello, "A function that prints 'Hello, world from wnetdeconv_cpp!'");
12
+ }
@@ -0,0 +1,741 @@
1
+ import warnings
2
+ from collections import namedtuple
3
+ from collections.abc import Sequence
4
+ from typing import Callable, Optional, Union, List, Tuple
5
+ import numpy as np
6
+ from scipy.optimize import minimize, OptimizeResult
7
+
8
+ from wnet import Distribution, WassersteinNetwork
9
+
10
+ _Flow = namedtuple("Flow", ["empirical_peak_idx", "theoretical_peak_idx", "flow"])
11
+ from wnet.distances import DistanceMetric
12
+
13
+
14
+ class DeconvSolver:
15
+ """
16
+ Aligns an empirical spectrum to one or more theoretical spectra using a Wasserstein network approach.
17
+ Alignment of two empirical spectra E1, E2 can be performed by setting E1 as the empirical_spectrum
18
+ and E2 as the only element of theoretical_spectra.
19
+
20
+ Parameters
21
+ ----------
22
+ empirical_spectrum : Distribution
23
+ The empirical spectrum to be aligned.
24
+ theoretical_spectra : Sequence[Distribution]
25
+ A sequence of theoretical spectra to align against.
26
+ distance_function : Callable[[np.ndarray, np.ndarray], np.ndarray]
27
+ Function to compute the distance between empirical and theoretical peaks.
28
+ max_distance : int or float
29
+ Maximum allowed distance for matching peaks.
30
+ trash_cost : int or float, optional
31
+ Cost for assigning unmatched peaks to trash (symmetric). Used as fallback for
32
+ experimental_trash_cost / theoretical_trash_cost when only one is set.
33
+ scale_factor : None, int, or float, optional
34
+ Scaling factor for intensities and costs. If None, it is computed from ``precision``.
35
+ precision : float, optional
36
+ Desired relative precision of the cost output (fraction of the theoretical cost
37
+ upper bound ``max_cost_per_unit_flow * max_sum_intensity``). Drives both the
38
+ auto scale_factor and the ``ftol`` stop criterion passed to scipy optimizers.
39
+ Ignored when ``scale_factor`` is supplied explicitly. Default 1e-3 (≈ 3
40
+ significant figures).
41
+ experimental_trash_cost : int or float, optional
42
+ Cost for discarding unmatched empirical peaks. Enables asymmetric trash mode.
43
+ theoretical_trash_cost : int or float, optional
44
+ Cost for discarding unmatched theoretical peaks. Enables asymmetric trash mode.
45
+ method : str, optional
46
+ Min-cost flow algorithm: ``"network_simplex"`` (default), ``"cycle_canceling"``, ``"cost_scaling"``, or ``"capacity_scaling"``.
47
+ Ignored when ``solver`` is provided.
48
+ solver : NetworkSimplex | CostScaling | CycleCanceling | CapacityScaling, optional
49
+ Solver configuration object. Takes precedence over ``method``.
50
+ Defaults to ``NetworkSimplex()`` (warm restarts, BLOCK_SEARCH pivot).
51
+ force_dense_1d : bool, optional
52
+ In 1D, force the O(m*n) dense factory instead of the O(m+n) chain
53
+ factory (default False = chain in 1D). Forwarded to
54
+ :class:`WassersteinNetwork`.
55
+
56
+ Attributes
57
+ ----------
58
+ scale_factor : float
59
+ The scaling factor used for intensities and costs.
60
+ empirical_spectrum : Distribution
61
+ The scaled empirical spectrum.
62
+ theoretical_spectra : list[Distribution]
63
+ The scaled theoretical spectra.
64
+ graph : WassersteinNetwork
65
+ The underlying Wasserstein network graph.
66
+ point : Sequence[float] or np.ndarray or None
67
+ The current point for solving the alignment.
68
+
69
+ Methods
70
+ -------
71
+ set_point(point)
72
+ Sets the point for solving the alignment and runs the solver.
73
+ total_cost()
74
+ Returns the total cost of the alignment, rescaled to original units.
75
+ print()
76
+ Prints a string representation of the underlying graph.
77
+ flows()
78
+ Returns a list of flows (alignments) between empirical and theoretical peaks.
79
+ no_subgraphs()
80
+ Returns the number of subgraphs in the alignment network.
81
+ print_diagnostics(subgraphs_too=False)
82
+ Prints diagnostic information about the alignment and optionally about each subgraph.
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ empirical_spectrum: Distribution,
88
+ theoretical_spectra: Sequence[Distribution],
89
+ distance: DistanceMetric,
90
+ max_distance: Union[int, float],
91
+ trash_cost: Optional[Union[int, float]] = None,
92
+ scale_factor: Optional[Union[int, float]] = None,
93
+ experimental_trash_cost: Optional[Union[int, float]] = None,
94
+ theoretical_trash_cost: Optional[Union[int, float]] = None,
95
+ method: str = None,
96
+ solver=None,
97
+ force_dense_1d: bool = False,
98
+ precision: float = 1e-3,
99
+ ) -> None:
100
+
101
+ if (
102
+ trash_cost is None
103
+ and experimental_trash_cost is None
104
+ and theoretical_trash_cost is None
105
+ ):
106
+ raise ValueError(
107
+ "At least one of trash_cost, experimental_trash_cost, or theoretical_trash_cost must be provided."
108
+ )
109
+
110
+ if not isinstance(empirical_spectrum, Distribution):
111
+ raise TypeError("empirical_spectrum must be a Distribution")
112
+ if not isinstance(theoretical_spectra, Sequence):
113
+ raise TypeError("theoretical_spectra must be a Sequence")
114
+ if not all(isinstance(t, Distribution) for t in theoretical_spectra):
115
+ raise TypeError("all theoretical_spectra elements must be Distribution")
116
+ if not isinstance(max_distance, (int, float)):
117
+ raise TypeError("max_distance must be a number")
118
+ for name, val in [
119
+ ("trash_cost", trash_cost),
120
+ ("experimental_trash_cost", experimental_trash_cost),
121
+ ("theoretical_trash_cost", theoretical_trash_cost),
122
+ ]:
123
+ if val is not None and not isinstance(val, (int, float)):
124
+ raise TypeError(f"{name} must be a number")
125
+ if scale_factor is not None and not isinstance(scale_factor, (int, float)):
126
+ raise TypeError("scale_factor must be a number")
127
+
128
+ asymmetric = (
129
+ experimental_trash_cost is not None or theoretical_trash_cost is not None
130
+ )
131
+ if asymmetric:
132
+ eff_exp = (
133
+ experimental_trash_cost
134
+ if experimental_trash_cost is not None
135
+ else trash_cost
136
+ )
137
+ eff_theo = (
138
+ theoretical_trash_cost
139
+ if theoretical_trash_cost is not None
140
+ else trash_cost
141
+ )
142
+ active_costs = [c for c in (eff_exp, eff_theo) if c is not None]
143
+ else:
144
+ active_costs = [trash_cost]
145
+
146
+ if scale_factor is None:
147
+ ALMOST_MAXINT = 2**60
148
+ empirical_sum_intensity = empirical_spectrum.sum_intensities
149
+ theoretical_sum_intensity = sum(
150
+ t.sum_intensities for t in theoretical_spectra
151
+ )
152
+ max_sum_intensity = max(empirical_sum_intensity, theoretical_sum_intensity)
153
+
154
+ # Output-precision constraint (original): integer resolution
155
+ # 1/sf^2 should be ~precision of the worst-case absolute cost
156
+ # max_cost*max_sum_intensity, so
157
+ # sf >= sqrt(1/(precision * cost_scale)).
158
+ # This used to be the only constraint. When the experimental
159
+ # spectrum has huge unnormalized intensities (raw MS counts ~1e7+),
160
+ # the formula drops sf so low that int(max_distance*sf) rounds to 0
161
+ # and the graph factory builds zero edges (silent failure).
162
+ #
163
+ # Per-edge floor: int(min_cost_per_unit_flow * sf) must be at least
164
+ # MIN_COST_TICKS so the cost map has usable resolution. Below ~25
165
+ # the gradient signal is too coarse for L-BFGS-B to make progress
166
+ # (empirical: scaled_MTD=10 on pbttt → 1 iter, scaled_MTD=25 → 36
167
+ # iters with a real optimum). Going higher than ~25 produces more
168
+ # accurate cost numbers but multiplies LEMON's pivot count on
169
+ # large graphs (cold solve scales roughly with sf), so we cap the
170
+ # auto floor at MIN_COST_TICKS rather than tying it to precision.
171
+ # Pass scale_factor explicitly (or tighten precision) when more
172
+ # input precision is needed.
173
+ MIN_COST_TICKS = 25
174
+ max_cost_per_unit_flow = max([max_distance] + active_costs)
175
+ min_cost_per_unit_flow = min([max_distance] + active_costs)
176
+ cost_scale = max_cost_per_unit_flow * max_sum_intensity
177
+ sf_output = np.sqrt(1.0 / (precision * cost_scale))
178
+ sf_floor = MIN_COST_TICKS / min_cost_per_unit_flow
179
+ desired_sf = max(sf_output, sf_floor)
180
+ max_sf = np.sqrt(ALMOST_MAXINT / cost_scale)
181
+ if desired_sf > max_sf:
182
+ achieved_ticks = max_sf * min_cost_per_unit_flow
183
+ achieved_out = 1.0 / (max_sf**2 * cost_scale)
184
+ warnings.warn(
185
+ f"Requested precision {precision} exceeds int64 capacity for this "
186
+ f"dataset (cost_scale={cost_scale:.3g}, "
187
+ f"min_cost={min_cost_per_unit_flow:.3g}); clamping scale_factor to "
188
+ f"{max_sf:.3g}. Achieved cost precision {achieved_out:.2e} "
189
+ f"(relative), min-cost integer ticks {achieved_ticks:.1f}."
190
+ )
191
+ scale_factor = max_sf
192
+ else:
193
+ scale_factor = desired_sf
194
+ assert (
195
+ scale_factor > 0
196
+ ), "Can't auto-compute a sensible scale factor. You might have some luck with setting it manually, but it probably means something about your data or trash_cost is off."
197
+ if int(min_cost_per_unit_flow * scale_factor) < 1:
198
+ raise ValueError(
199
+ f"Auto-computed scale_factor={scale_factor:.3g} cannot represent "
200
+ f"min_cost_per_unit_flow={min_cost_per_unit_flow:.3g} as a "
201
+ f"positive integer (the graph would have no edges). "
202
+ f"empirical_sum_intensity={empirical_sum_intensity:.3g}, "
203
+ f"theoretical_sum_intensity={theoretical_sum_intensity:.3g}. "
204
+ f"Normalize the spectra, pass an explicit scale_factor, or "
205
+ f"relax precision."
206
+ )
207
+
208
+ self.scale_factor = scale_factor
209
+ self._ftol = 1.0 / (scale_factor * scale_factor)
210
+ self.empirical_spectrum = empirical_spectrum.positions_intensities_scaled(
211
+ scale_factor
212
+ )
213
+ self.theoretical_spectra = [
214
+ t.positions_intensities_scaled(scale_factor) for t in theoretical_spectra
215
+ ]
216
+
217
+ self.graph = WassersteinNetwork(
218
+ self.empirical_spectrum,
219
+ self.theoretical_spectra,
220
+ distance,
221
+ int(max_distance * scale_factor),
222
+ force_dense_1d=force_dense_1d,
223
+ method=method,
224
+ solver=solver,
225
+ )
226
+ if asymmetric:
227
+ if eff_exp is not None:
228
+ self.graph.add_experimental_trash(int(eff_exp * scale_factor))
229
+ if eff_theo is not None:
230
+ self.graph.add_theoretical_trash(int(eff_theo * scale_factor))
231
+ else:
232
+ self.graph.add_simple_trash(int(trash_cost * scale_factor))
233
+ self.graph.build()
234
+ self.point = None
235
+
236
+ def set_point(self, point: Union[Sequence[float], np.ndarray]) -> None:
237
+ """
238
+ Set proportions of theoretical spectra and solve the graph at the given point.
239
+
240
+ Parameters
241
+ ----------
242
+ point : Sequence[float] or np.ndarray
243
+ Proportions for each theoretical spectrum.
244
+
245
+ Returns
246
+ -------
247
+ None
248
+ """
249
+ self.point = point
250
+ self.graph.solve(point)
251
+
252
+ def total_cost(self) -> float:
253
+ """
254
+ Calculates the total cost of the graph. Can only be called after set_point().
255
+
256
+ Returns:
257
+ float: The normalized total cost.
258
+ """
259
+ return self.graph.total_cost() / (self.scale_factor * self.scale_factor)
260
+
261
+ def print(self) -> None:
262
+ """
263
+ Prints a string representation of the graph associated with this aligner instance.
264
+
265
+ Returns:
266
+ None
267
+ """
268
+ print(str(self.graph))
269
+
270
+ def flows(self) -> list[_Flow]:
271
+ """
272
+ Computes and returns a list of flow information for each theoretical spectrum.
273
+
274
+ Each flow is represented as a namedtuple containing the empirical peak index,
275
+ theoretical peak index, and the scaled flow value (divided by self.scale_factor).
276
+
277
+ Returns:
278
+ list[namedtuple]: A list of Flow namedtuples, one for each theoretical
279
+ spectrum, each containing:
280
+ - empirical_peak_idx (int): Index of the empirical peak.
281
+ - theoretical_peak_idx (int): Index of the theoretical peak.
282
+ - flow (float): Scaled flow value between the peaks.
283
+ """
284
+ result = []
285
+ for i in range(len(self.theoretical_spectra)):
286
+ empirical_peak_idx, theoretical_peak_idx, flow = (
287
+ self.graph.flows_for_target(i)
288
+ )
289
+ result.append(_Flow(empirical_peak_idx, theoretical_peak_idx, flow / self.scale_factor))
290
+ return result
291
+
292
+ def gradient(self) -> np.ndarray:
293
+ """
294
+ Returns the gradient of total_cost with respect to the point
295
+ (spectrum proportions). Can only be called after set_point().
296
+
297
+ Returns
298
+ -------
299
+ np.ndarray
300
+ Array of partial derivatives, one per theoretical spectrum.
301
+ """
302
+ return (
303
+ self.graph.spectrum_proportion_derivatives().astype(float)
304
+ / (self.scale_factor * self.scale_factor)
305
+ )
306
+
307
+ def gradient_fast_approx(self) -> np.ndarray:
308
+ """Fast, APPROXIMATE gradient (dual-potential difference instead of the
309
+ residual shortest-path marginal).
310
+
311
+ Much cheaper (skips the per-subgraph Dijkstra) but returns a
312
+ different, basis-dependent gradient: a lower bound on the true
313
+ marginal, exact only on the optimal flow support. Opt-in; do not use
314
+ as a drop-in replacement for gradient() without validating convergence.
315
+ """
316
+ return (
317
+ self.graph.spectrum_proportion_derivatives_fast_approx().astype(float)
318
+ / (self.scale_factor * self.scale_factor)
319
+ )
320
+
321
+ def optimize(self, x0: Optional[np.ndarray] = None) -> OptimizeResult:
322
+ """
323
+ Minimize total transport cost over non-negative spectrum proportions.
324
+
325
+ Parameters
326
+ ----------
327
+ x0 : np.ndarray, optional
328
+ Initial proportions. Defaults to a vector of ones.
329
+
330
+ Returns
331
+ -------
332
+ scipy.optimize.OptimizeResult
333
+ Standard scipy result; .x holds the optimal proportions.
334
+ """
335
+ n = len(self.theoretical_spectra)
336
+ if x0 is None:
337
+ x0 = np.ones(n)
338
+
339
+ def cost_and_grad(w):
340
+ self.set_point(w)
341
+ return self.total_cost(), self.gradient()
342
+
343
+ return minimize(
344
+ cost_and_grad,
345
+ x0=x0,
346
+ jac=True,
347
+ method="L-BFGS-B",
348
+ bounds=[(0.0, None)] * n,
349
+ options={"ftol": self._ftol},
350
+ )
351
+
352
+ def no_subgraphs(self) -> int:
353
+ """
354
+ Returns the number of subgraphs in the underlying Wasserstein network.
355
+
356
+ Returns:
357
+ int: The number of subgraphs present in the graph.
358
+ """
359
+ return self.graph.no_subgraphs()
360
+
361
+ def print_diagnostics(self, subgraphs_too=False):
362
+ """
363
+ Prints diagnostic information about the current state of the alignment.
364
+
365
+ Parameters
366
+ ----------
367
+ subgraphs_too : bool, optional
368
+ If True, prints diagnostics for each subgraph in addition to the overall graph.
369
+
370
+ Diagnostics Printed
371
+ ------------------
372
+ - Number of subgraphs
373
+ - Number of empirical nodes
374
+ - Number of theoretical nodes
375
+ - Number of matching edges (dense factory)
376
+ - Number of chain edges (1D chain factory)
377
+ - Number of src-to-empirical edges
378
+ - Number of theoretical-to-sink edges
379
+ - Number of simple trash edges
380
+ - Matching density
381
+ - Scale factor (and its log10 value)
382
+ - Total cost
383
+
384
+ If `subgraphs_too` is True, for each subgraph:
385
+ - Number of empirical nodes
386
+ - Number of theoretical nodes
387
+ - Cost
388
+ - Matching density
389
+ - Theoretical spectra involved
390
+ """
391
+ print("Diagnostics:")
392
+ print("No subgraphs:", self.graph.no_subgraphs())
393
+ print("No empirical nodes:", self.graph.count_empirical_nodes())
394
+ print("No theoretical nodes:", self.graph.count_theoretical_nodes())
395
+ print("No matching edges:", self.graph.count_matching_edges())
396
+ print("No chain edges:", self.graph.count_chain_edges())
397
+ print("No src-to-empirical edges:", self.graph.count_src_to_empirical_edges())
398
+ print("No theoretical-to-sink edges:", self.graph.count_theoretical_to_sink_edges())
399
+ print("No simple trash edges:", self.graph.count_simple_trash_edges())
400
+ print("Matching density:", self.graph.matching_density())
401
+ print(
402
+ "Scale factor:", self.scale_factor, f" log10: {np.log10(self.scale_factor)}"
403
+ )
404
+ print("Total cost:", self.graph.total_cost())
405
+ if not subgraphs_too:
406
+ return
407
+ for ii in range(self.graph.no_subgraphs()):
408
+ s = self.graph.get_subgraph(ii)
409
+ print("Subgraph", ii, ":")
410
+ print(" No. empirical nodes:", s.count_empirical_nodes())
411
+ print(" No. theoretical nodes:", s.count_theoretical_nodes())
412
+ print(" No. matching edges:", s.count_matching_edges())
413
+ print(" No. chain edges:", s.count_chain_edges())
414
+ print(" No. src-to-empirical edges:", s.count_src_to_empirical_edges())
415
+ print(" No. theoretical-to-sink edges:", s.count_theoretical_to_sink_edges())
416
+ print(" No. simple trash edges:", s.count_simple_trash_edges())
417
+ print(" Cost:", s.total_cost())
418
+ print(" Matching density:", s.matching_density())
419
+ print(" Theoretical spectra involved:", s.theoretical_spectra_involved())
420
+
421
+
422
+ class ConstrainedSolver(DeconvSolver):
423
+ """
424
+ DeconvSolver with a total-mass equality constraint:
425
+
426
+ sum_s(w_s * total_intensity_s) = total_empirical_intensity
427
+
428
+ This couples the proportions so that components with extra unmatched peaks
429
+ (diluted libraries) are naturally down-weighted without tuning
430
+ theo_trash_cost. The constraint is enforced during the call to
431
+ optimize(), which uses SLSQP instead of L-BFGS-B.
432
+
433
+ All DeconvSolver methods (set_point, total_cost, gradient, flows, …)
434
+ are inherited unchanged and work identically.
435
+
436
+ Parameters
437
+ ----------
438
+ Same as DeconvSolver.
439
+ """
440
+
441
+ def __init__(self, *args, **kwargs) -> None:
442
+ super().__init__(*args, **kwargs)
443
+ self._emp_total = self.empirical_spectrum.sum_intensities
444
+ self._theo_totals = np.array(
445
+ [t.sum_intensities for t in self.theoretical_spectra]
446
+ )
447
+
448
+ def optimize(self, x0: Optional[np.ndarray] = None) -> OptimizeResult:
449
+ """
450
+ Minimize total transport cost subject to the total-mass constraint.
451
+
452
+ Parameters
453
+ ----------
454
+ x0 : np.ndarray, optional
455
+ Initial proportions. Must satisfy the constraint. Defaults to
456
+ equal weights scaled to satisfy sum_s(w_s * I_s) = I_emp.
457
+
458
+ Returns
459
+ -------
460
+ scipy.optimize.OptimizeResult
461
+ Standard scipy result; .x holds the optimal proportions.
462
+ """
463
+ n = len(self.theoretical_spectra)
464
+ if x0 is None:
465
+ w0 = self._emp_total / self._theo_totals.sum()
466
+ x0 = np.full(n, w0)
467
+
468
+ def cost_and_grad(w):
469
+ self.set_point(w)
470
+ return self.total_cost(), self.gradient()
471
+
472
+ constraint = {
473
+ "type": "eq",
474
+ "fun": lambda w: np.dot(w, self._theo_totals) - self._emp_total,
475
+ "jac": lambda w: self._theo_totals,
476
+ }
477
+
478
+ return minimize(
479
+ cost_and_grad,
480
+ x0=x0,
481
+ jac=True,
482
+ method="SLSQP",
483
+ bounds=[(0.0, None)] * n,
484
+ constraints=constraint,
485
+ options={"maxiter": 2000, "ftol": self._ftol},
486
+ )
487
+
488
+
489
+ class MagnetsteinSolver(ConstrainedSolver):
490
+ """
491
+ ConstrainedSolver that normalizes all spectra to sum to 1 internally,
492
+ reproducing magnetstein's dual-LP problem formulation.
493
+
494
+ With unit-norm spectra the total-mass equality constraint reduces to
495
+ sum(w) = 1, matching the LP's implicit mass-balance condition.
496
+ experimental_trash_cost = MTD and theoretical_trash_cost = MTD_th
497
+ correspond directly to magnetstein's penalty and penalty_th parameters.
498
+
499
+ Parameters
500
+ ----------
501
+ empirical_spectrum : Distribution
502
+ The empirical spectrum (normalized internally to sum to 1).
503
+ theoretical_spectra : Sequence[Distribution]
504
+ A sequence of theoretical spectra (each normalized internally).
505
+ distance : DistanceMetric
506
+ Distance metric. Use DistanceMetric.L1 for 1D NMR spectra.
507
+ MTD : float
508
+ Maximum Transport Distance for the mix (experimental trash cost).
509
+ MTD_th : float, optional
510
+ Maximum Transport Distance for components (theoretical trash cost).
511
+ If None, uses symmetric trash with cost MTD.
512
+ method : str, optional
513
+ Min-cost flow algorithm (default: ``"network_simplex"``). Ignored when ``solver`` is provided.
514
+ solver : NetworkSimplex | CostScaling | CycleCanceling | CapacityScaling, optional
515
+ Solver configuration object. Takes precedence over ``method``.
516
+ """
517
+
518
+ def __init__(
519
+ self,
520
+ empirical_spectrum: Distribution,
521
+ theoretical_spectra: Sequence[Distribution],
522
+ distance: DistanceMetric,
523
+ MTD: float,
524
+ MTD_th: Optional[float] = None,
525
+ method: str = None,
526
+ solver=None,
527
+ precision: float = 1e-3,
528
+ ) -> None:
529
+ emp = empirical_spectrum.normalized()
530
+ theos = [t.normalized() for t in theoretical_spectra]
531
+ if MTD_th is None:
532
+ super().__init__(
533
+ emp,
534
+ theos,
535
+ distance,
536
+ max_distance=MTD,
537
+ trash_cost=MTD,
538
+ method=method,
539
+ solver=solver,
540
+ precision=precision,
541
+ )
542
+ else:
543
+ super().__init__(
544
+ emp,
545
+ theos,
546
+ distance,
547
+ max_distance=max(MTD, MTD_th),
548
+ experimental_trash_cost=MTD,
549
+ theoretical_trash_cost=MTD_th,
550
+ method=method,
551
+ solver=solver,
552
+ precision=precision,
553
+ )
554
+
555
+
556
+ class MassersteinSolver(DeconvSolver):
557
+ """
558
+ Reproduces masserstein's ``dualdeconv2`` / ``dualdeconv4``.
559
+
560
+ All spectra are normalized to sum to 1 internally (as dualdeconv2
561
+ requires). The distance is always LINF (= absolute distance in 1D, the
562
+ dual of W1 / earth mover's distance used by masserstein).
563
+
564
+ Faithful model of dualdeconv2's LP
565
+ ----------------------------------
566
+ dualdeconv2 prices transport at the true linear W1 cost with an
567
+ experimental abyss at ``MTD``, and has *no theoretical abyss*: every unit
568
+ of ``w_k * theo_k`` must reach an experimental position — a component is
569
+ discarded only by driving ``w_k -> 0``, never by trashing theoretical
570
+ mass. Transporting a unit farther than ``MTD`` is never optimal in that
571
+ LP (the experimental abyss at ``MTD`` is always cheaper), so ``MTD`` is
572
+ already the LP's *effective* transport cap. We reproduce that with:
573
+
574
+ * ``max_distance = MTD`` — the effective cap; also keeps the 1D chain
575
+ sparse (O(m+n)) instead of dense (O(m*n)) on real spectra;
576
+ * ``experimental_trash_cost = MTD`` — the denoising penalty;
577
+ * ``theoretical_trash_cost = 2*MTD`` (dualdeconv2 case). This is a
578
+ numerical device only: with experimental-only trash the inner
579
+ min-cost-flow cost ``f(w)`` is degenerate/flat (un-routable
580
+ theoretical mass is dropped for free, so the outer optimizer gets a
581
+ zero gradient and returns its starting point — the old bug). Any
582
+ cost strictly above the ``MTD`` transport cap is never chosen over
583
+ transporting or lowering ``w_k``, so it carries no flow at the
584
+ optimum (= "no theoretical abyss; drop the component by lowering
585
+ w_k") yet makes ``f(w)`` well-defined and convex for every ``w``.
586
+ The multiplier is kept small (2x) on purpose: the auto
587
+ ``scale_factor`` divides by ``max_cost_per_unit_flow``, so a large
588
+ value would shrink it and lose m/z precision. A sweep (2/4/8/20x)
589
+ showed 2x gives the best Part-1 agreement (L1 ~2e-7 vs dualdeconv2)
590
+ while 8x already degrades it ~4x, with no compensating gain — the
591
+ fixed-integer network's dynamic range makes a true +inf infeasible,
592
+ so this is a deliberate approximation, exact for fully-placeable and
593
+ fully-unplaceable components, slightly soft for partial placement.
594
+
595
+ Residual caveats:
596
+ * dualdeconv2 solves one joint LP (proportions = exact shadow prices);
597
+ this is a nested optimization (SLSQP over ``w``, inner MCF). The
598
+ objective and noise/sum behaviour match, but under degeneracy
599
+ (near-collinear components) per-component proportions agree only to
600
+ optimizer tolerance, not bit-exactly.
601
+ * On raw unfiltered spectra the two formulations agree closely in
602
+ controlled tests (single/multi-component, collinear decoys, dense
603
+ overlapping + noise — see
604
+ ``experiments/direct_dualdeconv2_{nofilter,multi,dense}.py``):
605
+ objective to ~1e-5, signal fraction to ~1%, decoys zeroed.
606
+ * On DENSE-noisy mass spectra (e.g. hemoglobin Part 2 in
607
+ ``compare_dualdeconv2.py``) this reproduction breaks structurally:
608
+ the nested empirical->theoretical MCF matches per peak with the
609
+ sum Σ w_j*theo_j, while dualdeconv2's joint LP couples all isotope
610
+ positions of a component via Σ thr_ji Z_i ≤ 0. An 11-config grid
611
+ search (``experiments/grid_search_masserstein.py``) over
612
+ max_distance and theoretical_trash_cost found that NO setting
613
+ bridges the gap — larger max_distance makes it worse (more noise
614
+ targets), larger theo_trash does nothing (theo-trash never fires at
615
+ the optimum on dense noise), and either breaks the minimal case
616
+ first. Cross-scoring confirms it: at w_wnet, masserstein's own LP
617
+ gives ~100x worse cost than at w_dd2 — i.e. wnetdeconv's reported
618
+ ``fun`` is its own (lenient) model, not a competitive solution to
619
+ masserstein's LP. For inputs in this regime use
620
+ ``masserstein.estimate_proportions`` (which pre-filters to the
621
+ theoretical envelope, the agreement regime) or call
622
+ ``dualdeconv2`` directly — not this class.
623
+
624
+ ``deconvolve()`` uses SLSQP with bounds w_k >= 0 and the explicit
625
+ inequality constraint sum(w_k) <= 1, which dualdeconv2 enforces implicitly
626
+ via sum(probs) + sum(abyss) = 1, abyss >= 0.
627
+
628
+ For the symmetric case (MTD_th=None) this reproduces dualdeconv2;
629
+ with MTD_th set it reproduces dualdeconv4 (real theoretical penalty
630
+ MTD_th, still with the unbounded transport metric).
631
+
632
+ Parameters
633
+ ----------
634
+ empirical_spectrum : Distribution
635
+ Empirical spectrum (normalized internally to sum to 1).
636
+ theoretical_spectra : Sequence[Distribution]
637
+ Theoretical spectra (each normalized internally).
638
+ MTD : float
639
+ Maximum Transport Distance / denoising penalty (``penalty`` in dualdeconv2).
640
+ MTD_th : float, optional
641
+ Separate theoretical trash cost. None → symmetric = dualdeconv2;
642
+ non-None → asymmetric = dualdeconv4.
643
+ theo_trash_mult : float, optional
644
+ Multiplier on MTD for the +inf-proxy theoretical trash cost
645
+ (dualdeconv2 path only). Default 10x is what fixes the
646
+ minimal-divergence example
647
+ (``experiments/minimal_dense_noise_divergence.py``); below ~10x the
648
+ nested MCF under-prices un-routable theoretical mass relative to
649
+ masserstein's real-distance transport. Should be at least as large as
650
+ the maximum inter-isotope distance you expect un-routed mass to need
651
+ to travel (in m/z units of MTD). Above ~few hundred it can lose
652
+ precision via the auto ``scale_factor``.
653
+ method : str, optional
654
+ Min-cost flow algorithm. Ignored when ``solver`` is provided.
655
+ solver : NetworkSimplex | CostScaling | CycleCanceling | CapacityScaling, optional
656
+ Solver configuration object. Takes precedence over ``method``.
657
+ """
658
+
659
+ def __init__(
660
+ self,
661
+ empirical_spectrum: Distribution,
662
+ theoretical_spectra: Sequence[Distribution],
663
+ MTD: float,
664
+ MTD_th: Optional[float] = None,
665
+ theo_trash_mult: float = 10.0,
666
+ method: str = None,
667
+ solver=None,
668
+ precision: float = 1e-3,
669
+ ) -> None:
670
+ emp = empirical_spectrum.normalized()
671
+ theos = [t.normalized() for t in theoretical_spectra]
672
+ if MTD_th is None:
673
+ super().__init__(
674
+ emp,
675
+ theos,
676
+ distance=DistanceMetric.LINF,
677
+ max_distance=MTD,
678
+ experimental_trash_cost=MTD,
679
+ # effective +inf: large enough that the optimizer prefers
680
+ # lowering w_k over carrying flow on this edge — i.e. mimics
681
+ # masserstein's "no theoretical abyss; transport at real
682
+ # distance". Default 10x covers the typical asymmetric-isotope
683
+ # case; user can dial up if inter-isotope distances >> MTD.
684
+ theoretical_trash_cost=theo_trash_mult * MTD,
685
+ method=method,
686
+ solver=solver,
687
+ precision=precision,
688
+ )
689
+ else:
690
+ super().__init__(
691
+ emp,
692
+ theos,
693
+ distance=DistanceMetric.LINF,
694
+ max_distance=max(MTD, MTD_th),
695
+ experimental_trash_cost=MTD,
696
+ theoretical_trash_cost=MTD_th,
697
+ method=method,
698
+ solver=solver,
699
+ precision=precision,
700
+ )
701
+
702
+ def deconvolve(self, x0: Optional[np.ndarray] = None) -> dict:
703
+ """
704
+ Find optimal component proportions, matching dualdeconv2's output format.
705
+
706
+ Parameters
707
+ ----------
708
+ x0 : np.ndarray, optional
709
+ Initial proportions. Defaults to uniform 1/(2k) (interior of feasible set).
710
+
711
+ Returns
712
+ -------
713
+ dict
714
+ probs : list[float] – weight of each theoretical spectrum
715
+ fun : float – optimal transport cost (= dual LP objective)
716
+ success : bool
717
+ """
718
+ n = len(self.theoretical_spectra)
719
+ if x0 is None:
720
+ x0 = np.ones(n) / (2 * n)
721
+
722
+ def cost_and_grad(w):
723
+ self.set_point(w)
724
+ return self.total_cost(), self.gradient()
725
+
726
+ constraints = [{
727
+ "type": "ineq",
728
+ "fun": lambda w: 1.0 - w.sum(),
729
+ "jac": lambda w: -np.ones(n),
730
+ }]
731
+
732
+ result = minimize(
733
+ cost_and_grad,
734
+ x0=x0,
735
+ jac=True,
736
+ method="SLSQP",
737
+ bounds=[(0.0, None)] * n,
738
+ constraints=constraints,
739
+ options={"maxiter": 2000, "ftol": self._ftol},
740
+ )
741
+ return {"probs": list(result.x), "fun": result.fun, "success": result.success}
@@ -0,0 +1,161 @@
1
+ from typing import Optional
2
+ from functools import cached_property
3
+
4
+ import numpy as np
5
+
6
+ from wnet import Distribution
7
+
8
+
9
+ class Spectrum(Distribution):
10
+ """
11
+ A class representing NMR or MS spectrum data.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ positions: np.ndarray,
17
+ intensities: np.ndarray,
18
+ label: Optional[str] = None,
19
+ ):
20
+ """
21
+ Initialize a Spectrum object. Compared to Distribution, this class
22
+ retains the original intensities (not converted to int) for more precise
23
+ scaling operations. They are stored in the `original_intensities` attribute.
24
+ They are still converted to int before running any alignment algorithms.
25
+
26
+ Parameters
27
+ ----------
28
+ positions : np.ndarray
29
+ The spatial coordinates of the spectrum (e.g., m/z and RT for MS).
30
+ intensities : np.ndarray
31
+ The intensity values corresponding to the spatial coordinates.
32
+ """
33
+ self.original_intensities = intensities
34
+ super().__init__(positions, intensities, label=label)
35
+
36
+ @staticmethod
37
+ def FromFeatureXML(path):
38
+ """
39
+ Parse a featureXML file and return a Spectrum object.
40
+ """
41
+ import pyopenms as oms
42
+
43
+ # load the featureXML file
44
+ featureXML = oms.FeatureXMLFile()
45
+ features = oms.FeatureMap()
46
+ featureXML.load(path, features)
47
+ # load m/z, rt, and intensity values from the features
48
+ mzs = []
49
+ rts = []
50
+ intensities = []
51
+ for feature in features:
52
+ mzs.append(feature.getMZ())
53
+ rts.append(feature.getRT())
54
+ intensities.append(feature.getIntensity())
55
+ # create a Spectrum object
56
+ spectrum = Spectrum(np.array([mzs, rts]), np.array(intensities))
57
+ return spectrum
58
+
59
+ @cached_property
60
+ def sum_intensities(self) -> float:
61
+ """
62
+ Return the sum of the original intensities.
63
+ """
64
+ return np.sum(self.original_intensities)
65
+
66
+ def scaled(self, factor: float) -> "Spectrum":
67
+ """
68
+ Return a new Spectrum object with intensities scaled by the given factor.
69
+
70
+ Parameters
71
+ ----------
72
+ factor : float
73
+ The scaling factor to apply to the intensities.
74
+
75
+ Returns
76
+ -------
77
+ Spectrum
78
+ A new Spectrum object with scaled intensities.
79
+ """
80
+ return Spectrum(
81
+ self.positions, self.original_intensities * factor, label=self.label
82
+ )
83
+
84
+ def positions_intensities_scaled(self, scale_factor: float) -> "Spectrum":
85
+ """
86
+ Return a new Spectrum with both positions and intensities scaled by the given factor.
87
+
88
+ Parameters
89
+ ----------
90
+ scale_factor : float
91
+ The scaling factor to apply to positions and intensities.
92
+
93
+ Returns
94
+ -------
95
+ Spectrum
96
+ A new Spectrum object with scaled positions and intensities.
97
+ """
98
+ new_positions = self.positions.astype(np.float64, copy=False) * scale_factor
99
+ return Spectrum(new_positions, self.original_intensities * scale_factor, label=self.label)
100
+
101
+ def normalized(self) -> "Spectrum":
102
+ """
103
+ Return a new Spectrum object with intensities normalized to sum to 1.
104
+
105
+ Returns
106
+ -------
107
+ Spectrum
108
+ A new Spectrum object with normalized intensities.
109
+ """
110
+ total = self.sum_intensities
111
+ if total == 0:
112
+ raise ValueError("Cannot normalize a spectrum with total intensity of 0.")
113
+ return Spectrum(
114
+ self.positions, self.original_intensities / total, label=self.label
115
+ )
116
+
117
+ def as_distribution(self) -> Distribution:
118
+ """
119
+ Convert the Spectrum object to a Distribution object.
120
+
121
+ Returns
122
+ -------
123
+ Distribution
124
+ A Distribution object with the same positions and intensities.
125
+ """
126
+ return Distribution(self.positions, self.intensities, label=self.label)
127
+
128
+
129
+ def Spectrum_1D(
130
+ positions: np.ndarray, intensities: np.ndarray, label: Optional[str] = None
131
+ ) -> Spectrum:
132
+ """
133
+ Create a 1D Spectrum object.
134
+
135
+ Parameters
136
+ ----------
137
+ positions : np.ndarray
138
+ The spatial coordinates of the spectrum (e.g., m/z for MS).
139
+ intensities : np.ndarray
140
+ The intensity values corresponding to the spatial coordinates.
141
+ label : str, optional
142
+ An optional label for the spectrum.
143
+
144
+ Returns
145
+ -------
146
+ Spectrum
147
+ A 1D Spectrum object.
148
+ """
149
+ if not isinstance(positions, np.ndarray):
150
+ positions = np.array(positions)
151
+ if not isinstance(intensities, np.ndarray):
152
+ intensities = np.array(intensities)
153
+ if positions.ndim != 1:
154
+ raise ValueError(f"positions must be 1D, got shape {positions.shape}")
155
+ if intensities.ndim != 1:
156
+ raise ValueError(f"intensities must be 1D, got shape {intensities.shape}")
157
+ if positions.shape[0] != intensities.shape[0]:
158
+ raise ValueError(
159
+ f"positions and intensities must have the same length, got {positions.shape[0]} and {intensities.shape[0]}"
160
+ )
161
+ return Spectrum(positions[np.newaxis, :], intensities, label=label)