sparse-ot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. sparse_ot-0.1.0/.clangd +6 -0
  2. sparse_ot-0.1.0/.github/workflows/ci.yml +63 -0
  3. sparse_ot-0.1.0/.github/workflows/publish.yml +107 -0
  4. sparse_ot-0.1.0/.gitignore +25 -0
  5. sparse_ot-0.1.0/CMakeLists.txt +16 -0
  6. sparse_ot-0.1.0/LICENSE +21 -0
  7. sparse_ot-0.1.0/PKG-INFO +17 -0
  8. sparse_ot-0.1.0/README.md +256 -0
  9. sparse_ot-0.1.0/benchmarks/__init__.py +0 -0
  10. sparse_ot-0.1.0/benchmarks/bench_solvers.py +293 -0
  11. sparse_ot-0.1.0/benchmarks/generate_report.py +202 -0
  12. sparse_ot-0.1.0/benchmarks/problems.py +111 -0
  13. sparse_ot-0.1.0/benchmarks/results/accuracy_mid.json +360 -0
  14. sparse_ot-0.1.0/benchmarks/results/accuracy_quick.json +106 -0
  15. sparse_ot-0.1.0/benchmarks/results/efficiency_mid.json +366 -0
  16. sparse_ot-0.1.0/benchmarks/results/efficiency_quick.json +106 -0
  17. sparse_ot-0.1.0/benchmarks/results/figures/accuracy.pdf +0 -0
  18. sparse_ot-0.1.0/benchmarks/results/figures/accuracy.png +0 -0
  19. sparse_ot-0.1.0/benchmarks/results/figures/heatmap_lemon_vs_bonneel.pdf +0 -0
  20. sparse_ot-0.1.0/benchmarks/results/figures/heatmap_lemon_vs_bonneel.png +0 -0
  21. sparse_ot-0.1.0/benchmarks/results/figures/heatmap_ortools_vs_lemon.pdf +0 -0
  22. sparse_ot-0.1.0/benchmarks/results/figures/heatmap_ortools_vs_lemon.png +0 -0
  23. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k128.pdf +0 -0
  24. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k128.png +0 -0
  25. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k2.pdf +0 -0
  26. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k2.png +0 -0
  27. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k32.pdf +0 -0
  28. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k32.png +0 -0
  29. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k512.pdf +0 -0
  30. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k512.png +0 -0
  31. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k8.pdf +0 -0
  32. sparse_ot-0.1.0/benchmarks/results/figures/lineplot_k8.png +0 -0
  33. sparse_ot-0.1.0/docs/superpowers/plans/2026-05-15-plan-1-scaffold-bonneel.md +642 -0
  34. sparse_ot-0.1.0/docs/superpowers/plans/2026-05-16-plan-2-lemon-routing.md +1149 -0
  35. sparse_ot-0.1.0/docs/superpowers/plans/2026-05-16-plan-3-ortools-benchmarks.md +1445 -0
  36. sparse_ot-0.1.0/docs/superpowers/plans/2026-05-16-plan-4-feasibility-and-bench-fixes.md +1086 -0
  37. sparse_ot-0.1.0/docs/superpowers/plans/2026-05-18-sparse-bonneel.md +1020 -0
  38. sparse_ot-0.1.0/docs/superpowers/specs/2026-05-15-sparse-ot-design.md +336 -0
  39. sparse_ot-0.1.0/docs/superpowers/specs/2026-05-18-sparse-bonneel-design.md +215 -0
  40. sparse_ot-0.1.0/pyproject.toml +35 -0
  41. sparse_ot-0.1.0/src/cpp/bonneel/VENDORING.md +78 -0
  42. sparse_ot-0.1.0/src/cpp/bonneel/bipartite_sparse_digraph.h +132 -0
  43. sparse_ot-0.1.0/src/cpp/bonneel/full_bipartitegraph.h +238 -0
  44. sparse_ot-0.1.0/src/cpp/bonneel/network_simplex_simple.h +1600 -0
  45. sparse_ot-0.1.0/src/cpp/bonneel_solver.cpp +159 -0
  46. sparse_ot-0.1.0/src/sparse_ot/__init__.py +5 -0
  47. sparse_ot-0.1.0/src/sparse_ot/_ext/__init__.py +0 -0
  48. sparse_ot-0.1.0/src/sparse_ot/emd.py +126 -0
  49. sparse_ot-0.1.0/src/sparse_ot/feasibility.py +96 -0
  50. sparse_ot-0.1.0/src/sparse_ot/sparse_utils.py +72 -0
  51. sparse_ot-0.1.0/tests/test_benchmarks.py +121 -0
  52. sparse_ot-0.1.0/tests/test_bonneel_sparse.py +153 -0
  53. sparse_ot-0.1.0/tests/test_duals.py +78 -0
  54. sparse_ot-0.1.0/tests/test_emd.py +136 -0
  55. sparse_ot-0.1.0/tests/test_feasibility_check.py +62 -0
  56. sparse_ot-0.1.0/tests/test_sparse_utils.py +127 -0
  57. sparse_ot-0.1.0/uv.lock +1260 -0
@@ -0,0 +1,6 @@
1
+ CompileFlags:
2
+ Add:
3
+ - "-I/Users/jonatanbobrutsky-haim/Documents/Code/sparse-optimal-transport/.venv/lib/python3.12/site-packages/pybind11/include"
4
+ - "-I/Users/jonatanbobrutsky-haim/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/include/python3.12"
5
+ - "-Isrc/cpp"
6
+ - "-std=c++17"
@@ -0,0 +1,63 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ci-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ test:
15
+ name: ${{ matrix.os }} / py${{ matrix.python-version }}
16
+ runs-on: ${{ matrix.os }}
17
+ strategy:
18
+ fail-fast: false
19
+ matrix:
20
+ os: [ubuntu-latest, macos-latest]
21
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
22
+
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+
26
+ - name: Set up Python
27
+ uses: actions/setup-python@v5
28
+ with:
29
+ python-version: ${{ matrix.python-version }}
30
+ cache: pip
31
+
32
+ - name: Install build deps (Linux)
33
+ if: runner.os == 'Linux'
34
+ run: sudo apt-get update && sudo apt-get install -y build-essential cmake
35
+
36
+ - name: Install build deps (macOS)
37
+ if: runner.os == 'macOS'
38
+ run: brew install cmake
39
+
40
+ - name: Install package + test extras
41
+ run: |
42
+ python -m pip install --upgrade pip
43
+ pip install -e ".[dev]"
44
+
45
+ - name: Run tests (skip slow)
46
+ run: pytest tests/ -v -m "not slow" --timeout=300
47
+
48
+ # The slow tests (memory smoke test on a 10k×10k problem) run only on Linux
49
+ # against the latest Python, to keep CI time reasonable.
50
+ slow-tests:
51
+ name: slow tests (ubuntu / py3.12)
52
+ runs-on: ubuntu-latest
53
+ steps:
54
+ - uses: actions/checkout@v4
55
+ - uses: actions/setup-python@v5
56
+ with:
57
+ python-version: "3.12"
58
+ cache: pip
59
+ - run: sudo apt-get update && sudo apt-get install -y build-essential cmake
60
+ - run: |
61
+ python -m pip install --upgrade pip
62
+ pip install -e ".[dev]"
63
+ - run: pytest tests/ -v -m "slow" --timeout=600
@@ -0,0 +1,107 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ # Manual trigger for dry runs against TestPyPI.
7
+ workflow_dispatch:
8
+ inputs:
9
+ target:
10
+ description: "Publish target"
11
+ required: true
12
+ default: "testpypi"
13
+ type: choice
14
+ options:
15
+ - testpypi
16
+ - pypi
17
+
18
+ permissions:
19
+ contents: read
20
+
21
+ jobs:
22
+ build_wheels:
23
+ name: Wheels on ${{ matrix.os }}
24
+ runs-on: ${{ matrix.os }}
25
+ strategy:
26
+ fail-fast: false
27
+ matrix:
28
+ # macos-13 (Intel) dropped: the GitHub-hosted runner pool is
29
+ # chronically oversubscribed and stalls publish jobs by 30+ min.
30
+ # Apple Silicon dominates the macOS Python user base on 3.10+;
31
+ # users on Intel Macs can build from the sdist.
32
+ os: [ubuntu-latest, ubuntu-24.04-arm, macos-14]
33
+ steps:
34
+ - uses: actions/checkout@v4
35
+
36
+ - name: Build wheels
37
+ uses: pypa/cibuildwheel@v2.21
38
+ env:
39
+ # PyPI-supported Python versions; skip 32-bit Linux, PyPy, musllinux
40
+ # (scipy wheels not universally available on musl).
41
+ CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-*"
42
+ CIBW_SKIP: "*-musllinux_* *-win32 pp*"
43
+ # macOS targets: arm64 on macos-14, x86_64 on macos-13.
44
+ CIBW_ARCHS_MACOS: "auto64"
45
+ CIBW_ARCHS_LINUX: "auto64"
46
+ # Recent scipy releases publish only manylinux_2_28 wheels for x86_64
47
+ # and aarch64; the default manylinux2014 (glibc 2.17) container falls
48
+ # back to source-building scipy, which needs OpenBLAS and fails.
49
+ CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
50
+ CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28
51
+ # Minimal smoke test: import the package and run one tiny emd() call.
52
+ # The full pytest suite already runs on every supported (os, py) pair
53
+ # in ci.yml — re-running it inside each wheel container would force
54
+ # scipy + pot + matplotlib installs we don't need just to verify the
55
+ # binary loads.
56
+ CIBW_TEST_REQUIRES: "numpy scipy"
57
+ CIBW_TEST_COMMAND: >-
58
+ python -c "import numpy as np, sparse_ot;
59
+ G = sparse_ot.emd(np.array([0.4, 0.6]), np.array([0.5, 0.5]), np.array([[0.0, 1.0], [1.0, 0.0]]));
60
+ assert G.shape == (2, 2);
61
+ print('sparse_ot wheel smoke test ok')"
62
+
63
+ - uses: actions/upload-artifact@v4
64
+ with:
65
+ name: wheels-${{ matrix.os }}
66
+ path: ./wheelhouse/*.whl
67
+
68
+ build_sdist:
69
+ name: sdist
70
+ runs-on: ubuntu-latest
71
+ steps:
72
+ - uses: actions/checkout@v4
73
+ - uses: actions/setup-python@v5
74
+ with:
75
+ python-version: "3.12"
76
+ - run: python -m pip install --upgrade build
77
+ - run: python -m build --sdist
78
+ - uses: actions/upload-artifact@v4
79
+ with:
80
+ name: sdist
81
+ path: dist/*.tar.gz
82
+
83
+ publish:
84
+ name: Publish to ${{ (github.event_name == 'workflow_dispatch' && inputs.target) || 'pypi' }}
85
+ needs: [build_wheels, build_sdist]
86
+ runs-on: ubuntu-latest
87
+ # Trusted-publishing OIDC requires an environment + id-token write.
88
+ environment:
89
+ name: ${{ (github.event_name == 'workflow_dispatch' && inputs.target) || 'pypi' }}
90
+ url: https://${{ ((github.event_name == 'workflow_dispatch' && inputs.target) || 'pypi') == 'testpypi' && 'test.' || '' }}pypi.org/p/sparse-ot
91
+ permissions:
92
+ id-token: write
93
+ steps:
94
+ - uses: actions/download-artifact@v4
95
+ with:
96
+ path: dist
97
+ merge-multiple: true
98
+
99
+ - name: Publish to PyPI
100
+ if: (github.event_name == 'workflow_dispatch' && inputs.target == 'pypi') || github.event_name == 'release'
101
+ uses: pypa/gh-action-pypi-publish@release/v1
102
+
103
+ - name: Publish to TestPyPI
104
+ if: github.event_name == 'workflow_dispatch' && inputs.target == 'testpypi'
105
+ uses: pypa/gh-action-pypi-publish@release/v1
106
+ with:
107
+ repository-url: https://test.pypi.org/legacy/
@@ -0,0 +1,25 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+
8
+ # Virtual environment
9
+ .venv/
10
+
11
+ # scikit-build-core / CMake build artifacts
12
+ _skbuild/
13
+ CMakeFiles/
14
+ *.cmake
15
+ Makefile
16
+
17
+ # Compiled extensions
18
+ *.so
19
+ *.pyd
20
+
21
+ # pytest
22
+ .pytest_cache/
23
+
24
+ # macOS
25
+ .DS_Store
@@ -0,0 +1,16 @@
1
+ cmake_minimum_required(VERSION 3.18)
2
+ project(sparse_ot_ext LANGUAGES CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+
7
+ find_package(pybind11 CONFIG REQUIRED)
8
+
9
+ pybind11_add_module(_bonneel src/cpp/bonneel_solver.cpp)
10
+ target_include_directories(_bonneel PRIVATE src/cpp)
11
+ target_compile_options(_bonneel PRIVATE -O3)
12
+ # Disable OpenMP for the Bonneel network-simplex wrapper: OpenMP thread overhead
13
+ # causes 30-40x slowdown on the problem sizes we handle (n ≤ ~5 000). POT's
14
+ # emd_wrap is also built without OpenMP for the same reason.
15
+ target_compile_definitions(_bonneel PRIVATE NOOMP)
16
+ install(TARGETS _bonneel DESTINATION sparse_ot/_ext)
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jonatan Bobrutsky-Haim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.2
2
+ Name: sparse-ot
3
+ Version: 0.1.0
4
+ Summary: Sparse optimal transport with drop-in POT API
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: numpy>=1.24
8
+ Requires-Dist: scipy>=1.10
9
+ Provides-Extra: torch
10
+ Requires-Dist: torch>=2.0; extra == "torch"
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest>=7.0; extra == "dev"
13
+ Requires-Dist: pytest-timeout>=2.0; extra == "dev"
14
+ Requires-Dist: pot>=0.9; extra == "dev"
15
+ Requires-Dist: pybind11>=2.12; extra == "dev"
16
+ Requires-Dist: matplotlib>=3.7; extra == "dev"
17
+
@@ -0,0 +1,256 @@
1
+ # sparse-ot
2
+
3
+ [![CI](https://github.com/JBobrutsky/sparse-optimal-transport/actions/workflows/ci.yml/badge.svg)](https://github.com/JBobrutsky/sparse-optimal-transport/actions/workflows/ci.yml)
4
+
5
+ Drop-in replacement for [POT](https://github.com/PythonOT/POT)'s `emd` /
6
+ `emd2`, with native support for **sparse cost matrices**. One solver
7
+ ([Bonneel's network simplex](https://github.com/nbonneel/network_simplex))
8
+ covers both regimes:
9
+
10
+ - **Dense** `numpy.ndarray` cost matrix → dense plan.
11
+ - **`scipy.sparse` CSR** cost matrix → sparse plan, with memory and per-pivot
12
+ work both scaling in the number of candidate edges `k` rather than `n × m`.
13
+
14
+ ## Quickstart
15
+
16
+ ```python
17
+ import numpy as np, scipy.sparse, sparse_ot as sot
18
+
19
+ # Dense: identical interface to ot.emd.
20
+ G = sot.emd(a, b, M)
21
+ cost = sot.emd2(a, b, M)
22
+
23
+ # Sparse: pass a CSR cost matrix. Absent entries are forbidden edges, not
24
+ # zero-cost shortcuts.
25
+ M_csr = scipy.sparse.csr_matrix(...)
26
+ G_csr = sot.emd(a, b, M_csr)
27
+
28
+ # POT-compatible log dict (cost, u, v, warning, result_code).
29
+ G, info = sot.emd(a, b, M, log=True)
30
+ ```
31
+
32
+ The `u` and `v` returned in the log dict are the dual potentials with
33
+ POT's sign convention (`u[i] + v[j] ≤ M[i,j]` at the optimum). With
34
+ `center_dual=True` (default) `u` is shifted to zero mean, preserving
35
+ `u[i] + v[j]` on every edge.
36
+
37
+ ## Why sparse?
38
+
39
+ A 10 000 × 10 000 problem with 10 candidate edges per row (k = 100 000):
40
+
41
+ | Solver path | Memory | Wall time |
42
+ |--------------------|---------------|-----------|
43
+ | Bonneel-dense | ≈ 800 MB (cost matrix) | (does not run; OOM at this scale on small machines) |
44
+ | **Bonneel-sparse** | **≈ 6 MB** | seconds |
45
+
46
+ Most real OT problems (k-NN, transformer attention masks, point-cloud
47
+ matching) are intrinsically sparse. Materialising them as dense costs
48
+ matrices is wasteful and can be infeasible. This package gives you
49
+ Bonneel's tight constants without the O(n·m) memory penalty.
50
+
51
+ ## Feasibility on sparse supports
52
+
53
+ When you pass a sparse `M`, the transport plan is restricted to the
54
+ edges you provide. The package checks that the support is connected
55
+ and that supply totals match (`check_feasibility`), but **this does
56
+ not guarantee an LP-feasible plan exists**.
57
+
58
+ A small support can fail [Hall's condition](https://en.wikipedia.org/wiki/Hall%27s_marriage_theorem):
59
+ some local block of rows `S` may collectively need to move more mass
60
+ than the columns they reach can absorb. For example, a band-7 support
61
+ (each row connects only to its 7 nearest columns) cannot route generic
62
+ Dirichlet marginals at `n = 1000` — the corner rows have nowhere to
63
+ shed their excess.
64
+
65
+ When that happens we don't lie. The solver returns its best-effort
66
+ flow, `info["result_code"] == 0`, and a `RuntimeWarning` fires
67
+ explaining that the marginals weren't met. Compare to POT, which
68
+ silently routes mass through any zero-cost or penalty edge in the
69
+ densified representation and reports `success` with an arbitrary
70
+ cost.
71
+
72
+ In practice: build supports that are slightly denser than your
73
+ marginals strictly require (k-NN with k chosen by validation, plus a
74
+ small slack), or run with very dense support whenever you don't know
75
+ the marginal distribution ahead of time.
76
+
77
+ ## Convergence and the `numItermax` knob
78
+
79
+ Bonneel's network simplex stops at `numItermax` pivots without raising.
80
+ If the iteration cap is hit before convergence the returned flow can
81
+ violate marginals by orders of magnitude more than machine epsilon. We
82
+ guard against this in two ways:
83
+
84
+ 1. The default `numItermax` is **problem-size-aware**:
85
+ `min(50M, max(100k, 100·(n + m + k)))`.
86
+ 2. After every solve we re-check the marginals. If `max(|G.sum(1) - a|,
87
+ |G.sum(0) - b|) > 1e-6`, we emit a `RuntimeWarning` and report
88
+ `result_code = 0` with a diagnostic in `info["warning"]`. No
89
+ exception is raised, matching POT's behavior.
90
+
91
+ You can pass `numItermax=…` to override.
92
+
93
+ ## Build and install
94
+
95
+ ```bash
96
+ pip install -e . --no-build-isolation
97
+ ```
98
+
99
+ `pyproject.toml` sets `editable.rebuild = true`, so the pybind11
100
+ extension is rebuilt automatically the next time `sparse_ot` is imported
101
+ after a `src/cpp/` edit.
102
+
103
+ ## Benchmarks
104
+
105
+ Two independent suites:
106
+
107
+ ```bash
108
+ python benchmarks/bench_solvers.py --mid # ~15 minutes
109
+ python benchmarks/bench_solvers.py --quick # ~30 seconds (used by CI)
110
+ python benchmarks/bench_solvers.py # full sweep, hours
111
+ ```
112
+
113
+ - **Dense suite** — fully-random `n × n` cost matrix. Compares
114
+ `bonneel_dense` against `pot_reference` (`ot.emd`).
115
+ - **Sparse suite** — feasible-by-construction kNN-grid (`benchmarks/problems.py`).
116
+ Runs `bonneel_sparse` only; there is no honest dense representation of
117
+ a kNN cost (absent edges must be +∞, which neither POT nor Bonneel's
118
+ dense path can express).
119
+
120
+ Results are written to `benchmarks/results/{efficiency,accuracy}_{tag}.json`
121
+ with the structure:
122
+
123
+ ```jsonc
124
+ {
125
+ "dense": { "<n>": { "bonneel_dense": {...}, "pot_reference": {...} } },
126
+ "sparse": { "<n>": { "<k>": { "bonneel_sparse": {...} } } }
127
+ }
128
+ ```
129
+
130
+ See "Benchmark results" below for the current numbers on the maintainer's
131
+ laptop.
132
+
133
+ ## Benchmark results
134
+
135
+ Numbers below are from `python benchmarks/bench_solvers.py --mid` on an
136
+ Apple-Silicon laptop (Sonoma, 64 GB). Wall times are median of 1 run; the
137
+ sparse-suite peak memory is `tracemalloc` peak during `emd()`.
138
+
139
+ ### Dense suite (fully random `n × n` cost)
140
+
141
+ | n | `bonneel_dense` | `pot_reference` | dense / pot |
142
+ |------:|----------------:|----------------:|------------:|
143
+ | 200 | 0.003s | 0.002s | 0.66× |
144
+ | 500 | 0.019s | 0.015s | 0.80× |
145
+ | 1000 | 0.085s | 0.072s | 0.85× |
146
+ | 2000 | 0.406s | 0.346s | 0.85× |
147
+ | 4000 | 2.166s | 1.414s | 0.65× |
148
+
149
+ POT and `bonneel_dense` share the same C++ engine (POT vendors Bonneel's
150
+ network simplex), so the wall-time ratio reflects pure wrapping overhead;
151
+ POT's Cython wrapper is marginally tighter than our pybind11 wrapper.
152
+ Costs agree to machine precision for n ≤ 2000. At n = 4000, POT's cost
153
+ is 1.1 % higher than ours — POT's default `numItermax = 100 000`
154
+ truncates before convergence, while our problem-size-aware default
155
+ finishes the pivots.
156
+
157
+ ### Sparse suite (knn-grid)
158
+
159
+ The same knn problem is run through both Bonneel paths so the
160
+ speed/memory tradeoff is directly comparable. `bonneel_dense` runs only
161
+ where `n ≤ MAX_DENSE_N`; above that the cost matrix doesn't fit and the
162
+ cell is sparse-only. For the dense path we densify with a finite
163
+ penalty (`max(M.data) · (n·m + 1)`) on absent edges — with the
164
+ problem-size-aware `numItermax` the optimal basis never lands on a
165
+ penalty edge.
166
+
167
+ | n | k | nnz | sparse wall | sparse peak | dense wall | dense peak | winner |
168
+ |-------:|-----:|-----------:|------------:|------------:|-----------:|-----------:|:-------------|
169
+ | 200 | 2 | 400 | 0.006s | 0.04 MB | 0.003s | 0.32 MB | dense 2.4× |
170
+ | 200 | 32 | 6 400 | 0.051s | 0.14 MB | 0.004s | 0.32 MB | dense 14× |
171
+ | 200 | 128 | 25 600 | 0.204s | 0.52 MB | 0.003s | 0.32 MB | dense 71× |
172
+ | 1 000 | 2 | 2 000 | 0.033s | 0.15 MB | 0.075s | 7.69 MB | **sparse 2.3×** |
173
+ | 1 000 | 8 | 8 000 | 0.100s | 0.22 MB | 0.178s | 7.69 MB | **sparse 1.8×** |
174
+ | 1 000 | 32 | 32 000 | 0.315s | 0.67 MB | 0.182s | 7.69 MB | dense 1.7× |
175
+ | 1 000 | 128 | 128 000 | 1.103s | 2.59 MB | 0.256s | 7.69 MB | dense 4.3× |
176
+ | 4 000 | 2 | 8 000 | 0.134s | 0.60 MB | 1.652s | 122.3 MB | **sparse 12×** |
177
+ | 4 000 | 8 | 32 000 | 0.577s | 0.87 MB | 8.918s | 122.3 MB | **sparse 15×** |
178
+ | 4 000 | 32 | 128 000 | 1.572s | 2.66 MB | 8.574s | 122.3 MB | **sparse 5.5×** |
179
+ | 4 000 | 128 | 512 000 | 4.540s | 10.35 MB | 10.298s | 122.3 MB | **sparse 2.3×** |
180
+ | 4 000 | 400 | 1 600 000 | 14.473s | 32.14 MB | 8.393s | 122.3 MB | dense 1.7× |
181
+ | 4 000 | 512 | 2 048 000 | 17.956s | 41.11 MB | 9.535s | 122.3 MB | dense 1.9× |
182
+ | 16 000 | 2 | 32 000 | 0.531s | 2.39 MB | — | — | sparse only |
183
+ | 16 000 | 32 | 512 000 | 12.066s | 10.62 MB | — | — | sparse only |
184
+ | 16 000 | 128 | 2 048 000 | 32.955s | 41.38 MB | — | — | sparse only |
185
+ | 16 000 | 512 | 8 192 000 | 75.201s | 164.4 MB | — | — | sparse only |
186
+ | 16 000 | 1600 | 25 600 000 | 269.488s | 513.1 MB | — | — | sparse only |
187
+
188
+ Reading the table:
189
+
190
+ - At **n = 200** the dense path always wins because the n² cost matrix is
191
+ tiny and Bonneel's flat-array constants dominate over the sparse
192
+ digraph's per-arc indirection.
193
+ - At **n = 1 000** the crossover is around k ≈ n / 50: below that, sparse
194
+ wins; above, dense wins.
195
+ - At **n = 4 000** sparse wins by 2–15× up to k ≈ n / 20. Above that
196
+ density, dense again wins on time but its memory cost is fixed at
197
+ 122 MB regardless of k.
198
+ - At **n = 16 000** the dense path is out of reach (cost matrix ≈ 2 GB);
199
+ only sparse runs.
200
+
201
+ ### Accuracy
202
+
203
+ Marginals stay at machine precision (worst case `2.5 × 10⁻¹⁶`) across
204
+ every cell of both suites. On the knn problems, `bonneel_sparse` and
205
+ `bonneel_dense` agree on cost to ≈ machine precision in 22 of 24 cells.
206
+ Two exceptions:
207
+
208
+ | n | k | sparse cost | dense cost | relative diff |
209
+ |------:|-----:|------------:|-----------:|--------------:|
210
+ | 4 000 | 400 | 271.7297 | 272.2578 | 1.9 × 10⁻³ |
211
+ | 4 000 | 512 | 557.3260 | 557.3270 | 1.7 × 10⁻⁶ |
212
+
213
+ In both cases `bonneel_sparse` finds a strictly lower-cost plan. The
214
+ densified-with-penalty input introduces costs on the order of
215
+ `max(M) · n² ≈ 10¹²`, and floating-point reduced-cost computations on
216
+ that scale accumulate enough rounding noise to push the pivot rule off
217
+ the true optimum. The sparse path never sees those large numbers and is
218
+ the more accurate of the two when both can run.
219
+
220
+ ## Memory cutoffs
221
+
222
+ `bench_solvers.py` skips cells beyond these defaults (16 GB target):
223
+
224
+ | Constant | Default | Effect |
225
+ |------------------|---------------|--------------------------------------------|
226
+ | `MAX_DENSE_N` | 8 192 | Dense suite skipped above this |
227
+ | `MAX_SPARSE_NNZ` | 200 000 000 | Sparse cell skipped above this nnz |
228
+
229
+ Raise the constants for larger hardware.
230
+
231
+ ## Releasing
232
+
233
+ PyPI uploads are automated via GitHub Actions and PyPI's
234
+ [trusted-publishing OIDC](https://docs.pypi.org/trusted-publishers/). To
235
+ cut a release:
236
+
237
+ 1. Bump `project.version` in `pyproject.toml`, commit, tag (`git tag vX.Y.Z`),
238
+ push (`git push --tags`).
239
+ 2. Create a GitHub Release pointing at the tag.
240
+
241
+ The `.github/workflows/publish.yml` workflow then builds wheels via
242
+ `cibuildwheel` for Linux (x86_64, arm64) and macOS (x86_64, arm64) across
243
+ Python 3.10–3.13, builds an sdist, and uploads everything to PyPI.
244
+
245
+ First-time setup (one-time, requires owner action on pypi.org):
246
+
247
+ - Add a trusted publisher for **sparse-ot** with owner = `JBobrutsky`,
248
+ repository = `sparse-optimal-transport`, workflow = `publish.yml`,
249
+ environment = `pypi`.
250
+ - For TestPyPI dry runs, register the same on test.pypi.org with
251
+ environment = `testpypi`. Then trigger `Publish to PyPI` via the
252
+ Actions UI (workflow_dispatch) with target = `testpypi`.
253
+
254
+ ## License
255
+
256
+ [MIT](LICENSE).
File without changes