trnsparse 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trnsparse-0.3.2/.github/workflows/ci.yml +58 -0
- trnsparse-0.3.2/.pre-commit-config.yaml +7 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/CHANGELOG.md +67 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/CLAUDE.md +12 -0
- {trnsparse-0.3.0/trnsparse.egg-info → trnsparse-0.3.2}/PKG-INFO +5 -1
- {trnsparse-0.3.0 → trnsparse-0.3.2}/README.md +2 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_bsr_spmm.py +2 -3
- trnsparse-0.3.2/benchmarks/bench_iterative.py +50 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_spmm.py +1 -0
- trnsparse-0.3.2/docs/iterative_solvers.md +115 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/examples/sparse_fock.py +5 -3
- trnsparse-0.3.2/pyproject.toml +84 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/scripts/bench_to_md.py +3 -5
- trnsparse-0.3.2/scripts/run_simulator_tests.sh +116 -0
- trnsparse-0.3.2/tests/conftest.py +12 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_bsr.py +1 -2
- {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_formats.py +3 -5
- trnsparse-0.3.2/tests/test_iterative.py +135 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_nki_bsr.py +15 -13
- trnsparse-0.3.2/tests/test_nki_sim.py +113 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_nki_spmm.py +7 -8
- {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_ops.py +2 -7
- {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_screening.py +2 -3
- trnsparse-0.3.2/trnsparse/__init__.py +51 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/formats.py +15 -10
- trnsparse-0.3.2/trnsparse/iterative.py +190 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/nki/__init__.py +3 -1
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/nki/dispatch.py +67 -24
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/nki/kernels.py +9 -12
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/ops.py +30 -19
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/screening.py +2 -5
- {trnsparse-0.3.0 → trnsparse-0.3.2/trnsparse.egg-info}/PKG-INFO +5 -1
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/SOURCES.txt +7 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/requires.txt +2 -0
- trnsparse-0.3.0/.github/workflows/ci.yml +0 -20
- trnsparse-0.3.0/pyproject.toml +0 -47
- trnsparse-0.3.0/tests/conftest.py +0 -5
- trnsparse-0.3.0/trnsparse/__init__.py +0 -23
- {trnsparse-0.3.0 → trnsparse-0.3.2}/.github/workflows/notify-umbrella.yml +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/.github/workflows/publish.yml +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/.gitignore +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/CODE_OF_CONDUCT.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/CONTRIBUTING.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/LICENSE +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_screening.py +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_spmv.py +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/conftest.py +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/api.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/architecture.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/aws_setup.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/benchmarks.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/index.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/installation.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/migration_scipy.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/quickstart.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/infra/terraform/.terraform.lock.hcl +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/infra/terraform/README.md +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/infra/terraform/main.tf +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/mkdocs.yml +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/scripts/run_benchmarks.sh +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/scripts/run_neuron_tests.sh +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/setup.cfg +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/dependency_links.txt +0 -0
- {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
lint:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v6
|
|
13
|
+
- uses: actions/setup-python@v6
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.12"
|
|
16
|
+
- run: pip install "ruff>=0.6"
|
|
17
|
+
- run: ruff check .
|
|
18
|
+
- run: ruff format --check .
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
strategy:
|
|
23
|
+
matrix:
|
|
24
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v6
|
|
27
|
+
- uses: actions/setup-python@v6
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ matrix.python-version }}
|
|
30
|
+
- run: pip install -e ".[dev]"
|
|
31
|
+
- run: pytest tests/ -v -x --tb=short -m "not neuron and not nki_simulator" --cov=trnsparse --cov-report=xml
|
|
32
|
+
- name: Upload coverage reports to Codecov
|
|
33
|
+
if: matrix.python-version == '3.12'
|
|
34
|
+
uses: codecov/codecov-action@v5
|
|
35
|
+
with:
|
|
36
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
37
|
+
slug: trnsci/trnsparse
|
|
38
|
+
|
|
39
|
+
nki-simulator:
|
|
40
|
+
# Runs NKI kernels through nki.simulate(kernel)(numpy_args) on CPU.
|
|
41
|
+
# Catches Python-trace-level errors (bad kwargs, dropped ops, shape
|
|
42
|
+
# mismatches) pre-merge without AWS round-trips. MLIR verifier
|
|
43
|
+
# errors remain hardware-only (simulator explicitly skips compile).
|
|
44
|
+
runs-on: ubuntu-latest
|
|
45
|
+
steps:
|
|
46
|
+
- uses: actions/checkout@v6
|
|
47
|
+
- uses: actions/setup-python@v6
|
|
48
|
+
with:
|
|
49
|
+
python-version: "3.12"
|
|
50
|
+
- name: Install trnsparse + NKI simulator deps
|
|
51
|
+
run: |
|
|
52
|
+
pip install -e ".[dev]"
|
|
53
|
+
pip install --extra-index-url https://pip.repos.neuron.amazonaws.com \
|
|
54
|
+
"nki>=0.3.0"
|
|
55
|
+
- name: Run simulator-backed kernel tests
|
|
56
|
+
env:
|
|
57
|
+
TRNSPARSE_USE_SIMULATOR: "1"
|
|
58
|
+
run: pytest tests/ -v -m nki_simulator --tb=short
|
|
@@ -5,6 +5,73 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.3.2] — 2026-04-14
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **`cg_bsr`** and **`power_iteration_bsr`** — Conjugate Gradient and
|
|
13
|
+
power iteration on block-sparse row matrices. Plumbing on top of
|
|
14
|
+
`bsr_spmm` (one kernel dispatch per iteration). Closes Phase 1 of
|
|
15
|
+
#22 on-chip iterative solvers.
|
|
16
|
+
- **`jacobi_preconditioner_bsr(A)`** — builds a diagonal preconditioner
|
|
17
|
+
for `cg_bsr`'s `M=` argument.
|
|
18
|
+
- **`bsr_diagonal(A)`** — extracts the main diagonal from a BSR matrix.
|
|
19
|
+
- **`docs/iterative_solvers.md`** — design note covering the v0.3.2
|
|
20
|
+
plumbing and the v0.4.0 fused-kernel goal (#24). Explains the
|
|
21
|
+
architectural win Trainium offers (A SBUF-resident across iterations)
|
|
22
|
+
vs the current per-iteration HBM round-trip.
|
|
23
|
+
- **`tests/test_iterative.py`** — 8 CPU tests including scipy parity at
|
|
24
|
+
`atol=1e-4` on a 128×128 SPD system.
|
|
25
|
+
- **`benchmarks/bench_iterative.py`** — cg_bsr vs scipy.sparse.linalg.cg.
|
|
26
|
+
At 128×128 SPD: scipy 310 μs, trnsparse 369 μs (1.19×).
|
|
27
|
+
|
|
28
|
+
### Notes
|
|
29
|
+
|
|
30
|
+
- Algorithm body for CG is a local copy of `trnsolver.iterative.cg`;
|
|
31
|
+
kept local to avoid a cross-repo runtime dependency for one function.
|
|
32
|
+
- v0.4.0 will layer the fused CG/power-iteration NKI kernel on top —
|
|
33
|
+
tracked in #24. The API stays stable across the transition; users
|
|
34
|
+
upgrading from v0.3.2 get the fused-kernel speedup automatically
|
|
35
|
+
when the fused path is available.
|
|
36
|
+
|
|
37
|
+
## [0.3.1] — 2026-04-14
|
|
38
|
+
|
|
39
|
+
### Changed
|
|
40
|
+
|
|
41
|
+
- **Migrated NKI imports to the `nki.*` namespace** (NKI 0.3.0 Stable,
|
|
42
|
+
Neuron SDK 2.29, April 2026). Legacy `neuronxcc.nki.*` shim is no
|
|
43
|
+
longer used. `pyproject.toml` `[neuron]` extra gains `nki>=0.3.0`
|
|
44
|
+
alongside the existing `neuronxcc>=2.24` and `torch-neuronx>=2.9`.
|
|
45
|
+
Hosts without an `nki` wheel (macOS, non-Linux archs) still hit
|
|
46
|
+
`HAS_NKI=False` and get the torch fallback. Kernel bodies unchanged —
|
|
47
|
+
the trnblas audit confirmed the positional `nisa.nc_matmul` +
|
|
48
|
+
`nl.copy(psum, ...)` pattern complies with NKI 0.3.0.
|
|
49
|
+
- `test` CI job now filters `-m "not neuron and not nki_simulator"` so
|
|
50
|
+
each test runs in exactly one job.
|
|
51
|
+
|
|
52
|
+
### Added
|
|
53
|
+
|
|
54
|
+
- **`TRNSPARSE_USE_SIMULATOR=1` dispatch branch** through
|
|
55
|
+
`nki.simulate(kernel)(np_args)`. Bypasses torch_xla + NEFF compile;
|
|
56
|
+
kernels run on CPU for correctness iteration. Hardware still owns
|
|
57
|
+
perf numbers.
|
|
58
|
+
- **`nki-simulator` CI job on `ubuntu-latest`** — installs `nki>=0.3.0`
|
|
59
|
+
from the AWS pip index and runs the simulator suite on every push/PR.
|
|
60
|
+
Kernel correctness gate without AWS cost. Catches Python-trace-level
|
|
61
|
+
errors (bad kwargs, dropped ops, shape mismatches); MLIR verifier
|
|
62
|
+
errors remain hardware-only (NKI 0.3.0 has no documented device-free
|
|
63
|
+
NEFF compile API).
|
|
64
|
+
- `tests/test_nki_sim.py` — curated simulator suite (4 tests: CSR
|
|
65
|
+
aligned + rectangular, BSR block-dense + block-diagonal). Skips
|
|
66
|
+
cleanly off-hardware.
|
|
67
|
+
- `scripts/run_simulator_tests.sh` — SSM runner mirroring
|
|
68
|
+
`run_neuron_tests.sh` with `TRNSPARSE_USE_SIMULATOR=1` in the env.
|
|
69
|
+
- `tests/conftest.py` — registers the `nki_simulator` pytest marker.
|
|
70
|
+
|
|
71
|
+
Addresses [trnsci/trnsparse#23](https://github.com/trnsci/trnsparse/issues/23).
|
|
72
|
+
Follows the trnblas reference commits `c693561`, `f24993b`, `77eeb82`
|
|
73
|
+
(suite-wide coordination in `trnsci/trnsci#5`).
|
|
74
|
+
|
|
8
75
|
## [0.3.0] — 2026-04-13
|
|
9
76
|
|
|
10
77
|
### Added
|
|
@@ -64,3 +64,15 @@ pip install -e ".[dev]"
|
|
|
64
64
|
pytest tests/ -v
|
|
65
65
|
python examples/sparse_fock.py --demo
|
|
66
66
|
```
|
|
67
|
+
|
|
68
|
+
## Blog posts
|
|
69
|
+
|
|
70
|
+
When asked to draft a blog post for this library for the [trnsci blog](https://trnsci.dev/blog/):
|
|
71
|
+
|
|
72
|
+
1. Read the editorial brief at [`docs/blog/AUTHOR_BRIEF.md`](https://github.com/trnsci/trnsci/blob/main/docs/blog/AUTHOR_BRIEF.md) in the umbrella repo (trnsci/trnsci). It defines voice (authorless, library-as-subject), stance (architecture-first, transparency-always), and the nine required section headings.
|
|
73
|
+
|
|
74
|
+
2. Find the prompt block for this library in [`BLOG_PROMPTS.md`](https://github.com/trnsci/trnsci/blob/main/BLOG_PROMPTS.md) at the umbrella repo root. It carries library-specific context and suggested architectural angles.
|
|
75
|
+
|
|
76
|
+
3. Draft the post following the brief. Open a PR against `trnsci/trnsci` at `docs/blog/posts/<YYYY-MM-DD>-<slug>.md`. Scott (suite director) reviews before merge.
|
|
77
|
+
|
|
78
|
+
The umbrella repo — not this one — owns the blog. Per-library retrospective posts are unsigned; library is the subject, no byline. See the brief for the full set of rules.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: trnsparse
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Sparse matrix operations for AWS Trainium via NKI
|
|
5
5
|
Author-email: Scott Friedman <scttfrdmn@gmail.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -19,17 +19,21 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: torch>=2.1
|
|
20
20
|
Requires-Dist: numpy>=1.24
|
|
21
21
|
Provides-Extra: neuron
|
|
22
|
+
Requires-Dist: nki>=0.3.0; extra == "neuron"
|
|
22
23
|
Requires-Dist: neuronxcc>=2.24; extra == "neuron"
|
|
23
24
|
Requires-Dist: torch-neuronx>=2.9; extra == "neuron"
|
|
24
25
|
Provides-Extra: dev
|
|
25
26
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
26
27
|
Requires-Dist: pytest-benchmark>=4.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.1; extra == "dev"
|
|
27
29
|
Requires-Dist: scipy>=1.11; extra == "dev"
|
|
28
30
|
Dynamic: license-file
|
|
29
31
|
|
|
30
32
|
# trnsparse
|
|
31
33
|
|
|
32
34
|
[](https://github.com/trnsci/trnsparse/actions/workflows/ci.yml)
|
|
35
|
+
[](https://codecov.io/gh/trnsci/trnsparse)
|
|
36
|
+
[](https://github.com/astral-sh/ruff)
|
|
33
37
|
[](https://pypi.org/project/trnsparse/)
|
|
34
38
|
[](https://pypi.org/project/trnsparse/)
|
|
35
39
|
[](LICENSE)
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# trnsparse
|
|
2
2
|
|
|
3
3
|
[](https://github.com/trnsci/trnsparse/actions/workflows/ci.yml)
|
|
4
|
+
[](https://codecov.io/gh/trnsci/trnsparse)
|
|
5
|
+
[](https://github.com/astral-sh/ruff)
|
|
4
6
|
[](https://pypi.org/project/trnsparse/)
|
|
5
7
|
[](https://pypi.org/project/trnsparse/)
|
|
6
8
|
[](LICENSE)
|
|
@@ -21,8 +21,7 @@ import torch
|
|
|
21
21
|
import trnsparse
|
|
22
22
|
from trnsparse.nki.dispatch import HAS_NKI
|
|
23
23
|
|
|
24
|
-
|
|
25
|
-
M_BLOCKS = [4, 8] # matrix has M_BLOCKS * 128 rows
|
|
24
|
+
M_BLOCKS = [4, 8] # matrix has M_BLOCKS * 128 rows
|
|
26
25
|
N_BLOCKS = [4, 8]
|
|
27
26
|
BLOCK_DENSITIES = [0.1, 0.25, 0.5]
|
|
28
27
|
RHS_COLS = [128, 256]
|
|
@@ -58,7 +57,7 @@ def bsr_and_B(m_blocks, n_blocks, block_density, bsr_rhs_cols):
|
|
|
58
57
|
for i in range(m_blocks):
|
|
59
58
|
for j in range(n_blocks):
|
|
60
59
|
if mask[i, j]:
|
|
61
|
-
A[i * b:(i + 1) * b, j * b:(j + 1) * b] = torch.randn(b, b)
|
|
60
|
+
A[i * b : (i + 1) * b, j * b : (j + 1) * b] = torch.randn(b, b)
|
|
62
61
|
bsr = trnsparse.BSRMatrix.from_dense(A, block_size=b)
|
|
63
62
|
B = torch.randn(N, bsr_rhs_cols)
|
|
64
63
|
return bsr, B, A
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Iterative-solver benchmarks: trnsparse.cg_bsr vs scipy baseline.
|
|
2
|
+
|
|
3
|
+
The v0.3.2 plumbing dispatches one `bsr_spmm` call per CG iteration.
|
|
4
|
+
On NKI that means one kernel launch + HBM round-trip per iteration.
|
|
5
|
+
Expect the current path to be dominated by dispatch overhead compared
|
|
6
|
+
to scipy's compiled C loop, which motivates the v0.4.0 fused-kernel
|
|
7
|
+
follow-up.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
import torch
|
|
14
|
+
|
|
15
|
+
import trnsparse
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture(params=[128, 256])
|
|
19
|
+
def iter_size(request):
|
|
20
|
+
return request.param
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def spd_bsr_and_dense(iter_size):
|
|
25
|
+
torch.manual_seed(0)
|
|
26
|
+
n = iter_size
|
|
27
|
+
M = torch.randn(n, n)
|
|
28
|
+
A_dense = M @ M.T + n * torch.eye(n)
|
|
29
|
+
A_bsr = trnsparse.BSRMatrix.from_dense(A_dense, block_size=128)
|
|
30
|
+
b = torch.randn(n)
|
|
31
|
+
return A_dense, A_bsr, b
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_cg_bsr_trnsparse(benchmark, spd_bsr_and_dense):
|
|
35
|
+
_, A_bsr, b = spd_bsr_and_dense
|
|
36
|
+
benchmark(lambda: trnsparse.cg_bsr(A_bsr, b, tol=1e-8, max_iter=2 * b.shape[0]))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_cg_scipy(benchmark, spd_bsr_and_dense):
|
|
40
|
+
sp = pytest.importorskip("scipy.sparse")
|
|
41
|
+
spla = pytest.importorskip("scipy.sparse.linalg")
|
|
42
|
+
A_dense, _, b = spd_bsr_and_dense
|
|
43
|
+
A_scipy = sp.csr_matrix(A_dense.numpy())
|
|
44
|
+
b_np = b.numpy()
|
|
45
|
+
benchmark(lambda: spla.cg(A_scipy, b_np, rtol=1e-8, maxiter=2 * len(b_np)))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_power_iteration_trnsparse(benchmark, spd_bsr_and_dense):
|
|
49
|
+
_, A_bsr, _ = spd_bsr_and_dense
|
|
50
|
+
benchmark(lambda: trnsparse.power_iteration_bsr(A_bsr, max_iter=500, tol=1e-9))
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Iterative solvers over BSR
|
|
2
|
+
|
|
3
|
+
trnsparse v0.3.2 adds `cg_bsr` and `power_iteration_bsr` — Conjugate
|
|
4
|
+
Gradient and power iteration on block-sparse row matrices. The API is
|
|
5
|
+
stable; the architectural story below explains why there's a v0.4.0
|
|
6
|
+
follow-up.
|
|
7
|
+
|
|
8
|
+
## Why this matters
|
|
9
|
+
|
|
10
|
+
Large SPD linear systems and dominant-eigenpair problems show up
|
|
11
|
+
across scientific computing:
|
|
12
|
+
|
|
13
|
+
- **Quantum chemistry**: Hamiltonian eigenvalue problems (HF, DFT,
|
|
14
|
+
CI), response equations (CPSCF).
|
|
15
|
+
- **PDE discretizations**: stiffness-matrix solves for finite element
|
|
16
|
+
methods, graph Laplacian systems.
|
|
17
|
+
- **Graph learning**: spectral embeddings, PageRank-like iterations.
|
|
18
|
+
|
|
19
|
+
The matrix `A` in each case is typically block-sparse (Fock matrices
|
|
20
|
+
after Schwarz screening; FEM stiffness tied to mesh connectivity;
|
|
21
|
+
graph adjacency). BSR is the Trainium-native representation for those
|
|
22
|
+
matrices (see `architecture.md`).
|
|
23
|
+
|
|
24
|
+
## v0.3.2 — plumbing
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import trnsparse
|
|
28
|
+
|
|
29
|
+
A = trnsparse.BSRMatrix.from_dense(fock_matrix, block_size=128)
|
|
30
|
+
b = compute_rhs()
|
|
31
|
+
|
|
32
|
+
x, iters, rel = trnsparse.cg_bsr(A, b, tol=1e-6, max_iter=1000)
|
|
33
|
+
# Jacobi-preconditioned variant:
|
|
34
|
+
M = trnsparse.jacobi_preconditioner_bsr(A)
|
|
35
|
+
x, iters, rel = trnsparse.cg_bsr(A, b, tol=1e-6, M=M)
|
|
36
|
+
|
|
37
|
+
lam, v, iters = trnsparse.power_iteration_bsr(A, max_iter=500)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Under the hood, each CG iteration calls `bsr_spmm(A, x.unsqueeze(1))`
|
|
41
|
+
once. On the NKI backend that's one kernel dispatch + one HBM
|
|
42
|
+
round-trip per iteration. On CPU it's `torch.sparse`-backed and
|
|
43
|
+
roughly on par with `scipy.sparse.linalg.cg` (benchmarked: 369 μs vs
|
|
44
|
+
310 μs at 128×128 SPD, 1.19× slower).
|
|
45
|
+
|
|
46
|
+
## v0.4.0 — fused kernel with SBUF-resident A
|
|
47
|
+
|
|
48
|
+
The architectural claim from
|
|
49
|
+
[#22](https://github.com/trnsci/trnsparse/issues/22): Trainium's 32 GB
|
|
50
|
+
SBUF per NeuronCore fits a 5000×5000 BSR Hamiltonian on-chip. CG
|
|
51
|
+
doesn't need to round-trip `A` to HBM at all — only `x`, `r`, and `p`.
|
|
52
|
+
|
|
53
|
+
The shape of the v0.4.0 kernel:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
@nki.jit
|
|
57
|
+
def _cg_spd_kernel(A_blocks, A_cols, A_row_ptrs, b, max_iter):
|
|
58
|
+
# Load A blocks once into SBUF at the top.
|
|
59
|
+
A_sbuf = nl.load(A_blocks)
|
|
60
|
+
|
|
61
|
+
# State: x, r, p in SBUF registers.
|
|
62
|
+
x = nl.zeros(...)
|
|
63
|
+
r = nl.copy(b)
|
|
64
|
+
p = nl.copy(r)
|
|
65
|
+
rr = nl.reduce(r * r)
|
|
66
|
+
|
|
67
|
+
for k in nl.affine_range(max_iter):
|
|
68
|
+
Ap = _bsr_matvec_sbuf(A_sbuf, A_cols, A_row_ptrs, p) # all SBUF
|
|
69
|
+
pAp = nl.reduce(p * Ap)
|
|
70
|
+
alpha = rr / pAp
|
|
71
|
+
x = x + alpha * p
|
|
72
|
+
r = r - alpha * Ap
|
|
73
|
+
rr_new = nl.reduce(r * r)
|
|
74
|
+
beta = rr_new / rr
|
|
75
|
+
p = r + beta * p
|
|
76
|
+
rr = rr_new
|
|
77
|
+
|
|
78
|
+
return x, residual_norm_history
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Fixed `max_iter`, no early exit — returning the full residual history
|
|
82
|
+
lets the host pick the convergence point post-hoc (standard
|
|
83
|
+
NKI-inside-loop constraint: no dynamic control flow).
|
|
84
|
+
|
|
85
|
+
**Expected regime where this wins**: when
|
|
86
|
+
`max_iter × dispatch_overhead > fused_kernel_cost + hbm_load_A_once`.
|
|
87
|
+
For a 4000×4000 BSR Hamiltonian with ~100 iterations to convergence,
|
|
88
|
+
that's roughly an order of magnitude of wall-time reduction.
|
|
89
|
+
|
|
90
|
+
**What needs to land first**:
|
|
91
|
+
|
|
92
|
+
1. Simulator-iterated kernel skeleton (now tractable thanks to the
|
|
93
|
+
`nki-simulator` CI gate — see the NKI 0.3.0 migration in v0.3.1).
|
|
94
|
+
2. SBUF sizing model for `A` — some workloads overflow 32 GB; then
|
|
95
|
+
fall back to the v0.3.2 plumbing.
|
|
96
|
+
3. Dispatcher logic in `cg_bsr` that picks the fused kernel when
|
|
97
|
+
`max_iter * n` exceeds a threshold.
|
|
98
|
+
|
|
99
|
+
Tracked in a dedicated sub-issue on `trnsci/trnsparse`.
|
|
100
|
+
|
|
101
|
+
## Why CG and power iteration, not GMRES / Lanczos / Davidson
|
|
102
|
+
|
|
103
|
+
v0.3.2 covers the two most common algorithm families:
|
|
104
|
+
|
|
105
|
+
- **CG** for SPD systems — the workhorse for Hamiltonian solves,
|
|
106
|
+
stiffness systems, and many PDE discretizations.
|
|
107
|
+
- **Power iteration** for dominant eigenpairs — the starting point
|
|
108
|
+
for spectral methods, PageRank-like fixed points, and the iteration
|
|
109
|
+
core inside Lanczos / Arnoldi / Davidson.
|
|
110
|
+
|
|
111
|
+
GMRES (general non-symmetric systems), Lanczos / Arnoldi (full
|
|
112
|
+
spectrum), and Davidson (interior eigenvalues) are follow-ups when
|
|
113
|
+
users ask for them. The v0.4.0 fused kernel's structure (load A once,
|
|
114
|
+
iterate on-chip) generalizes trivially to those variants — it's the
|
|
115
|
+
same architectural pattern.
|
|
@@ -12,7 +12,9 @@ Usage:
|
|
|
12
12
|
|
|
13
13
|
import argparse
|
|
14
14
|
import time
|
|
15
|
+
|
|
15
16
|
import torch
|
|
17
|
+
|
|
16
18
|
import trnsparse
|
|
17
19
|
|
|
18
20
|
|
|
@@ -27,7 +29,7 @@ def main():
|
|
|
27
29
|
args.nbasis = 50
|
|
28
30
|
|
|
29
31
|
n = args.nbasis
|
|
30
|
-
print(
|
|
32
|
+
print("Sparse Fock build:")
|
|
31
33
|
print(f" Basis functions: {n}")
|
|
32
34
|
print(f" Threshold: {args.threshold:.0e}")
|
|
33
35
|
|
|
@@ -40,7 +42,7 @@ def main():
|
|
|
40
42
|
|
|
41
43
|
# Screen
|
|
42
44
|
stats = trnsparse.sparsity_stats(Q, args.threshold)
|
|
43
|
-
print(
|
|
45
|
+
print("\n Sparsity statistics:")
|
|
44
46
|
print(f" Total shell pairs: {stats['total_pairs']}")
|
|
45
47
|
print(f" Significant pairs: {stats['significant_pairs']}")
|
|
46
48
|
print(f" Pair sparsity: {stats['pair_sparsity']:.1%}")
|
|
@@ -51,7 +53,7 @@ def main():
|
|
|
51
53
|
integrals_dense = torch.randn(n, n) * Q * 0.01
|
|
52
54
|
integrals_dense[~mask] = 0.0
|
|
53
55
|
integrals_sparse = trnsparse.from_dense(integrals_dense)
|
|
54
|
-
print(f" Integral matrix nnz: {integrals_sparse.nnz} / {n*n}")
|
|
56
|
+
print(f" Integral matrix nnz: {integrals_sparse.nnz} / {n * n}")
|
|
55
57
|
|
|
56
58
|
# Density matrix (random SPD for demo)
|
|
57
59
|
P = torch.randn(n, n) * 0.1
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "setuptools-scm>=8.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "trnsparse"
|
|
7
|
+
version = "0.3.2"
|
|
8
|
+
description = "Sparse matrix operations for AWS Trainium via NKI"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Scott Friedman", email = "scttfrdmn@gmail.com" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Scientific/Engineering",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Mathematics",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"torch>=2.1",
|
|
24
|
+
"numpy>=1.24",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
neuron = [
|
|
29
|
+
# NKI 0.3.0 Stable (Neuron SDK 2.29+, April 2026). trnsparse imports
|
|
30
|
+
# from the canonical `nki.*` namespace; the legacy `neuronxcc.nki.*`
|
|
31
|
+
# shim is not used. SDK 2.29 DLAMI bundles nki==0.3.0 alongside
|
|
32
|
+
# neuronxcc==2.24.5133. Hosts without an `nki` wheel (macOS,
|
|
33
|
+
# non-Linux archs) still hit HAS_NKI=False and get the torch fallback.
|
|
34
|
+
"nki>=0.3.0",
|
|
35
|
+
"neuronxcc>=2.24",
|
|
36
|
+
"torch-neuronx>=2.9",
|
|
37
|
+
]
|
|
38
|
+
dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "pytest-cov>=4.1", "scipy>=1.11"]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/trnsci/trnsparse"
|
|
42
|
+
Documentation = "https://trnsci.dev/trnsparse/"
|
|
43
|
+
Repository = "https://github.com/trnsci/trnsparse"
|
|
44
|
+
Issues = "https://github.com/trnsci/trnsparse/issues"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
include = ["trnsparse*"]
|
|
48
|
+
|
|
49
|
+
[tool.ruff]
|
|
50
|
+
# Match the rest of the trnsci suite. Line length 100 is a compromise
|
|
51
|
+
# between black's 88 and the wider tables common in scientific code.
|
|
52
|
+
line-length = 100
|
|
53
|
+
target-version = "py310"
|
|
54
|
+
extend-exclude = ["site", "infra/terraform*/.terraform"]
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint]
|
|
57
|
+
# Sensible default selection: pycodestyle (E/W), Pyflakes (F), isort (I),
|
|
58
|
+
# pyupgrade (UP), flake8-bugbear (B), flake8-simplify (SIM). Skip docstring
|
|
59
|
+
# rules (D*) — we're deliberately light on docstrings in this project.
|
|
60
|
+
select = ["E", "W", "F", "I", "UP", "B", "SIM"]
|
|
61
|
+
ignore = [
|
|
62
|
+
"E501", # line too long — formatter handles it
|
|
63
|
+
"B008", # function call in default arg — common + intentional here
|
|
64
|
+
"SIM108", # if/else over ternary — readability call, not worth fighting
|
|
65
|
+
"SIM300", # Yoda-condition false positives on array comparisons
|
|
66
|
+
"E741", # ambiguous single-letter names — `I` for identity matrix is idiomatic in linear algebra
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
[tool.ruff.lint.per-file-ignores]
|
|
70
|
+
# NKI kernels use patterns that trip some lint rules (SBUF writes that
|
|
71
|
+
# look unused to the linter, etc). Don't fight the kernel authoring loop.
|
|
72
|
+
"trnsparse/nki/dispatch.py" = ["F841", "B007"]
|
|
73
|
+
"trnsparse/nki/kernels.py" = ["F841", "B007"]
|
|
74
|
+
"scripts/*" = ["F401"]
|
|
75
|
+
"tests/*" = ["F401", "F811"]
|
|
76
|
+
|
|
77
|
+
[tool.ruff.format]
|
|
78
|
+
# Black-compatible defaults.
|
|
79
|
+
quote-style = "double"
|
|
80
|
+
indent-style = "space"
|
|
81
|
+
|
|
82
|
+
[tool.pytest.ini_options]
|
|
83
|
+
markers = ["neuron: requires Neuron hardware"]
|
|
84
|
+
testpaths = ["tests"]
|
|
@@ -67,7 +67,7 @@ def render_markdown(rows: dict, machine_info: dict | None = None) -> str:
|
|
|
67
67
|
out.append("| Operation | Variant | Param | Median (μs) | vs trnsparse-PyTorch |")
|
|
68
68
|
out.append("|-----------|---------|-------|------------:|-------------------:|")
|
|
69
69
|
|
|
70
|
-
for
|
|
70
|
+
for group, op, param in sorted(rows.keys()):
|
|
71
71
|
variants = rows[(group, op, param)]
|
|
72
72
|
baseline = variants.get("trnrand_pytorch")
|
|
73
73
|
for variant in ("nki", "trnrand_pytorch", "torch"):
|
|
@@ -82,11 +82,9 @@ def render_markdown(rows: dict, machine_info: dict | None = None) -> str:
|
|
|
82
82
|
if ratio >= 1.0:
|
|
83
83
|
speedup = f"{ratio:.2f}× faster"
|
|
84
84
|
else:
|
|
85
|
-
speedup = f"{1/ratio:.2f}× slower"
|
|
85
|
+
speedup = f"{1 / ratio:.2f}× slower"
|
|
86
86
|
param_disp = param.replace("_", " ") if param else "-"
|
|
87
|
-
out.append(
|
|
88
|
-
f"| {group}.{op} | {label} | {param_disp} | {us:>10.2f} | {speedup} |"
|
|
89
|
-
)
|
|
87
|
+
out.append(f"| {group}.{op} | {label} | {param_disp} | {us:>10.2f} | {speedup} |")
|
|
90
88
|
return "\n".join(out) + "\n"
|
|
91
89
|
|
|
92
90
|
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
#
|
|
3
|
+
# Run NKI simulator-backed tests on the trn1 CI instance.
|
|
4
|
+
#
|
|
5
|
+
# Simulator bypasses torch_xla + NEFF compile: kernels run on CPU via
|
|
6
|
+
# nki.simulate(kernel)(numpy_args). Use for correctness / constraint
|
|
7
|
+
# iteration; hardware still owns perf numbers.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# AWS_PROFILE=aws ./scripts/run_simulator_tests.sh
|
|
11
|
+
#
|
|
12
|
+
# Same trap-stop pattern as run_neuron_tests.sh. Runs the nki_simulator-
|
|
13
|
+
# marked test suite with TRNSPARSE_USE_SIMULATOR=1 set in the SSM env.
|
|
14
|
+
#
|
|
15
|
+
# Still AWS-resident for now (the nki wheel is linux_x86_64 only + lives
|
|
16
|
+
# on the AWS pip index, not a common macOS target). The CI job on
|
|
17
|
+
# ubuntu-latest covers the same surface on every push.
|
|
18
|
+
|
|
19
|
+
set -euo pipefail
|
|
20
|
+
|
|
21
|
+
INSTANCE_TYPE="${INSTANCE_TYPE:-trn1}"
|
|
22
|
+
TAG="trnsparse-ci-${INSTANCE_TYPE}"
|
|
23
|
+
REGION="${AWS_REGION:-us-east-1}"
|
|
24
|
+
SHA="$(git rev-parse HEAD)"
|
|
25
|
+
|
|
26
|
+
: "${AWS_PROFILE:?Set AWS_PROFILE, e.g. AWS_PROFILE=aws ./scripts/run_simulator_tests.sh}"
|
|
27
|
+
|
|
28
|
+
echo "Looking up instance with Name=$TAG in $REGION..."
|
|
29
|
+
INSTANCE_ID=$(aws ec2 describe-instances \
|
|
30
|
+
--filters "Name=tag:Name,Values=$TAG" \
|
|
31
|
+
"Name=instance-state-name,Values=stopped,running,pending" \
|
|
32
|
+
--query 'Reservations[0].Instances[0].InstanceId' \
|
|
33
|
+
--output text \
|
|
34
|
+
--region "$REGION")
|
|
35
|
+
|
|
36
|
+
if [[ -z "$INSTANCE_ID" || "$INSTANCE_ID" == "None" ]]; then
|
|
37
|
+
echo "ERROR: No instance found with Name=$TAG" >&2
|
|
38
|
+
exit 1
|
|
39
|
+
fi
|
|
40
|
+
echo "Instance: $INSTANCE_ID"
|
|
41
|
+
|
|
42
|
+
cleanup() {
|
|
43
|
+
local exit_code=$?
|
|
44
|
+
echo ""
|
|
45
|
+
echo "Stopping $INSTANCE_ID..."
|
|
46
|
+
aws ec2 stop-instances --instance-ids "$INSTANCE_ID" --region "$REGION" >/dev/null
|
|
47
|
+
exit "$exit_code"
|
|
48
|
+
}
|
|
49
|
+
trap cleanup EXIT
|
|
50
|
+
|
|
51
|
+
STATE=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" --region "$REGION" \
|
|
52
|
+
--query 'Reservations[0].Instances[0].State.Name' --output text)
|
|
53
|
+
|
|
54
|
+
if [[ "$STATE" == "stopped" ]]; then
|
|
55
|
+
echo "Starting instance..."
|
|
56
|
+
aws ec2 start-instances --instance-ids "$INSTANCE_ID" --region "$REGION" >/dev/null
|
|
57
|
+
fi
|
|
58
|
+
|
|
59
|
+
echo "Waiting for instance-running..."
|
|
60
|
+
aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" --region "$REGION"
|
|
61
|
+
echo "Waiting for SSM agent..."
|
|
62
|
+
for _ in $(seq 1 60); do
|
|
63
|
+
PING=$(aws ssm describe-instance-information \
|
|
64
|
+
--filters "Key=InstanceIds,Values=$INSTANCE_ID" \
|
|
65
|
+
--region "$REGION" \
|
|
66
|
+
--query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || true)
|
|
67
|
+
[[ "$PING" == "Online" ]] && break
|
|
68
|
+
sleep 5
|
|
69
|
+
done
|
|
70
|
+
if [[ "$PING" != "Online" ]]; then
|
|
71
|
+
echo "ERROR: SSM agent not Online after 5 minutes (last PingStatus=$PING)" >&2
|
|
72
|
+
exit 1
|
|
73
|
+
fi
|
|
74
|
+
|
|
75
|
+
echo "Sending simulator test command (SHA=$SHA)..."
|
|
76
|
+
CMD_ID=$(aws ssm send-command \
|
|
77
|
+
--instance-ids "$INSTANCE_ID" \
|
|
78
|
+
--document-name "AWS-RunShellScript" \
|
|
79
|
+
--comment "trnsparse nki simulator tests @ $SHA" \
|
|
80
|
+
--parameters "commands=[
|
|
81
|
+
\"bash -c 'set -euo pipefail; cd /home/ubuntu/trnsparse && sudo -u ubuntu git fetch --all && sudo -u ubuntu git checkout $SHA && NEURON_VENV=\$(ls -d /opt/aws_neuronx_venv_pytorch_* | head -1) && sudo -u ubuntu \$NEURON_VENV/bin/pip install -e /home/ubuntu/trnsparse[dev] --quiet && sudo -u ubuntu env PATH=\$NEURON_VENV/bin:/usr/bin:/bin TRNSPARSE_USE_SIMULATOR=1 \$NEURON_VENV/bin/pytest /home/ubuntu/trnsparse/tests/ -v -m nki_simulator --tb=short'\"
|
|
82
|
+
]" \
|
|
83
|
+
--region "$REGION" \
|
|
84
|
+
--output text --query 'Command.CommandId')
|
|
85
|
+
|
|
86
|
+
echo "Command ID: $CMD_ID"
|
|
87
|
+
echo "Waiting for command to complete..."
|
|
88
|
+
for _ in $(seq 1 60); do
|
|
89
|
+
STATUS=$(aws ssm get-command-invocation \
|
|
90
|
+
--command-id "$CMD_ID" \
|
|
91
|
+
--instance-id "$INSTANCE_ID" \
|
|
92
|
+
--region "$REGION" \
|
|
93
|
+
--query 'Status' --output text 2>/dev/null || echo "InProgress")
|
|
94
|
+
[[ "$STATUS" != "InProgress" && "$STATUS" != "Pending" ]] && break
|
|
95
|
+
sleep 15
|
|
96
|
+
done
|
|
97
|
+
|
|
98
|
+
echo ""
|
|
99
|
+
echo "=== STDOUT ==="
|
|
100
|
+
aws ssm get-command-invocation \
|
|
101
|
+
--command-id "$CMD_ID" \
|
|
102
|
+
--instance-id "$INSTANCE_ID" \
|
|
103
|
+
--region "$REGION" \
|
|
104
|
+
--query 'StandardOutputContent' --output text
|
|
105
|
+
|
|
106
|
+
echo ""
|
|
107
|
+
echo "=== STDERR ==="
|
|
108
|
+
aws ssm get-command-invocation \
|
|
109
|
+
--command-id "$CMD_ID" \
|
|
110
|
+
--instance-id "$INSTANCE_ID" \
|
|
111
|
+
--region "$REGION" \
|
|
112
|
+
--query 'StandardErrorContent' --output text | head -20
|
|
113
|
+
|
|
114
|
+
echo ""
|
|
115
|
+
echo "=== Status: $STATUS ==="
|
|
116
|
+
[[ "$STATUS" == "Success" ]]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Test configuration."""
|
|
2
|
+
|
|
3
|
+
import pytest # noqa: F401 — imported for marker side-effect exposure
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def pytest_configure(config):
|
|
7
|
+
config.addinivalue_line("markers", "neuron: requires Neuron hardware")
|
|
8
|
+
config.addinivalue_line(
|
|
9
|
+
"markers",
|
|
10
|
+
"nki_simulator: runs NKI kernels via nki.simulate on CPU "
|
|
11
|
+
"(requires TRNSPARSE_USE_SIMULATOR=1 + nki>=0.3.0)",
|
|
12
|
+
)
|