trnsparse 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. trnsparse-0.3.2/.github/workflows/ci.yml +58 -0
  2. trnsparse-0.3.2/.pre-commit-config.yaml +7 -0
  3. {trnsparse-0.3.0 → trnsparse-0.3.2}/CHANGELOG.md +67 -0
  4. {trnsparse-0.3.0 → trnsparse-0.3.2}/CLAUDE.md +12 -0
  5. {trnsparse-0.3.0/trnsparse.egg-info → trnsparse-0.3.2}/PKG-INFO +5 -1
  6. {trnsparse-0.3.0 → trnsparse-0.3.2}/README.md +2 -0
  7. {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_bsr_spmm.py +2 -3
  8. trnsparse-0.3.2/benchmarks/bench_iterative.py +50 -0
  9. {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_spmm.py +1 -0
  10. trnsparse-0.3.2/docs/iterative_solvers.md +115 -0
  11. {trnsparse-0.3.0 → trnsparse-0.3.2}/examples/sparse_fock.py +5 -3
  12. trnsparse-0.3.2/pyproject.toml +84 -0
  13. {trnsparse-0.3.0 → trnsparse-0.3.2}/scripts/bench_to_md.py +3 -5
  14. trnsparse-0.3.2/scripts/run_simulator_tests.sh +116 -0
  15. trnsparse-0.3.2/tests/conftest.py +12 -0
  16. {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_bsr.py +1 -2
  17. {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_formats.py +3 -5
  18. trnsparse-0.3.2/tests/test_iterative.py +135 -0
  19. {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_nki_bsr.py +15 -13
  20. trnsparse-0.3.2/tests/test_nki_sim.py +113 -0
  21. {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_nki_spmm.py +7 -8
  22. {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_ops.py +2 -7
  23. {trnsparse-0.3.0 → trnsparse-0.3.2}/tests/test_screening.py +2 -3
  24. trnsparse-0.3.2/trnsparse/__init__.py +51 -0
  25. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/formats.py +15 -10
  26. trnsparse-0.3.2/trnsparse/iterative.py +190 -0
  27. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/nki/__init__.py +3 -1
  28. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/nki/dispatch.py +67 -24
  29. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/nki/kernels.py +9 -12
  30. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/ops.py +30 -19
  31. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse/screening.py +2 -5
  32. {trnsparse-0.3.0 → trnsparse-0.3.2/trnsparse.egg-info}/PKG-INFO +5 -1
  33. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/SOURCES.txt +7 -0
  34. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/requires.txt +2 -0
  35. trnsparse-0.3.0/.github/workflows/ci.yml +0 -20
  36. trnsparse-0.3.0/pyproject.toml +0 -47
  37. trnsparse-0.3.0/tests/conftest.py +0 -5
  38. trnsparse-0.3.0/trnsparse/__init__.py +0 -23
  39. {trnsparse-0.3.0 → trnsparse-0.3.2}/.github/workflows/notify-umbrella.yml +0 -0
  40. {trnsparse-0.3.0 → trnsparse-0.3.2}/.github/workflows/publish.yml +0 -0
  41. {trnsparse-0.3.0 → trnsparse-0.3.2}/.gitignore +0 -0
  42. {trnsparse-0.3.0 → trnsparse-0.3.2}/CODE_OF_CONDUCT.md +0 -0
  43. {trnsparse-0.3.0 → trnsparse-0.3.2}/CONTRIBUTING.md +0 -0
  44. {trnsparse-0.3.0 → trnsparse-0.3.2}/LICENSE +0 -0
  45. {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_screening.py +0 -0
  46. {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/bench_spmv.py +0 -0
  47. {trnsparse-0.3.0 → trnsparse-0.3.2}/benchmarks/conftest.py +0 -0
  48. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/api.md +0 -0
  49. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/architecture.md +0 -0
  50. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/aws_setup.md +0 -0
  51. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/benchmarks.md +0 -0
  52. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/index.md +0 -0
  53. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/installation.md +0 -0
  54. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/migration_scipy.md +0 -0
  55. {trnsparse-0.3.0 → trnsparse-0.3.2}/docs/quickstart.md +0 -0
  56. {trnsparse-0.3.0 → trnsparse-0.3.2}/infra/terraform/.terraform.lock.hcl +0 -0
  57. {trnsparse-0.3.0 → trnsparse-0.3.2}/infra/terraform/README.md +0 -0
  58. {trnsparse-0.3.0 → trnsparse-0.3.2}/infra/terraform/main.tf +0 -0
  59. {trnsparse-0.3.0 → trnsparse-0.3.2}/mkdocs.yml +0 -0
  60. {trnsparse-0.3.0 → trnsparse-0.3.2}/scripts/run_benchmarks.sh +0 -0
  61. {trnsparse-0.3.0 → trnsparse-0.3.2}/scripts/run_neuron_tests.sh +0 -0
  62. {trnsparse-0.3.0 → trnsparse-0.3.2}/setup.cfg +0 -0
  63. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/dependency_links.txt +0 -0
  64. {trnsparse-0.3.0 → trnsparse-0.3.2}/trnsparse.egg-info/top_level.txt +0 -0
@@ -0,0 +1,58 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ lint:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v6
13
+ - uses: actions/setup-python@v6
14
+ with:
15
+ python-version: "3.12"
16
+ - run: pip install "ruff>=0.6"
17
+ - run: ruff check .
18
+ - run: ruff format --check .
19
+
20
+ test:
21
+ runs-on: ubuntu-latest
22
+ strategy:
23
+ matrix:
24
+ python-version: ["3.10", "3.11", "3.12"]
25
+ steps:
26
+ - uses: actions/checkout@v6
27
+ - uses: actions/setup-python@v6
28
+ with:
29
+ python-version: ${{ matrix.python-version }}
30
+ - run: pip install -e ".[dev]"
31
+ - run: pytest tests/ -v -x --tb=short -m "not neuron and not nki_simulator" --cov=trnsparse --cov-report=xml
32
+ - name: Upload coverage reports to Codecov
33
+ if: matrix.python-version == '3.12'
34
+ uses: codecov/codecov-action@v5
35
+ with:
36
+ token: ${{ secrets.CODECOV_TOKEN }}
37
+ slug: trnsci/trnsparse
38
+
39
+ nki-simulator:
40
+ # Runs NKI kernels through nki.simulate(kernel)(numpy_args) on CPU.
41
+ # Catches Python-trace-level errors (bad kwargs, dropped ops, shape
42
+ # mismatches) pre-merge without AWS round-trips. MLIR verifier
43
+ # errors remain hardware-only (simulator explicitly skips compile).
44
+ runs-on: ubuntu-latest
45
+ steps:
46
+ - uses: actions/checkout@v6
47
+ - uses: actions/setup-python@v6
48
+ with:
49
+ python-version: "3.12"
50
+ - name: Install trnsparse + NKI simulator deps
51
+ run: |
52
+ pip install -e ".[dev]"
53
+ pip install --extra-index-url https://pip.repos.neuron.amazonaws.com \
54
+ "nki>=0.3.0"
55
+ - name: Run simulator-backed kernel tests
56
+ env:
57
+ TRNSPARSE_USE_SIMULATOR: "1"
58
+ run: pytest tests/ -v -m nki_simulator --tb=short
@@ -0,0 +1,7 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.8.6
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
@@ -5,6 +5,73 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.3.2] — 2026-04-14
9
+
10
+ ### Added
11
+
12
+ - **`cg_bsr`** and **`power_iteration_bsr`** — Conjugate Gradient and
13
+ power iteration on block-sparse row matrices. Plumbing on top of
14
+ `bsr_spmm` (one kernel dispatch per iteration). Closes Phase 1 of
15
+ #22 on-chip iterative solvers.
16
+ - **`jacobi_preconditioner_bsr(A)`** — builds a diagonal preconditioner
17
+ for `cg_bsr`'s `M=` argument.
18
+ - **`bsr_diagonal(A)`** — extracts the main diagonal from a BSR matrix.
19
+ - **`docs/iterative_solvers.md`** — design note covering the v0.3.2
20
+ plumbing and the v0.4.0 fused-kernel goal (#24). Explains the
21
+ architectural win Trainium offers (A SBUF-resident across iterations)
22
+ vs the current per-iteration HBM round-trip.
23
+ - **`tests/test_iterative.py`** — 8 CPU tests including scipy parity at
24
+ `atol=1e-4` on a 128×128 SPD system.
25
+ - **`benchmarks/bench_iterative.py`** — cg_bsr vs scipy.sparse.linalg.cg.
26
+ At 128×128 SPD: scipy 310 μs, trnsparse 369 μs (1.19×).
27
+
28
+ ### Notes
29
+
30
+ - Algorithm body for CG is a local copy of `trnsolver.iterative.cg`;
31
+ kept local to avoid a cross-repo runtime dependency for one function.
32
+ - v0.4.0 will layer the fused CG/power-iteration NKI kernel on top —
33
+ tracked in #24. The API stays stable across the transition; users
34
+ upgrading from v0.3.2 get the fused-kernel speedup automatically
35
+ when the fused path is available.
36
+
37
+ ## [0.3.1] — 2026-04-14
38
+
39
+ ### Changed
40
+
41
+ - **Migrated NKI imports to the `nki.*` namespace** (NKI 0.3.0 Stable,
42
+ Neuron SDK 2.29, April 2026). Legacy `neuronxcc.nki.*` shim is no
43
+ longer used. `pyproject.toml` `[neuron]` extra gains `nki>=0.3.0`
44
+ alongside the existing `neuronxcc>=2.24` and `torch-neuronx>=2.9`.
45
+ Hosts without an `nki` wheel (macOS, non-Linux archs) still hit
46
+ `HAS_NKI=False` and get the torch fallback. Kernel bodies unchanged —
47
+ the trnblas audit confirmed the positional `nisa.nc_matmul` +
48
+ `nl.copy(psum, ...)` pattern complies with NKI 0.3.0.
49
+ - `test` CI job now filters `-m "not neuron and not nki_simulator"` so
50
+ each test runs in exactly one job.
51
+
52
+ ### Added
53
+
54
+ - **`TRNSPARSE_USE_SIMULATOR=1` dispatch branch** through
55
+ `nki.simulate(kernel)(np_args)`. Bypasses torch_xla + NEFF compile;
56
+ kernels run on CPU for correctness iteration. Hardware still owns
57
+ perf numbers.
58
+ - **`nki-simulator` CI job on `ubuntu-latest`** — installs `nki>=0.3.0`
59
+ from the AWS pip index and runs the simulator suite on every push/PR.
60
+ Kernel correctness gate without AWS cost. Catches Python-trace-level
61
+ errors (bad kwargs, dropped ops, shape mismatches); MLIR verifier
62
+ errors remain hardware-only (NKI 0.3.0 has no documented device-free
63
+ NEFF compile API).
64
+ - `tests/test_nki_sim.py` — curated simulator suite (4 tests: CSR
65
+ aligned + rectangular, BSR block-dense + block-diagonal). Skips
66
+ cleanly off-hardware.
67
+ - `scripts/run_simulator_tests.sh` — SSM runner mirroring
68
+ `run_neuron_tests.sh` with `TRNSPARSE_USE_SIMULATOR=1` in the env.
69
+ - `tests/conftest.py` — registers the `nki_simulator` pytest marker.
70
+
71
+ Addresses [trnsci/trnsparse#23](https://github.com/trnsci/trnsparse/issues/23).
72
+ Follows the trnblas reference commits `c693561`, `f24993b`, `77eeb82`
73
+ (suite-wide coordination in `trnsci/trnsci#5`).
74
+
8
75
  ## [0.3.0] — 2026-04-13
9
76
 
10
77
  ### Added
@@ -64,3 +64,15 @@ pip install -e ".[dev]"
64
64
  pytest tests/ -v
65
65
  python examples/sparse_fock.py --demo
66
66
  ```
67
+
68
+ ## Blog posts
69
+
70
+ When asked to draft a blog post for this library for the [trnsci blog](https://trnsci.dev/blog/):
71
+
72
+ 1. Read the editorial brief at [`docs/blog/AUTHOR_BRIEF.md`](https://github.com/trnsci/trnsci/blob/main/docs/blog/AUTHOR_BRIEF.md) in the umbrella repo (trnsci/trnsci). It defines voice (authorless, library-as-subject), stance (architecture-first, transparency-always), and the nine required section headings.
73
+
74
+ 2. Find the prompt block for this library in [`BLOG_PROMPTS.md`](https://github.com/trnsci/trnsci/blob/main/BLOG_PROMPTS.md) at the umbrella repo root. It carries library-specific context and suggested architectural angles.
75
+
76
+ 3. Draft the post following the brief. Open a PR against `trnsci/trnsci` at `docs/blog/posts/<YYYY-MM-DD>-<slug>.md`. Scott (suite director) reviews before merge.
77
+
78
+ The umbrella repo — not this one — owns the blog. Per-library retrospective posts are unsigned; library is the subject, no byline. See the brief for the full set of rules.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trnsparse
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Sparse matrix operations for AWS Trainium via NKI
5
5
  Author-email: Scott Friedman <scttfrdmn@gmail.com>
6
6
  License-Expression: Apache-2.0
@@ -19,17 +19,21 @@ License-File: LICENSE
19
19
  Requires-Dist: torch>=2.1
20
20
  Requires-Dist: numpy>=1.24
21
21
  Provides-Extra: neuron
22
+ Requires-Dist: nki>=0.3.0; extra == "neuron"
22
23
  Requires-Dist: neuronxcc>=2.24; extra == "neuron"
23
24
  Requires-Dist: torch-neuronx>=2.9; extra == "neuron"
24
25
  Provides-Extra: dev
25
26
  Requires-Dist: pytest>=7.0; extra == "dev"
26
27
  Requires-Dist: pytest-benchmark>=4.0; extra == "dev"
28
+ Requires-Dist: pytest-cov>=4.1; extra == "dev"
27
29
  Requires-Dist: scipy>=1.11; extra == "dev"
28
30
  Dynamic: license-file
29
31
 
30
32
  # trnsparse
31
33
 
32
34
  [![CI](https://github.com/trnsci/trnsparse/actions/workflows/ci.yml/badge.svg)](https://github.com/trnsci/trnsparse/actions/workflows/ci.yml)
35
+ [![codecov](https://codecov.io/gh/trnsci/trnsparse/graph/badge.svg)](https://codecov.io/gh/trnsci/trnsparse)
36
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
33
37
  [![PyPI](https://img.shields.io/pypi/v/trnsparse)](https://pypi.org/project/trnsparse/)
34
38
  [![Python](https://img.shields.io/pypi/pyversions/trnsparse)](https://pypi.org/project/trnsparse/)
35
39
  [![License](https://img.shields.io/github/license/trnsci/trnsparse)](LICENSE)
@@ -1,6 +1,8 @@
1
1
  # trnsparse
2
2
 
3
3
  [![CI](https://github.com/trnsci/trnsparse/actions/workflows/ci.yml/badge.svg)](https://github.com/trnsci/trnsparse/actions/workflows/ci.yml)
4
+ [![codecov](https://codecov.io/gh/trnsci/trnsparse/graph/badge.svg)](https://codecov.io/gh/trnsci/trnsparse)
5
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
4
6
  [![PyPI](https://img.shields.io/pypi/v/trnsparse)](https://pypi.org/project/trnsparse/)
5
7
  [![Python](https://img.shields.io/pypi/pyversions/trnsparse)](https://pypi.org/project/trnsparse/)
6
8
  [![License](https://img.shields.io/github/license/trnsci/trnsparse)](LICENSE)
@@ -21,8 +21,7 @@ import torch
21
21
  import trnsparse
22
22
  from trnsparse.nki.dispatch import HAS_NKI
23
23
 
24
-
25
- M_BLOCKS = [4, 8] # matrix has M_BLOCKS * 128 rows
24
+ M_BLOCKS = [4, 8] # matrix has M_BLOCKS * 128 rows
26
25
  N_BLOCKS = [4, 8]
27
26
  BLOCK_DENSITIES = [0.1, 0.25, 0.5]
28
27
  RHS_COLS = [128, 256]
@@ -58,7 +57,7 @@ def bsr_and_B(m_blocks, n_blocks, block_density, bsr_rhs_cols):
58
57
  for i in range(m_blocks):
59
58
  for j in range(n_blocks):
60
59
  if mask[i, j]:
61
- A[i * b:(i + 1) * b, j * b:(j + 1) * b] = torch.randn(b, b)
60
+ A[i * b : (i + 1) * b, j * b : (j + 1) * b] = torch.randn(b, b)
62
61
  bsr = trnsparse.BSRMatrix.from_dense(A, block_size=b)
63
62
  B = torch.randn(N, bsr_rhs_cols)
64
63
  return bsr, B, A
@@ -0,0 +1,50 @@
1
+ """Iterative-solver benchmarks: trnsparse.cg_bsr vs scipy baseline.
2
+
3
+ The v0.3.2 plumbing dispatches one `bsr_spmm` call per CG iteration.
4
+ On NKI that means one kernel launch + HBM round-trip per iteration.
5
+ Expect the current path to be dominated by dispatch overhead compared
6
+ to scipy's compiled C loop, which motivates the v0.4.0 fused-kernel
7
+ follow-up.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import pytest
13
+ import torch
14
+
15
+ import trnsparse
16
+
17
+
18
+ @pytest.fixture(params=[128, 256])
19
+ def iter_size(request):
20
+ return request.param
21
+
22
+
23
+ @pytest.fixture
24
+ def spd_bsr_and_dense(iter_size):
25
+ torch.manual_seed(0)
26
+ n = iter_size
27
+ M = torch.randn(n, n)
28
+ A_dense = M @ M.T + n * torch.eye(n)
29
+ A_bsr = trnsparse.BSRMatrix.from_dense(A_dense, block_size=128)
30
+ b = torch.randn(n)
31
+ return A_dense, A_bsr, b
32
+
33
+
34
+ def test_cg_bsr_trnsparse(benchmark, spd_bsr_and_dense):
35
+ _, A_bsr, b = spd_bsr_and_dense
36
+ benchmark(lambda: trnsparse.cg_bsr(A_bsr, b, tol=1e-8, max_iter=2 * b.shape[0]))
37
+
38
+
39
+ def test_cg_scipy(benchmark, spd_bsr_and_dense):
40
+ sp = pytest.importorskip("scipy.sparse")
41
+ spla = pytest.importorskip("scipy.sparse.linalg")
42
+ A_dense, _, b = spd_bsr_and_dense
43
+ A_scipy = sp.csr_matrix(A_dense.numpy())
44
+ b_np = b.numpy()
45
+ benchmark(lambda: spla.cg(A_scipy, b_np, rtol=1e-8, maxiter=2 * len(b_np)))
46
+
47
+
48
+ def test_power_iteration_trnsparse(benchmark, spd_bsr_and_dense):
49
+ _, A_bsr, _ = spd_bsr_and_dense
50
+ benchmark(lambda: trnsparse.power_iteration_bsr(A_bsr, max_iter=500, tol=1e-9))
@@ -14,6 +14,7 @@ On Trainium + neuronxcc installed, all four run in one pytest invocation.
14
14
  """
15
15
 
16
16
  import pytest
17
+
17
18
  import trnsparse
18
19
  from trnsparse.nki.dispatch import HAS_NKI
19
20
 
@@ -0,0 +1,115 @@
1
+ # Iterative solvers over BSR
2
+
3
+ trnsparse v0.3.2 adds `cg_bsr` and `power_iteration_bsr` — Conjugate
4
+ Gradient and power iteration on block-sparse row matrices. The API is
5
+ stable; the architectural story below explains why there's a v0.4.0
6
+ follow-up.
7
+
8
+ ## Why this matters
9
+
10
+ Large SPD linear systems and dominant-eigenpair problems show up
11
+ across scientific computing:
12
+
13
+ - **Quantum chemistry**: Hamiltonian eigenvalue problems (HF, DFT,
14
+ CI), response equations (CPSCF).
15
+ - **PDE discretizations**: stiffness-matrix solves for finite element
16
+ methods, graph Laplacian systems.
17
+ - **Graph learning**: spectral embeddings, PageRank-like iterations.
18
+
19
+ The matrix `A` in each case is typically block-sparse (Fock matrices
20
+ after Schwarz screening; FEM stiffness tied to mesh connectivity;
21
+ graph adjacency). BSR is the Trainium-native representation for those
22
+ matrices (see `architecture.md`).
23
+
24
+ ## v0.3.2 — plumbing
25
+
26
+ ```python
27
+ import trnsparse
28
+
29
+ A = trnsparse.BSRMatrix.from_dense(fock_matrix, block_size=128)
30
+ b = compute_rhs()
31
+
32
+ x, iters, rel = trnsparse.cg_bsr(A, b, tol=1e-6, max_iter=1000)
33
+ # Jacobi-preconditioned variant:
34
+ M = trnsparse.jacobi_preconditioner_bsr(A)
35
+ x, iters, rel = trnsparse.cg_bsr(A, b, tol=1e-6, M=M)
36
+
37
+ lam, v, iters = trnsparse.power_iteration_bsr(A, max_iter=500)
38
+ ```
39
+
40
+ Under the hood, each CG iteration calls `bsr_spmm(A, x.unsqueeze(1))`
41
+ once. On the NKI backend that's one kernel dispatch + one HBM
42
+ round-trip per iteration. On CPU it's `torch.sparse`-backed and
43
+ roughly on par with `scipy.sparse.linalg.cg` (benchmarked: 369 μs vs
44
+ 310 μs at 128×128 SPD, 1.19× slower).
45
+
46
+ ## v0.4.0 — fused kernel with SBUF-resident A
47
+
48
+ The architectural claim from
49
+ [#22](https://github.com/trnsci/trnsparse/issues/22): Trainium's 32 GB
50
+ SBUF per NeuronCore fits a 5000×5000 BSR Hamiltonian on-chip. CG
51
+ doesn't need to round-trip `A` to HBM at all — only `x`, `r`, and `p`.
52
+
53
+ The shape of the v0.4.0 kernel:
54
+
55
+ ```python
56
+ @nki.jit
57
+ def _cg_spd_kernel(A_blocks, A_cols, A_row_ptrs, b, max_iter):
58
+ # Load A blocks once into SBUF at the top.
59
+ A_sbuf = nl.load(A_blocks)
60
+
61
+ # State: x, r, p in SBUF registers.
62
+ x = nl.zeros(...)
63
+ r = nl.copy(b)
64
+ p = nl.copy(r)
65
+ rr = nl.reduce(r * r)
66
+
67
+ for k in nl.affine_range(max_iter):
68
+ Ap = _bsr_matvec_sbuf(A_sbuf, A_cols, A_row_ptrs, p) # all SBUF
69
+ pAp = nl.reduce(p * Ap)
70
+ alpha = rr / pAp
71
+ x = x + alpha * p
72
+ r = r - alpha * Ap
73
+ rr_new = nl.reduce(r * r)
74
+ beta = rr_new / rr
75
+ p = r + beta * p
76
+ rr = rr_new
77
+
78
+ return x, residual_norm_history
79
+ ```
80
+
81
+ Fixed `max_iter`, no early exit — returning the full residual history
82
+ lets the host pick the convergence point post-hoc (standard
83
+ NKI-inside-loop constraint: no dynamic control flow).
84
+
85
+ **Expected regime where this wins**: when
86
+ `max_iter × dispatch_overhead > fused_kernel_cost + hbm_load_A_once`.
87
+ For a 4000×4000 BSR Hamiltonian with ~100 iterations to convergence,
88
+ that's roughly an order of magnitude of wall-time reduction.
89
+
90
+ **What needs to land first**:
91
+
92
+ 1. Simulator-iterated kernel skeleton (now tractable thanks to the
93
+ `nki-simulator` CI gate — see the NKI 0.3.0 migration in v0.3.1).
94
+ 2. SBUF sizing model for `A` — some workloads overflow 32 GB; then
95
+ fall back to the v0.3.2 plumbing.
96
+ 3. Dispatcher logic in `cg_bsr` that picks the fused kernel when
97
+ `max_iter * n` exceeds a threshold.
98
+
99
+ Tracked in a dedicated sub-issue on `trnsci/trnsparse`.
100
+
101
+ ## Why CG and power iteration, not GMRES / Lanczos / Davidson
102
+
103
+ v0.3.2 covers the two most common algorithm families:
104
+
105
+ - **CG** for SPD systems — the workhorse for Hamiltonian solves,
106
+ stiffness systems, and many PDE discretizations.
107
+ - **Power iteration** for dominant eigenpairs — the starting point
108
+ for spectral methods, PageRank-like fixed points, and the iteration
109
+ core inside Lanczos / Arnoldi / Davidson.
110
+
111
+ GMRES (general non-symmetric systems), Lanczos / Arnoldi (full
112
+ spectrum), and Davidson (interior eigenvalues) are follow-ups when
113
+ users ask for them. The v0.4.0 fused kernel's structure (load A once,
114
+ iterate on-chip) generalizes trivially to those variants — it's the
115
+ same architectural pattern.
@@ -12,7 +12,9 @@ Usage:
12
12
 
13
13
  import argparse
14
14
  import time
15
+
15
16
  import torch
17
+
16
18
  import trnsparse
17
19
 
18
20
 
@@ -27,7 +29,7 @@ def main():
27
29
  args.nbasis = 50
28
30
 
29
31
  n = args.nbasis
30
- print(f"Sparse Fock build:")
32
+ print("Sparse Fock build:")
31
33
  print(f" Basis functions: {n}")
32
34
  print(f" Threshold: {args.threshold:.0e}")
33
35
 
@@ -40,7 +42,7 @@ def main():
40
42
 
41
43
  # Screen
42
44
  stats = trnsparse.sparsity_stats(Q, args.threshold)
43
- print(f"\n Sparsity statistics:")
45
+ print("\n Sparsity statistics:")
44
46
  print(f" Total shell pairs: {stats['total_pairs']}")
45
47
  print(f" Significant pairs: {stats['significant_pairs']}")
46
48
  print(f" Pair sparsity: {stats['pair_sparsity']:.1%}")
@@ -51,7 +53,7 @@ def main():
51
53
  integrals_dense = torch.randn(n, n) * Q * 0.01
52
54
  integrals_dense[~mask] = 0.0
53
55
  integrals_sparse = trnsparse.from_dense(integrals_dense)
54
- print(f" Integral matrix nnz: {integrals_sparse.nnz} / {n*n}")
56
+ print(f" Integral matrix nnz: {integrals_sparse.nnz} / {n * n}")
55
57
 
56
58
  # Density matrix (random SPD for demo)
57
59
  P = torch.randn(n, n) * 0.1
@@ -0,0 +1,84 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "setuptools-scm>=8.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "trnsparse"
7
+ version = "0.3.2"
8
+ description = "Sparse matrix operations for AWS Trainium via NKI"
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Scott Friedman", email = "scttfrdmn@gmail.com" },
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Science/Research",
18
+ "Programming Language :: Python :: 3",
19
+ "Topic :: Scientific/Engineering",
20
+ "Topic :: Scientific/Engineering :: Mathematics",
21
+ ]
22
+ dependencies = [
23
+ "torch>=2.1",
24
+ "numpy>=1.24",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ neuron = [
29
+ # NKI 0.3.0 Stable (Neuron SDK 2.29+, April 2026). trnsparse imports
30
+ # from the canonical `nki.*` namespace; the legacy `neuronxcc.nki.*`
31
+ # shim is not used. SDK 2.29 DLAMI bundles nki==0.3.0 alongside
32
+ # neuronxcc==2.24.5133. Hosts without an `nki` wheel (macOS,
33
+ # non-Linux archs) still hit HAS_NKI=False and get the torch fallback.
34
+ "nki>=0.3.0",
35
+ "neuronxcc>=2.24",
36
+ "torch-neuronx>=2.9",
37
+ ]
38
+ dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "pytest-cov>=4.1", "scipy>=1.11"]
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/trnsci/trnsparse"
42
+ Documentation = "https://trnsci.dev/trnsparse/"
43
+ Repository = "https://github.com/trnsci/trnsparse"
44
+ Issues = "https://github.com/trnsci/trnsparse/issues"
45
+
46
+ [tool.setuptools.packages.find]
47
+ include = ["trnsparse*"]
48
+
49
+ [tool.ruff]
50
+ # Match the rest of the trnsci suite. Line length 100 is a compromise
51
+ # between black's 88 and the wider tables common in scientific code.
52
+ line-length = 100
53
+ target-version = "py310"
54
+ extend-exclude = ["site", "infra/terraform*/.terraform"]
55
+
56
+ [tool.ruff.lint]
57
+ # Sensible default selection: pycodestyle (E/W), Pyflakes (F), isort (I),
58
+ # pyupgrade (UP), flake8-bugbear (B), flake8-simplify (SIM). Skip docstring
59
+ # rules (D*) — we're deliberately light on docstrings in this project.
60
+ select = ["E", "W", "F", "I", "UP", "B", "SIM"]
61
+ ignore = [
62
+ "E501", # line too long — formatter handles it
63
+ "B008", # function call in default arg — common + intentional here
64
+ "SIM108", # if/else over ternary — readability call, not worth fighting
65
+ "SIM300", # Yoda-condition false positives on array comparisons
66
+ "E741", # ambiguous single-letter names — `I` for identity matrix is idiomatic in linear algebra
67
+ ]
68
+
69
+ [tool.ruff.lint.per-file-ignores]
70
+ # NKI kernels use patterns that trip some lint rules (SBUF writes that
71
+ # look unused to the linter, etc). Don't fight the kernel authoring loop.
72
+ "trnsparse/nki/dispatch.py" = ["F841", "B007"]
73
+ "trnsparse/nki/kernels.py" = ["F841", "B007"]
74
+ "scripts/*" = ["F401"]
75
+ "tests/*" = ["F401", "F811"]
76
+
77
+ [tool.ruff.format]
78
+ # Black-compatible defaults.
79
+ quote-style = "double"
80
+ indent-style = "space"
81
+
82
+ [tool.pytest.ini_options]
83
+ markers = ["neuron: requires Neuron hardware"]
84
+ testpaths = ["tests"]
@@ -67,7 +67,7 @@ def render_markdown(rows: dict, machine_info: dict | None = None) -> str:
67
67
  out.append("| Operation | Variant | Param | Median (μs) | vs trnsparse-PyTorch |")
68
68
  out.append("|-----------|---------|-------|------------:|-------------------:|")
69
69
 
70
- for (group, op, param) in sorted(rows.keys()):
70
+ for group, op, param in sorted(rows.keys()):
71
71
  variants = rows[(group, op, param)]
72
72
  baseline = variants.get("trnrand_pytorch")
73
73
  for variant in ("nki", "trnrand_pytorch", "torch"):
@@ -82,11 +82,9 @@ def render_markdown(rows: dict, machine_info: dict | None = None) -> str:
82
82
  if ratio >= 1.0:
83
83
  speedup = f"{ratio:.2f}× faster"
84
84
  else:
85
- speedup = f"{1/ratio:.2f}× slower"
85
+ speedup = f"{1 / ratio:.2f}× slower"
86
86
  param_disp = param.replace("_", " ") if param else "-"
87
- out.append(
88
- f"| {group}.{op} | {label} | {param_disp} | {us:>10.2f} | {speedup} |"
89
- )
87
+ out.append(f"| {group}.{op} | {label} | {param_disp} | {us:>10.2f} | {speedup} |")
90
88
  return "\n".join(out) + "\n"
91
89
 
92
90
 
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Run NKI simulator-backed tests on the trn1 CI instance.
4
+ #
5
+ # Simulator bypasses torch_xla + NEFF compile: kernels run on CPU via
6
+ # nki.simulate(kernel)(numpy_args). Use for correctness / constraint
7
+ # iteration; hardware still owns perf numbers.
8
+ #
9
+ # Usage:
10
+ # AWS_PROFILE=aws ./scripts/run_simulator_tests.sh
11
+ #
12
+ # Same trap-stop pattern as run_neuron_tests.sh. Runs the nki_simulator-
13
+ # marked test suite with TRNSPARSE_USE_SIMULATOR=1 set in the SSM env.
14
+ #
15
+ # Still AWS-resident for now (the nki wheel is linux_x86_64 only + lives
16
+ # on the AWS pip index, not a common macOS target). The CI job on
17
+ # ubuntu-latest covers the same surface on every push.
18
+
19
+ set -euo pipefail
20
+
21
+ INSTANCE_TYPE="${INSTANCE_TYPE:-trn1}"
22
+ TAG="trnsparse-ci-${INSTANCE_TYPE}"
23
+ REGION="${AWS_REGION:-us-east-1}"
24
+ SHA="$(git rev-parse HEAD)"
25
+
26
+ : "${AWS_PROFILE:?Set AWS_PROFILE, e.g. AWS_PROFILE=aws ./scripts/run_simulator_tests.sh}"
27
+
28
+ echo "Looking up instance with Name=$TAG in $REGION..."
29
+ INSTANCE_ID=$(aws ec2 describe-instances \
30
+ --filters "Name=tag:Name,Values=$TAG" \
31
+ "Name=instance-state-name,Values=stopped,running,pending" \
32
+ --query 'Reservations[0].Instances[0].InstanceId' \
33
+ --output text \
34
+ --region "$REGION")
35
+
36
+ if [[ -z "$INSTANCE_ID" || "$INSTANCE_ID" == "None" ]]; then
37
+ echo "ERROR: No instance found with Name=$TAG" >&2
38
+ exit 1
39
+ fi
40
+ echo "Instance: $INSTANCE_ID"
41
+
42
+ cleanup() {
43
+ local exit_code=$?
44
+ echo ""
45
+ echo "Stopping $INSTANCE_ID..."
46
+ aws ec2 stop-instances --instance-ids "$INSTANCE_ID" --region "$REGION" >/dev/null
47
+ exit "$exit_code"
48
+ }
49
+ trap cleanup EXIT
50
+
51
+ STATE=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" --region "$REGION" \
52
+ --query 'Reservations[0].Instances[0].State.Name' --output text)
53
+
54
+ if [[ "$STATE" == "stopped" ]]; then
55
+ echo "Starting instance..."
56
+ aws ec2 start-instances --instance-ids "$INSTANCE_ID" --region "$REGION" >/dev/null
57
+ fi
58
+
59
+ echo "Waiting for instance-running..."
60
+ aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" --region "$REGION"
61
+ echo "Waiting for SSM agent..."
62
+ for _ in $(seq 1 60); do
63
+ PING=$(aws ssm describe-instance-information \
64
+ --filters "Key=InstanceIds,Values=$INSTANCE_ID" \
65
+ --region "$REGION" \
66
+ --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || true)
67
+ [[ "$PING" == "Online" ]] && break
68
+ sleep 5
69
+ done
70
+ if [[ "$PING" != "Online" ]]; then
71
+ echo "ERROR: SSM agent not Online after 5 minutes (last PingStatus=$PING)" >&2
72
+ exit 1
73
+ fi
74
+
75
+ echo "Sending simulator test command (SHA=$SHA)..."
76
+ CMD_ID=$(aws ssm send-command \
77
+ --instance-ids "$INSTANCE_ID" \
78
+ --document-name "AWS-RunShellScript" \
79
+ --comment "trnsparse nki simulator tests @ $SHA" \
80
+ --parameters "commands=[
81
+ \"bash -c 'set -euo pipefail; cd /home/ubuntu/trnsparse && sudo -u ubuntu git fetch --all && sudo -u ubuntu git checkout $SHA && NEURON_VENV=\$(ls -d /opt/aws_neuronx_venv_pytorch_* | head -1) && sudo -u ubuntu \$NEURON_VENV/bin/pip install -e /home/ubuntu/trnsparse[dev] --quiet && sudo -u ubuntu env PATH=\$NEURON_VENV/bin:/usr/bin:/bin TRNSPARSE_USE_SIMULATOR=1 \$NEURON_VENV/bin/pytest /home/ubuntu/trnsparse/tests/ -v -m nki_simulator --tb=short'\"
82
+ ]" \
83
+ --region "$REGION" \
84
+ --output text --query 'Command.CommandId')
85
+
86
+ echo "Command ID: $CMD_ID"
87
+ echo "Waiting for command to complete..."
88
+ for _ in $(seq 1 60); do
89
+ STATUS=$(aws ssm get-command-invocation \
90
+ --command-id "$CMD_ID" \
91
+ --instance-id "$INSTANCE_ID" \
92
+ --region "$REGION" \
93
+ --query 'Status' --output text 2>/dev/null || echo "InProgress")
94
+ [[ "$STATUS" != "InProgress" && "$STATUS" != "Pending" ]] && break
95
+ sleep 15
96
+ done
97
+
98
+ echo ""
99
+ echo "=== STDOUT ==="
100
+ aws ssm get-command-invocation \
101
+ --command-id "$CMD_ID" \
102
+ --instance-id "$INSTANCE_ID" \
103
+ --region "$REGION" \
104
+ --query 'StandardOutputContent' --output text
105
+
106
+ echo ""
107
+ echo "=== STDERR ==="
108
+ aws ssm get-command-invocation \
109
+ --command-id "$CMD_ID" \
110
+ --instance-id "$INSTANCE_ID" \
111
+ --region "$REGION" \
112
+ --query 'StandardErrorContent' --output text | head -20
113
+
114
+ echo ""
115
+ echo "=== Status: $STATUS ==="
116
+ [[ "$STATUS" == "Success" ]]
@@ -0,0 +1,12 @@
1
+ """Test configuration."""
2
+
3
+ import pytest # noqa: F401 — imported for marker side-effect exposure
4
+
5
+
6
+ def pytest_configure(config):
7
+ config.addinivalue_line("markers", "neuron: requires Neuron hardware")
8
+ config.addinivalue_line(
9
+ "markers",
10
+ "nki_simulator: runs NKI kernels via nki.simulate on CPU "
11
+ "(requires TRNSPARSE_USE_SIMULATOR=1 + nki>=0.3.0)",
12
+ )