rns-engine 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Evan Wesley
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: rns_engine
3
+ Version: 0.1.0
4
+ Summary: Exact integer arithmetic via AVX2-accelerated Residue Number System
5
+ License: MIT License
6
+
7
+ Copyright (c) 2026 Evan Wesley
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ Project-URL: Homepage, https://github.com/playfularchitect/rns_engine
28
+ Project-URL: Issues, https://github.com/playfularchitect/rns_engine/issues
29
+ Keywords: arithmetic,exact,integer,RNS,residue,SIMD,AVX2
30
+ Classifier: Development Status :: 3 - Alpha
31
+ Classifier: Intended Audience :: Science/Research
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Programming Language :: Python :: 3
35
+ Classifier: Programming Language :: Python :: 3.9
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Programming Language :: C++
40
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
41
+ Classifier: Topic :: Software Development :: Libraries
42
+ Requires-Python: >=3.9
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: numpy>=1.21
46
+ Dynamic: license-file
47
+
48
+ # rns_engine
49
+
50
+ **Exact integer arithmetic via AVX2-accelerated Residue Number System (RNS).**
51
+
52
+ No floating point. No approximation. Errors are structurally impossible.
53
+
54
+ ---
55
+
56
+ ## What it does
57
+
58
+ Standard Python integers are exact but slow. NumPy is fast but uses floating point or silently overflows. `rns_engine` gives you **exact integer arithmetic at hundreds of millions of operations per second** — the best of both worlds.
59
+
60
+ It works by decomposing integers into residues across three coprime moduli (127, 8191, 65536), performing all operations in residue space using AVX2 SIMD instructions, and reconstructing exact results via the Chinese Remainder Theorem.
61
+
62
+ **Dynamic range:** `[0, 68,174,282,752)` — about 68 billion.
63
+
64
+ ## Install
65
+
66
+ ```bash
67
+ pip install rns_engine
68
+ ```
69
+
70
+ Requires a CPU with AVX2 (any Intel/AMD since ~2013). Falls back to scalar arithmetic on ARM and older hardware.
71
+
72
+ ## Quick start
73
+
74
+ ```python
75
+ import rns_engine as rns
76
+ import numpy as np
77
+
78
+ # Works on arrays of uint64
79
+ a = np.array([123456789, 999999999], dtype=np.uint64)
80
+ b = np.array([987654321, 111111111], dtype=np.uint64)
81
+
82
+ # Encode once
83
+ ea = rns.encode(a) # returns (r0, r1, r2) residue arrays
84
+ eb = rns.encode(b)
85
+
86
+ # Operate in residue space — no intermediate decode needed
87
+ result = rns.decode(*rns.mul(*ea, *eb)) # exact multiplication
88
+
89
+ # Chain multiple operations — decode once at the end
90
+ s1 = rns.add(*ea, *eb) # a + b
91
+ s2 = rns.mul(*s1, *eb) # (a + b) * b
92
+ s3 = rns.sub(*s2, *ea) # (a + b) * b - a
93
+ out = rns.decode(*s3) # one decode, three operations
94
+ ```
95
+
96
+ ## Operations
97
+
98
+ | Function | Description |
99
+ |----------|-------------|
100
+ | `rns.encode(x)` | `uint64[]` → `(r0, r1, r2)` residue arrays |
101
+ | `rns.decode(r0, r1, r2)` | Residues → `uint64[]` via Garner's algorithm |
102
+ | `rns.add(*ea, *eb)` | Exact addition |
103
+ | `rns.sub(*ea, *eb)` | Exact subtraction |
104
+ | `rns.mul(*ea, *eb)` | Exact multiplication |
105
+ | `rns.div_(*ea, *eb)` | Exact division (b must be coprime to all moduli) |
106
+ | `rns.op(*ea, *eb, code)` | Generic: `0`=add `1`=mul `2`=sub `3`=div |
107
+
108
+ ### Division constraint
109
+
110
+ Division requires `b` to be invertible on all three rails:
111
+ - `b % 127 != 0`
112
+ - `b % 8191 != 0`
113
+ - `b % 65536` is **odd** (coprime to 2^16)
114
+
115
+ ```python
116
+ # Safe way to ensure b is valid for division:
117
+ b = np.where(b % 2 == 0, b + 1, b) # make odd
118
+ b = np.where(b % 127 == 0, b + 2, b)
119
+ b = np.where(b % 8191 == 0, b + 4, b)
120
+ b = b % rns.M
121
+ ```
122
+
123
+ ## Performance
124
+
125
+ On a machine with AVX2 (tested on Google Colab's CPU):
126
+
127
+ | Operation | Throughput |
128
+ |-----------|-----------|
129
+ | add | ~200–400 M ops/sec |
130
+ | sub | ~200–400 M ops/sec |
131
+ | mul | ~200–400 M ops/sec |
132
+ | div | ~1.6 M ops/sec (scalar modinv per element) |
133
+
134
+ ## Why RNS?
135
+
136
+ In a Residue Number System, **addition and multiplication have no carry propagation between digits**. Each residue rail is independent. This makes RNS ideal for:
137
+
138
+ - **Exact arithmetic** — results are always correct within the dynamic range
139
+ - **Parallel computation** — rails can run simultaneously
140
+ - **Error detection** — CRT reconstruction fails loudly if any rail is corrupted
141
+ - **Cryptography** — modular arithmetic is the native language of RSA, ECC, etc.
142
+
143
+ ## How it works
144
+
145
+ Three coprime moduli: `m0 = 127`, `m1 = 8191`, `m2 = 65536`
146
+
147
+ Dynamic range: `M = 127 × 8191 × 65536 = 68,174,282,752`
148
+
149
+ **Encode:** `x → (x mod 127, x mod 8191, x mod 65536)`
150
+
151
+ **Operate:** each rail independently, e.g. add: `(a+b) mod mᵢ` per rail
152
+
153
+ **Decode (Garner's algorithm):**
154
+ ```
155
+ t0 = r0
156
+ t1 = (r1 - t0) × inv(127, 8191) mod 8191
157
+ t2 = (r2 - t0 - t1×127) × inv(127×8191, 65536) mod 65536
158
+ x = t0 + t1×127 + t2×127×8191
159
+ ```
160
+
161
+ Mod 127 and mod 8191 reductions use the Mersenne-prime trick:
162
+ `x mod (2^k - 1) = (x & mask) + (x >> k)` — no division needed.
163
+
164
+ ## Building from source
165
+
166
+ ```bash
167
+ git clone https://github.com/playfularchitect/rns_engine
168
+ cd rns_engine
169
+ pip install pybind11 numpy
170
+ pip install -e .
171
+ pytest tests/ -v
172
+ ```
173
+
174
+ Requires `g++` with C++17 support.
175
+
176
+ ## License
177
+
178
+ MIT
@@ -0,0 +1,131 @@
1
+ # rns_engine
2
+
3
+ **Exact integer arithmetic via AVX2-accelerated Residue Number System (RNS).**
4
+
5
+ No floating point. No approximation. Errors are structurally impossible.
6
+
7
+ ---
8
+
9
+ ## What it does
10
+
11
+ Standard Python integers are exact but slow. NumPy is fast but uses floating point or silently overflows. `rns_engine` gives you **exact integer arithmetic at hundreds of millions of operations per second** — the best of both worlds.
12
+
13
+ It works by decomposing integers into residues across three coprime moduli (127, 8191, 65536), performing all operations in residue space using AVX2 SIMD instructions, and reconstructing exact results via the Chinese Remainder Theorem.
14
+
15
+ **Dynamic range:** `[0, 68,174,282,752)` — about 68 billion.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install rns_engine
21
+ ```
22
+
23
+ Requires a CPU with AVX2 (any Intel/AMD since ~2013). Falls back to scalar arithmetic on ARM and older hardware.
24
+
25
+ ## Quick start
26
+
27
+ ```python
28
+ import rns_engine as rns
29
+ import numpy as np
30
+
31
+ # Works on arrays of uint64
32
+ a = np.array([123456789, 999999999], dtype=np.uint64)
33
+ b = np.array([987654321, 111111111], dtype=np.uint64)
34
+
35
+ # Encode once
36
+ ea = rns.encode(a) # returns (r0, r1, r2) residue arrays
37
+ eb = rns.encode(b)
38
+
39
+ # Operate in residue space — no intermediate decode needed
40
+ result = rns.decode(*rns.mul(*ea, *eb)) # exact multiplication
41
+
42
+ # Chain multiple operations — decode once at the end
43
+ s1 = rns.add(*ea, *eb) # a + b
44
+ s2 = rns.mul(*s1, *eb) # (a + b) * b
45
+ s3 = rns.sub(*s2, *ea) # (a + b) * b - a
46
+ out = rns.decode(*s3) # one decode, three operations
47
+ ```
48
+
49
+ ## Operations
50
+
51
+ | Function | Description |
52
+ |----------|-------------|
53
+ | `rns.encode(x)` | `uint64[]` → `(r0, r1, r2)` residue arrays |
54
+ | `rns.decode(r0, r1, r2)` | Residues → `uint64[]` via Garner's algorithm |
55
+ | `rns.add(*ea, *eb)` | Exact addition |
56
+ | `rns.sub(*ea, *eb)` | Exact subtraction |
57
+ | `rns.mul(*ea, *eb)` | Exact multiplication |
58
+ | `rns.div_(*ea, *eb)` | Exact division (b must be coprime to all moduli) |
59
+ | `rns.op(*ea, *eb, code)` | Generic: `0`=add `1`=mul `2`=sub `3`=div |
60
+
61
+ ### Division constraint
62
+
63
+ Division requires `b` to be invertible on all three rails:
64
+ - `b % 127 != 0`
65
+ - `b % 8191 != 0`
66
+ - `b % 65536` is **odd** (coprime to 2^16)
67
+
68
+ ```python
69
+ # Safe way to ensure b is valid for division:
70
+ b = np.where(b % 2 == 0, b + 1, b) # make odd
71
+ b = np.where(b % 127 == 0, b + 2, b)
72
+ b = np.where(b % 8191 == 0, b + 4, b)
73
+ b = b % rns.M
74
+ ```
75
+
76
+ ## Performance
77
+
78
+ On a machine with AVX2 (tested on Google Colab's CPU):
79
+
80
+ | Operation | Throughput |
81
+ |-----------|-----------|
82
+ | add | ~200–400 M ops/sec |
83
+ | sub | ~200–400 M ops/sec |
84
+ | mul | ~200–400 M ops/sec |
85
+ | div | ~1.6 M ops/sec (scalar modinv per element) |
86
+
87
+ ## Why RNS?
88
+
89
+ In a Residue Number System, **addition and multiplication have no carry propagation between digits**. Each residue rail is independent. This makes RNS ideal for:
90
+
91
+ - **Exact arithmetic** — results are always correct within the dynamic range
92
+ - **Parallel computation** — rails can run simultaneously
93
+ - **Error detection** — CRT reconstruction fails loudly if any rail is corrupted
94
+ - **Cryptography** — modular arithmetic is the native language of RSA, ECC, etc.
95
+
96
+ ## How it works
97
+
98
+ Three coprime moduli: `m0 = 127`, `m1 = 8191`, `m2 = 65536`
99
+
100
+ Dynamic range: `M = 127 × 8191 × 65536 = 68,174,282,752`
101
+
102
+ **Encode:** `x → (x mod 127, x mod 8191, x mod 65536)`
103
+
104
+ **Operate:** each rail independently, e.g. add: `(a+b) mod mᵢ` per rail
105
+
106
+ **Decode (Garner's algorithm):**
107
+ ```
108
+ t0 = r0
109
+ t1 = (r1 - t0) × inv(127, 8191) mod 8191
110
+ t2 = (r2 - t0 - t1×127) × inv(127×8191, 65536) mod 65536
111
+ x = t0 + t1×127 + t2×127×8191
112
+ ```
113
+
114
+ Mod 127 and mod 8191 reductions use the Mersenne-prime trick:
115
+ `x mod (2^k - 1) = (x & mask) + (x >> k)` — no division needed.
116
+
117
+ ## Building from source
118
+
119
+ ```bash
120
+ git clone https://github.com/playfularchitect/rns_engine
121
+ cd rns_engine
122
+ pip install pybind11 numpy
123
+ pip install -e .
124
+ pytest tests/ -v
125
+ ```
126
+
127
+ Requires `g++` with C++17 support.
128
+
129
+ ## License
130
+
131
+ MIT
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "pybind11>=2.11"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rns_engine"
7
+ version = "0.1.0"
8
+ description = "Exact integer arithmetic via AVX2-accelerated Residue Number System"
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ requires-python = ">=3.9"
12
+ dependencies = ["numpy>=1.21"]
13
+ keywords = ["arithmetic", "exact", "integer", "RNS", "residue", "SIMD", "AVX2"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Science/Research",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: C++",
25
+ "Topic :: Scientific/Engineering :: Mathematics",
26
+ "Topic :: Software Development :: Libraries",
27
+ ]
28
+
29
+ [project.urls]
30
+ Homepage = "https://github.com/playfularchitect/rns_engine"
31
+ Issues = "https://github.com/playfularchitect/rns_engine/issues"
32
+
33
+ [tool.setuptools.packages.find]
34
+ where = ["src"]
35
+
36
+ [tool.setuptools.package-data]
37
+ rns_engine = ["*.pyi"]
38
+
39
+ [tool.cibuildwheel]
40
+ # Build for CPython 3.9–3.12 only (not PyPy)
41
+ build = "cp39-* cp310-* cp311-* cp312-*"
42
+ # Skip 32-bit builds and musllinux (AVX2 not worth supporting there)
43
+ skip = "*-win32 *-manylinux_i686 *-musllinux*"
44
+ dependency-versions = "latest"
45
+
46
+ [tool.cibuildwheel.linux]
47
+ archs = ["x86_64"]
48
+
49
+ [tool.cibuildwheel.macos]
50
+ # x86_64 supports AVX2; arm64 (M1/M2) uses scalar fallback
51
+ archs = ["x86_64", "arm64"]
52
+
53
+ [tool.cibuildwheel.windows]
54
+ archs = ["AMD64"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,51 @@
1
+ """
2
+ setup.py — builds the rns_engine C++ extension.
3
+
4
+ Detects AVX2 support and enables it when available.
5
+ Falls back to scalar on non-x86 or older hardware.
6
+ """
7
+
8
+ import sys
9
+ import platform
10
+ from setuptools import setup, Extension
11
+ import pybind11
12
+
13
+
14
+ def get_compile_args():
15
+ """Return compiler flags appropriate for the current platform."""
16
+ system = platform.system()
17
+ machine = platform.machine()
18
+
19
+ common = ["-std=c++17", "-O3", "-DNDEBUG", "-Wno-unused-function"]
20
+
21
+ if system in ("Linux", "Darwin") and machine in ("x86_64", "AMD64"):
22
+ # Enable AVX2 on x86-64 Linux/Mac
23
+ return common + ["-mavx2", "-march=native", "-funroll-loops"]
24
+
25
+ elif system == "Windows":
26
+ # MSVC flags
27
+ return ["/std:c++17", "/O2", "/DNDEBUG", "/arch:AVX2"]
28
+
29
+ else:
30
+ # ARM, RISC-V, etc. — scalar fallback, still fast
31
+ return common
32
+
33
+ return common
34
+
35
+
36
+ def get_link_args():
37
+ if platform.system() == "Windows":
38
+ return []
39
+ return []
40
+
41
+
42
+ ext = Extension(
43
+ "rns_engine._core",
44
+ sources=["src/rns_engine/_core.cpp"],
45
+ include_dirs=[pybind11.get_include()],
46
+ extra_compile_args=get_compile_args(),
47
+ extra_link_args=get_link_args(),
48
+ language="c++",
49
+ )
50
+
51
+ setup(ext_modules=[ext])
@@ -0,0 +1,60 @@
1
+ """
2
+ rns_engine — Exact integer arithmetic via Residue Number System (RNS).
3
+
4
+ Dynamic range: [0, 68,174,282,752) = 127 × 8191 × 65536
5
+ AVX2-accelerated on x86; scalar fallback on all other platforms.
6
+
7
+ Quick start
8
+ -----------
9
+ >>> import rns_engine as rns
10
+ >>> import numpy as np
11
+ >>>
12
+ >>> a = np.array([123456789, 999999999], dtype=np.uint64)
13
+ >>> b = np.array([987654321, 111111111], dtype=np.uint64)
14
+ >>>
15
+ >>> ea = rns.encode(a) # -> (r0, r1, r2) residue arrays
16
+ >>> eb = rns.encode(b)
17
+ >>>
18
+ >>> result = rns.decode(*rns.mul(*ea, *eb)) # exact multiplication
19
+ >>> # stays in residue space for multi-step expressions:
20
+ >>> s1 = rns.add(*ea, *eb) # a + b
21
+ >>> s2 = rns.mul(*s1, *eb) # (a + b) * b
22
+ >>> out = rns.decode(*s2) # decode once at the end
23
+
24
+ Notes
25
+ -----
26
+ - All values must be in [0, M) where M = rns.M = 68,174,282,752
27
+ - Values outside this range are reduced mod M on encode
28
+ - Division requires b to be coprime to all moduli:
29
+ b % 127 != 0
30
+ b % 8191 != 0
31
+ b % 65536 is odd (coprime to 2^16)
32
+ """
33
+
34
+ from ._core import (
35
+ encode,
36
+ decode,
37
+ op,
38
+ add,
39
+ sub,
40
+ mul,
41
+ div_,
42
+ M,
43
+ M0,
44
+ M1,
45
+ M2,
46
+ HAS_AVX2,
47
+ )
48
+
49
+ __version__ = "0.1.0"
50
+ __all__ = ["encode", "decode", "op", "add", "sub", "mul", "div_",
51
+ "M", "M0", "M1", "M2", "HAS_AVX2"]
52
+
53
+
54
+ def info():
55
+ """Print a summary of the engine configuration."""
56
+ print(f"rns_engine v{__version__}")
57
+ print(f" Dynamic range : [0, {M:,})")
58
+ print(f" Moduli : {M0} × {M1} × {M2}")
59
+ print(f" AVX2 : {'yes' if HAS_AVX2 else 'no (scalar fallback)'}")
60
+ print(f" Operations : add sub mul div_")
@@ -0,0 +1,243 @@
1
+ /*
2
+ * rns_engine/_core.cpp
3
+ * 3-rail RNS exact integer arithmetic. AVX2 + scalar fallback.
4
+ * Works on Windows/MSVC, Linux/GCC, Mac/Clang.
5
+ */
6
+
7
+ #include <pybind11/pybind11.h>
8
+ #include <pybind11/numpy.h>
9
+ #include <stdint.h>
10
+ #include <stdexcept>
11
+
12
+ namespace py = pybind11;
13
+ using arr16 = py::array_t<uint16_t>;
14
+ using arr32 = py::array_t<uint32_t>;
15
+ using arr64 = py::array_t<uint64_t>;
16
+
17
+ static constexpr uint32_t M0 = 127;
18
+ static constexpr uint32_t M1 = 8191;
19
+ static constexpr uint32_t M2 = 65536;
20
+ static constexpr uint64_t BM = (uint64_t)M0 * M1 * M2;
21
+ static constexpr uint32_t INV01 = 129;
22
+ static constexpr uint32_t INV012 = 24705;
23
+ #define L 16
24
+
25
+ static inline uint16_t r127s(uint32_t x) {
26
+ x = (x & 0x7F) + (x >> 7);
27
+ x = (x & 0x7F) + (x >> 7);
28
+ return x == 127 ? 0 : (uint16_t)x;
29
+ }
30
+ static inline uint32_t r8191s(uint64_t x) {
31
+ x = (x & 0x1FFF) + (x >> 13);
32
+ x = (x & 0x1FFF) + (x >> 13);
33
+ return x == 8191 ? 0 : (uint32_t)x;
34
+ }
35
+ static int64_t egcd(int64_t a, int64_t b, int64_t *x, int64_t *y) {
36
+ if (!a) { *x = 0; *y = 1; return b; }
37
+ int64_t x1, y1;
38
+ int64_t g = egcd(b % a, a, &x1, &y1);
39
+ *x = y1 - (b / a) * x1; *y = x1;
40
+ return g;
41
+ }
42
+ static uint32_t inv_s(int64_t a, int64_t m) {
43
+ a = ((a % m) + m) % m;
44
+ if (!a) return 0;
45
+ int64_t x, y;
46
+ if (egcd(a, m, &x, &y) != 1) return 0;
47
+ return (uint32_t)(((x % m) + m) % m);
48
+ }
49
+ static inline uint64_t garner(uint16_t r0, uint32_t r1, uint16_t r2) {
50
+ uint32_t t0 = r0;
51
+ uint32_t t1 = (uint32_t)(
52
+ ((int64_t)r1 - (int64_t)(t0 % 8191) + 8191) % 8191
53
+ * (uint64_t)INV01 % 8191);
54
+ uint64_t base = t0 + (uint64_t)t1 * 127;
55
+ int64_t d = ((int64_t)r2 - (int64_t)(base % 65536) + 131072LL) % 65536;
56
+ return base + (uint64_t)(d * (uint64_t)INV012 % 65536) * 127ULL * 8191ULL;
57
+ }
58
+
59
+ // int64_t used for loop counters — works on Windows, Linux, Mac
60
+ static void kernel_scalar(
61
+ const uint16_t *a0, const uint32_t *a1, const uint16_t *a2,
62
+ const uint16_t *b0, const uint32_t *b1, const uint16_t *b2,
63
+ uint16_t *r0, uint32_t *r1, uint16_t *r2,
64
+ int64_t n, int op)
65
+ {
66
+ for (int64_t i = 0; i < n; i++) {
67
+ if (op == 0) {
68
+ r0[i] = r127s(a0[i] + b0[i]);
69
+ r1[i] = r8191s((uint64_t)a1[i] + b1[i]);
70
+ r2[i] = (uint16_t)((a2[i] + b2[i]) & 0xFFFF);
71
+ } else if (op == 1) {
72
+ r0[i] = r127s((uint32_t)a0[i] * b0[i]);
73
+ r1[i] = r8191s((uint64_t)a1[i] * b1[i]);
74
+ r2[i] = (uint16_t)((uint32_t)a2[i] * b2[i]);
75
+ } else if (op == 2) {
76
+ r0[i] = r127s(127 + a0[i] - b0[i] % 127);
77
+ r1[i] = r8191s(8191 + (uint64_t)a1[i] - b1[i] % 8191);
78
+ r2[i] = (uint16_t)((65536 + a2[i] - b2[i] % 65536) & 0xFFFF);
79
+ } else {
80
+ r0[i] = r127s ((uint32_t)a0[i] * inv_s(b0[i], 127));
81
+ r1[i] = r8191s((uint64_t)a1[i] * inv_s(b1[i], 8191));
82
+ r2[i] = (uint16_t)(((uint32_t)a2[i] * inv_s(b2[i], 65536)) & 0xFFFF);
83
+ }
84
+ }
85
+ }
86
+
87
+ #if defined(__AVX2__)
88
+ #include <immintrin.h>
89
+ #define HAVE_AVX2 1
90
+ using vec16 = __m256i;
91
+ static inline vec16 V1(int x) { return _mm256_set1_epi16((short)x); }
92
+ static inline vec16 Va(vec16 a, vec16 b) { return _mm256_add_epi16(a, b); }
93
+ static inline vec16 Vs(vec16 a, vec16 b) { return _mm256_sub_epi16(a, b); }
94
+ static inline vec16 Vm(vec16 a, vec16 b) { return _mm256_mullo_epi16(a, b); }
95
+ static inline vec16 Vn(vec16 a, vec16 b) { return _mm256_and_si256(a, b); }
96
+ static inline vec16 Vh(vec16 a, int s) { return _mm256_srli_epi16(a, s); }
97
+ static inline vec16 Ve(vec16 a, vec16 b) { return _mm256_cmpeq_epi16(a, b); }
98
+ static inline vec16 r127v(vec16 x) {
99
+ vec16 t = Va(Vn(x, V1(0x7F)), Vh(x, 7));
100
+ t = Va(Vn(t, V1(0x7F)), Vh(t, 7));
101
+ return Vs(t, Vn(V1(127), Ve(t, V1(127))));
102
+ }
103
+ static inline vec16 r8191v(vec16 x) {
104
+ vec16 t = Va(Vn(x, V1(0x1FFF)), Vh(x, 13));
105
+ return Vs(t, Vn(V1(8191), Ve(t, V1(8191))));
106
+ }
107
+ static inline vec16 mul8191v(vec16 a, vec16 b) {
108
+ __m256i mk = _mm256_set1_epi32(0x1FFF);
109
+ auto f = [&](__m256i x) {
110
+ x = _mm256_add_epi32(_mm256_and_si256(x, mk), _mm256_srli_epi32(x, 13));
111
+ x = _mm256_add_epi32(_mm256_and_si256(x, mk), _mm256_srli_epi32(x, 13));
112
+ return _mm256_sub_epi32(x, _mm256_and_si256(
113
+ _mm256_cmpeq_epi32(x, _mm256_set1_epi32(8191)),
114
+ _mm256_set1_epi32(8191)));
115
+ };
116
+ __m256i pl = _mm256_mullo_epi32(
117
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(a)),
118
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b)));
119
+ __m256i ph = _mm256_mullo_epi32(
120
+ _mm256_cvtepu16_epi32(_mm256_extracti128_si256(a, 1)),
121
+ _mm256_cvtepu16_epi32(_mm256_extracti128_si256(b, 1)));
122
+ return _mm256_permute4x64_epi64(
123
+ _mm256_packus_epi32(f(pl), f(ph)), 0b11011000);
124
+ }
125
+ static void kernel_avx2(
126
+ const uint16_t *a0, const uint32_t *a1, const uint16_t *a2,
127
+ const uint16_t *b0, const uint32_t *b1, const uint16_t *b2,
128
+ uint16_t *r0, uint32_t *r1, uint16_t *r2,
129
+ int64_t n, int op)
130
+ {
131
+ if (op == 3) { kernel_scalar(a0,a1,a2,b0,b1,b2,r0,r1,r2,n,op); return; }
132
+ int64_t full = (n / L) * L;
133
+ for (int64_t base = 0; base < full; base += L) {
134
+ alignas(32) int16_t ta0[L],tb0[L],ta1[L],tb1[L],ta2[L],tb2[L];
135
+ for (int l = 0; l < L; l++) {
136
+ ta0[l]=(int16_t)a0[base+l]; tb0[l]=(int16_t)b0[base+l];
137
+ ta1[l]=(int16_t)a1[base+l]; tb1[l]=(int16_t)b1[base+l];
138
+ ta2[l]=(int16_t)a2[base+l]; tb2[l]=(int16_t)b2[base+l];
139
+ }
140
+ vec16 va0=_mm256_load_si256((vec16*)ta0), vb0=_mm256_load_si256((vec16*)tb0);
141
+ vec16 va1=_mm256_load_si256((vec16*)ta1), vb1=_mm256_load_si256((vec16*)tb1);
142
+ vec16 va2=_mm256_load_si256((vec16*)ta2), vb2=_mm256_load_si256((vec16*)tb2);
143
+ vec16 vr0, vr1, vr2;
144
+ if (op == 0) {
145
+ vr0 = r127v(Va(va0,vb0)); vr1 = r8191v(Va(va1,vb1)); vr2 = Va(va2,vb2);
146
+ } else if (op == 1) {
147
+ vr0 = r127v(Vm(va0,vb0)); vr1 = mul8191v(va1,vb1); vr2 = Vm(va2,vb2);
148
+ } else {
149
+ vr0 = r127v (Va(va0, r127v (Vs(V1(127), vb0))));
150
+ vr1 = r8191v(Va(va1, r8191v(Vs(V1(8191), vb1))));
151
+ vr2 = Va(va2, Vs(V1(0), vb2));
152
+ }
153
+ alignas(32) int16_t tr0[L], tr1[L], tr2[L];
154
+ _mm256_store_si256((vec16*)tr0, vr0);
155
+ _mm256_store_si256((vec16*)tr1, vr1);
156
+ _mm256_store_si256((vec16*)tr2, vr2);
157
+ for (int l = 0; l < L; l++) {
158
+ r0[base+l]=(uint16_t)tr0[l];
159
+ r1[base+l]=(uint16_t)tr1[l];
160
+ r2[base+l]=(uint16_t)tr2[l];
161
+ }
162
+ }
163
+ kernel_scalar(a0+full,a1+full,a2+full,b0+full,b1+full,b2+full,
164
+ r0+full,r1+full,r2+full, n-full, op);
165
+ }
166
+ #else
167
+ #define HAVE_AVX2 0
168
+ #endif
169
+
170
+ static void kernel(
171
+ const uint16_t *a0, const uint32_t *a1, const uint16_t *a2,
172
+ const uint16_t *b0, const uint32_t *b1, const uint16_t *b2,
173
+ uint16_t *r0, uint32_t *r1, uint16_t *r2,
174
+ int64_t n, int op)
175
+ {
176
+ #if HAVE_AVX2
177
+ kernel_avx2(a0,a1,a2,b0,b1,b2,r0,r1,r2,n,op);
178
+ #else
179
+ kernel_scalar(a0,a1,a2,b0,b1,b2,r0,r1,r2,n,op);
180
+ #endif
181
+ }
182
+
183
+ py::tuple py_encode(arr64 x_in) {
184
+ auto x = x_in.unchecked<1>();
185
+ int64_t n = (int64_t)x_in.shape(0);
186
+ arr16 o0({n}); arr32 o1({n}); arr16 o2({n});
187
+ auto p0 = o0.mutable_unchecked<1>();
188
+ auto p1 = o1.mutable_unchecked<1>();
189
+ auto p2 = o2.mutable_unchecked<1>();
190
+ for (int64_t i = 0; i < n; i++) {
191
+ uint64_t v = x(i) % BM;
192
+ p0(i) = (uint16_t)(v % 127);
193
+ p1(i) = (uint32_t)(v % 8191);
194
+ p2(i) = (uint16_t)(v % 65536);
195
+ }
196
+ return py::make_tuple(o0, o1, o2);
197
+ }
198
+
199
+ arr64 py_decode(arr16 r0_, arr32 r1_, arr16 r2_) {
200
+ int64_t n = (int64_t)r0_.shape(0);
201
+ if (r1_.shape(0) != (size_t)n || r2_.shape(0) != (size_t)n)
202
+ throw std::invalid_argument("array length mismatch");
203
+ arr64 out({n});
204
+ auto r0 = r0_.unchecked<1>();
205
+ auto r1 = r1_.unchecked<1>();
206
+ auto r2 = r2_.unchecked<1>();
207
+ auto o = out.mutable_unchecked<1>();
208
+ for (int64_t i = 0; i < n; i++) o(i) = garner(r0(i), r1(i), r2(i));
209
+ return out;
210
+ }
211
+
212
+ py::tuple py_op(arr16 a0, arr32 a1, arr16 a2,
213
+ arr16 b0, arr32 b1, arr16 b2, int opcode) {
214
+ if (opcode < 0 || opcode > 3)
215
+ throw std::invalid_argument("opcode must be 0=add 1=mul 2=sub 3=div");
216
+ int64_t n = (int64_t)a0.shape(0);
217
+ arr16 r0({n}); arr32 r1({n}); arr16 r2({n});
218
+ kernel(a0.data(), a1.data(), a2.data(),
219
+ b0.data(), b1.data(), b2.data(),
220
+ r0.mutable_data(), r1.mutable_data(), r2.mutable_data(),
221
+ n, opcode);
222
+ return py::make_tuple(r0, r1, r2);
223
+ }
224
+
225
+ PYBIND11_MODULE(_core, m) {
226
+ m.doc() = "rns_engine._core: AVX2-accelerated 3-rail RNS exact integer arithmetic.";
227
+ m.attr("M") = (uint64_t)BM;
228
+ m.attr("M0") = (uint32_t)M0;
229
+ m.attr("M1") = (uint32_t)M1;
230
+ m.attr("M2") = (uint32_t)M2;
231
+ m.attr("HAS_AVX2") = (bool)HAVE_AVX2;
232
+ m.def("encode", &py_encode, "uint64[] -> (r0,r1,r2)");
233
+ m.def("decode", &py_decode, "(r0,r1,r2) -> uint64[]");
234
+ m.def("op", &py_op, "opcode: 0=add 1=mul 2=sub 3=div");
235
+ m.def("add", [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
236
+ { return py_op(a0,a1,a2,b0,b1,b2,0); }, "Exact addition.");
237
+ m.def("sub", [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
238
+ { return py_op(a0,a1,a2,b0,b1,b2,2); }, "Exact subtraction.");
239
+ m.def("mul", [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
240
+ { return py_op(a0,a1,a2,b0,b1,b2,1); }, "Exact multiplication.");
241
+ m.def("div_", [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
242
+ { return py_op(a0,a1,a2,b0,b1,b2,3); }, "Exact division.");
243
+ }
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: rns_engine
3
+ Version: 0.1.0
4
+ Summary: Exact integer arithmetic via AVX2-accelerated Residue Number System
5
+ License: MIT License
6
+
7
+ Copyright (c) 2026 Evan Wesley
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ Project-URL: Homepage, https://github.com/playfularchitect/rns_engine
28
+ Project-URL: Issues, https://github.com/playfularchitect/rns_engine/issues
29
+ Keywords: arithmetic,exact,integer,RNS,residue,SIMD,AVX2
30
+ Classifier: Development Status :: 3 - Alpha
31
+ Classifier: Intended Audience :: Science/Research
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Programming Language :: Python :: 3
35
+ Classifier: Programming Language :: Python :: 3.9
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Programming Language :: C++
40
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
41
+ Classifier: Topic :: Software Development :: Libraries
42
+ Requires-Python: >=3.9
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: numpy>=1.21
46
+ Dynamic: license-file
47
+
48
+ # rns_engine
49
+
50
+ **Exact integer arithmetic via AVX2-accelerated Residue Number System (RNS).**
51
+
52
+ No floating point. No approximation. Errors are structurally impossible.
53
+
54
+ ---
55
+
56
+ ## What it does
57
+
58
+ Standard Python integers are exact but slow. NumPy is fast but uses floating point or silently overflows. `rns_engine` gives you **exact integer arithmetic at hundreds of millions of operations per second** — the best of both worlds.
59
+
60
+ It works by decomposing integers into residues across three coprime moduli (127, 8191, 65536), performing all operations in residue space using AVX2 SIMD instructions, and reconstructing exact results via the Chinese Remainder Theorem.
61
+
62
+ **Dynamic range:** `[0, 68,174,282,752)` — about 68 billion.
63
+
64
+ ## Install
65
+
66
+ ```bash
67
+ pip install rns_engine
68
+ ```
69
+
70
+ Requires a CPU with AVX2 (any Intel/AMD since ~2013). Falls back to scalar arithmetic on ARM and older hardware.
71
+
72
+ ## Quick start
73
+
74
+ ```python
75
+ import rns_engine as rns
76
+ import numpy as np
77
+
78
+ # Works on arrays of uint64
79
+ a = np.array([123456789, 999999999], dtype=np.uint64)
80
+ b = np.array([987654321, 111111111], dtype=np.uint64)
81
+
82
+ # Encode once
83
+ ea = rns.encode(a) # returns (r0, r1, r2) residue arrays
84
+ eb = rns.encode(b)
85
+
86
+ # Operate in residue space — no intermediate decode needed
87
+ result = rns.decode(*rns.mul(*ea, *eb)) # exact multiplication
88
+
89
+ # Chain multiple operations — decode once at the end
90
+ s1 = rns.add(*ea, *eb) # a + b
91
+ s2 = rns.mul(*s1, *eb) # (a + b) * b
92
+ s3 = rns.sub(*s2, *ea) # (a + b) * b - a
93
+ out = rns.decode(*s3) # one decode, three operations
94
+ ```
95
+
96
+ ## Operations
97
+
98
+ | Function | Description |
99
+ |----------|-------------|
100
+ | `rns.encode(x)` | `uint64[]` → `(r0, r1, r2)` residue arrays |
101
+ | `rns.decode(r0, r1, r2)` | Residues → `uint64[]` via Garner's algorithm |
102
+ | `rns.add(*ea, *eb)` | Exact addition |
103
+ | `rns.sub(*ea, *eb)` | Exact subtraction |
104
+ | `rns.mul(*ea, *eb)` | Exact multiplication |
105
+ | `rns.div_(*ea, *eb)` | Exact division (b must be coprime to all moduli) |
106
+ | `rns.op(*ea, *eb, code)` | Generic: `0`=add `1`=mul `2`=sub `3`=div |
107
+
108
+ ### Division constraint
109
+
110
+ Division requires `b` to be invertible on all three rails:
111
+ - `b % 127 != 0`
112
+ - `b % 8191 != 0`
113
+ - `b % 65536` is **odd** (coprime to 2^16)
114
+
115
+ ```python
116
+ # Safe way to ensure b is valid for division:
117
+ b = np.where(b % 2 == 0, b + 1, b) # make odd
118
+ b = np.where(b % 127 == 0, b + 2, b)
119
+ b = np.where(b % 8191 == 0, b + 4, b)
120
+ b = b % rns.M
121
+ ```
122
+
123
+ ## Performance
124
+
125
+ On a machine with AVX2 (tested on Google Colab's CPU):
126
+
127
+ | Operation | Throughput |
128
+ |-----------|-----------|
129
+ | add | ~200–400 M ops/sec |
130
+ | sub | ~200–400 M ops/sec |
131
+ | mul | ~200–400 M ops/sec |
132
+ | div | ~1.6 M ops/sec (scalar modinv per element) |
133
+
134
+ ## Why RNS?
135
+
136
+ In a Residue Number System, **addition and multiplication have no carry propagation between digits**. Each residue rail is independent. This makes RNS ideal for:
137
+
138
+ - **Exact arithmetic** — results are always correct within the dynamic range
139
+ - **Parallel computation** — rails can run simultaneously
140
+ - **Error detection** — CRT reconstruction fails loudly if any rail is corrupted
141
+ - **Cryptography** — modular arithmetic is the native language of RSA, ECC, etc.
142
+
143
+ ## How it works
144
+
145
+ Three coprime moduli: `m0 = 127`, `m1 = 8191`, `m2 = 65536`
146
+
147
+ Dynamic range: `M = 127 × 8191 × 65536 = 68,174,282,752`
148
+
149
+ **Encode:** `x → (x mod 127, x mod 8191, x mod 65536)`
150
+
151
+ **Operate:** each rail independently, e.g. add: `(a+b) mod mᵢ` per rail
152
+
153
+ **Decode (Garner's algorithm):**
154
+ ```
155
+ t0 = r0
156
+ t1 = (r1 - t0) × inv(127, 8191) mod 8191
157
+ t2 = (r2 - t0 - t1×127) × inv(127×8191, 65536) mod 65536
158
+ x = t0 + t1×127 + t2×127×8191
159
+ ```
160
+
161
+ Mod 127 and mod 8191 reductions use the Mersenne-prime trick:
162
+ `x mod (2^k - 1) = (x & mask) + (x >> k)` — no division needed.
163
+
164
+ ## Building from source
165
+
166
+ ```bash
167
+ git clone https://github.com/playfularchitect/rns_engine
168
+ cd rns_engine
169
+ pip install pybind11 numpy
170
+ pip install -e .
171
+ pytest tests/ -v
172
+ ```
173
+
174
+ Requires `g++` with C++17 support.
175
+
176
+ ## License
177
+
178
+ MIT
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ src/rns_engine/__init__.py
6
+ src/rns_engine/_core.cpp
7
+ src/rns_engine.egg-info/PKG-INFO
8
+ src/rns_engine.egg-info/SOURCES.txt
9
+ src/rns_engine.egg-info/dependency_links.txt
10
+ src/rns_engine.egg-info/requires.txt
11
+ src/rns_engine.egg-info/top_level.txt
12
+ tests/test_rns.py
@@ -0,0 +1 @@
1
+ numpy>=1.21
@@ -0,0 +1 @@
1
+ rns_engine
@@ -0,0 +1,236 @@
1
+ """
2
+ tests/test_rns.py — correctness tests for rns_engine.
3
+
4
+ Run with: pytest tests/ -v
5
+ """
6
+
7
+ import pytest
8
+ import numpy as np
9
+ import rns_engine as rns
10
+
11
+
12
+ M = rns.M
13
+
14
+
15
+ # ── helpers ───────────────────────────────────────────────────────────────
16
+ def make(n, seed=42, odd_b=False):
17
+ rng = np.random.default_rng(seed)
18
+ a = rng.integers(0, M, size=n, dtype=np.uint64)
19
+ if odd_b:
20
+ b = rng.integers(0, M // 2, size=n, dtype=np.uint64) * 2 + 1
21
+ else:
22
+ b = rng.integers(0, M, size=n, dtype=np.uint64)
23
+ return a, b
24
+
25
+
26
+ def oracle(a_np, b_np, op):
27
+ """Exact Python arbitrary-precision oracle."""
28
+ n = len(a_np)
29
+ result = np.zeros(n, dtype=np.uint64)
30
+ for i in range(n):
31
+ ai, bi = int(a_np[i]), int(b_np[i])
32
+ if op == "add": result[i] = (ai + bi) % M
33
+ elif op == "sub": result[i] = (ai - bi) % M
34
+ elif op == "mul": result[i] = (ai * bi) % M
35
+ return result
36
+
37
+
38
+ # ── basic sanity ──────────────────────────────────────────────────────────
39
+ def test_constants():
40
+ assert rns.M == 127 * 8191 * 65536
41
+ assert rns.M0 == 127
42
+ assert rns.M1 == 8191
43
+ assert rns.M2 == 65536
44
+
45
+
46
+ def test_info_runs():
47
+ rns.info() # just check it doesn't crash
48
+
49
+
50
+ # ── encode / decode ───────────────────────────────────────────────────────
51
+ def test_roundtrip_small():
52
+ vals = np.array([0, 1, 126, 127, 8190, 8191, 65535, M-1], dtype=np.uint64)
53
+ assert np.array_equal(vals, rns.decode(*rns.encode(vals)))
54
+
55
+
56
+ def test_roundtrip_random():
57
+ a, _ = make(10_000)
58
+ assert np.array_equal(a, rns.decode(*rns.encode(a)))
59
+
60
+
61
+ def test_encode_reduces_mod_M():
62
+ # values >= M should be reduced
63
+ vals = np.array([M, M+1, M*2], dtype=np.uint64)
64
+ decoded = rns.decode(*rns.encode(vals))
65
+ assert np.array_equal(decoded, np.array([0, 1, 0], dtype=np.uint64))
66
+
67
+
68
+ # ── addition ──────────────────────────────────────────────────────────────
69
+ def test_add_correctness():
70
+ a, b = make(5_000)
71
+ got = rns.decode(*rns.add(*rns.encode(a), *rns.encode(b)))
72
+ exp = oracle(a, b, "add")
73
+ assert np.array_equal(got, exp)
74
+
75
+
76
+ def test_add_zero():
77
+ a, _ = make(100)
78
+ z = np.zeros(100, dtype=np.uint64)
79
+ got = rns.decode(*rns.add(*rns.encode(a), *rns.encode(z)))
80
+ assert np.array_equal(got, a)
81
+
82
+
83
+ def test_add_wraps():
84
+ a = np.array([M - 1], dtype=np.uint64)
85
+ b = np.array([1], dtype=np.uint64)
86
+ got = rns.decode(*rns.add(*rns.encode(a), *rns.encode(b)))
87
+ assert got[0] == 0
88
+
89
+
90
+ # ── subtraction ───────────────────────────────────────────────────────────
91
+ def test_sub_correctness():
92
+ a, b = make(5_000)
93
+ got = rns.decode(*rns.sub(*rns.encode(a), *rns.encode(b)))
94
+ exp = oracle(a, b, "sub")
95
+ assert np.array_equal(got, exp)
96
+
97
+
98
+ def test_sub_self_is_zero():
99
+ a, _ = make(100)
100
+ got = rns.decode(*rns.sub(*rns.encode(a), *rns.encode(a)))
101
+ assert np.all(got == 0)
102
+
103
+
104
+ def test_sub_wraps():
105
+ a = np.array([0], dtype=np.uint64)
106
+ b = np.array([1], dtype=np.uint64)
107
+ got = rns.decode(*rns.sub(*rns.encode(a), *rns.encode(b)))
108
+ assert got[0] == M - 1
109
+
110
+
111
+ # ── multiplication ────────────────────────────────────────────────────────
112
+ def test_mul_correctness():
113
+ a, b = make(5_000)
114
+ got = rns.decode(*rns.mul(*rns.encode(a), *rns.encode(b)))
115
+ exp = oracle(a, b, "mul")
116
+ assert np.array_equal(got, exp)
117
+
118
+
119
+ def test_mul_by_zero():
120
+ a, _ = make(100)
121
+ z = np.zeros(100, dtype=np.uint64)
122
+ got = rns.decode(*rns.mul(*rns.encode(a), *rns.encode(z)))
123
+ assert np.all(got == 0)
124
+
125
+
126
+ def test_mul_by_one():
127
+ a, _ = make(100)
128
+ one = np.ones(100, dtype=np.uint64)
129
+ got = rns.decode(*rns.mul(*rns.encode(a), *rns.encode(one)))
130
+ assert np.array_equal(got, a)
131
+
132
+
133
+ # ── division ──────────────────────────────────────────────────────────────
134
+ def test_div_correctness():
135
+ # Use odd b values that are invertible on all rails
136
+ a, b = make(1_000, odd_b=True)
137
+ # also ensure nonzero mod 127 and 8191
138
+ b = np.where(b % 127 == 0, b + 1, b)
139
+ b = np.where(b % 8191 == 0, b + 2, b)
140
+ b = b % M
141
+ got = rns.decode(*rns.div_(*rns.encode(a), *rns.encode(b)))
142
+ # oracle: per-integer Python division in the field
143
+ from math import gcd
144
+ def mi(a, m):
145
+ a = a % m
146
+ if a == 0: return 0
147
+ def eg(a, b):
148
+ if not a: return b, 0, 1
149
+ g, x, y = eg(b % a, a)
150
+ return g, y - (b // a) * x, x
151
+ g, x, _ = eg(a, m)
152
+ return x % m if g == 1 else 0
153
+ exp = np.array([
154
+ int(rns.decode(
155
+ np.array([int(a[i]) % 127 * mi(int(b[i]) % 127, 127) % 127], dtype=np.uint16),
156
+ np.array([int(a[i]) % 8191 * mi(int(b[i]) % 8191, 8191) % 8191], dtype=np.uint32),
157
+ np.array([int(a[i]) % 65536* mi(int(b[i]) % 65536,65536)% 65536],dtype=np.uint16),
158
+ )[0]) for i in range(len(a))
159
+ ], dtype=np.uint64)
160
+ assert np.array_equal(got, exp)
161
+
162
+
163
+ # ── algebraic identities ──────────────────────────────────────────────────
164
+ def test_identity_sub_add(n=500):
165
+ """a - b + b == a"""
166
+ a, b = make(n)
167
+ ea, eb = rns.encode(a), rns.encode(b)
168
+ s1 = rns.sub(*ea, *eb)
169
+ s2 = rns.add(*s1, *eb)
170
+ assert np.array_equal(rns.decode(*s2), a)
171
+
172
+
173
+ def test_identity_mul_div(n=200):
174
+ """a * b / b == a (b invertible)"""
175
+ a, b = make(n, odd_b=True)
176
+ b = np.where(b % 127 == 0, b + 1, b)
177
+ b = np.where(b % 8191 == 0, b + 2, b)
178
+ b = b % M
179
+ ea, eb = rns.encode(a), rns.encode(b)
180
+ s1 = rns.mul( *ea, *eb)
181
+ s2 = rns.div_(*s1, *eb)
182
+ assert np.array_equal(rns.decode(*s2), a)
183
+
184
+
185
+ def test_identity_distributive(n=500):
186
+ """(a + b) * c == a*c + b*c"""
187
+ rng = np.random.default_rng(99)
188
+ a = rng.integers(0, M, size=n, dtype=np.uint64)
189
+ b = rng.integers(0, M, size=n, dtype=np.uint64)
190
+ c = rng.integers(0, M, size=n, dtype=np.uint64)
191
+ ea, eb, ec = rns.encode(a), rns.encode(b), rns.encode(c)
192
+ lhs = rns.decode(*rns.mul(*rns.add(*ea, *eb), *ec))
193
+ rhs = rns.decode(*rns.add(*rns.mul(*ea, *ec), *rns.mul(*eb, *ec)))
194
+ assert np.array_equal(lhs, rhs)
195
+
196
+
197
+ def test_identity_additive_inverse(n=500):
198
+ """a + (-a) == 0"""
199
+ a, _ = make(n)
200
+ ea = rns.encode(a)
201
+ ez = rns.encode(np.zeros(n, dtype=np.uint64))
202
+ neg_a = rns.sub(*ez, *ea)
203
+ result = rns.decode(*rns.add(*ea, *neg_a))
204
+ assert np.all(result == 0)
205
+
206
+
207
+ def test_chain_expression(n=1000):
208
+ """((a + b) * c - d) * e matches Python exact arithmetic"""
209
+ rng = np.random.default_rng(7)
210
+ a,b,c,d,e = [rng.integers(0, 1000, size=n, dtype=np.uint64) for _ in range(5)]
211
+ py_exp = ((a.astype(object)+b)*c-d)*e % M
212
+
213
+ s1 = rns.add(*rns.encode(a), *rns.encode(b))
214
+ s2 = rns.mul(*s1, *rns.encode(c))
215
+ s3 = rns.sub(*s2, *rns.encode(d))
216
+ s4 = rns.mul(*s3, *rns.encode(e))
217
+ result = rns.decode(*s4)
218
+
219
+ assert np.array_equal(result, py_exp.astype(np.uint64))
220
+
221
+
222
+ # ── op() generic interface ────────────────────────────────────────────────
223
+ def test_op_matches_named_functions():
224
+ a, b = make(100)
225
+ ea, eb = rns.encode(a), rns.encode(b)
226
+ for code, fn in [(0, rns.add), (1, rns.mul), (2, rns.sub)]:
227
+ via_op = rns.decode(*rns.op(*ea, *eb, code))
228
+ via_name = rns.decode(*fn(*ea, *eb))
229
+ assert np.array_equal(via_op, via_name)
230
+
231
+
232
+ def test_op_invalid_opcode():
233
+ a, _ = make(10)
234
+ ea = rns.encode(a)
235
+ with pytest.raises(Exception):
236
+ rns.op(*ea, *ea, 99)