PyPI - rns-engine - Versions diffs - 0.1.0__tar.gz - Mend

rns-engine 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

rns_engine-0.1.0/LICENSE +21 -0
rns_engine-0.1.0/PKG-INFO +178 -0
rns_engine-0.1.0/README.md +131 -0
rns_engine-0.1.0/pyproject.toml +54 -0
rns_engine-0.1.0/setup.cfg +4 -0
rns_engine-0.1.0/setup.py +51 -0
rns_engine-0.1.0/src/rns_engine/__init__.py +60 -0
rns_engine-0.1.0/src/rns_engine/_core.cpp +243 -0
rns_engine-0.1.0/src/rns_engine.egg-info/PKG-INFO +178 -0
rns_engine-0.1.0/src/rns_engine.egg-info/SOURCES.txt +12 -0
rns_engine-0.1.0/src/rns_engine.egg-info/dependency_links.txt +1 -0
rns_engine-0.1.0/src/rns_engine.egg-info/requires.txt +1 -0
rns_engine-0.1.0/src/rns_engine.egg-info/top_level.txt +1 -0
rns_engine-0.1.0/tests/test_rns.py +236 -0

rns_engine-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Evan Wesley
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

rns_engine-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,178 @@
+Metadata-Version: 2.4
+Name: rns_engine
+Version: 0.1.0
+Summary: Exact integer arithmetic via AVX2-accelerated Residue Number System
+License: MIT License
+        Copyright (c) 2026 Evan Wesley
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/playfularchitect/rns_engine
+Project-URL: Issues, https://github.com/playfularchitect/rns_engine/issues
+Keywords: arithmetic,exact,integer,RNS,residue,SIMD,AVX2
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: C++
+Classifier: Topic :: Scientific/Engineering :: Mathematics
+Classifier: Topic :: Software Development :: Libraries
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.21
+Dynamic: license-file
+# rns_engine
+**Exact integer arithmetic via AVX2-accelerated Residue Number System (RNS).**
+No floating point. No approximation. Errors are structurally impossible.
+---
+## What it does
+Standard Python integers are exact but slow. NumPy is fast but uses floating point or silently overflows. `rns_engine` gives you **exact integer arithmetic at hundreds of millions of operations per second** — the best of both worlds.
+It works by decomposing integers into residues across three coprime moduli (127, 8191, 65536), performing all operations in residue space using AVX2 SIMD instructions, and reconstructing exact results via the Chinese Remainder Theorem.
+**Dynamic range:** `[0, 68,174,282,752)` — about 68 billion.
+## Install
+```bash
+pip install rns_engine
+```
+Requires a CPU with AVX2 (any Intel/AMD since ~2013). Falls back to scalar arithmetic on ARM and older hardware.
+## Quick start
+```python
+import rns_engine as rns
+import numpy as np
+# Works on arrays of uint64
+a = np.array([123456789, 999999999], dtype=np.uint64)
+b = np.array([987654321, 111111111], dtype=np.uint64)
+# Encode once
+ea = rns.encode(a)   # returns (r0, r1, r2) residue arrays
+eb = rns.encode(b)
+# Operate in residue space — no intermediate decode needed
+result = rns.decode(*rns.mul(*ea, *eb))   # exact multiplication
+# Chain multiple operations — decode once at the end
+s1 = rns.add(*ea, *eb)      # a + b
+s2 = rns.mul(*s1, *eb)      # (a + b) * b
+s3 = rns.sub(*s2, *ea)      # (a + b) * b - a
+out = rns.decode(*s3)        # one decode, three operations
+```
+## Operations
+| Function | Description |
+|----------|-------------|
+| `rns.encode(x)` | `uint64[]` → `(r0, r1, r2)` residue arrays |
+| `rns.decode(r0, r1, r2)` | Residues → `uint64[]` via Garner's algorithm |
+| `rns.add(*ea, *eb)` | Exact addition |
+| `rns.sub(*ea, *eb)` | Exact subtraction |
+| `rns.mul(*ea, *eb)` | Exact multiplication |
+| `rns.div_(*ea, *eb)` | Exact division (b must be coprime to all moduli) |
+| `rns.op(*ea, *eb, code)` | Generic: `0`=add `1`=mul `2`=sub `3`=div |
+### Division constraint
+Division requires `b` to be invertible on all three rails:
+- `b % 127  != 0`
+- `b % 8191 != 0`
+- `b % 65536` is **odd** (coprime to 2^16)
+```python
+# Safe way to ensure b is valid for division:
+b = np.where(b % 2 == 0, b + 1, b)   # make odd
+b = np.where(b % 127  == 0, b + 2, b)
+b = np.where(b % 8191 == 0, b + 4, b)
+b = b % rns.M
+```
+## Performance
+On a machine with AVX2 (tested on Google Colab's CPU):
+| Operation | Throughput |
+|-----------|-----------|
+| add | ~200–400 M ops/sec |
+| sub | ~200–400 M ops/sec |
+| mul | ~200–400 M ops/sec |
+| div | ~1.6 M ops/sec (scalar modinv per element) |
+## Why RNS?
+In a Residue Number System, **addition and multiplication have no carry propagation between digits**. Each residue rail is independent. This makes RNS ideal for:
+- **Exact arithmetic** — results are always correct within the dynamic range
+- **Parallel computation** — rails can run simultaneously
+- **Error detection** — CRT reconstruction fails loudly if any rail is corrupted
+- **Cryptography** — modular arithmetic is the native language of RSA, ECC, etc.
+## How it works
+Three coprime moduli: `m0 = 127`, `m1 = 8191`, `m2 = 65536`
+Dynamic range: `M = 127 × 8191 × 65536 = 68,174,282,752`
+**Encode:** `x → (x mod 127, x mod 8191, x mod 65536)`
+**Operate:** each rail independently, e.g. add: `(a+b) mod mᵢ` per rail
+**Decode (Garner's algorithm):**
+```
+t0 = r0
+t1 = (r1 - t0) × inv(127, 8191)  mod 8191
+t2 = (r2 - t0 - t1×127) × inv(127×8191, 65536)  mod 65536
+x  = t0 + t1×127 + t2×127×8191
+```
+Mod 127 and mod 8191 reductions use the Mersenne-prime trick:
+`x mod (2^k - 1) = (x & mask) + (x >> k)` — no division needed.
+## Building from source
+```bash
+git clone https://github.com/playfularchitect/rns_engine
+cd rns_engine
+pip install pybind11 numpy
+pip install -e .
+pytest tests/ -v
+```
+Requires `g++` with C++17 support.
+## License
+MIT

rns_engine-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,131 @@
+# rns_engine
+**Exact integer arithmetic via AVX2-accelerated Residue Number System (RNS).**
+No floating point. No approximation. Errors are structurally impossible.
+---
+## What it does
+Standard Python integers are exact but slow. NumPy is fast but uses floating point or silently overflows. `rns_engine` gives you **exact integer arithmetic at hundreds of millions of operations per second** — the best of both worlds.
+It works by decomposing integers into residues across three coprime moduli (127, 8191, 65536), performing all operations in residue space using AVX2 SIMD instructions, and reconstructing exact results via the Chinese Remainder Theorem.
+**Dynamic range:** `[0, 68,174,282,752)` — about 68 billion.
+## Install
+```bash
+pip install rns_engine
+```
+Requires a CPU with AVX2 (any Intel/AMD since ~2013). Falls back to scalar arithmetic on ARM and older hardware.
+## Quick start
+```python
+import rns_engine as rns
+import numpy as np
+# Works on arrays of uint64
+a = np.array([123456789, 999999999], dtype=np.uint64)
+b = np.array([987654321, 111111111], dtype=np.uint64)
+# Encode once
+ea = rns.encode(a)   # returns (r0, r1, r2) residue arrays
+eb = rns.encode(b)
+# Operate in residue space — no intermediate decode needed
+result = rns.decode(*rns.mul(*ea, *eb))   # exact multiplication
+# Chain multiple operations — decode once at the end
+s1 = rns.add(*ea, *eb)      # a + b
+s2 = rns.mul(*s1, *eb)      # (a + b) * b
+s3 = rns.sub(*s2, *ea)      # (a + b) * b - a
+out = rns.decode(*s3)        # one decode, three operations
+```
+## Operations
+| Function | Description |
+|----------|-------------|
+| `rns.encode(x)` | `uint64[]` → `(r0, r1, r2)` residue arrays |
+| `rns.decode(r0, r1, r2)` | Residues → `uint64[]` via Garner's algorithm |
+| `rns.add(*ea, *eb)` | Exact addition |
+| `rns.sub(*ea, *eb)` | Exact subtraction |
+| `rns.mul(*ea, *eb)` | Exact multiplication |
+| `rns.div_(*ea, *eb)` | Exact division (b must be coprime to all moduli) |
+| `rns.op(*ea, *eb, code)` | Generic: `0`=add `1`=mul `2`=sub `3`=div |
+### Division constraint
+Division requires `b` to be invertible on all three rails:
+- `b % 127  != 0`
+- `b % 8191 != 0`
+- `b % 65536` is **odd** (coprime to 2^16)
+```python
+# Safe way to ensure b is valid for division:
+b = np.where(b % 2 == 0, b + 1, b)   # make odd
+b = np.where(b % 127  == 0, b + 2, b)
+b = np.where(b % 8191 == 0, b + 4, b)
+b = b % rns.M
+```
+## Performance
+On a machine with AVX2 (tested on Google Colab's CPU):
+| Operation | Throughput |
+|-----------|-----------|
+| add | ~200–400 M ops/sec |
+| sub | ~200–400 M ops/sec |
+| mul | ~200–400 M ops/sec |
+| div | ~1.6 M ops/sec (scalar modinv per element) |
+## Why RNS?
+In a Residue Number System, **addition and multiplication have no carry propagation between digits**. Each residue rail is independent. This makes RNS ideal for:
+- **Exact arithmetic** — results are always correct within the dynamic range
+- **Parallel computation** — rails can run simultaneously
+- **Error detection** — CRT reconstruction fails loudly if any rail is corrupted
+- **Cryptography** — modular arithmetic is the native language of RSA, ECC, etc.
+## How it works
+Three coprime moduli: `m0 = 127`, `m1 = 8191`, `m2 = 65536`
+Dynamic range: `M = 127 × 8191 × 65536 = 68,174,282,752`
+**Encode:** `x → (x mod 127, x mod 8191, x mod 65536)`
+**Operate:** each rail independently, e.g. add: `(a+b) mod mᵢ` per rail
+**Decode (Garner's algorithm):**
+```
+t0 = r0
+t1 = (r1 - t0) × inv(127, 8191)  mod 8191
+t2 = (r2 - t0 - t1×127) × inv(127×8191, 65536)  mod 65536
+x  = t0 + t1×127 + t2×127×8191
+```
+Mod 127 and mod 8191 reductions use the Mersenne-prime trick:
+`x mod (2^k - 1) = (x & mask) + (x >> k)` — no division needed.
+## Building from source
+```bash
+git clone https://github.com/playfularchitect/rns_engine
+cd rns_engine
+pip install pybind11 numpy
+pip install -e .
+pytest tests/ -v
+```
+Requires `g++` with C++17 support.
+## License
+MIT

rns_engine-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,54 @@
+[build-system]
+requires = ["setuptools>=64", "pybind11>=2.11"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "rns_engine"
+version = "0.1.0"
+description = "Exact integer arithmetic via AVX2-accelerated Residue Number System"
+readme = "README.md"
+license = { file = "LICENSE" }
+requires-python = ">=3.9"
+dependencies = ["numpy>=1.21"]
+keywords = ["arithmetic", "exact", "integer", "RNS", "residue", "SIMD", "AVX2"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: C++",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Software Development :: Libraries",
+]
+[project.urls]
+Homepage = "https://github.com/playfularchitect/rns_engine"
+Issues   = "https://github.com/playfularchitect/rns_engine/issues"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.setuptools.package-data]
+rns_engine = ["*.pyi"]
+[tool.cibuildwheel]
+# Build for CPython 3.9–3.12 only (not PyPy)
+build = "cp39-* cp310-* cp311-* cp312-*"
+# Skip 32-bit builds and musllinux (AVX2 not worth supporting there)
+skip  = "*-win32 *-manylinux_i686 *-musllinux*"
+dependency-versions = "latest"
+[tool.cibuildwheel.linux]
+archs = ["x86_64"]
+[tool.cibuildwheel.macos]
+# x86_64 supports AVX2; arm64 (M1/M2) uses scalar fallback
+archs = ["x86_64", "arm64"]
+[tool.cibuildwheel.windows]
+archs = ["AMD64"]

rns_engine-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

rns_engine-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""
+setup.py — builds the rns_engine C++ extension.
+Detects AVX2 support and enables it when available.
+Falls back to scalar on non-x86 or older hardware.
+"""
+import sys
+import platform
+from setuptools import setup, Extension
+import pybind11
+def get_compile_args():
+    """Return compiler flags appropriate for the current platform."""
+    system = platform.system()
+    machine = platform.machine()
+    common = ["-std=c++17", "-O3", "-DNDEBUG", "-Wno-unused-function"]
+    if system in ("Linux", "Darwin") and machine in ("x86_64", "AMD64"):
+        # Enable AVX2 on x86-64 Linux/Mac
+        return common + ["-mavx2", "-march=native", "-funroll-loops"]
+    elif system == "Windows":
+        # MSVC flags
+        return ["/std:c++17", "/O2", "/DNDEBUG", "/arch:AVX2"]
+    else:
+        # ARM, RISC-V, etc. — scalar fallback, still fast
+        return common
+    return common
+def get_link_args():
+    if platform.system() == "Windows":
+        return []
+    return []
+ext = Extension(
+    "rns_engine._core",
+    sources=["src/rns_engine/_core.cpp"],
+    include_dirs=[pybind11.get_include()],
+    extra_compile_args=get_compile_args(),
+    extra_link_args=get_link_args(),
+    language="c++",
+)
+setup(ext_modules=[ext])

rns_engine-0.1.0/src/rns_engine/__init__.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""
+rns_engine — Exact integer arithmetic via Residue Number System (RNS).
+Dynamic range: [0, 68,174,282,752)  =  127 × 8191 × 65536
+AVX2-accelerated on x86; scalar fallback on all other platforms.
+Quick start
+-----------
+>>> import rns_engine as rns
+>>> import numpy as np
+>>>
+>>> a = np.array([123456789, 999999999], dtype=np.uint64)
+>>> b = np.array([987654321, 111111111], dtype=np.uint64)
+>>>
+>>> ea = rns.encode(a)          # -> (r0, r1, r2) residue arrays
+>>> eb = rns.encode(b)
+>>>
+>>> result = rns.decode(*rns.mul(*ea, *eb))   # exact multiplication
+>>> # stays in residue space for multi-step expressions:
+>>> s1 = rns.add(*ea, *eb)      # a + b
+>>> s2 = rns.mul(*s1, *eb)      # (a + b) * b
+>>> out = rns.decode(*s2)       # decode once at the end
+Notes
+-----
+- All values must be in [0, M) where M = rns.M = 68,174,282,752
+- Values outside this range are reduced mod M on encode
+- Division requires b to be coprime to all moduli:
+    b % 127  != 0
+    b % 8191 != 0
+    b % 65536 is odd  (coprime to 2^16)
+"""
+from ._core import (
+    encode,
+    decode,
+    op,
+    add,
+    sub,
+    mul,
+    div_,
+    M,
+    M0,
+    M1,
+    M2,
+    HAS_AVX2,
+)
+__version__ = "0.1.0"
+__all__ = ["encode", "decode", "op", "add", "sub", "mul", "div_",
+           "M", "M0", "M1", "M2", "HAS_AVX2"]
+def info():
+    """Print a summary of the engine configuration."""
+    print(f"rns_engine v{__version__}")
+    print(f"  Dynamic range : [0, {M:,})")
+    print(f"  Moduli        : {M0} × {M1} × {M2}")
+    print(f"  AVX2          : {'yes' if HAS_AVX2 else 'no (scalar fallback)'}")
+    print(f"  Operations    : add  sub  mul  div_")

rns_engine-0.1.0/src/rns_engine/_core.cpp ADDED Viewed

@@ -0,0 +1,243 @@
+/*
+ * rns_engine/_core.cpp
+ * 3-rail RNS exact integer arithmetic. AVX2 + scalar fallback.
+ * Works on Windows/MSVC, Linux/GCC, Mac/Clang.
+ */
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <stdint.h>
+#include <stdexcept>
+namespace py = pybind11;
+using arr16 = py::array_t<uint16_t>;
+using arr32 = py::array_t<uint32_t>;
+using arr64 = py::array_t<uint64_t>;
+static constexpr uint32_t M0     = 127;
+static constexpr uint32_t M1     = 8191;
+static constexpr uint32_t M2     = 65536;
+static constexpr uint64_t BM     = (uint64_t)M0 * M1 * M2;
+static constexpr uint32_t INV01  = 129;
+static constexpr uint32_t INV012 = 24705;
+#define L 16
+static inline uint16_t r127s(uint32_t x) {
+    x = (x & 0x7F) + (x >> 7);
+    x = (x & 0x7F) + (x >> 7);
+    return x == 127 ? 0 : (uint16_t)x;
+}
+static inline uint32_t r8191s(uint64_t x) {
+    x = (x & 0x1FFF) + (x >> 13);
+    x = (x & 0x1FFF) + (x >> 13);
+    return x == 8191 ? 0 : (uint32_t)x;
+}
+static int64_t egcd(int64_t a, int64_t b, int64_t *x, int64_t *y) {
+    if (!a) { *x = 0; *y = 1; return b; }
+    int64_t x1, y1;
+    int64_t g = egcd(b % a, a, &x1, &y1);
+    *x = y1 - (b / a) * x1; *y = x1;
+    return g;
+}
+static uint32_t inv_s(int64_t a, int64_t m) {
+    a = ((a % m) + m) % m;
+    if (!a) return 0;
+    int64_t x, y;
+    if (egcd(a, m, &x, &y) != 1) return 0;
+    return (uint32_t)(((x % m) + m) % m);
+}
+static inline uint64_t garner(uint16_t r0, uint32_t r1, uint16_t r2) {
+    uint32_t t0 = r0;
+    uint32_t t1 = (uint32_t)(
+        ((int64_t)r1 - (int64_t)(t0 % 8191) + 8191) % 8191
+        * (uint64_t)INV01 % 8191);
+    uint64_t base = t0 + (uint64_t)t1 * 127;
+    int64_t d = ((int64_t)r2 - (int64_t)(base % 65536) + 131072LL) % 65536;
+    return base + (uint64_t)(d * (uint64_t)INV012 % 65536) * 127ULL * 8191ULL;
+}
+// int64_t used for loop counters — works on Windows, Linux, Mac
+static void kernel_scalar(
+    const uint16_t *a0, const uint32_t *a1, const uint16_t *a2,
+    const uint16_t *b0, const uint32_t *b1, const uint16_t *b2,
+    uint16_t *r0,       uint32_t *r1,       uint16_t *r2,
+    int64_t n, int op)
+{
+    for (int64_t i = 0; i < n; i++) {
+        if (op == 0) {
+            r0[i] = r127s(a0[i] + b0[i]);
+            r1[i] = r8191s((uint64_t)a1[i] + b1[i]);
+            r2[i] = (uint16_t)((a2[i] + b2[i]) & 0xFFFF);
+        } else if (op == 1) {
+            r0[i] = r127s((uint32_t)a0[i] * b0[i]);
+            r1[i] = r8191s((uint64_t)a1[i] * b1[i]);
+            r2[i] = (uint16_t)((uint32_t)a2[i] * b2[i]);
+        } else if (op == 2) {
+            r0[i] = r127s(127 + a0[i] - b0[i] % 127);
+            r1[i] = r8191s(8191 + (uint64_t)a1[i] - b1[i] % 8191);
+            r2[i] = (uint16_t)((65536 + a2[i] - b2[i] % 65536) & 0xFFFF);
+        } else {
+            r0[i] = r127s ((uint32_t)a0[i] * inv_s(b0[i], 127));
+            r1[i] = r8191s((uint64_t)a1[i] * inv_s(b1[i], 8191));
+            r2[i] = (uint16_t)(((uint32_t)a2[i] * inv_s(b2[i], 65536)) & 0xFFFF);
+        }
+    }
+}
+#if defined(__AVX2__)
+#include <immintrin.h>
+#define HAVE_AVX2 1
+using vec16 = __m256i;
+static inline vec16 V1(int x)            { return _mm256_set1_epi16((short)x); }
+static inline vec16 Va(vec16 a, vec16 b) { return _mm256_add_epi16(a, b); }
+static inline vec16 Vs(vec16 a, vec16 b) { return _mm256_sub_epi16(a, b); }
+static inline vec16 Vm(vec16 a, vec16 b) { return _mm256_mullo_epi16(a, b); }
+static inline vec16 Vn(vec16 a, vec16 b) { return _mm256_and_si256(a, b); }
+static inline vec16 Vh(vec16 a, int s)   { return _mm256_srli_epi16(a, s); }
+static inline vec16 Ve(vec16 a, vec16 b) { return _mm256_cmpeq_epi16(a, b); }
+static inline vec16 r127v(vec16 x) {
+    vec16 t = Va(Vn(x, V1(0x7F)), Vh(x, 7));
+    t = Va(Vn(t, V1(0x7F)), Vh(t, 7));
+    return Vs(t, Vn(V1(127), Ve(t, V1(127))));
+}
+static inline vec16 r8191v(vec16 x) {
+    vec16 t = Va(Vn(x, V1(0x1FFF)), Vh(x, 13));
+    return Vs(t, Vn(V1(8191), Ve(t, V1(8191))));
+}
+static inline vec16 mul8191v(vec16 a, vec16 b) {
+    __m256i mk = _mm256_set1_epi32(0x1FFF);
+    auto f = [&](__m256i x) {
+        x = _mm256_add_epi32(_mm256_and_si256(x, mk), _mm256_srli_epi32(x, 13));
+        x = _mm256_add_epi32(_mm256_and_si256(x, mk), _mm256_srli_epi32(x, 13));
+        return _mm256_sub_epi32(x, _mm256_and_si256(
+            _mm256_cmpeq_epi32(x, _mm256_set1_epi32(8191)),
+            _mm256_set1_epi32(8191)));
+    };
+    __m256i pl = _mm256_mullo_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(a)),
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b)));
+    __m256i ph = _mm256_mullo_epi32(
+        _mm256_cvtepu16_epi32(_mm256_extracti128_si256(a, 1)),
+        _mm256_cvtepu16_epi32(_mm256_extracti128_si256(b, 1)));
+    return _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(f(pl), f(ph)), 0b11011000);
+}
+static void kernel_avx2(
+    const uint16_t *a0, const uint32_t *a1, const uint16_t *a2,
+    const uint16_t *b0, const uint32_t *b1, const uint16_t *b2,
+    uint16_t *r0,       uint32_t *r1,       uint16_t *r2,
+    int64_t n, int op)
+{
+    if (op == 3) { kernel_scalar(a0,a1,a2,b0,b1,b2,r0,r1,r2,n,op); return; }
+    int64_t full = (n / L) * L;
+    for (int64_t base = 0; base < full; base += L) {
+        alignas(32) int16_t ta0[L],tb0[L],ta1[L],tb1[L],ta2[L],tb2[L];
+        for (int l = 0; l < L; l++) {
+            ta0[l]=(int16_t)a0[base+l]; tb0[l]=(int16_t)b0[base+l];
+            ta1[l]=(int16_t)a1[base+l]; tb1[l]=(int16_t)b1[base+l];
+            ta2[l]=(int16_t)a2[base+l]; tb2[l]=(int16_t)b2[base+l];
+        }
+        vec16 va0=_mm256_load_si256((vec16*)ta0), vb0=_mm256_load_si256((vec16*)tb0);
+        vec16 va1=_mm256_load_si256((vec16*)ta1), vb1=_mm256_load_si256((vec16*)tb1);
+        vec16 va2=_mm256_load_si256((vec16*)ta2), vb2=_mm256_load_si256((vec16*)tb2);
+        vec16 vr0, vr1, vr2;
+        if (op == 0) {
+            vr0 = r127v(Va(va0,vb0)); vr1 = r8191v(Va(va1,vb1)); vr2 = Va(va2,vb2);
+        } else if (op == 1) {
+            vr0 = r127v(Vm(va0,vb0)); vr1 = mul8191v(va1,vb1);   vr2 = Vm(va2,vb2);
+        } else {
+            vr0 = r127v (Va(va0, r127v (Vs(V1(127),  vb0))));
+            vr1 = r8191v(Va(va1, r8191v(Vs(V1(8191), vb1))));
+            vr2 = Va(va2, Vs(V1(0), vb2));
+        }
+        alignas(32) int16_t tr0[L], tr1[L], tr2[L];
+        _mm256_store_si256((vec16*)tr0, vr0);
+        _mm256_store_si256((vec16*)tr1, vr1);
+        _mm256_store_si256((vec16*)tr2, vr2);
+        for (int l = 0; l < L; l++) {
+            r0[base+l]=(uint16_t)tr0[l];
+            r1[base+l]=(uint16_t)tr1[l];
+            r2[base+l]=(uint16_t)tr2[l];
+        }
+    }
+    kernel_scalar(a0+full,a1+full,a2+full,b0+full,b1+full,b2+full,
+                  r0+full,r1+full,r2+full, n-full, op);
+}
+#else
+#define HAVE_AVX2 0
+#endif
+static void kernel(
+    const uint16_t *a0, const uint32_t *a1, const uint16_t *a2,
+    const uint16_t *b0, const uint32_t *b1, const uint16_t *b2,
+    uint16_t *r0,       uint32_t *r1,       uint16_t *r2,
+    int64_t n, int op)
+{
+#if HAVE_AVX2
+    kernel_avx2(a0,a1,a2,b0,b1,b2,r0,r1,r2,n,op);
+#else
+    kernel_scalar(a0,a1,a2,b0,b1,b2,r0,r1,r2,n,op);
+#endif
+}
+py::tuple py_encode(arr64 x_in) {
+    auto x = x_in.unchecked<1>();
+    int64_t n = (int64_t)x_in.shape(0);
+    arr16 o0({n}); arr32 o1({n}); arr16 o2({n});
+    auto p0 = o0.mutable_unchecked<1>();
+    auto p1 = o1.mutable_unchecked<1>();
+    auto p2 = o2.mutable_unchecked<1>();
+    for (int64_t i = 0; i < n; i++) {
+        uint64_t v = x(i) % BM;
+        p0(i) = (uint16_t)(v % 127);
+        p1(i) = (uint32_t)(v % 8191);
+        p2(i) = (uint16_t)(v % 65536);
+    }
+    return py::make_tuple(o0, o1, o2);
+}
+arr64 py_decode(arr16 r0_, arr32 r1_, arr16 r2_) {
+    int64_t n = (int64_t)r0_.shape(0);
+    if (r1_.shape(0) != (size_t)n || r2_.shape(0) != (size_t)n)
+        throw std::invalid_argument("array length mismatch");
+    arr64 out({n});
+    auto r0 = r0_.unchecked<1>();
+    auto r1 = r1_.unchecked<1>();
+    auto r2 = r2_.unchecked<1>();
+    auto o  = out.mutable_unchecked<1>();
+    for (int64_t i = 0; i < n; i++) o(i) = garner(r0(i), r1(i), r2(i));
+    return out;
+}
+py::tuple py_op(arr16 a0, arr32 a1, arr16 a2,
+                arr16 b0, arr32 b1, arr16 b2, int opcode) {
+    if (opcode < 0 || opcode > 3)
+        throw std::invalid_argument("opcode must be 0=add 1=mul 2=sub 3=div");
+    int64_t n = (int64_t)a0.shape(0);
+    arr16 r0({n}); arr32 r1({n}); arr16 r2({n});
+    kernel(a0.data(), a1.data(), a2.data(),
+           b0.data(), b1.data(), b2.data(),
+           r0.mutable_data(), r1.mutable_data(), r2.mutable_data(),
+           n, opcode);
+    return py::make_tuple(r0, r1, r2);
+}
+PYBIND11_MODULE(_core, m) {
+    m.doc() = "rns_engine._core: AVX2-accelerated 3-rail RNS exact integer arithmetic.";
+    m.attr("M")        = (uint64_t)BM;
+    m.attr("M0")       = (uint32_t)M0;
+    m.attr("M1")       = (uint32_t)M1;
+    m.attr("M2")       = (uint32_t)M2;
+    m.attr("HAS_AVX2") = (bool)HAVE_AVX2;
+    m.def("encode", &py_encode, "uint64[] -> (r0,r1,r2)");
+    m.def("decode", &py_decode, "(r0,r1,r2) -> uint64[]");
+    m.def("op",     &py_op,     "opcode: 0=add 1=mul 2=sub 3=div");
+    m.def("add",  [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
+          { return py_op(a0,a1,a2,b0,b1,b2,0); }, "Exact addition.");
+    m.def("sub",  [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
+          { return py_op(a0,a1,a2,b0,b1,b2,2); }, "Exact subtraction.");
+    m.def("mul",  [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
+          { return py_op(a0,a1,a2,b0,b1,b2,1); }, "Exact multiplication.");
+    m.def("div_", [](arr16 a0,arr32 a1,arr16 a2,arr16 b0,arr32 b1,arr16 b2)
+          { return py_op(a0,a1,a2,b0,b1,b2,3); }, "Exact division.");
+}

rns_engine-0.1.0/src/rns_engine.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,178 @@
+Metadata-Version: 2.4
+Name: rns_engine
+Version: 0.1.0
+Summary: Exact integer arithmetic via AVX2-accelerated Residue Number System
+License: MIT License
+        Copyright (c) 2026 Evan Wesley
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/playfularchitect/rns_engine
+Project-URL: Issues, https://github.com/playfularchitect/rns_engine/issues
+Keywords: arithmetic,exact,integer,RNS,residue,SIMD,AVX2
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: C++
+Classifier: Topic :: Scientific/Engineering :: Mathematics
+Classifier: Topic :: Software Development :: Libraries
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.21
+Dynamic: license-file
+# rns_engine
+**Exact integer arithmetic via AVX2-accelerated Residue Number System (RNS).**
+No floating point. No approximation. Errors are structurally impossible.
+---
+## What it does
+Standard Python integers are exact but slow. NumPy is fast but uses floating point or silently overflows. `rns_engine` gives you **exact integer arithmetic at hundreds of millions of operations per second** — the best of both worlds.
+It works by decomposing integers into residues across three coprime moduli (127, 8191, 65536), performing all operations in residue space using AVX2 SIMD instructions, and reconstructing exact results via the Chinese Remainder Theorem.
+**Dynamic range:** `[0, 68,174,282,752)` — about 68 billion.
+## Install
+```bash
+pip install rns_engine
+```
+Requires a CPU with AVX2 (any Intel/AMD since ~2013). Falls back to scalar arithmetic on ARM and older hardware.
+## Quick start
+```python
+import rns_engine as rns
+import numpy as np
+# Works on arrays of uint64
+a = np.array([123456789, 999999999], dtype=np.uint64)
+b = np.array([987654321, 111111111], dtype=np.uint64)
+# Encode once
+ea = rns.encode(a)   # returns (r0, r1, r2) residue arrays
+eb = rns.encode(b)
+# Operate in residue space — no intermediate decode needed
+result = rns.decode(*rns.mul(*ea, *eb))   # exact multiplication
+# Chain multiple operations — decode once at the end
+s1 = rns.add(*ea, *eb)      # a + b
+s2 = rns.mul(*s1, *eb)      # (a + b) * b
+s3 = rns.sub(*s2, *ea)      # (a + b) * b - a
+out = rns.decode(*s3)        # one decode, three operations
+```
+## Operations
+| Function | Description |
+|----------|-------------|
+| `rns.encode(x)` | `uint64[]` → `(r0, r1, r2)` residue arrays |
+| `rns.decode(r0, r1, r2)` | Residues → `uint64[]` via Garner's algorithm |
+| `rns.add(*ea, *eb)` | Exact addition |
+| `rns.sub(*ea, *eb)` | Exact subtraction |
+| `rns.mul(*ea, *eb)` | Exact multiplication |
+| `rns.div_(*ea, *eb)` | Exact division (b must be coprime to all moduli) |
+| `rns.op(*ea, *eb, code)` | Generic: `0`=add `1`=mul `2`=sub `3`=div |
+### Division constraint
+Division requires `b` to be invertible on all three rails:
+- `b % 127  != 0`
+- `b % 8191 != 0`
+- `b % 65536` is **odd** (coprime to 2^16)
+```python
+# Safe way to ensure b is valid for division:
+b = np.where(b % 2 == 0, b + 1, b)   # make odd
+b = np.where(b % 127  == 0, b + 2, b)
+b = np.where(b % 8191 == 0, b + 4, b)
+b = b % rns.M
+```
+## Performance
+On a machine with AVX2 (tested on Google Colab's CPU):
+| Operation | Throughput |
+|-----------|-----------|
+| add | ~200–400 M ops/sec |
+| sub | ~200–400 M ops/sec |
+| mul | ~200–400 M ops/sec |
+| div | ~1.6 M ops/sec (scalar modinv per element) |
+## Why RNS?
+In a Residue Number System, **addition and multiplication have no carry propagation between digits**. Each residue rail is independent. This makes RNS ideal for:
+- **Exact arithmetic** — results are always correct within the dynamic range
+- **Parallel computation** — rails can run simultaneously
+- **Error detection** — CRT reconstruction fails loudly if any rail is corrupted
+- **Cryptography** — modular arithmetic is the native language of RSA, ECC, etc.
+## How it works
+Three coprime moduli: `m0 = 127`, `m1 = 8191`, `m2 = 65536`
+Dynamic range: `M = 127 × 8191 × 65536 = 68,174,282,752`
+**Encode:** `x → (x mod 127, x mod 8191, x mod 65536)`
+**Operate:** each rail independently, e.g. add: `(a+b) mod mᵢ` per rail
+**Decode (Garner's algorithm):**
+```
+t0 = r0
+t1 = (r1 - t0) × inv(127, 8191)  mod 8191
+t2 = (r2 - t0 - t1×127) × inv(127×8191, 65536)  mod 65536
+x  = t0 + t1×127 + t2×127×8191
+```
+Mod 127 and mod 8191 reductions use the Mersenne-prime trick:
+`x mod (2^k - 1) = (x & mask) + (x >> k)` — no division needed.
+## Building from source
+```bash
+git clone https://github.com/playfularchitect/rns_engine
+cd rns_engine
+pip install pybind11 numpy
+pip install -e .
+pytest tests/ -v
+```
+Requires `g++` with C++17 support.
+## License
+MIT

rns_engine-0.1.0/src/rns_engine.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,12 @@
+LICENSE
+README.md
+pyproject.toml
+setup.py
+src/rns_engine/__init__.py
+src/rns_engine/_core.cpp
+src/rns_engine.egg-info/PKG-INFO
+src/rns_engine.egg-info/SOURCES.txt
+src/rns_engine.egg-info/dependency_links.txt
+src/rns_engine.egg-info/requires.txt
+src/rns_engine.egg-info/top_level.txt
+tests/test_rns.py

rns_engine-0.1.0/src/rns_engine.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

rns_engine-0.1.0/src/rns_engine.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ numpy>=1.21

rns_engine-0.1.0/src/rns_engine.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ rns_engine

rns_engine-0.1.0/tests/test_rns.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""
+tests/test_rns.py — correctness tests for rns_engine.
+Run with:  pytest tests/ -v
+"""
+import pytest
+import numpy as np
+import rns_engine as rns
+M = rns.M
+# ── helpers ───────────────────────────────────────────────────────────────
+def make(n, seed=42, odd_b=False):
+    rng = np.random.default_rng(seed)
+    a = rng.integers(0, M, size=n, dtype=np.uint64)
+    if odd_b:
+        b = rng.integers(0, M // 2, size=n, dtype=np.uint64) * 2 + 1
+    else:
+        b = rng.integers(0, M, size=n, dtype=np.uint64)
+    return a, b
+def oracle(a_np, b_np, op):
+    """Exact Python arbitrary-precision oracle."""
+    n = len(a_np)
+    result = np.zeros(n, dtype=np.uint64)
+    for i in range(n):
+        ai, bi = int(a_np[i]), int(b_np[i])
+        if   op == "add": result[i] = (ai + bi) % M
+        elif op == "sub": result[i] = (ai - bi) % M
+        elif op == "mul": result[i] = (ai * bi) % M
+    return result
+# ── basic sanity ──────────────────────────────────────────────────────────
+def test_constants():
+    assert rns.M  == 127 * 8191 * 65536
+    assert rns.M0 == 127
+    assert rns.M1 == 8191
+    assert rns.M2 == 65536
+def test_info_runs():
+    rns.info()   # just check it doesn't crash
+# ── encode / decode ───────────────────────────────────────────────────────
+def test_roundtrip_small():
+    vals = np.array([0, 1, 126, 127, 8190, 8191, 65535, M-1], dtype=np.uint64)
+    assert np.array_equal(vals, rns.decode(*rns.encode(vals)))
+def test_roundtrip_random():
+    a, _ = make(10_000)
+    assert np.array_equal(a, rns.decode(*rns.encode(a)))
+def test_encode_reduces_mod_M():
+    # values >= M should be reduced
+    vals = np.array([M, M+1, M*2], dtype=np.uint64)
+    decoded = rns.decode(*rns.encode(vals))
+    assert np.array_equal(decoded, np.array([0, 1, 0], dtype=np.uint64))
+# ── addition ──────────────────────────────────────────────────────────────
+def test_add_correctness():
+    a, b = make(5_000)
+    got = rns.decode(*rns.add(*rns.encode(a), *rns.encode(b)))
+    exp = oracle(a, b, "add")
+    assert np.array_equal(got, exp)
+def test_add_zero():
+    a, _ = make(100)
+    z = np.zeros(100, dtype=np.uint64)
+    got = rns.decode(*rns.add(*rns.encode(a), *rns.encode(z)))
+    assert np.array_equal(got, a)
+def test_add_wraps():
+    a = np.array([M - 1], dtype=np.uint64)
+    b = np.array([1],     dtype=np.uint64)
+    got = rns.decode(*rns.add(*rns.encode(a), *rns.encode(b)))
+    assert got[0] == 0
+# ── subtraction ───────────────────────────────────────────────────────────
+def test_sub_correctness():
+    a, b = make(5_000)
+    got = rns.decode(*rns.sub(*rns.encode(a), *rns.encode(b)))
+    exp = oracle(a, b, "sub")
+    assert np.array_equal(got, exp)
+def test_sub_self_is_zero():
+    a, _ = make(100)
+    got = rns.decode(*rns.sub(*rns.encode(a), *rns.encode(a)))
+    assert np.all(got == 0)
+def test_sub_wraps():
+    a = np.array([0], dtype=np.uint64)
+    b = np.array([1], dtype=np.uint64)
+    got = rns.decode(*rns.sub(*rns.encode(a), *rns.encode(b)))
+    assert got[0] == M - 1
+# ── multiplication ────────────────────────────────────────────────────────
+def test_mul_correctness():
+    a, b = make(5_000)
+    got = rns.decode(*rns.mul(*rns.encode(a), *rns.encode(b)))
+    exp = oracle(a, b, "mul")
+    assert np.array_equal(got, exp)
+def test_mul_by_zero():
+    a, _ = make(100)
+    z = np.zeros(100, dtype=np.uint64)
+    got = rns.decode(*rns.mul(*rns.encode(a), *rns.encode(z)))
+    assert np.all(got == 0)
+def test_mul_by_one():
+    a, _ = make(100)
+    one = np.ones(100, dtype=np.uint64)
+    got = rns.decode(*rns.mul(*rns.encode(a), *rns.encode(one)))
+    assert np.array_equal(got, a)
+# ── division ──────────────────────────────────────────────────────────────
+def test_div_correctness():
+    # Use odd b values that are invertible on all rails
+    a, b = make(1_000, odd_b=True)
+    # also ensure nonzero mod 127 and 8191
+    b = np.where(b % 127  == 0, b + 1, b)
+    b = np.where(b % 8191 == 0, b + 2, b)
+    b = b % M
+    got  = rns.decode(*rns.div_(*rns.encode(a), *rns.encode(b)))
+    # oracle: per-integer Python division in the field
+    from math import gcd
+    def mi(a, m):
+        a = a % m
+        if a == 0: return 0
+        def eg(a, b):
+            if not a: return b, 0, 1
+            g, x, y = eg(b % a, a)
+            return g, y - (b // a) * x, x
+        g, x, _ = eg(a, m)
+        return x % m if g == 1 else 0
+    exp = np.array([
+        int(rns.decode(
+            np.array([int(a[i]) % 127  * mi(int(b[i]) % 127,  127)  % 127],  dtype=np.uint16),
+            np.array([int(a[i]) % 8191 * mi(int(b[i]) % 8191, 8191) % 8191], dtype=np.uint32),
+            np.array([int(a[i]) % 65536* mi(int(b[i]) % 65536,65536)% 65536],dtype=np.uint16),
+        )[0]) for i in range(len(a))
+    ], dtype=np.uint64)
+    assert np.array_equal(got, exp)
+# ── algebraic identities ──────────────────────────────────────────────────
+def test_identity_sub_add(n=500):
+    """a - b + b == a"""
+    a, b = make(n)
+    ea, eb = rns.encode(a), rns.encode(b)
+    s1 = rns.sub(*ea, *eb)
+    s2 = rns.add(*s1,  *eb)
+    assert np.array_equal(rns.decode(*s2), a)
+def test_identity_mul_div(n=200):
+    """a * b / b == a  (b invertible)"""
+    a, b = make(n, odd_b=True)
+    b = np.where(b % 127  == 0, b + 1, b)
+    b = np.where(b % 8191 == 0, b + 2, b)
+    b = b % M
+    ea, eb = rns.encode(a), rns.encode(b)
+    s1 = rns.mul( *ea, *eb)
+    s2 = rns.div_(*s1, *eb)
+    assert np.array_equal(rns.decode(*s2), a)
+def test_identity_distributive(n=500):
+    """(a + b) * c == a*c + b*c"""
+    rng = np.random.default_rng(99)
+    a = rng.integers(0, M, size=n, dtype=np.uint64)
+    b = rng.integers(0, M, size=n, dtype=np.uint64)
+    c = rng.integers(0, M, size=n, dtype=np.uint64)
+    ea, eb, ec = rns.encode(a), rns.encode(b), rns.encode(c)
+    lhs = rns.decode(*rns.mul(*rns.add(*ea, *eb), *ec))
+    rhs = rns.decode(*rns.add(*rns.mul(*ea, *ec), *rns.mul(*eb, *ec)))
+    assert np.array_equal(lhs, rhs)
+def test_identity_additive_inverse(n=500):
+    """a + (-a) == 0"""
+    a, _ = make(n)
+    ea = rns.encode(a)
+    ez = rns.encode(np.zeros(n, dtype=np.uint64))
+    neg_a = rns.sub(*ez, *ea)
+    result = rns.decode(*rns.add(*ea, *neg_a))
+    assert np.all(result == 0)
+def test_chain_expression(n=1000):
+    """((a + b) * c - d) * e  matches Python exact arithmetic"""
+    rng = np.random.default_rng(7)
+    a,b,c,d,e = [rng.integers(0, 1000, size=n, dtype=np.uint64) for _ in range(5)]
+    py_exp = ((a.astype(object)+b)*c-d)*e % M
+    s1 = rns.add(*rns.encode(a), *rns.encode(b))
+    s2 = rns.mul(*s1,             *rns.encode(c))
+    s3 = rns.sub(*s2,             *rns.encode(d))
+    s4 = rns.mul(*s3,             *rns.encode(e))
+    result = rns.decode(*s4)
+    assert np.array_equal(result, py_exp.astype(np.uint64))
+# ── op() generic interface ────────────────────────────────────────────────
+def test_op_matches_named_functions():
+    a, b = make(100)
+    ea, eb = rns.encode(a), rns.encode(b)
+    for code, fn in [(0, rns.add), (1, rns.mul), (2, rns.sub)]:
+        via_op   = rns.decode(*rns.op(*ea, *eb, code))
+        via_name = rns.decode(*fn(*ea, *eb))
+        assert np.array_equal(via_op, via_name)
+def test_op_invalid_opcode():
+    a, _ = make(10)
+    ea = rns.encode(a)
+    with pytest.raises(Exception):
+        rns.op(*ea, *ea, 99)