npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/spatial/neon.h ADDED Viewed

@@ -0,0 +1,773 @@
+/**
+ *  @brief SIMD-accelerated Spatial Similarity Measures for NEON.
+ *  @file include/numkong/spatial/neon.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/spatial.h
+ *
+ *  @section spatial_neon_instructions Key NEON Spatial Instructions
+ *
+ *  ARM NEON instructions for distance computations:
+ *
+ *      Intrinsic         Instruction                   Latency     Throughput
+ *                                                                  A76     M4+/V1+/Oryon
+ *      vfmaq_f32         FMLA (V.4S, V.4S, V.4S)       4cy         2/cy    4/cy
+ *      vmulq_f32         FMUL (V.4S, V.4S, V.4S)       3cy         2/cy    4/cy
+ *      vaddq_f32         FADD (V.4S, V.4S, V.4S)       2cy         2/cy    4/cy
+ *      vsubq_f32         FSUB (V.4S, V.4S, V.4S)       2cy         2/cy    4/cy
+ *      vrsqrteq_f32      FRSQRTE (V.4S, V.4S)          2cy         2/cy    2/cy
+ *      vsqrtq_f32        FSQRT (V.4S, V.4S)            9-12cy      0.25/cy 0.25/cy
+ *      vrecpeq_f32       FRECPE (V.4S, V.4S)           2cy         2/cy    2/cy
+ *
+ *  FRSQRTE provides ~8-bit precision; two Newton-Raphson iterations via vrsqrtsq_f32 achieve
+ *  ~23-bit precision, sufficient for f32. This is much faster than FSQRT (0.25/cy).
+ *
+ *  Distance computations (L2, angular) benefit from 2x throughput on 4-pipe cores (Apple M4+,
+ *  Graviton3+, Oryon), but FSQRT remains slow on all cores. Use rsqrt+NR when precision allows.
+ */
+#ifndef NK_SPATIAL_NEON_H
+#define NK_SPATIAL_NEON_H
+#if NK_TARGET_ARM_
+#if NK_TARGET_NEON
+#include "numkong/types.h"
+#include "numkong/scalar/neon.h" // `nk_f32_sqrt_neon`
+#include "numkong/dot/neon.h"    // `nk_dot_stable_sum_f64x2_neon_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("arch=armv8-a+simd")
+#endif
+/**
+ *  @brief Reciprocal square root of 4 floats with Newton-Raphson refinement.
+ *
+ *  Uses `vrsqrteq_f32` (~8-bit initial estimate) followed by two Newton-Raphson iterations
+ *  via `vrsqrtsq_f32`, achieving ~23-bit precision — sufficient for f32.
+ *  Much faster than `vsqrtq_f32` (2 cy vs 9-12 cy latency, 2/cy vs 0.25/cy throughput).
+ */
+NK_INTERNAL float32x4_t nk_rsqrt_f32x4_neon_(float32x4_t x) {
+    float32x4_t rsqrt = vrsqrteq_f32(x);
+    rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(x, rsqrt), rsqrt));
+    rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(x, rsqrt), rsqrt));
+    return rsqrt;
+}
+/**
+ *  @brief Reciprocal square root of 2 doubles with Newton-Raphson refinement.
+ *
+ *  Uses `vrsqrteq_f64` (~8-bit initial estimate) followed by three Newton-Raphson iterations
+ *  via `vrsqrtsq_f64`, achieving ~48-bit precision — reasonable for f64 distance computations
+ *  where the final result is often narrowed to f32.  For full 52-bit mantissa fidelity,
+ *  prefer `vsqrtq_f64` instead.
+ */
+NK_INTERNAL float64x2_t nk_rsqrt_f64x2_neon_(float64x2_t x) {
+    float64x2_t rsqrt = vrsqrteq_f64(x);
+    rsqrt = vmulq_f64(rsqrt, vrsqrtsq_f64(vmulq_f64(x, rsqrt), rsqrt));
+    rsqrt = vmulq_f64(rsqrt, vrsqrtsq_f64(vmulq_f64(x, rsqrt), rsqrt));
+    rsqrt = vmulq_f64(rsqrt, vrsqrtsq_f64(vmulq_f64(x, rsqrt), rsqrt));
+    return rsqrt;
+}
+NK_INTERNAL nk_f32_t nk_angular_normalize_f32_neon_(nk_f32_t ab, nk_f32_t a2, nk_f32_t b2) {
+    if (a2 == 0 && b2 == 0) return 0;
+    if (ab == 0) return 1;
+    nk_f32_t squares_arr[2] = {a2, b2};
+    float32x2_t squares = vld1_f32(squares_arr);
+    // Unlike x86, Arm NEON manuals don't explicitly mention the accuracy of their `rsqrt` approximation.
+    // Third-party research suggests that it's less accurate than SSE instructions, having an error of 1.5×2⁻¹².
+    // One or two rounds of Newton-Raphson refinement are recommended to improve the accuracy.
+    // https://github.com/lighttransport/embree-aarch64/issues/24
+    // https://github.com/lighttransport/embree-aarch64/blob/3f75f8cb4e553d13dced941b5fefd4c826835a6b/common/math/math.h#L137-L145
+    float32x2_t rsqrts = vrsqrte_f32(squares);
+    // Perform two rounds of Newton-Raphson refinement:
+    // https://en.wikipedia.org/wiki/Newton%27s_method
+    rsqrts = vmul_f32(rsqrts, vrsqrts_f32(vmul_f32(squares, rsqrts), rsqrts));
+    rsqrts = vmul_f32(rsqrts, vrsqrts_f32(vmul_f32(squares, rsqrts), rsqrts));
+    vst1_f32(squares_arr, rsqrts);
+    nk_f32_t result = 1 - ab * squares_arr[0] * squares_arr[1];
+    return result > 0 ? result : 0;
+}
+NK_INTERNAL nk_f64_t nk_angular_normalize_f64_neon_(nk_f64_t ab, nk_f64_t a2, nk_f64_t b2) {
+    if (a2 == 0 && b2 == 0) return 0;
+    if (ab == 0) return 1;
+    nk_f64_t squares_arr[2] = {a2, b2};
+    float64x2_t squares = vld1q_f64(squares_arr);
+    // Unlike x86, Arm NEON manuals don't explicitly mention the accuracy of their `rsqrt` approximation.
+    // Third-party research suggests that it's less accurate than SSE instructions, having an error of 1.5×2⁻¹².
+    // One or two rounds of Newton-Raphson refinement are recommended to improve the accuracy.
+    // https://github.com/lighttransport/embree-aarch64/issues/24
+    // https://github.com/lighttransport/embree-aarch64/blob/3f75f8cb4e553d13dced941b5fefd4c826835a6b/common/math/math.h#L137-L145
+    float64x2_t rsqrts_f64x2 = vrsqrteq_f64(squares);
+    // Perform three rounds of Newton-Raphson refinement for f64 precision (~48 bits):
+    // https://en.wikipedia.org/wiki/Newton%27s_method
+    rsqrts_f64x2 = vmulq_f64(rsqrts_f64x2, vrsqrtsq_f64(vmulq_f64(squares, rsqrts_f64x2), rsqrts_f64x2));
+    rsqrts_f64x2 = vmulq_f64(rsqrts_f64x2, vrsqrtsq_f64(vmulq_f64(squares, rsqrts_f64x2), rsqrts_f64x2));
+    rsqrts_f64x2 = vmulq_f64(rsqrts_f64x2, vrsqrtsq_f64(vmulq_f64(squares, rsqrts_f64x2), rsqrts_f64x2));
+    vst1q_f64(squares_arr, rsqrts_f64x2);
+    nk_f64_t result = 1 - ab * squares_arr[0] * squares_arr[1];
+    return result > 0 ? result : 0;
+}
+#pragma region - Traditional Floats
+NK_PUBLIC void nk_sqeuclidean_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
+    // Accumulate in f64 for numerical stability (2 f32s per iteration, avoids slow vget_low/high)
+    float64x2_t sum_f64x2 = vdupq_n_f64(0);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float32x2_t a_f32x2 = vld1_f32(a + i);
+        float32x2_t b_f32x2 = vld1_f32(b + i);
+        float32x2_t diff_f32x2 = vsub_f32(a_f32x2, b_f32x2);
+        float64x2_t diff_f64x2 = vcvt_f64_f32(diff_f32x2);
+        sum_f64x2 = vfmaq_f64(sum_f64x2, diff_f64x2, diff_f64x2);
+    }
+    nk_f64_t sum_f64 = vaddvq_f64(sum_f64x2);
+    for (; i < n; ++i) {
+        nk_f64_t diff_f64 = (nk_f64_t)a[i] - (nk_f64_t)b[i];
+        sum_f64 += diff_f64 * diff_f64;
+    }
+    *result = sum_f64;
+}
+NK_PUBLIC void nk_euclidean_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
+    nk_sqeuclidean_f32_neon(a, b, n, result);
+    *result = nk_f64_sqrt_neon(*result);
+}
+NK_PUBLIC void nk_angular_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
+    // Accumulate in f64 for numerical stability (2 f32s per iteration, avoids slow vget_low/high)
+    float64x2_t ab_f64x2 = vdupq_n_f64(0);
+    float64x2_t a2_f64x2 = vdupq_n_f64(0);
+    float64x2_t b2_f64x2 = vdupq_n_f64(0);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float32x2_t a_f32x2 = vld1_f32(a + i);
+        float32x2_t b_f32x2 = vld1_f32(b + i);
+        float64x2_t a_f64x2 = vcvt_f64_f32(a_f32x2);
+        float64x2_t b_f64x2 = vcvt_f64_f32(b_f32x2);
+        ab_f64x2 = vfmaq_f64(ab_f64x2, a_f64x2, b_f64x2);
+        a2_f64x2 = vfmaq_f64(a2_f64x2, a_f64x2, a_f64x2);
+        b2_f64x2 = vfmaq_f64(b2_f64x2, b_f64x2, b_f64x2);
+    }
+    nk_f64_t ab_f64 = vaddvq_f64(ab_f64x2);
+    nk_f64_t a2_f64 = vaddvq_f64(a2_f64x2);
+    nk_f64_t b2_f64 = vaddvq_f64(b2_f64x2);
+    for (; i < n; ++i) {
+        nk_f64_t ai = (nk_f64_t)a[i], bi = (nk_f64_t)b[i];
+        ab_f64 += ai * bi, a2_f64 += ai * ai, b2_f64 += bi * bi;
+    }
+    *result = nk_angular_normalize_f64_neon_(ab_f64, a2_f64, b2_f64);
+}
+NK_PUBLIC void nk_sqeuclidean_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    float64x2_t sum_f64x2 = vdupq_n_f64(0);
+    float64x2_t a_f64x2, b_f64x2;
+nk_sqeuclidean_f64_neon_cycle:
+    if (n < 2) {
+        nk_b128_vec_t a_tail, b_tail;
+        nk_partial_load_b64x2_serial_(a, &a_tail, n);
+        nk_partial_load_b64x2_serial_(b, &b_tail, n);
+        a_f64x2 = a_tail.f64x2;
+        b_f64x2 = b_tail.f64x2;
+        n = 0;
+    }
+    else {
+        a_f64x2 = vld1q_f64(a);
+        b_f64x2 = vld1q_f64(b);
+        a += 2, b += 2, n -= 2;
+    }
+    float64x2_t diff_f64x2 = vsubq_f64(a_f64x2, b_f64x2);
+    sum_f64x2 = vfmaq_f64(sum_f64x2, diff_f64x2, diff_f64x2);
+    if (n) goto nk_sqeuclidean_f64_neon_cycle;
+    *result = vaddvq_f64(sum_f64x2);
+}
+NK_PUBLIC void nk_euclidean_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    nk_sqeuclidean_f64_neon(a, b, n, result);
+    *result = nk_f64_sqrt_neon(*result);
+}
+NK_PUBLIC void nk_angular_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    // Dot2 (Ogita-Rump-Oishi) for cross-product ab (may have cancellation),
+    // simple FMA for self-products a2/b2 (all positive, no cancellation)
+    float64x2_t ab_sum_f64x2 = vdupq_n_f64(0);
+    float64x2_t ab_compensation_f64x2 = vdupq_n_f64(0);
+    float64x2_t a2_f64x2 = vdupq_n_f64(0);
+    float64x2_t b2_f64x2 = vdupq_n_f64(0);
+    float64x2_t a_f64x2, b_f64x2;
+nk_angular_f64_neon_cycle:
+    if (n < 2) {
+        nk_b128_vec_t a_tail, b_tail;
+        nk_partial_load_b64x2_serial_(a, &a_tail, n);
+        nk_partial_load_b64x2_serial_(b, &b_tail, n);
+        a_f64x2 = a_tail.f64x2;
+        b_f64x2 = b_tail.f64x2;
+        n = 0;
+    }
+    else {
+        a_f64x2 = vld1q_f64(a);
+        b_f64x2 = vld1q_f64(b);
+        a += 2, b += 2, n -= 2;
+    }
+    // TwoProd for ab: product = a*b, error = fma(a,b,-product)
+    float64x2_t product_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+    float64x2_t product_error_f64x2 = vnegq_f64(vfmsq_f64(product_f64x2, a_f64x2, b_f64x2));
+    // TwoSum: (t, q) = TwoSum(sum, product)
+    float64x2_t tentative_sum_f64x2 = vaddq_f64(ab_sum_f64x2, product_f64x2);
+    float64x2_t virtual_addend_f64x2 = vsubq_f64(tentative_sum_f64x2, ab_sum_f64x2);
+    float64x2_t sum_error_f64x2 = vaddq_f64(
+        vsubq_f64(ab_sum_f64x2, vsubq_f64(tentative_sum_f64x2, virtual_addend_f64x2)),
+        vsubq_f64(product_f64x2, virtual_addend_f64x2));
+    ab_sum_f64x2 = tentative_sum_f64x2;
+    ab_compensation_f64x2 = vaddq_f64(ab_compensation_f64x2, vaddq_f64(sum_error_f64x2, product_error_f64x2));
+    // Simple FMA for self-products (no cancellation)
+    a2_f64x2 = vfmaq_f64(a2_f64x2, a_f64x2, a_f64x2);
+    b2_f64x2 = vfmaq_f64(b2_f64x2, b_f64x2, b_f64x2);
+    if (n) goto nk_angular_f64_neon_cycle;
+    *result = nk_angular_normalize_f64_neon_( //
+        nk_dot_stable_sum_f64x2_neon_(ab_sum_f64x2, ab_compensation_f64x2), vaddvq_f64(a2_f64x2), vaddvq_f64(b2_f64x2));
+}
+#pragma endregion - Traditional Floats
+#pragma region - Smaller Floats
+NK_PUBLIC void nk_sqeuclidean_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
+    uint16x8_t a_u16x8, b_u16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_sqeuclidean_bf16_neon_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a, &a_vec, n);
+        nk_partial_load_b16x8_serial_(b, &b_vec, n);
+        a_u16x8 = a_vec.u16x8;
+        b_u16x8 = b_vec.u16x8;
+        n = 0;
+    }
+    else {
+        a_u16x8 = vld1q_u16((nk_u16_t const *)a);
+        b_u16x8 = vld1q_u16((nk_u16_t const *)b);
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(a_u16x8), 16));
+    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(a_u16x8), 16));
+    float32x4_t b_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(b_u16x8), 16));
+    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(b_u16x8), 16));
+    float32x4_t diff_low_f32x4 = vsubq_f32(a_low_f32x4, b_low_f32x4);
+    float32x4_t diff_high_f32x4 = vsubq_f32(a_high_f32x4, b_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_low_f32x4, diff_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_high_f32x4, diff_high_f32x4);
+    if (n) goto nk_sqeuclidean_bf16_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_euclidean_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_bf16_neon(a, b, n, result);
+    *result = nk_f32_sqrt_neon(*result);
+}
+NK_PUBLIC void nk_angular_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
+    uint16x8_t a_u16x8, b_u16x8;
+    float32x4_t ab_f32x4 = vdupq_n_f32(0);
+    float32x4_t a2_f32x4 = vdupq_n_f32(0);
+    float32x4_t b2_f32x4 = vdupq_n_f32(0);
+nk_angular_bf16_neon_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a, &a_vec, n);
+        nk_partial_load_b16x8_serial_(b, &b_vec, n);
+        a_u16x8 = a_vec.u16x8;
+        b_u16x8 = b_vec.u16x8;
+        n = 0;
+    }
+    else {
+        a_u16x8 = vld1q_u16((nk_u16_t const *)a);
+        b_u16x8 = vld1q_u16((nk_u16_t const *)b);
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(a_u16x8), 16));
+    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(a_u16x8), 16));
+    float32x4_t b_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(b_u16x8), 16));
+    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(b_u16x8), 16));
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_low_f32x4, b_low_f32x4);
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_high_f32x4, b_high_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_low_f32x4, a_low_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_high_f32x4, a_high_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_low_f32x4, b_low_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_high_f32x4, b_high_f32x4);
+    if (n) goto nk_angular_bf16_neon_cycle;
+    nk_f32_t ab = vaddvq_f32(ab_f32x4);
+    nk_f32_t a2 = vaddvq_f32(a2_f32x4);
+    nk_f32_t b2 = vaddvq_f32(b2_f32x4);
+    *result = nk_angular_normalize_f32_neon_(ab, a2, b2);
+}
+NK_PUBLIC void nk_sqeuclidean_e2m3_neon(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_sqeuclidean_e2m3_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e2m3x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e2m3x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e2m3x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e2m3x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    float32x4_t diff_low_f32x4 = vsubq_f32(a_low_f32x4, b_low_f32x4);
+    float32x4_t diff_high_f32x4 = vsubq_f32(a_high_f32x4, b_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_low_f32x4, diff_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_high_f32x4, diff_high_f32x4);
+    if (n) goto nk_sqeuclidean_e2m3_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_euclidean_e2m3_neon(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e2m3_neon(a, b, n, result);
+    *result = nk_f32_sqrt_neon(*result);
+}
+NK_PUBLIC void nk_angular_e2m3_neon(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t ab_f32x4 = vdupq_n_f32(0);
+    float32x4_t a2_f32x4 = vdupq_n_f32(0);
+    float32x4_t b2_f32x4 = vdupq_n_f32(0);
+nk_angular_e2m3_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e2m3x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e2m3x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e2m3x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e2m3x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_low_f32x4, b_low_f32x4);
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_high_f32x4, b_high_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_low_f32x4, a_low_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_high_f32x4, a_high_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_low_f32x4, b_low_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_high_f32x4, b_high_f32x4);
+    if (n) goto nk_angular_e2m3_neon_cycle;
+    nk_f32_t ab = vaddvq_f32(ab_f32x4);
+    nk_f32_t a2 = vaddvq_f32(a2_f32x4);
+    nk_f32_t b2 = vaddvq_f32(b2_f32x4);
+    *result = nk_angular_normalize_f32_neon_(ab, a2, b2);
+}
+NK_PUBLIC void nk_sqeuclidean_e3m2_neon(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_sqeuclidean_e3m2_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e3m2x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e3m2x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e3m2x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e3m2x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    float32x4_t diff_low_f32x4 = vsubq_f32(a_low_f32x4, b_low_f32x4);
+    float32x4_t diff_high_f32x4 = vsubq_f32(a_high_f32x4, b_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_low_f32x4, diff_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_high_f32x4, diff_high_f32x4);
+    if (n) goto nk_sqeuclidean_e3m2_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_euclidean_e3m2_neon(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e3m2_neon(a, b, n, result);
+    *result = nk_f32_sqrt_neon(*result);
+}
+NK_PUBLIC void nk_angular_e3m2_neon(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t ab_f32x4 = vdupq_n_f32(0);
+    float32x4_t a2_f32x4 = vdupq_n_f32(0);
+    float32x4_t b2_f32x4 = vdupq_n_f32(0);
+nk_angular_e3m2_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e3m2x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e3m2x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e3m2x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e3m2x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_low_f32x4, b_low_f32x4);
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_high_f32x4, b_high_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_low_f32x4, a_low_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_high_f32x4, a_high_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_low_f32x4, b_low_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_high_f32x4, b_high_f32x4);
+    if (n) goto nk_angular_e3m2_neon_cycle;
+    nk_f32_t ab = vaddvq_f32(ab_f32x4);
+    nk_f32_t a2 = vaddvq_f32(a2_f32x4);
+    nk_f32_t b2 = vaddvq_f32(b2_f32x4);
+    *result = nk_angular_normalize_f32_neon_(ab, a2, b2);
+}
+NK_PUBLIC void nk_sqeuclidean_e4m3_neon(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_sqeuclidean_e4m3_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e4m3x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e4m3x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    float32x4_t diff_low_f32x4 = vsubq_f32(a_low_f32x4, b_low_f32x4);
+    float32x4_t diff_high_f32x4 = vsubq_f32(a_high_f32x4, b_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_low_f32x4, diff_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_high_f32x4, diff_high_f32x4);
+    if (n) goto nk_sqeuclidean_e4m3_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_euclidean_e4m3_neon(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e4m3_neon(a, b, n, result);
+    *result = nk_f32_sqrt_neon(*result);
+}
+NK_PUBLIC void nk_angular_e4m3_neon(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t ab_f32x4 = vdupq_n_f32(0);
+    float32x4_t a2_f32x4 = vdupq_n_f32(0);
+    float32x4_t b2_f32x4 = vdupq_n_f32(0);
+nk_angular_e4m3_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e4m3x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e4m3x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_low_f32x4, b_low_f32x4);
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_high_f32x4, b_high_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_low_f32x4, a_low_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_high_f32x4, a_high_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_low_f32x4, b_low_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_high_f32x4, b_high_f32x4);
+    if (n) goto nk_angular_e4m3_neon_cycle;
+    nk_f32_t ab = vaddvq_f32(ab_f32x4);
+    nk_f32_t a2 = vaddvq_f32(a2_f32x4);
+    nk_f32_t b2 = vaddvq_f32(b2_f32x4);
+    *result = nk_angular_normalize_f32_neon_(ab, a2, b2);
+}
+NK_PUBLIC void nk_sqeuclidean_e5m2_neon(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_sqeuclidean_e5m2_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e5m2x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e5m2x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    float32x4_t diff_low_f32x4 = vsubq_f32(a_low_f32x4, b_low_f32x4);
+    float32x4_t diff_high_f32x4 = vsubq_f32(a_high_f32x4, b_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_low_f32x4, diff_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, diff_high_f32x4, diff_high_f32x4);
+    if (n) goto nk_sqeuclidean_e5m2_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_euclidean_e5m2_neon(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e5m2_neon(a, b, n, result);
+    *result = nk_f32_sqrt_neon(*result);
+}
+NK_PUBLIC void nk_angular_e5m2_neon(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t ab_f32x4 = vdupq_n_f32(0);
+    float32x4_t a2_f32x4 = vdupq_n_f32(0);
+    float32x4_t b2_f32x4 = vdupq_n_f32(0);
+nk_angular_e5m2_neon_cycle:
+    if (n < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a, &a_vec, n);
+        nk_partial_load_b8x8_serial_(b, &b_vec, n);
+        a_f16x8 = nk_e5m2x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e5m2x8_to_f16x8_neon_(b_vec.u8x8);
+        n = 0;
+    }
+    else {
+        a_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(a));
+        b_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(b));
+        a += 8, b += 8, n -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_low_f32x4, b_low_f32x4);
+    ab_f32x4 = vfmaq_f32(ab_f32x4, a_high_f32x4, b_high_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_low_f32x4, a_low_f32x4);
+    a2_f32x4 = vfmaq_f32(a2_f32x4, a_high_f32x4, a_high_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_low_f32x4, b_low_f32x4);
+    b2_f32x4 = vfmaq_f32(b2_f32x4, b_high_f32x4, b_high_f32x4);
+    if (n) goto nk_angular_e5m2_neon_cycle;
+    nk_f32_t ab = vaddvq_f32(ab_f32x4);
+    nk_f32_t a2 = vaddvq_f32(a2_f32x4);
+    nk_f32_t b2 = vaddvq_f32(b2_f32x4);
+    *result = nk_angular_normalize_f32_neon_(ab, a2, b2);
+}
+/** @brief Angular from_dot: computes 1 − dot × rsqrt(query_sumsq × target_sumsq) for 4 pairs in f64. */
+NK_INTERNAL void nk_angular_through_f64_from_dot_neon_(nk_b256_vec_t dots, nk_f64_t query_sumsq,
+                                                       nk_b256_vec_t target_sumsqs, nk_b256_vec_t *results) {
+    float64x2_t dots_ab_f64x2 = dots.f64x2s[0];
+    float64x2_t dots_cd_f64x2 = dots.f64x2s[1];
+    float64x2_t query_sumsq_f64x2 = vdupq_n_f64(query_sumsq);
+    float64x2_t target_sumsqs_ab_f64x2 = target_sumsqs.f64x2s[0];
+    float64x2_t target_sumsqs_cd_f64x2 = target_sumsqs.f64x2s[1];
+    // products = query_sumsq * target_sumsq
+    float64x2_t products_ab_f64x2 = vmulq_f64(query_sumsq_f64x2, target_sumsqs_ab_f64x2);
+    float64x2_t products_cd_f64x2 = vmulq_f64(query_sumsq_f64x2, target_sumsqs_cd_f64x2);
+    // rsqrt with Newton-Raphson (2 iterations for ~48-bit precision)
+    float64x2_t rsqrt_ab_f64x2 = vrsqrteq_f64(products_ab_f64x2);
+    float64x2_t rsqrt_cd_f64x2 = vrsqrteq_f64(products_cd_f64x2);
+    rsqrt_ab_f64x2 = vmulq_f64(rsqrt_ab_f64x2,
+                               vrsqrtsq_f64(vmulq_f64(products_ab_f64x2, rsqrt_ab_f64x2), rsqrt_ab_f64x2));
+    rsqrt_cd_f64x2 = vmulq_f64(rsqrt_cd_f64x2,
+                               vrsqrtsq_f64(vmulq_f64(products_cd_f64x2, rsqrt_cd_f64x2), rsqrt_cd_f64x2));
+    rsqrt_ab_f64x2 = vmulq_f64(rsqrt_ab_f64x2,
+                               vrsqrtsq_f64(vmulq_f64(products_ab_f64x2, rsqrt_ab_f64x2), rsqrt_ab_f64x2));
+    rsqrt_cd_f64x2 = vmulq_f64(rsqrt_cd_f64x2,
+                               vrsqrtsq_f64(vmulq_f64(products_cd_f64x2, rsqrt_cd_f64x2), rsqrt_cd_f64x2));
+    // angular = 1 − dot × rsqrt(product)
+    float64x2_t ones_f64x2 = vdupq_n_f64(1.0);
+    float64x2_t zeros_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t result_ab_f64x2 = vsubq_f64(ones_f64x2, vmulq_f64(dots_ab_f64x2, rsqrt_ab_f64x2));
+    float64x2_t result_cd_f64x2 = vsubq_f64(ones_f64x2, vmulq_f64(dots_cd_f64x2, rsqrt_cd_f64x2));
+    // Clamp to [0, inf)
+    result_ab_f64x2 = vmaxq_f64(result_ab_f64x2, zeros_f64x2);
+    result_cd_f64x2 = vmaxq_f64(result_cd_f64x2, zeros_f64x2);
+    // Handle edge cases with vectorized selects
+    uint64x2_t products_zero_ab_u64x2 = vceqq_f64(products_ab_f64x2, zeros_f64x2);
+    uint64x2_t products_zero_cd_u64x2 = vceqq_f64(products_cd_f64x2, zeros_f64x2);
+    uint64x2_t dots_zero_ab_u64x2 = vceqq_f64(dots_ab_f64x2, zeros_f64x2);
+    uint64x2_t dots_zero_cd_u64x2 = vceqq_f64(dots_cd_f64x2, zeros_f64x2);
+    // Both zero → result = 0; products zero but dots nonzero → result = 1
+    uint64x2_t both_zero_ab_u64x2 = vandq_u64(products_zero_ab_u64x2, dots_zero_ab_u64x2);
+    uint64x2_t both_zero_cd_u64x2 = vandq_u64(products_zero_cd_u64x2, dots_zero_cd_u64x2);
+    result_ab_f64x2 = vbslq_f64(both_zero_ab_u64x2, zeros_f64x2, result_ab_f64x2);
+    result_cd_f64x2 = vbslq_f64(both_zero_cd_u64x2, zeros_f64x2, result_cd_f64x2);
+    uint64x2_t prod_zero_dot_nonzero_ab_u64x2 = vandq_u64(
+        products_zero_ab_u64x2, vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(dots_zero_ab_u64x2))));
+    uint64x2_t prod_zero_dot_nonzero_cd_u64x2 = vandq_u64(
+        products_zero_cd_u64x2, vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(dots_zero_cd_u64x2))));
+    result_ab_f64x2 = vbslq_f64(prod_zero_dot_nonzero_ab_u64x2, ones_f64x2, result_ab_f64x2);
+    result_cd_f64x2 = vbslq_f64(prod_zero_dot_nonzero_cd_u64x2, ones_f64x2, result_cd_f64x2);
+    results->f64x2s[0] = result_ab_f64x2;
+    results->f64x2s[1] = result_cd_f64x2;
+}
+/** @brief Euclidean from_dot: computes √(query_sumsq + target_sumsq − 2 × dot) for 4 pairs in f64. */
+NK_INTERNAL void nk_euclidean_through_f64_from_dot_neon_(nk_b256_vec_t dots, nk_f64_t query_sumsq,
+                                                         nk_b256_vec_t target_sumsqs, nk_b256_vec_t *results) {
+    float64x2_t dots_ab_f64x2 = dots.f64x2s[0];
+    float64x2_t dots_cd_f64x2 = dots.f64x2s[1];
+    float64x2_t query_sumsq_f64x2 = vdupq_n_f64(query_sumsq);
+    float64x2_t target_sumsqs_ab_f64x2 = target_sumsqs.f64x2s[0];
+    float64x2_t target_sumsqs_cd_f64x2 = target_sumsqs.f64x2s[1];
+    // dist_sq = query_sumsq + target_sumsq − 2 × dot
+    float64x2_t neg_two_f64x2 = vdupq_n_f64(-2.0);
+    float64x2_t sum_sq_ab_f64x2 = vaddq_f64(query_sumsq_f64x2, target_sumsqs_ab_f64x2);
+    float64x2_t sum_sq_cd_f64x2 = vaddq_f64(query_sumsq_f64x2, target_sumsqs_cd_f64x2);
+    float64x2_t dist_sq_ab_f64x2 = vfmaq_f64(sum_sq_ab_f64x2, neg_two_f64x2, dots_ab_f64x2);
+    float64x2_t dist_sq_cd_f64x2 = vfmaq_f64(sum_sq_cd_f64x2, neg_two_f64x2, dots_cd_f64x2);
+    // Clamp and sqrt in f64
+    float64x2_t zeros_f64x2 = vdupq_n_f64(0.0);
+    dist_sq_ab_f64x2 = vmaxq_f64(dist_sq_ab_f64x2, zeros_f64x2);
+    dist_sq_cd_f64x2 = vmaxq_f64(dist_sq_cd_f64x2, zeros_f64x2);
+    float64x2_t dist_ab_f64x2 = vsqrtq_f64(dist_sq_ab_f64x2);
+    float64x2_t dist_cd_f64x2 = vsqrtq_f64(dist_sq_cd_f64x2);
+    results->f64x2s[0] = dist_ab_f64x2;
+    results->f64x2s[1] = dist_cd_f64x2;
+}
+/** @brief Angular from_dot: computes 1 − dot × rsqrt(query_sumsq × target_sumsq) for 4 pairs in f32. */
+NK_INTERNAL void nk_angular_through_f32_from_dot_neon_(nk_b128_vec_t dots, nk_f32_t query_sumsq,
+                                                       nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    float32x4_t dots_f32x4 = dots.f32x4;
+    float32x4_t query_sumsq_f32x4 = vdupq_n_f32(query_sumsq);
+    float32x4_t products_f32x4 = vmulq_f32(query_sumsq_f32x4, target_sumsqs.f32x4);
+    // rsqrt with Newton-Raphson refinement (2 iterations)
+    float32x4_t rsqrt_f32x4 = vrsqrteq_f32(products_f32x4);
+    rsqrt_f32x4 = vmulq_f32(rsqrt_f32x4, vrsqrtsq_f32(vmulq_f32(products_f32x4, rsqrt_f32x4), rsqrt_f32x4));
+    rsqrt_f32x4 = vmulq_f32(rsqrt_f32x4, vrsqrtsq_f32(vmulq_f32(products_f32x4, rsqrt_f32x4), rsqrt_f32x4));
+    float32x4_t normalized_f32x4 = vmulq_f32(dots_f32x4, rsqrt_f32x4);
+    float32x4_t angular_f32x4 = vsubq_f32(vdupq_n_f32(1.0f), normalized_f32x4);
+    results->f32x4 = vmaxq_f32(angular_f32x4, vdupq_n_f32(0.0f));
+}
+/** @brief Euclidean from_dot: computes √(query_sumsq + target_sumsq − 2 × dot) for 4 pairs in f32. */
+NK_INTERNAL void nk_euclidean_through_f32_from_dot_neon_(nk_b128_vec_t dots, nk_f32_t query_sumsq,
+                                                         nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    float32x4_t dots_f32x4 = dots.f32x4;
+    float32x4_t query_sumsq_f32x4 = vdupq_n_f32(query_sumsq);
+    float32x4_t sum_sq_f32x4 = vaddq_f32(query_sumsq_f32x4, target_sumsqs.f32x4);
+    // dist_sq = sum_sq − 2 × dot
+    float32x4_t dist_sq_f32x4 = vfmsq_f32(sum_sq_f32x4, vdupq_n_f32(2.0f), dots_f32x4);
+    // Clamp and sqrt
+    dist_sq_f32x4 = vmaxq_f32(dist_sq_f32x4, vdupq_n_f32(0.0f));
+    results->f32x4 = vsqrtq_f32(dist_sq_f32x4);
+}
+/** @brief Angular from_dot for i32 accumulators: cast to f32, rsqrt+NR, clamp. 4 pairs. */
+NK_INTERNAL void nk_angular_through_i32_from_dot_neon_(nk_b128_vec_t dots, nk_i32_t query_sumsq,
+                                                       nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    float32x4_t dots_f32x4 = vcvtq_f32_s32(dots.i32x4);
+    float32x4_t query_sumsq_f32x4 = vdupq_n_f32((nk_f32_t)query_sumsq);
+    float32x4_t products_f32x4 = vmulq_f32(query_sumsq_f32x4, vcvtq_f32_s32(target_sumsqs.i32x4));
+    float32x4_t rsqrt_f32x4 = nk_rsqrt_f32x4_neon_(products_f32x4);
+    float32x4_t normalized_f32x4 = vmulq_f32(dots_f32x4, rsqrt_f32x4);
+    float32x4_t angular_f32x4 = vsubq_f32(vdupq_n_f32(1.0f), normalized_f32x4);
+    results->f32x4 = vmaxq_f32(angular_f32x4, vdupq_n_f32(0.0f));
+}
+/** @brief Euclidean from_dot for i32 accumulators: cast to f32, then √(a² + b² − 2ab). 4 pairs. */
+NK_INTERNAL void nk_euclidean_through_i32_from_dot_neon_(nk_b128_vec_t dots, nk_i32_t query_sumsq,
+                                                         nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    float32x4_t dots_f32x4 = vcvtq_f32_s32(dots.i32x4);
+    float32x4_t query_sumsq_f32x4 = vdupq_n_f32((nk_f32_t)query_sumsq);
+    float32x4_t sum_sq_f32x4 = vaddq_f32(query_sumsq_f32x4, vcvtq_f32_s32(target_sumsqs.i32x4));
+    float32x4_t dist_sq_f32x4 = vfmsq_f32(sum_sq_f32x4, vdupq_n_f32(2.0f), dots_f32x4);
+    dist_sq_f32x4 = vmaxq_f32(dist_sq_f32x4, vdupq_n_f32(0.0f));
+    results->f32x4 = vsqrtq_f32(dist_sq_f32x4);
+}
+/** @brief Angular from_dot for u32 accumulators: cast to f32, rsqrt+NR, clamp. 4 pairs. */
+NK_INTERNAL void nk_angular_through_u32_from_dot_neon_(nk_b128_vec_t dots, nk_u32_t query_sumsq,
+                                                       nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    float32x4_t dots_f32x4 = vcvtq_f32_u32(dots.u32x4);
+    float32x4_t query_sumsq_f32x4 = vdupq_n_f32((nk_f32_t)query_sumsq);
+    float32x4_t products_f32x4 = vmulq_f32(query_sumsq_f32x4, vcvtq_f32_u32(target_sumsqs.u32x4));
+    float32x4_t rsqrt_f32x4 = nk_rsqrt_f32x4_neon_(products_f32x4);
+    float32x4_t normalized_f32x4 = vmulq_f32(dots_f32x4, rsqrt_f32x4);
+    float32x4_t angular_f32x4 = vsubq_f32(vdupq_n_f32(1.0f), normalized_f32x4);
+    results->f32x4 = vmaxq_f32(angular_f32x4, vdupq_n_f32(0.0f));
+}
+/** @brief Euclidean from_dot for u32 accumulators: cast to f32, then √(a² + b² − 2ab). 4 pairs. */
+NK_INTERNAL void nk_euclidean_through_u32_from_dot_neon_(nk_b128_vec_t dots, nk_u32_t query_sumsq,
+                                                         nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    float32x4_t dots_f32x4 = vcvtq_f32_u32(dots.u32x4);
+    float32x4_t query_sumsq_f32x4 = vdupq_n_f32((nk_f32_t)query_sumsq);
+    float32x4_t sum_sq_f32x4 = vaddq_f32(query_sumsq_f32x4, vcvtq_f32_u32(target_sumsqs.u32x4));
+    float32x4_t dist_sq_f32x4 = vfmsq_f32(sum_sq_f32x4, vdupq_n_f32(2.0f), dots_f32x4);
+    dist_sq_f32x4 = vmaxq_f32(dist_sq_f32x4, vdupq_n_f32(0.0f));
+    results->f32x4 = vsqrtq_f32(dist_sq_f32x4);
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#pragma endregion - Smaller Floats
+#endif // NK_TARGET_NEON
+#endif // NK_TARGET_ARM_
+#endif // NK_SPATIAL_NEON_H