npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/spatial/haswell.h ADDED Viewed

@@ -0,0 +1,960 @@
+/**
+ *  @brief SIMD-accelerated Spatial Similarity Measures for Haswell.
+ *  @file include/numkong/spatial/haswell.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/spatial.h
+ *
+ *  @section spatial_haswell_instructions Key AVX2 Spatial Instructions
+ *
+ *      Intrinsic                   Instruction                     Latency     Throughput  Ports
+ *      _mm256_fmadd_ps             VFMADD (YMM, YMM, YMM)          5cy         0.5/cy      p01
+ *      _mm256_mul_ps               VMULPS (YMM, YMM, YMM)          5cy         0.5/cy      p01
+ *      _mm256_add_ps               VADDPS (YMM, YMM, YMM)          3cy         1/cy        p01
+ *      _mm256_sub_ps               VSUBPS (YMM, YMM, YMM)          3cy         1/cy        p01
+ *      _mm_rsqrt_ps                VRSQRTPS (XMM, XMM)             5cy         1/cy        p0
+ *      _mm_sqrt_ps                 VSQRTPS (XMM, XMM)              11cy        7cy         p0
+ *      _mm256_sqrt_ps              VSQRTPS (YMM, YMM)              12cy        14cy        p0
+ *
+ *  For angular distance normalization, `_mm_rsqrt_ps` provides ~12-bit precision (1.5 x 2⁻¹² error).
+ *  Newton-Raphson refinement doubles precision to ~22-24 bits, sufficient for f32. For f64 we use
+ *  the exact `_mm_sqrt_pd` instruction since fast rsqrt approximations lack f64 precision.
+ */
+#ifndef NK_SPATIAL_HASWELL_H
+#define NK_SPATIAL_HASWELL_H
+#if NK_TARGET_X86_
+#if NK_TARGET_HASWELL
+#include "numkong/types.h"
+#include "numkong/scalar/haswell.h" // `nk_f32_sqrt_haswell`
+#include "numkong/dot/haswell.h"    // `nk_dot_f32x4_state_haswell_t`
+#include "numkong/reduce/haswell.h" // `nk_reduce_add_f32x8_haswell_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2")
+#endif
+/** @brief Reciprocal square root of 4 floats with Newton-Raphson refinement. */
+NK_INTERNAL __m128 nk_rsqrt_f32x4_haswell_(__m128 x) {
+    __m128 rsqrt_f32x4 = _mm_rsqrt_ps(x);
+    __m128 nr_f32x4 = _mm_mul_ps(_mm_mul_ps(x, rsqrt_f32x4), rsqrt_f32x4);
+    nr_f32x4 = _mm_sub_ps(_mm_set1_ps(3.0f), nr_f32x4);
+    return _mm_mul_ps(_mm_mul_ps(_mm_set1_ps(0.5f), rsqrt_f32x4), nr_f32x4);
+}
+/** @brief Safe square root of 4 floats with zero-clamping for numerical stability. */
+NK_INTERNAL __m128 nk_safe_sqrt_f32x4_haswell_(__m128 x) { return _mm_sqrt_ps(_mm_max_ps(x, _mm_setzero_ps())); }
+/** @brief Angular from_dot: computes 1 − dot × rsqrt(query_sumsq × target_sumsq) for 4 pairs. */
+NK_INTERNAL void nk_angular_through_f32_from_dot_haswell_(nk_b128_vec_t dots, nk_f32_t query_sumsq,
+                                                          nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    __m128 dots_f32x4 = dots.xmm_ps;
+    __m128 query_sumsq_f32x4 = _mm_set1_ps(query_sumsq);
+    __m128 products_f32x4 = _mm_mul_ps(query_sumsq_f32x4, target_sumsqs.xmm_ps);
+    __m128 rsqrt_f32x4 = nk_rsqrt_f32x4_haswell_(products_f32x4);
+    __m128 normalized_f32x4 = _mm_mul_ps(dots_f32x4, rsqrt_f32x4);
+    __m128 angular_f32x4 = _mm_sub_ps(_mm_set1_ps(1.0f), normalized_f32x4);
+    results->xmm_ps = _mm_max_ps(angular_f32x4, _mm_setzero_ps());
+}
+/** @brief Euclidean from_dot: computes √(query_sumsq + target_sumsq − 2 × dot) for 4 pairs. */
+NK_INTERNAL void nk_euclidean_through_f32_from_dot_haswell_(nk_b128_vec_t dots, nk_f32_t query_sumsq,
+                                                            nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    __m128 dots_f32x4 = dots.xmm_ps;
+    __m128 query_sumsq_f32x4 = _mm_set1_ps(query_sumsq);
+    __m128 sum_sq_f32x4 = _mm_add_ps(query_sumsq_f32x4, target_sumsqs.xmm_ps);
+    __m128 dist_sq_f32x4 = _mm_fnmadd_ps(_mm_set1_ps(2.0f), dots_f32x4, sum_sq_f32x4);
+    results->xmm_ps = nk_safe_sqrt_f32x4_haswell_(dist_sq_f32x4);
+}
+/** @brief Angular from_dot for native f64: 1 − dot / √(query_sumsq × target_sumsq) for 4 pairs. */
+NK_INTERNAL void nk_angular_through_f64_from_dot_haswell_(nk_b256_vec_t dots, nk_f64_t query_sumsq,
+                                                          nk_b256_vec_t target_sumsqs, nk_b256_vec_t *results) {
+    __m256d dots_f64x4 = dots.ymm_pd;
+    __m256d query_sumsq_f64x4 = _mm256_set1_pd(query_sumsq);
+    __m256d products_f64x4 = _mm256_mul_pd(query_sumsq_f64x4, target_sumsqs.ymm_pd);
+    __m256d sqrt_products_f64x4 = _mm256_sqrt_pd(products_f64x4);
+    __m256d normalized_f64x4 = _mm256_div_pd(dots_f64x4, sqrt_products_f64x4);
+    __m256d angular_f64x4 = _mm256_sub_pd(_mm256_set1_pd(1.0), normalized_f64x4);
+    results->ymm_pd = _mm256_max_pd(angular_f64x4, _mm256_setzero_pd());
+}
+/** @brief Euclidean from_dot for native f64: √(query_sumsq + target_sumsq − 2 × dot) for 4 pairs. */
+NK_INTERNAL void nk_euclidean_through_f64_from_dot_haswell_(nk_b256_vec_t dots, nk_f64_t query_sumsq,
+                                                            nk_b256_vec_t target_sumsqs, nk_b256_vec_t *results) {
+    __m256d dots_f64x4 = dots.ymm_pd;
+    __m256d query_sumsq_f64x4 = _mm256_set1_pd(query_sumsq);
+    __m256d sum_sq_f64x4 = _mm256_add_pd(query_sumsq_f64x4, target_sumsqs.ymm_pd);
+    __m256d dist_sq_f64x4 = _mm256_fnmadd_pd(_mm256_set1_pd(2.0), dots_f64x4, sum_sq_f64x4);
+    results->ymm_pd = _mm256_sqrt_pd(_mm256_max_pd(dist_sq_f64x4, _mm256_setzero_pd()));
+}
+/** @brief Angular from_dot for i32 accumulators: cast to f32, rsqrt+NR, clamp. 4 pairs. */
+NK_INTERNAL void nk_angular_through_i32_from_dot_haswell_(nk_b128_vec_t dots, nk_i32_t query_sumsq,
+                                                          nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    __m128 dots_f32x4 = _mm_cvtepi32_ps(dots.xmm);
+    __m128 query_sumsq_f32x4 = _mm_set1_ps((nk_f32_t)query_sumsq);
+    __m128 products_f32x4 = _mm_mul_ps(query_sumsq_f32x4, _mm_cvtepi32_ps(target_sumsqs.xmm));
+    __m128 rsqrt_f32x4 = nk_rsqrt_f32x4_haswell_(products_f32x4);
+    __m128 normalized_f32x4 = _mm_mul_ps(dots_f32x4, rsqrt_f32x4);
+    __m128 angular_f32x4 = _mm_sub_ps(_mm_set1_ps(1.0f), normalized_f32x4);
+    results->xmm_ps = _mm_max_ps(angular_f32x4, _mm_setzero_ps());
+}
+/** @brief Euclidean from_dot for i32 accumulators: cast to f32, then √(a² + b² − 2ab). 4 pairs. */
+NK_INTERNAL void nk_euclidean_through_i32_from_dot_haswell_(nk_b128_vec_t dots, nk_i32_t query_sumsq,
+                                                            nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    __m128 dots_f32x4 = _mm_cvtepi32_ps(dots.xmm);
+    __m128 query_sumsq_f32x4 = _mm_set1_ps((nk_f32_t)query_sumsq);
+    __m128 sum_sq_f32x4 = _mm_add_ps(query_sumsq_f32x4, _mm_cvtepi32_ps(target_sumsqs.xmm));
+    __m128 dist_sq_f32x4 = _mm_fnmadd_ps(_mm_set1_ps(2.0f), dots_f32x4, sum_sq_f32x4);
+    results->xmm_ps = nk_safe_sqrt_f32x4_haswell_(dist_sq_f32x4);
+}
+/** @brief Angular from_dot for u32 accumulators: cast to f32, rsqrt+NR, clamp. 4 pairs. */
+NK_INTERNAL void nk_angular_through_u32_from_dot_haswell_(nk_b128_vec_t dots, nk_u32_t query_sumsq,
+                                                          nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    __m128 dots_f32x4 = _mm_cvtepi32_ps(dots.xmm);
+    __m128 query_sumsq_f32x4 = _mm_set1_ps((nk_f32_t)query_sumsq);
+    __m128 products_f32x4 = _mm_mul_ps(query_sumsq_f32x4, _mm_cvtepi32_ps(target_sumsqs.xmm));
+    __m128 rsqrt_f32x4 = nk_rsqrt_f32x4_haswell_(products_f32x4);
+    __m128 normalized_f32x4 = _mm_mul_ps(dots_f32x4, rsqrt_f32x4);
+    __m128 angular_f32x4 = _mm_sub_ps(_mm_set1_ps(1.0f), normalized_f32x4);
+    results->xmm_ps = _mm_max_ps(angular_f32x4, _mm_setzero_ps());
+}
+/** @brief Euclidean from_dot for u32 accumulators: cast to f32, then √(a² + b² − 2ab). 4 pairs. */
+NK_INTERNAL void nk_euclidean_through_u32_from_dot_haswell_(nk_b128_vec_t dots, nk_u32_t query_sumsq,
+                                                            nk_b128_vec_t target_sumsqs, nk_b128_vec_t *results) {
+    __m128 dots_f32x4 = _mm_cvtepi32_ps(dots.xmm);
+    __m128 query_sumsq_f32x4 = _mm_set1_ps((nk_f32_t)query_sumsq);
+    __m128 sum_sq_f32x4 = _mm_add_ps(query_sumsq_f32x4, _mm_cvtepi32_ps(target_sumsqs.xmm));
+    __m128 dist_sq_f32x4 = _mm_fnmadd_ps(_mm_set1_ps(2.0f), dots_f32x4, sum_sq_f32x4);
+    results->xmm_ps = nk_safe_sqrt_f32x4_haswell_(dist_sq_f32x4);
+}
+NK_INTERNAL nk_f64_t nk_angular_normalize_f64_haswell_(nk_f64_t ab, nk_f64_t a2, nk_f64_t b2) {
+    // If both vectors have magnitude 0, the distance is 0.
+    if (a2 == 0 && b2 == 0) return 0;
+    // If any one of the vectors is 0, the square root of the product is 0,
+    // the division is illformed, and the result is 1.
+    else if (ab == 0) return 1;
+    // Design note: We use exact `_mm_sqrt_pd` instead of fast rsqrt approximation.
+    // The f32 `_mm_rsqrt_ps` has max relative error of 1.5 × 2⁻¹² (~11 bits precision).
+    // Even with Newton-Raphson refinement (doubles precision to ~22-24 bits), this is
+    // insufficient for f64's 52-bit mantissa, causing ULP errors in the hundreds of millions.
+    // The `_mm_sqrt_pd` instruction has ~13 cycle latency but provides full f64 precision.
+    // https://web.archive.org/web/20210208132927/http://assemblyrequired.crashworks.org/timing-square-root/
+    __m128d squares_f64x2 = _mm_set_pd(a2, b2);
+    __m128d sqrts_f64x2 = _mm_sqrt_pd(squares_f64x2);
+    nk_f64_t a_sqrt = _mm_cvtsd_f64(_mm_unpackhi_pd(sqrts_f64x2, sqrts_f64x2));
+    nk_f64_t b_sqrt = _mm_cvtsd_f64(sqrts_f64x2);
+    nk_f64_t result = 1 - ab / (a_sqrt * b_sqrt);
+    return result > 0 ? result : 0;
+}
+NK_INTERNAL nk_f32_t nk_angular_normalize_f32_haswell_(nk_f32_t ab, nk_f32_t a2, nk_f32_t b2) {
+    // If both vectors have magnitude 0, the distance is 0.
+    if (a2 == 0.0f && b2 == 0.0f) return 0.0f;
+    // If any one of the vectors is 0, the square root of the product is 0,
+    // the division is illformed, and the result is 1.
+    else if (ab == 0.0f) return 1.0f;
+    // Load the squares into an __m128 register for single-precision floating-point operations
+    __m128 squares = _mm_set_ps(a2, b2, a2, b2); // We replicate to make use of full register
+    // Compute the reciprocal square root of the squares using `_mm_rsqrt_ps` (single-precision)
+    __m128 rsqrts = _mm_rsqrt_ps(squares);
+    // Perform one iteration of Newton-Raphson refinement to improve the precision of rsqrt:
+    // Formula: y' = y × (1.5 - 0.5 × x × y × y)
+    __m128 half = _mm_set1_ps(0.5f);
+    __m128 three_halves = _mm_set1_ps(1.5f);
+    rsqrts = _mm_mul_ps(rsqrts,
+                        _mm_sub_ps(three_halves, _mm_mul_ps(half, _mm_mul_ps(squares, _mm_mul_ps(rsqrts, rsqrts)))));
+    // Extract the reciprocal square roots of a2 and b2 from the __m128 register
+    nk_f32_t a2_reciprocal = _mm_cvtss_f32(_mm_shuffle_ps(rsqrts, rsqrts, _MM_SHUFFLE(0, 0, 0, 1)));
+    nk_f32_t b2_reciprocal = _mm_cvtss_f32(rsqrts);
+    // Calculate the angular distance: 1 - dot_product × a2_reciprocal × b2_reciprocal
+    nk_f32_t result = 1.0f - ab * a2_reciprocal * b2_reciprocal;
+    return result > 0 ? result : 0;
+}
+#pragma region - Smaller Floats
+NK_PUBLIC void nk_sqeuclidean_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 a_f32x8, b_f32x8;
+    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+nk_sqeuclidean_f16_haswell_cycle:
+    if (n < 8) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_f16x8_to_f32x8_haswell_(a, &a_vec, n);
+        nk_partial_load_f16x8_to_f32x8_haswell_(b, &b_vec, n);
+        a_f32x8 = a_vec.ymm_ps;
+        b_f32x8 = b_vec.ymm_ps;
+        n = 0;
+    }
+    else {
+        a_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)a));
+        b_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)b));
+        n -= 8, a += 8, b += 8;
+    }
+    __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+    distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+    if (n) goto nk_sqeuclidean_f16_haswell_cycle;
+    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+}
+NK_PUBLIC void nk_euclidean_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_f16_haswell(a, b, n, result);
+    *result = nk_f32_sqrt_haswell(*result);
+}
+NK_PUBLIC void nk_angular_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 a_f32x8, b_f32x8;
+    __m256 dot_product_f32x8 = _mm256_setzero_ps(), a_norm_sq_f32x8 = _mm256_setzero_ps(),
+           b_norm_sq_f32x8 = _mm256_setzero_ps();
+nk_angular_f16_haswell_cycle:
+    if (n < 8) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_f16x8_to_f32x8_haswell_(a, &a_vec, n);
+        nk_partial_load_f16x8_to_f32x8_haswell_(b, &b_vec, n);
+        a_f32x8 = a_vec.ymm_ps;
+        b_f32x8 = b_vec.ymm_ps;
+        n = 0;
+    }
+    else {
+        a_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)a));
+        b_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)b));
+        n -= 8, a += 8, b += 8;
+    }
+    dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+    if (n) goto nk_angular_f16_haswell_cycle;
+    nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
+    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
+    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(b_norm_sq_f32x8);
+    *result = nk_angular_normalize_f32_haswell_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
+}
+NK_PUBLIC void nk_sqeuclidean_bf16_haswell(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 a_f32x8, b_f32x8;
+    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+nk_sqeuclidean_bf16_haswell_cycle:
+    if (n < 8) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_bf16x8_to_f32x8_haswell_(a, &a_vec, n);
+        nk_partial_load_bf16x8_to_f32x8_haswell_(b, &b_vec, n);
+        a_f32x8 = a_vec.ymm_ps;
+        b_f32x8 = b_vec.ymm_ps;
+        n = 0;
+    }
+    else {
+        a_f32x8 = nk_bf16x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        b_f32x8 = nk_bf16x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        n -= 8, a += 8, b += 8;
+    }
+    __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+    distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+    if (n) goto nk_sqeuclidean_bf16_haswell_cycle;
+    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+}
+NK_PUBLIC void nk_euclidean_bf16_haswell(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_bf16_haswell(a, b, n, result);
+    *result = nk_f32_sqrt_haswell(*result);
+}
+NK_PUBLIC void nk_angular_bf16_haswell(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 a_f32x8, b_f32x8;
+    __m256 dot_product_f32x8 = _mm256_setzero_ps(), a_norm_sq_f32x8 = _mm256_setzero_ps(),
+           b_norm_sq_f32x8 = _mm256_setzero_ps();
+nk_angular_bf16_haswell_cycle:
+    if (n < 8) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_bf16x8_to_f32x8_haswell_(a, &a_vec, n);
+        nk_partial_load_bf16x8_to_f32x8_haswell_(b, &b_vec, n);
+        a_f32x8 = a_vec.ymm_ps;
+        b_f32x8 = b_vec.ymm_ps;
+        n = 0;
+    }
+    else {
+        a_f32x8 = nk_bf16x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        b_f32x8 = nk_bf16x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        n -= 8, a += 8, b += 8;
+    }
+    dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+    if (n) goto nk_angular_bf16_haswell_cycle;
+    nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
+    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
+    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(b_norm_sq_f32x8);
+    *result = nk_angular_normalize_f32_haswell_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
+}
+#pragma endregion - Smaller Floats
+#pragma region - Small Integers
+NK_PUBLIC void nk_sqeuclidean_i8_haswell(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_u32_t *result) {
+    // Optimized i8 L2-squared using saturating subtract + VPMADDWD
+    //
+    // Approach:
+    //   - XOR with 0x80 to reinterpret signed i8 as unsigned u8
+    //   - Compute |a-b| using unsigned saturating subtraction: diff = (a ⊖ b) | (b ⊖ a)
+    //   - Zero-extend u8→u16 using unpacking (1cy latency @ p5)
+    //   - Square using vpmaddwd on u16 values (32 elements/iteration)
+    //
+    // The XOR bias is needed because subs_epu8 (unsigned) saturates to 0 when the result
+    // would be negative, so OR-ing both directions gives the true |a-b|.
+    // A naive subs_epi8 (signed) would saturate to -128, corrupting the OR trick.
+    //
+    // Correctness: For squared distance, |a-b|² = (a-b)², so unsigned absolute differences are valid.
+    //              The XOR preserves distances: |a-b| = |(a^0x80) - (b^0x80)|.
+    //
+    __m256i distance_sq_low_i32x8 = _mm256_setzero_si256();
+    __m256i distance_sq_high_i32x8 = _mm256_setzero_si256();
+    __m256i const zeros_i8x32 = _mm256_setzero_si256();
+    __m256i const bias_i8x32 = _mm256_set1_epi8((char)0x80);
+    __m256i diff_low_i16x16, diff_high_i16x16;
+    __m256i a_i8x32, b_i8x32, diff_u8x32;
+    // Process 32 elements per iteration with 256-bit loads
+    nk_size_t i = 0;
+    for (; i + 32 <= n; i += 32) {
+        a_i8x32 = _mm256_loadu_si256((__m256i const *)(a + i));
+        b_i8x32 = _mm256_loadu_si256((__m256i const *)(b + i));
+        // Reinterpret signed i8 as unsigned u8 by flipping the sign bit
+        a_i8x32 = _mm256_xor_si256(a_i8x32, bias_i8x32);
+        b_i8x32 = _mm256_xor_si256(b_i8x32, bias_i8x32);
+        // Compute |a-b| using unsigned saturating subtraction
+        // subs_epu8 saturates to 0 if result would be negative
+        // OR-ing both directions gives absolute difference as unsigned
+        diff_u8x32 = _mm256_or_si256(_mm256_subs_epu8(a_i8x32, b_i8x32), _mm256_subs_epu8(b_i8x32, a_i8x32));
+        // Zero-extend to i16 using unpack (1cy @ p5, much faster than cvtepi8_epi16)
+        diff_low_i16x16 = _mm256_unpacklo_epi8(diff_u8x32, zeros_i8x32);
+        diff_high_i16x16 = _mm256_unpackhi_epi8(diff_u8x32, zeros_i8x32);
+        // Multiply and accumulate at i16 level, accumulate at i32 level
+        distance_sq_low_i32x8 = _mm256_add_epi32(distance_sq_low_i32x8,
+                                                 _mm256_madd_epi16(diff_low_i16x16, diff_low_i16x16));
+        distance_sq_high_i32x8 = _mm256_add_epi32(distance_sq_high_i32x8,
+                                                  _mm256_madd_epi16(diff_high_i16x16, diff_high_i16x16));
+    }
+    // Reduce to scalar
+    nk_i32_t distance_sq_i32 = nk_reduce_add_i32x8_haswell_(
+        _mm256_add_epi32(distance_sq_low_i32x8, distance_sq_high_i32x8));
+    // Take care of the tail:
+    for (; i < n; ++i) {
+        nk_i32_t diff_i32 = (nk_i32_t)(a[i]) - b[i];
+        distance_sq_i32 += diff_i32 * diff_i32;
+    }
+    *result = (nk_u32_t)distance_sq_i32;
+}
+NK_PUBLIC void nk_euclidean_i8_haswell(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_u32_t distance_sq_u32;
+    nk_sqeuclidean_i8_haswell(a, b, n, &distance_sq_u32);
+    *result = nk_f32_sqrt_haswell((nk_f32_t)distance_sq_u32);
+}
+NK_PUBLIC void nk_angular_i8_haswell(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256i dot_product_i32x8 = _mm256_setzero_si256();
+    __m256i a_norm_sq_i32x8 = _mm256_setzero_si256();
+    __m256i b_norm_sq_i32x8 = _mm256_setzero_si256();
+    // AVX2 has no instructions for 8-bit signed integer dot products,
+    // but it has a weird instruction for mixed signed-unsigned 8-bit dot product.
+    // So we need to normalize the first vector to its absolute value,
+    // and shift the product sign into the second vector.
+    //
+    //      __m256i a_i8_abs_vec = _mm256_abs_epi8(a_i8_vec);
+    //      __m256i b_i8_flipped_vec = _mm256_sign_epi8(b_i8_vec, a_i8_vec);
+    //      __m256i ab_i16_vec = _mm256_maddubs_epi16(a_i8_abs_vec, b_i8_flipped_vec);
+    //
+    // The problem with this approach, however, is the `-128` value in the second vector.
+    // Flipping its sign will do nothing, and the result will be incorrect.
+    // This can easily lead to noticeable numerical errors in the final result.
+    nk_size_t i = 0;
+    for (; i + 16 <= n; i += 16) {
+        __m128i a_i8x16 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m128i b_i8x16 = _mm_loadu_si128((__m128i const *)(b + i));
+        // Sign extend i8 → i16 directly (128-bit → 256-bit, no port 5 pressure)
+        __m256i a_i16x16 = _mm256_cvtepi8_epi16(a_i8x16);
+        __m256i b_i16x16 = _mm256_cvtepi8_epi16(b_i8x16);
+        // Multiply and accumulate as i16 pairs, accumulate products as i32:
+        dot_product_i32x8 = _mm256_add_epi32(dot_product_i32x8, _mm256_madd_epi16(a_i16x16, b_i16x16));
+        a_norm_sq_i32x8 = _mm256_add_epi32(a_norm_sq_i32x8, _mm256_madd_epi16(a_i16x16, a_i16x16));
+        b_norm_sq_i32x8 = _mm256_add_epi32(b_norm_sq_i32x8, _mm256_madd_epi16(b_i16x16, b_i16x16));
+    }
+    // Reduce to scalar
+    nk_i32_t dot_product_i32 = nk_reduce_add_i32x8_haswell_(dot_product_i32x8);
+    nk_i32_t a_norm_sq_i32 = nk_reduce_add_i32x8_haswell_(a_norm_sq_i32x8);
+    nk_i32_t b_norm_sq_i32 = nk_reduce_add_i32x8_haswell_(b_norm_sq_i32x8);
+    // Take care of the tail:
+    for (; i < n; ++i) {
+        nk_i32_t a_element_i32 = a[i], b_element_i32 = b[i];
+        dot_product_i32 += a_element_i32 * b_element_i32;
+        a_norm_sq_i32 += a_element_i32 * a_element_i32;
+        b_norm_sq_i32 += b_element_i32 * b_element_i32;
+    }
+    *result = nk_angular_normalize_f32_haswell_(dot_product_i32, a_norm_sq_i32, b_norm_sq_i32);
+}
+NK_PUBLIC void nk_sqeuclidean_u8_haswell(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
+    __m256i distance_sq_low_i32x8 = _mm256_setzero_si256();
+    __m256i distance_sq_high_i32x8 = _mm256_setzero_si256();
+    __m256i const zeros_i8x32 = _mm256_setzero_si256();
+    nk_size_t i = 0;
+    for (; i + 32 <= n; i += 32) {
+        __m256i a_u8x32 = _mm256_loadu_si256((__m256i const *)(a + i));
+        __m256i b_u8x32 = _mm256_loadu_si256((__m256i const *)(b + i));
+        // Subtracting unsigned vectors in AVX2 is done by saturating subtraction:
+        __m256i diff_u8x32 = _mm256_or_si256(_mm256_subs_epu8(a_u8x32, b_u8x32), _mm256_subs_epu8(b_u8x32, a_u8x32));
+        // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
+        // instructions instead of extracts, as they are much faster and more efficient.
+        __m256i diff_low_i16x16 = _mm256_unpacklo_epi8(diff_u8x32, zeros_i8x32);
+        __m256i diff_high_i16x16 = _mm256_unpackhi_epi8(diff_u8x32, zeros_i8x32);
+        // Multiply and accumulate at `int16` level, accumulate at `int32` level:
+        distance_sq_low_i32x8 = _mm256_add_epi32(distance_sq_low_i32x8,
+                                                 _mm256_madd_epi16(diff_low_i16x16, diff_low_i16x16));
+        distance_sq_high_i32x8 = _mm256_add_epi32(distance_sq_high_i32x8,
+                                                  _mm256_madd_epi16(diff_high_i16x16, diff_high_i16x16));
+    }
+    // Accumulate the 32-bit integers from `distance_sq_high_i32x8` and `distance_sq_low_i32x8`
+    nk_i32_t distance_sq_i32 = nk_reduce_add_i32x8_haswell_(
+        _mm256_add_epi32(distance_sq_low_i32x8, distance_sq_high_i32x8));
+    // Take care of the tail:
+    for (; i < n; ++i) {
+        nk_i32_t diff_i32 = (nk_i32_t)(a[i]) - b[i];
+        distance_sq_i32 += diff_i32 * diff_i32;
+    }
+    *result = (nk_u32_t)distance_sq_i32;
+}
+NK_PUBLIC void nk_euclidean_u8_haswell(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_u32_t distance_sq_u32;
+    nk_sqeuclidean_u8_haswell(a, b, n, &distance_sq_u32);
+    *result = nk_f32_sqrt_haswell((nk_f32_t)distance_sq_u32);
+}
+NK_PUBLIC void nk_angular_u8_haswell(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256i dot_product_low_i32x8 = _mm256_setzero_si256();
+    __m256i dot_product_high_i32x8 = _mm256_setzero_si256();
+    __m256i a_norm_sq_low_i32x8 = _mm256_setzero_si256();
+    __m256i a_norm_sq_high_i32x8 = _mm256_setzero_si256();
+    __m256i b_norm_sq_low_i32x8 = _mm256_setzero_si256();
+    __m256i b_norm_sq_high_i32x8 = _mm256_setzero_si256();
+    __m256i const zeros_i8x32 = _mm256_setzero_si256();
+    // AVX2 has no instructions for 8-bit signed integer dot products,
+    // but it has a weird instruction for mixed signed-unsigned 8-bit dot product.
+    // So we need to normalize the first vector to its absolute value,
+    // and shift the product sign into the second vector.
+    //
+    //      __m256i a_i8_abs_vec = _mm256_abs_epi8(a_i8_vec);
+    //      __m256i b_i8_flipped_vec = _mm256_sign_epi8(b_i8_vec, a_i8_vec);
+    //      __m256i ab_i16_vec = _mm256_maddubs_epi16(a_i8_abs_vec, b_i8_flipped_vec);
+    //
+    // The problem with this approach, however, is the `-128` value in the second vector.
+    // Flipping its sign will do nothing, and the result will be incorrect.
+    // This can easily lead to noticeable numerical errors in the final result.
+    nk_size_t i = 0;
+    for (; i + 32 <= n; i += 32) {
+        __m256i a_u8x32 = _mm256_loadu_si256((__m256i const *)(a + i));
+        __m256i b_u8x32 = _mm256_loadu_si256((__m256i const *)(b + i));
+        // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
+        // instructions instead of extracts, as they are much faster and more efficient.
+        __m256i a_low_i16x16 = _mm256_unpacklo_epi8(a_u8x32, zeros_i8x32);
+        __m256i a_high_i16x16 = _mm256_unpackhi_epi8(a_u8x32, zeros_i8x32);
+        __m256i b_low_i16x16 = _mm256_unpacklo_epi8(b_u8x32, zeros_i8x32);
+        __m256i b_high_i16x16 = _mm256_unpackhi_epi8(b_u8x32, zeros_i8x32);
+        // Multiply and accumulate as `int16`, accumulate products as `int32`
+        dot_product_low_i32x8 = _mm256_add_epi32(dot_product_low_i32x8, _mm256_madd_epi16(a_low_i16x16, b_low_i16x16));
+        dot_product_high_i32x8 = _mm256_add_epi32(dot_product_high_i32x8,
+                                                  _mm256_madd_epi16(a_high_i16x16, b_high_i16x16));
+        a_norm_sq_low_i32x8 = _mm256_add_epi32(a_norm_sq_low_i32x8, _mm256_madd_epi16(a_low_i16x16, a_low_i16x16));
+        a_norm_sq_high_i32x8 = _mm256_add_epi32(a_norm_sq_high_i32x8, _mm256_madd_epi16(a_high_i16x16, a_high_i16x16));
+        b_norm_sq_low_i32x8 = _mm256_add_epi32(b_norm_sq_low_i32x8, _mm256_madd_epi16(b_low_i16x16, b_low_i16x16));
+        b_norm_sq_high_i32x8 = _mm256_add_epi32(b_norm_sq_high_i32x8, _mm256_madd_epi16(b_high_i16x16, b_high_i16x16));
+    }
+    // Further reduce to a single sum for each vector
+    nk_i32_t dot_product_i32 = nk_reduce_add_i32x8_haswell_(
+        _mm256_add_epi32(dot_product_low_i32x8, dot_product_high_i32x8));
+    nk_i32_t a_norm_sq_i32 = nk_reduce_add_i32x8_haswell_(_mm256_add_epi32(a_norm_sq_low_i32x8, a_norm_sq_high_i32x8));
+    nk_i32_t b_norm_sq_i32 = nk_reduce_add_i32x8_haswell_(_mm256_add_epi32(b_norm_sq_low_i32x8, b_norm_sq_high_i32x8));
+    // Take care of the tail:
+    for (; i < n; ++i) {
+        nk_i32_t a_element_i32 = a[i], b_element_i32 = b[i];
+        dot_product_i32 += a_element_i32 * b_element_i32;
+        a_norm_sq_i32 += a_element_i32 * a_element_i32;
+        b_norm_sq_i32 += b_element_i32 * b_element_i32;
+    }
+    *result = nk_angular_normalize_f32_haswell_(dot_product_i32, a_norm_sq_i32, b_norm_sq_i32);
+}
+#pragma endregion - Small Integers
+#pragma region - Traditional Floats
+NK_PUBLIC void nk_sqeuclidean_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
+    // Upcast to f64 for higher precision accumulation
+    __m256d sum_f64x4 = _mm256_setzero_pd();
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m128 a_f32x4 = _mm_loadu_ps(a + i);
+        __m128 b_f32x4 = _mm_loadu_ps(b + i);
+        __m256d a_f64x4 = _mm256_cvtps_pd(a_f32x4);
+        __m256d b_f64x4 = _mm256_cvtps_pd(b_f32x4);
+        __m256d diff_f64x4 = _mm256_sub_pd(a_f64x4, b_f64x4);
+        sum_f64x4 = _mm256_fmadd_pd(diff_f64x4, diff_f64x4, sum_f64x4);
+    }
+    nk_f64_t sum_f64 = nk_reduce_add_f64x4_haswell_(sum_f64x4);
+    for (; i < n; ++i) {
+        nk_f64_t diff_f64 = (nk_f64_t)a[i] - b[i];
+        sum_f64 += diff_f64 * diff_f64;
+    }
+    *result = sum_f64;
+}
+NK_PUBLIC void nk_euclidean_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
+    // Upcast to f64 for higher precision accumulation, use f64 sqrt before downcasting
+    __m256d sum_f64x4 = _mm256_setzero_pd();
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m128 a_f32x4 = _mm_loadu_ps(a + i);
+        __m128 b_f32x4 = _mm_loadu_ps(b + i);
+        __m256d a_f64x4 = _mm256_cvtps_pd(a_f32x4);
+        __m256d b_f64x4 = _mm256_cvtps_pd(b_f32x4);
+        __m256d diff_f64x4 = _mm256_sub_pd(a_f64x4, b_f64x4);
+        sum_f64x4 = _mm256_fmadd_pd(diff_f64x4, diff_f64x4, sum_f64x4);
+    }
+    nk_f64_t sum_f64 = nk_reduce_add_f64x4_haswell_(sum_f64x4);
+    for (; i < n; ++i) {
+        nk_f64_t diff_f64 = (nk_f64_t)a[i] - b[i];
+        sum_f64 += diff_f64 * diff_f64;
+    }
+    *result = nk_f64_sqrt_haswell(sum_f64);
+}
+NK_PUBLIC void nk_angular_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
+    // Upcast to f64 for higher precision accumulation
+    __m256d dot_f64x4 = _mm256_setzero_pd();
+    __m256d a_norm_sq_f64x4 = _mm256_setzero_pd();
+    __m256d b_norm_sq_f64x4 = _mm256_setzero_pd();
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m128 a_f32x4 = _mm_loadu_ps(a + i);
+        __m128 b_f32x4 = _mm_loadu_ps(b + i);
+        __m256d a_f64x4 = _mm256_cvtps_pd(a_f32x4);
+        __m256d b_f64x4 = _mm256_cvtps_pd(b_f32x4);
+        dot_f64x4 = _mm256_fmadd_pd(a_f64x4, b_f64x4, dot_f64x4);
+        a_norm_sq_f64x4 = _mm256_fmadd_pd(a_f64x4, a_f64x4, a_norm_sq_f64x4);
+        b_norm_sq_f64x4 = _mm256_fmadd_pd(b_f64x4, b_f64x4, b_norm_sq_f64x4);
+    }
+    nk_f64_t dot_f64 = nk_reduce_add_f64x4_haswell_(dot_f64x4);
+    nk_f64_t a_norm_sq_f64 = nk_reduce_add_f64x4_haswell_(a_norm_sq_f64x4);
+    nk_f64_t b_norm_sq_f64 = nk_reduce_add_f64x4_haswell_(b_norm_sq_f64x4);
+    for (; i < n; ++i) {
+        nk_f64_t a_f64 = a[i], b_f64 = b[i];
+        dot_f64 += a_f64 * b_f64;
+        a_norm_sq_f64 += a_f64 * a_f64;
+        b_norm_sq_f64 += b_f64 * b_f64;
+    }
+    *result = nk_angular_normalize_f64_haswell_(dot_f64, a_norm_sq_f64, b_norm_sq_f64);
+}
+NK_PUBLIC void nk_sqeuclidean_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    __m256d sum_f64x4 = _mm256_setzero_pd();
+    __m256d a_f64x4, b_f64x4;
+nk_sqeuclidean_f64_haswell_cycle:
+    if (n < 4) {
+        nk_b256_vec_t a_tail, b_tail;
+        nk_partial_load_b64x4_serial_(a, &a_tail, n);
+        nk_partial_load_b64x4_serial_(b, &b_tail, n);
+        a_f64x4 = a_tail.ymm_pd;
+        b_f64x4 = b_tail.ymm_pd;
+        n = 0;
+    }
+    else {
+        a_f64x4 = _mm256_loadu_pd(a);
+        b_f64x4 = _mm256_loadu_pd(b);
+        a += 4, b += 4, n -= 4;
+    }
+    __m256d diff_f64x4 = _mm256_sub_pd(a_f64x4, b_f64x4);
+    sum_f64x4 = _mm256_fmadd_pd(diff_f64x4, diff_f64x4, sum_f64x4);
+    if (n) goto nk_sqeuclidean_f64_haswell_cycle;
+    *result = nk_reduce_add_f64x4_haswell_(sum_f64x4);
+}
+NK_PUBLIC void nk_euclidean_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    nk_sqeuclidean_f64_haswell(a, b, n, result);
+    *result = nk_f64_sqrt_haswell(*result);
+}
+NK_PUBLIC void nk_angular_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    // Dot2 (Ogita-Rump-Oishi 2005) for cross-product a × b only - it may have cancellation.
+    // Self-products ‖a‖² and ‖b‖² use simple FMA - all terms are non-negative, no cancellation.
+    // Note: For cross-product we use Knuth TwoSum (6 ops) rather than Neumaier with blends (10 ops)
+    // since products can be signed and Knuth handles any operand ordering efficiently.
+    __m256d dot_sum_f64x4 = _mm256_setzero_pd();
+    __m256d dot_compensation_f64x4 = _mm256_setzero_pd();
+    __m256d a_norm_sq_f64x4 = _mm256_setzero_pd();
+    __m256d b_norm_sq_f64x4 = _mm256_setzero_pd();
+    __m256d a_f64x4, b_f64x4;
+nk_angular_f64_haswell_cycle:
+    if (n < 4) {
+        nk_b256_vec_t a_tail, b_tail;
+        nk_partial_load_b64x4_serial_(a, &a_tail, n);
+        nk_partial_load_b64x4_serial_(b, &b_tail, n);
+        a_f64x4 = a_tail.ymm_pd;
+        b_f64x4 = b_tail.ymm_pd;
+        n = 0;
+    }
+    else {
+        a_f64x4 = _mm256_loadu_pd(a);
+        b_f64x4 = _mm256_loadu_pd(b);
+        a += 4, b += 4, n -= 4;
+    }
+    // TwoProd: product = a × b, error = fma(a, b, -product)
+    __m256d x_f64x4 = _mm256_mul_pd(a_f64x4, b_f64x4);
+    __m256d product_error_f64x4 = _mm256_fmsub_pd(a_f64x4, b_f64x4, x_f64x4);
+    // Knuth TwoSum: error = (sum - (t - z)) + (x - z) where z = t - sum
+    __m256d tentative_sum_f64x4 = _mm256_add_pd(dot_sum_f64x4, x_f64x4);
+    __m256d virtual_addend_f64x4 = _mm256_sub_pd(tentative_sum_f64x4, dot_sum_f64x4);
+    __m256d sum_error_f64x4 = _mm256_add_pd(
+        _mm256_sub_pd(dot_sum_f64x4, _mm256_sub_pd(tentative_sum_f64x4, virtual_addend_f64x4)),
+        _mm256_sub_pd(x_f64x4, virtual_addend_f64x4));
+    dot_sum_f64x4 = tentative_sum_f64x4;
+    dot_compensation_f64x4 = _mm256_add_pd(dot_compensation_f64x4, _mm256_add_pd(sum_error_f64x4, product_error_f64x4));
+    // Simple FMA for self-products (no cancellation possible)
+    a_norm_sq_f64x4 = _mm256_fmadd_pd(a_f64x4, a_f64x4, a_norm_sq_f64x4);
+    b_norm_sq_f64x4 = _mm256_fmadd_pd(b_f64x4, b_f64x4, b_norm_sq_f64x4);
+    if (n) goto nk_angular_f64_haswell_cycle;
+    *result = nk_angular_normalize_f64_haswell_( //
+        nk_dot_stable_sum_f64x4_haswell_(dot_sum_f64x4, dot_compensation_f64x4),
+        nk_reduce_add_f64x4_haswell_(a_norm_sq_f64x4), nk_reduce_add_f64x4_haswell_(b_norm_sq_f64x4));
+}
+#pragma endregion - Traditional Floats
+#pragma region - Smaller Floats
+NK_PUBLIC void nk_sqeuclidean_e2m3_haswell(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+nk_sqeuclidean_e2m3_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e2m3x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e2m3x8_to_f32x8_haswell_(b_vec.xmm);
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e2m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e2m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_sqeuclidean_e2m3_haswell_cycle;
+    }
+    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+}
+NK_PUBLIC void nk_euclidean_e2m3_haswell(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e2m3_haswell(a, b, n, result);
+    *result = nk_f32_sqrt_haswell(*result);
+}
+NK_PUBLIC void nk_angular_e2m3_haswell(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 dot_product_f32x8 = _mm256_setzero_ps();
+    __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+nk_angular_e2m3_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e2m3x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e2m3x8_to_f32x8_haswell_(b_vec.xmm);
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e2m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e2m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_angular_e2m3_haswell_cycle;
+    }
+    nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
+    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
+    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(b_norm_sq_f32x8);
+    *result = nk_angular_normalize_f32_haswell_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
+}
+NK_PUBLIC void nk_sqeuclidean_e3m2_haswell(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+nk_sqeuclidean_e3m2_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e3m2x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e3m2x8_to_f32x8_haswell_(b_vec.xmm);
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e3m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e3m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_sqeuclidean_e3m2_haswell_cycle;
+    }
+    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+}
+NK_PUBLIC void nk_euclidean_e3m2_haswell(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e3m2_haswell(a, b, n, result);
+    *result = nk_f32_sqrt_haswell(*result);
+}
+NK_PUBLIC void nk_angular_e3m2_haswell(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 dot_product_f32x8 = _mm256_setzero_ps();
+    __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+nk_angular_e3m2_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e3m2x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e3m2x8_to_f32x8_haswell_(b_vec.xmm);
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e3m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e3m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_angular_e3m2_haswell_cycle;
+    }
+    nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
+    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
+    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(b_norm_sq_f32x8);
+    *result = nk_angular_normalize_f32_haswell_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
+}
+NK_PUBLIC void nk_sqeuclidean_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+nk_sqeuclidean_e4m3_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_vec.xmm);
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_sqeuclidean_e4m3_haswell_cycle;
+    }
+    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+}
+NK_PUBLIC void nk_euclidean_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e4m3_haswell(a, b, n, result);
+    *result = nk_f32_sqrt_haswell(*result);
+}
+NK_PUBLIC void nk_angular_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 dot_product_f32x8 = _mm256_setzero_ps();
+    __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+nk_angular_e4m3_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_vec.xmm);
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_angular_e4m3_haswell_cycle;
+    }
+    nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
+    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
+    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(b_norm_sq_f32x8);
+    *result = nk_angular_normalize_f32_haswell_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
+}
+NK_PUBLIC void nk_sqeuclidean_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+nk_sqeuclidean_e5m2_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_vec.xmm);
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
+        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_sqeuclidean_e5m2_haswell_cycle;
+    }
+    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+}
+NK_PUBLIC void nk_euclidean_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_sqeuclidean_e5m2_haswell(a, b, n, result);
+    *result = nk_f32_sqrt_haswell(*result);
+}
+NK_PUBLIC void nk_angular_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 dot_product_f32x8 = _mm256_setzero_ps();
+    __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+nk_angular_e5m2_haswell_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a, &a_vec, n);
+        nk_partial_load_b8x16_serial_(b, &b_vec, n);
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_vec.xmm);
+        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_vec.xmm);
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+    }
+    else {
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
+        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
+        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
+        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
+        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        n -= 8, a += 8, b += 8;
+        goto nk_angular_e5m2_haswell_cycle;
+    }
+    nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
+    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
+    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(b_norm_sq_f32x8);
+    *result = nk_angular_normalize_f32_haswell_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#pragma endregion - Smaller Floats
+#endif // NK_TARGET_HASWELL
+#endif // NK_TARGET_X86_
+#endif // NK_SPATIAL_HASWELL_H