npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/spatial/icelake.h ADDED Viewed

@@ -0,0 +1,586 @@
+/**
+ *  @brief SIMD-accelerated Spatial Similarity Measures for Ice Lake.
+ *  @file include/numkong/spatial/icelake.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/spatial.h
+ *
+ *  @section spatial_icelake_instructions Key AVX-512 VNNI Spatial Instructions
+ *
+ *      Intrinsic                   Instruction                     Ice         Genoa
+ *      _mm512_dpwssd_epi32         VPDPWSSD (ZMM, ZMM, ZMM)        5cy @ p0    4cy @ p01
+ *      _mm512_cvtepi8_epi16        VPMOVSXBW (ZMM, YMM)            3cy @ p5    3cy @ p12
+ *      _mm512_sub_epi16            VPSUBW (ZMM, ZMM, ZMM)          1cy @ p05   1cy @ p0123
+ *      _mm512_reduce_add_epi32     (pseudo: shuffle chain)         ~8cy        ~8cy
+ *
+ *  Ice Lake's VNNI enables efficient i8 distance computations via VPDPWSSD for squared differences.
+ *  After widening i8 to i16, the same instruction computes both multiply and horizontal pair addition.
+ *  This approach avoids the asymmetric VPDPBUSD issues with signed values like -128.
+ */
+#ifndef NK_SPATIAL_ICELAKE_H
+#define NK_SPATIAL_ICELAKE_H
+#if NK_TARGET_X86_
+#if NK_TARGET_ICELAKE
+#include "numkong/types.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(                                                                        \
+    __attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,avx512vnni,f16c,fma,bmi,bmi2"))), \
+    apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vnni", "f16c", "fma", "bmi", "bmi2")
+#endif
+NK_PUBLIC void nk_sqeuclidean_i8_icelake(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_u32_t *result) {
+    // Optimized i8 L2-squared using saturating subtract + DPWSSD
+    //
+    // Old approach (Haswell/Skylake):
+    //   - Compute (a-b) as signed i8, then sign-extend i8→i16 using cvtepi8_epi16
+    //   - Square using vpmaddwd on i16 values (32 elements/iteration)
+    //   - Bottleneck: cvtepi8_epi16 (3cy latency @ p5) limits throughput
+    //
+    // New approach (Ice Lake+):
+    //   - XOR with 0x80 to reinterpret signed i8 as unsigned u8
+    //   - Compute |a-b| using unsigned saturating subtraction: diff = (a ⊖ b) | (b ⊖ a)
+    //   - Zero-extend u8→u16 using unpacking (1cy latency @ p5)
+    //   - Square using vpmaddwd on u16 values (64 elements/iteration)
+    //   - Eliminates cvtepi8_epi16 bottleneck, doubles throughput
+    //
+    // Performance gain: 1.6-1.85× speedup
+    //   - Processes 64 elements/iteration (2× improvement)
+    //   - Faster zero-extension (unpack 1cy vs cvtepi8_epi16 3cy)
+    //   - Correctness: |a-b|² = (a-b)², so unsigned absolute differences are valid
+    //
+    // The XOR bias is needed because subs_epu8 (unsigned) saturates to 0 when
+    // the result would be negative, so OR-ing both directions gives the true |a-b|.
+    // A naive subs_epi8 (signed) saturates to -128, corrupting the OR trick.
+    //
+    __m512i distance_sq_low_i32x16 = _mm512_setzero_si512();
+    __m512i distance_sq_high_i32x16 = _mm512_setzero_si512();
+    __m512i const zeros_i8x64 = _mm512_setzero_si512();
+    __m512i const bias_i8x64 = _mm512_set1_epi8((char)0x80);
+    __m512i diff_low_i16x32, diff_high_i16x32;
+    __m512i a_i8x64, b_i8x64, diff_u8x64;
+nk_sqeuclidean_i8_icelake_cycle:
+    if (n < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
+        a_i8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        b_i8x64 = _mm512_maskz_loadu_epi8(mask, b);
+        n = 0;
+    }
+    else {
+        a_i8x64 = _mm512_loadu_si512(a);
+        b_i8x64 = _mm512_loadu_si512(b);
+        a += 64, b += 64, n -= 64;
+    }
+    // Reinterpret signed i8 as unsigned u8 by flipping the sign bit
+    a_i8x64 = _mm512_xor_si512(a_i8x64, bias_i8x64);
+    b_i8x64 = _mm512_xor_si512(b_i8x64, bias_i8x64);
+    // Compute |a-b| using unsigned saturating subtraction
+    // subs_epu8 saturates to 0 if result would be negative
+    // OR-ing both directions gives absolute difference as unsigned
+    diff_u8x64 = _mm512_or_si512(_mm512_subs_epu8(a_i8x64, b_i8x64), _mm512_subs_epu8(b_i8x64, a_i8x64));
+    // Zero-extend to i16 using unpack (1cy @ p5, much faster than cvtepi8_epi16)
+    diff_low_i16x32 = _mm512_unpacklo_epi8(diff_u8x64, zeros_i8x64);
+    diff_high_i16x32 = _mm512_unpackhi_epi8(diff_u8x64, zeros_i8x64);
+    // Multiply and accumulate at i16 level, accumulate at i32 level
+    distance_sq_low_i32x16 = _mm512_dpwssd_epi32(distance_sq_low_i32x16, diff_low_i16x32, diff_low_i16x32);
+    distance_sq_high_i32x16 = _mm512_dpwssd_epi32(distance_sq_high_i32x16, diff_high_i16x32, diff_high_i16x32);
+    if (n) goto nk_sqeuclidean_i8_icelake_cycle;
+    *result = _mm512_reduce_add_epi32(_mm512_add_epi32(distance_sq_low_i32x16, distance_sq_high_i32x16));
+}
+NK_PUBLIC void nk_euclidean_i8_icelake(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_u32_t d2;
+    nk_sqeuclidean_i8_icelake(a, b, n, &d2);
+    *result = nk_f32_sqrt_haswell((nk_f32_t)d2);
+}
+NK_PUBLIC void nk_angular_i8_icelake(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m512i dot_product_i32x16 = _mm512_setzero_si512();
+    __m512i a_norm_sq_i32x16 = _mm512_setzero_si512();
+    __m512i b_norm_sq_i32x16 = _mm512_setzero_si512();
+    __m512i a_i16x32, b_i16x32;
+nk_angular_i8_icelake_cycle:
+    if (n < 32) {
+        __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
+        a_i16x32 = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, a));
+        b_i16x32 = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b));
+        n = 0;
+    }
+    else {
+        a_i16x32 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i const *)a));
+        b_i16x32 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i const *)b));
+        a += 32, b += 32, n -= 32;
+    }
+    // We can't directly use the `_mm512_dpbusd_epi32` intrinsic everywhere,
+    // as it's asymmetric with respect to the sign of the input arguments:
+    //
+    //      Signed(ZeroExtend16(a.byte[4 × j]) × SignExtend16(b.byte[4 × j]))
+    //
+    // To compute the squares, we could just drop the sign bit of the second argument.
+    // But this would lead to big-big problems on values like `-128`!
+    // For dot-products we don't have the luxury of optimizing the sign bit away.
+    // Assuming this is an approximate kernel (with reciprocal square root approximations)
+    // in the end, we can allow clamping the value to [-127, 127] range.
+    //
+    // VNNI instruction performance (Ice Lake vs Zen4 Genoa):
+    //
+    //      Instruction                     Ice             Genoa
+    //      VPDPBUSDS (ZMM, ZMM, ZMM)       5cy @ p0        4cy @ p01
+    //      VPDPWSSDS (ZMM, ZMM, ZMM)       5cy @ p0        4cy @ p01
+    //      VPMADDWD (ZMM, ZMM, ZMM)        5cy @ p05       3cy @ p01
+    //
+    // On Ice Lake, VNNI bottlenecks on port 0. On Genoa, dual-issue on p01 is faster.
+    //
+    // The old solution was complex replied on 1. and 2.:
+    //
+    //    a_i8_abs_vec = _mm512_abs_epi8(a_i8_vec);
+    //    b_i8_abs_vec = _mm512_abs_epi8(b_i8_vec);
+    //    a2_i32_vec = _mm512_dpbusds_epi32(a2_i32_vec, a_i8_abs_vec, a_i8_abs_vec);
+    //    b2_i32_vec = _mm512_dpbusds_epi32(b2_i32_vec, b_i8_abs_vec, b_i8_abs_vec);
+    //    ab_i32_low_vec = _mm512_dpwssds_epi32(                      //
+    //        ab_i32_low_vec,                                         //
+    //        _mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_vec)), //
+    //        _mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_vec)));
+    //    ab_i32_high_vec = _mm512_dpwssds_epi32(                           //
+    //        ab_i32_high_vec,                                              //
+    //        _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_vec, 1)), //
+    //        _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_vec, 1)));
+    //
+    // The new solution is simpler and relies on 3.:
+    dot_product_i32x16 = _mm512_add_epi32(dot_product_i32x16, _mm512_madd_epi16(a_i16x32, b_i16x32));
+    a_norm_sq_i32x16 = _mm512_add_epi32(a_norm_sq_i32x16, _mm512_madd_epi16(a_i16x32, a_i16x32));
+    b_norm_sq_i32x16 = _mm512_add_epi32(b_norm_sq_i32x16, _mm512_madd_epi16(b_i16x32, b_i16x32));
+    if (n) goto nk_angular_i8_icelake_cycle;
+    nk_i32_t dot_product_i32 = _mm512_reduce_add_epi32(dot_product_i32x16);
+    nk_i32_t a_norm_sq_i32 = _mm512_reduce_add_epi32(a_norm_sq_i32x16);
+    nk_i32_t b_norm_sq_i32 = _mm512_reduce_add_epi32(b_norm_sq_i32x16);
+    *result = nk_angular_normalize_f32_haswell_(dot_product_i32, a_norm_sq_i32, b_norm_sq_i32);
+}
+NK_PUBLIC void nk_sqeuclidean_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
+    __m512i distance_sq_low_i32x16 = _mm512_setzero_si512();
+    __m512i distance_sq_high_i32x16 = _mm512_setzero_si512();
+    __m512i const zeros_i8x64 = _mm512_setzero_si512();
+    __m512i diff_low_i16x32, diff_high_i16x32;
+    __m512i a_u8x64, b_u8x64, diff_u8x64;
+nk_sqeuclidean_u8_icelake_cycle:
+    if (n < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
+        a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
+        n = 0;
+    }
+    else {
+        a_u8x64 = _mm512_loadu_si512(a);
+        b_u8x64 = _mm512_loadu_si512(b);
+        a += 64, b += 64, n -= 64;
+    }
+    // Substracting unsigned vectors in AVX-512 is done by saturating subtraction:
+    diff_u8x64 = _mm512_or_si512(_mm512_subs_epu8(a_u8x64, b_u8x64), _mm512_subs_epu8(b_u8x64, a_u8x64));
+    diff_low_i16x32 = _mm512_unpacklo_epi8(diff_u8x64, zeros_i8x64);
+    diff_high_i16x32 = _mm512_unpackhi_epi8(diff_u8x64, zeros_i8x64);
+    // Multiply and accumulate at `int16` level, accumulate at `int32` level:
+    distance_sq_low_i32x16 = _mm512_dpwssd_epi32(distance_sq_low_i32x16, diff_low_i16x32, diff_low_i16x32);
+    distance_sq_high_i32x16 = _mm512_dpwssd_epi32(distance_sq_high_i32x16, diff_high_i16x32, diff_high_i16x32);
+    if (n) goto nk_sqeuclidean_u8_icelake_cycle;
+    *result = _mm512_reduce_add_epi32(_mm512_add_epi32(distance_sq_low_i32x16, distance_sq_high_i32x16));
+}
+NK_PUBLIC void nk_euclidean_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_u32_t d2;
+    nk_sqeuclidean_u8_icelake(a, b, n, &d2);
+    *result = nk_f32_sqrt_haswell((nk_f32_t)d2);
+}
+NK_PUBLIC void nk_angular_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m512i dot_product_low_i32x16 = _mm512_setzero_si512();
+    __m512i dot_product_high_i32x16 = _mm512_setzero_si512();
+    __m512i a_norm_sq_low_i32x16 = _mm512_setzero_si512();
+    __m512i a_norm_sq_high_i32x16 = _mm512_setzero_si512();
+    __m512i b_norm_sq_low_i32x16 = _mm512_setzero_si512();
+    __m512i b_norm_sq_high_i32x16 = _mm512_setzero_si512();
+    __m512i const zeros_i8x64 = _mm512_setzero_si512();
+    __m512i a_low_i16x32, a_high_i16x32, b_low_i16x32, b_high_i16x32;
+    __m512i a_u8x64, b_u8x64;
+nk_angular_u8_icelake_cycle:
+    if (n < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
+        a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
+        n = 0;
+    }
+    else {
+        a_u8x64 = _mm512_loadu_si512(a);
+        b_u8x64 = _mm512_loadu_si512(b);
+        a += 64, b += 64, n -= 64;
+    }
+    // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking
+    // instructions instead of extracts, as they are much faster and more efficient.
+    a_low_i16x32 = _mm512_unpacklo_epi8(a_u8x64, zeros_i8x64);
+    a_high_i16x32 = _mm512_unpackhi_epi8(a_u8x64, zeros_i8x64);
+    b_low_i16x32 = _mm512_unpacklo_epi8(b_u8x64, zeros_i8x64);
+    b_high_i16x32 = _mm512_unpackhi_epi8(b_u8x64, zeros_i8x64);
+    // Multiply and accumulate as `int16`, accumulate products as `int32`:
+    dot_product_low_i32x16 = _mm512_dpwssds_epi32(dot_product_low_i32x16, a_low_i16x32, b_low_i16x32);
+    dot_product_high_i32x16 = _mm512_dpwssds_epi32(dot_product_high_i32x16, a_high_i16x32, b_high_i16x32);
+    a_norm_sq_low_i32x16 = _mm512_dpwssds_epi32(a_norm_sq_low_i32x16, a_low_i16x32, a_low_i16x32);
+    a_norm_sq_high_i32x16 = _mm512_dpwssds_epi32(a_norm_sq_high_i32x16, a_high_i16x32, a_high_i16x32);
+    b_norm_sq_low_i32x16 = _mm512_dpwssds_epi32(b_norm_sq_low_i32x16, b_low_i16x32, b_low_i16x32);
+    b_norm_sq_high_i32x16 = _mm512_dpwssds_epi32(b_norm_sq_high_i32x16, b_high_i16x32, b_high_i16x32);
+    if (n) goto nk_angular_u8_icelake_cycle;
+    nk_i32_t dot_product_i32 = _mm512_reduce_add_epi32(
+        _mm512_add_epi32(dot_product_low_i32x16, dot_product_high_i32x16));
+    nk_i32_t a_norm_sq_i32 = _mm512_reduce_add_epi32(_mm512_add_epi32(a_norm_sq_low_i32x16, a_norm_sq_high_i32x16));
+    nk_i32_t b_norm_sq_i32 = _mm512_reduce_add_epi32(_mm512_add_epi32(b_norm_sq_low_i32x16, b_norm_sq_high_i32x16));
+    *result = nk_angular_normalize_f32_haswell_(dot_product_i32, a_norm_sq_i32, b_norm_sq_i32);
+}
+NK_PUBLIC void nk_sqeuclidean_i4_icelake(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_u32_t *result) {
+    // i4 values are packed as nibbles: two 4-bit signed values per byte.
+    // Parameter `n` is the number of 4-bit values (dimensions), not bytes.
+    n = nk_size_round_up_to_multiple_(n, 2);
+    nk_size_t n_bytes = n / 2;
+    // While `int8_t` covers the range [-128, 127], `int4_t` covers only [-8, 7].
+    // The absolute difference between two 4-bit integers is at most 15 and fits in `uint4_t`.
+    // Moreover, its square is at most 225, which fits into `uint8_t`.
+    //
+    // Instead of using lookup tables for sign extension and squaring, we use arithmetic:
+    //
+    //  1. XOR trick for sign extension: `signed = (nibble ^ 8) - 8`
+    //     Maps [0,7] → [0,7] (positive) and [8,15] → [-8,-1] (negative).
+    //
+    //  2. For L2 squared: |a-b|² = diff * diff, using `_mm512_dpbusd_epi32`.
+    //     After computing signed difference and taking abs, the result fits ∈ [0,15].
+    //     We can then use DPBUSD to compute diff² efficiently without lookup tables.
+    //
+    // This approach avoids 8x VPSHUFB operations per iteration, replacing them with
+    // arithmetic operations that distribute better across execution ports.
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i const eight_i8x64 = _mm512_set1_epi8(8);
+    __m512i a_i4_vec, b_i4_vec;
+    __m512i a_low_u8x64, a_high_u8x64, b_low_u8x64, b_high_u8x64;
+    __m512i a_low_i8x64, a_high_i8x64, b_low_i8x64, b_high_i8x64;
+    __m512i diff_low_u8x64, diff_high_u8x64;
+    __m512i d2_i32x16 = _mm512_setzero_si512();
+nk_sqeuclidean_i4_icelake_cycle:
+    if (n_bytes < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
+        a_i4_vec = _mm512_maskz_loadu_epi8(mask, a);
+        b_i4_vec = _mm512_maskz_loadu_epi8(mask, b);
+        n_bytes = 0;
+    }
+    else {
+        a_i4_vec = _mm512_loadu_epi8(a);
+        b_i4_vec = _mm512_loadu_epi8(b);
+        a += 64, b += 64, n_bytes -= 64;
+    }
+    // Extract nibbles as unsigned [0,15]. VPSHUFB ignores high 4 bits of index,
+    // so no AND needed for low nibbles when used with lookup, but we need it here.
+    a_low_u8x64 = _mm512_and_si512(a_i4_vec, nibble_mask_u8x64);
+    a_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(a_i4_vec, 4), nibble_mask_u8x64);
+    b_low_u8x64 = _mm512_and_si512(b_i4_vec, nibble_mask_u8x64);
+    b_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(b_i4_vec, 4), nibble_mask_u8x64);
+    // Sign extend using XOR trick: signed = (nibble ^ 8) - 8
+    a_low_i8x64 = _mm512_sub_epi8(_mm512_xor_si512(a_low_u8x64, eight_i8x64), eight_i8x64);
+    a_high_i8x64 = _mm512_sub_epi8(_mm512_xor_si512(a_high_u8x64, eight_i8x64), eight_i8x64);
+    b_low_i8x64 = _mm512_sub_epi8(_mm512_xor_si512(b_low_u8x64, eight_i8x64), eight_i8x64);
+    b_high_i8x64 = _mm512_sub_epi8(_mm512_xor_si512(b_high_u8x64, eight_i8x64), eight_i8x64);
+    // Compute |a - b| for each nibble pair. Result is unsigned ∈ [0, 15].
+    diff_low_u8x64 = _mm512_abs_epi8(_mm512_sub_epi8(a_low_i8x64, b_low_i8x64));
+    diff_high_u8x64 = _mm512_abs_epi8(_mm512_sub_epi8(a_high_i8x64, b_high_i8x64));
+    // Square and accumulate using DPBUSD: diff² = diff * diff.
+    // DPBUSD computes u8*i8 products and sums groups of 4 into i32.
+    // Since diff is ∈ [0,15], it's safe for both u8 and i8 interpretation.
+    d2_i32x16 = _mm512_dpbusd_epi32(d2_i32x16, diff_low_u8x64, diff_low_u8x64);
+    d2_i32x16 = _mm512_dpbusd_epi32(d2_i32x16, diff_high_u8x64, diff_high_u8x64);
+    if (n_bytes) goto nk_sqeuclidean_i4_icelake_cycle;
+    *result = (nk_u32_t)_mm512_reduce_add_epi32(d2_i32x16);
+}
+NK_PUBLIC void nk_euclidean_i4_icelake(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_u32_t d2;
+    nk_sqeuclidean_i4_icelake(a, b, n, &d2);
+    *result = nk_f32_sqrt_haswell((nk_f32_t)d2);
+}
+NK_PUBLIC void nk_angular_i4_icelake(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result) {
+    // i4 values are packed as nibbles: two 4-bit signed values per byte.
+    // Parameter `n` is the number of 4-bit values (dimensions), not bytes.
+    n = nk_size_round_up_to_multiple_(n, 2);
+    nk_size_t n_bytes = n / 2;
+    // Angular distance for signed 4-bit integers requires computing:
+    //   1. Dot product: ∑(aᵢ × bᵢ)
+    //   2. Squared norms: ∑(aᵢ²) and ∑(bᵢ²)
+    //
+    // For signed i4 values in [-8, 7], we use DPBUSD for everything by leveraging
+    // an algebraic identity. Define x = a ^ 8 (XOR with 8), which maps:
+    //   [0,7] → [8,15] and [8,15] → [0,7]
+    //
+    // The signed value is: a_signed = x - 8
+    //
+    // For two signed values:
+    //   a_signed × b_signed = (ax - 8)(bx - 8) = ax × bx - 8 × ax - 8 × bx + 64
+    //
+    // Therefore:
+    //   dot(a_signed, b_signed) = DPBUSD(ax, bx) - 8 × (∑(ax) + ∑(bx)) + 64 × n
+    //
+    // This avoids all i8 → i16 upcasts and uses DPBUSD directly on byte values!
+    // For norms, we use |x|² = x², computing abs then squaring with DPBUSD.
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i const eight_i8x64 = _mm512_set1_epi8(8);
+    __m512i const zeros_i8x64 = _mm512_setzero_si512();
+    __m512i a_i4_vec, b_i4_vec;
+    __m512i a_low_u8x64, a_high_u8x64, b_low_u8x64, b_high_u8x64;
+    __m512i ax_low_u8x64, ax_high_u8x64, bx_low_u8x64, bx_high_u8x64;
+    __m512i a_low_i8x64, a_high_i8x64, b_low_i8x64, b_high_i8x64;
+    // Accumulators for dot product (using biased values) and correction sums
+    __m512i ab_i32x16 = zeros_i8x64;
+    __m512i ax_sum_i64x8 = zeros_i8x64;
+    __m512i bx_sum_i64x8 = zeros_i8x64;
+    // Accumulators for squared norms
+    __m512i a2_i32x16 = zeros_i8x64;
+    __m512i b2_i32x16 = zeros_i8x64;
+nk_angular_i4_icelake_cycle:
+    if (n_bytes < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
+        a_i4_vec = _mm512_mask_loadu_epi8(_mm512_set1_epi8((char)0x88), mask, a);
+        b_i4_vec = _mm512_mask_loadu_epi8(_mm512_set1_epi8((char)0x88), mask, b);
+        n_bytes = 0;
+    }
+    else {
+        a_i4_vec = _mm512_loadu_epi8(a);
+        b_i4_vec = _mm512_loadu_epi8(b);
+        a += 64, b += 64, n_bytes -= 64;
+    }
+    // Extract nibbles as unsigned [0,15]
+    a_low_u8x64 = _mm512_and_si512(a_i4_vec, nibble_mask_u8x64);
+    a_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(a_i4_vec, 4), nibble_mask_u8x64);
+    b_low_u8x64 = _mm512_and_si512(b_i4_vec, nibble_mask_u8x64);
+    b_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(b_i4_vec, 4), nibble_mask_u8x64);
+    // Compute biased values: ax = a ^ 8 (still ∈ [0,15], just reordered)
+    ax_low_u8x64 = _mm512_xor_si512(a_low_u8x64, eight_i8x64);
+    ax_high_u8x64 = _mm512_xor_si512(a_high_u8x64, eight_i8x64);
+    bx_low_u8x64 = _mm512_xor_si512(b_low_u8x64, eight_i8x64);
+    bx_high_u8x64 = _mm512_xor_si512(b_high_u8x64, eight_i8x64);
+    // Dot product using DPBUSD on biased values (correction applied at end)
+    ab_i32x16 = _mm512_dpbusd_epi32(ab_i32x16, ax_low_u8x64, bx_low_u8x64);
+    ab_i32x16 = _mm512_dpbusd_epi32(ab_i32x16, ax_high_u8x64, bx_high_u8x64);
+    // Track sums for correction using SAD (sum of absolute differences with zero)
+    ax_sum_i64x8 = _mm512_add_epi64(ax_sum_i64x8, _mm512_sad_epu8(ax_low_u8x64, zeros_i8x64));
+    ax_sum_i64x8 = _mm512_add_epi64(ax_sum_i64x8, _mm512_sad_epu8(ax_high_u8x64, zeros_i8x64));
+    bx_sum_i64x8 = _mm512_add_epi64(bx_sum_i64x8, _mm512_sad_epu8(bx_low_u8x64, zeros_i8x64));
+    bx_sum_i64x8 = _mm512_add_epi64(bx_sum_i64x8, _mm512_sad_epu8(bx_high_u8x64, zeros_i8x64));
+    // For norms: convert to signed, take abs, then square with DPBUSD
+    a_low_i8x64 = _mm512_sub_epi8(ax_low_u8x64, eight_i8x64);
+    a_high_i8x64 = _mm512_sub_epi8(ax_high_u8x64, eight_i8x64);
+    b_low_i8x64 = _mm512_sub_epi8(bx_low_u8x64, eight_i8x64);
+    b_high_i8x64 = _mm512_sub_epi8(bx_high_u8x64, eight_i8x64);
+    __m512i a_low_abs_u8x64 = _mm512_abs_epi8(a_low_i8x64);
+    __m512i a_high_abs_u8x64 = _mm512_abs_epi8(a_high_i8x64);
+    __m512i b_low_abs_u8x64 = _mm512_abs_epi8(b_low_i8x64);
+    __m512i b_high_abs_u8x64 = _mm512_abs_epi8(b_high_i8x64);
+    // Squared norms: ‖x‖² = x², use DPBUSD for efficient squaring
+    a2_i32x16 = _mm512_dpbusd_epi32(a2_i32x16, a_low_abs_u8x64, a_low_abs_u8x64);
+    a2_i32x16 = _mm512_dpbusd_epi32(a2_i32x16, a_high_abs_u8x64, a_high_abs_u8x64);
+    b2_i32x16 = _mm512_dpbusd_epi32(b2_i32x16, b_low_abs_u8x64, b_low_abs_u8x64);
+    b2_i32x16 = _mm512_dpbusd_epi32(b2_i32x16, b_high_abs_u8x64, b_high_abs_u8x64);
+    if (n_bytes) goto nk_angular_i4_icelake_cycle;
+    // Apply algebraic correction for signed dot product:
+    // signed_dot = DPBUSD(ax, bx) - 8 × (∑(ax) + ∑(bx)) + 64 × n
+    nk_i64_t ax_sum = _mm512_reduce_add_epi64(ax_sum_i64x8);
+    nk_i64_t bx_sum = _mm512_reduce_add_epi64(bx_sum_i64x8);
+    nk_i32_t ab_raw = _mm512_reduce_add_epi32(ab_i32x16);
+    nk_i32_t ab = ab_raw - 8 * (nk_i32_t)(ax_sum + bx_sum) + 64 * (nk_i32_t)n;
+    nk_size_t n_bytes_total = nk_size_divide_round_up_(n, 2);
+    nk_i32_t norm_excess = 128 * (nk_i32_t)(nk_size_round_up_to_multiple_(n_bytes_total, 64) - n_bytes_total);
+    nk_i32_t a2 = _mm512_reduce_add_epi32(a2_i32x16) - norm_excess;
+    nk_i32_t b2 = _mm512_reduce_add_epi32(b2_i32x16) - norm_excess;
+    *result = nk_angular_normalize_f32_haswell_(ab, (nk_f32_t)a2, (nk_f32_t)b2);
+}
+NK_PUBLIC void nk_sqeuclidean_u4_icelake(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_u32_t *result) {
+    // u4 values are packed as nibbles: two 4-bit unsigned values per byte.
+    // Parameter `n` is the number of 4-bit values (dimensions), not bytes.
+    n = nk_size_round_up_to_multiple_(n, 2);
+    nk_size_t n_bytes = n / 2;
+    // For unsigned 4-bit integers ∈ [0, 15], the L2 squared distance is straightforward:
+    //   1. Extract nibbles as u8 values
+    //   2. Compute |a - b| using saturating subtraction: max(a,b) - min(a,b) = (a ⊖ b) | (b ⊖ a)
+    //   3. Square with DPBUSD: diff * diff
+    //
+    // No sign extension needed since values are unsigned.
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i a_u4_vec, b_u4_vec;
+    __m512i a_low_u8x64, a_high_u8x64, b_low_u8x64, b_high_u8x64;
+    __m512i diff_low_u8x64, diff_high_u8x64;
+    __m512i d2_i32x16 = _mm512_setzero_si512();
+nk_sqeuclidean_u4_icelake_cycle:
+    if (n_bytes < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
+        a_u4_vec = _mm512_maskz_loadu_epi8(mask, a);
+        b_u4_vec = _mm512_maskz_loadu_epi8(mask, b);
+        n_bytes = 0;
+    }
+    else {
+        a_u4_vec = _mm512_loadu_epi8(a);
+        b_u4_vec = _mm512_loadu_epi8(b);
+        a += 64, b += 64, n_bytes -= 64;
+    }
+    // Extract nibbles as unsigned [0,15]
+    a_low_u8x64 = _mm512_and_si512(a_u4_vec, nibble_mask_u8x64);
+    a_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(a_u4_vec, 4), nibble_mask_u8x64);
+    b_low_u8x64 = _mm512_and_si512(b_u4_vec, nibble_mask_u8x64);
+    b_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(b_u4_vec, 4), nibble_mask_u8x64);
+    // Absolute difference for unsigned: |a-b| = (a ⊖ b) | (b ⊖ a) where ⊖ is saturating sub
+    diff_low_u8x64 = _mm512_or_si512(_mm512_subs_epu8(a_low_u8x64, b_low_u8x64),
+                                     _mm512_subs_epu8(b_low_u8x64, a_low_u8x64));
+    diff_high_u8x64 = _mm512_or_si512(_mm512_subs_epu8(a_high_u8x64, b_high_u8x64),
+                                      _mm512_subs_epu8(b_high_u8x64, a_high_u8x64));
+    // Square and accumulate using DPBUSD
+    d2_i32x16 = _mm512_dpbusd_epi32(d2_i32x16, diff_low_u8x64, diff_low_u8x64);
+    d2_i32x16 = _mm512_dpbusd_epi32(d2_i32x16, diff_high_u8x64, diff_high_u8x64);
+    if (n_bytes) goto nk_sqeuclidean_u4_icelake_cycle;
+    *result = (nk_u32_t)_mm512_reduce_add_epi32(d2_i32x16);
+}
+NK_PUBLIC void nk_euclidean_u4_icelake(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_u32_t d2;
+    nk_sqeuclidean_u4_icelake(a, b, n, &d2);
+    *result = nk_f32_sqrt_haswell((nk_f32_t)d2);
+}
+NK_PUBLIC void nk_angular_u4_icelake(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result) {
+    // u4 values are packed as nibbles: two 4-bit unsigned values per byte.
+    // Parameter `n` is the number of 4-bit values (dimensions), not bytes.
+    n = nk_size_round_up_to_multiple_(n, 2);
+    nk_size_t n_bytes = n / 2;
+    // Angular distance for unsigned 4-bit integers ∈ [0, 15].
+    // Since values are unsigned and small, we can use DPBUSD directly for both
+    // dot product and norms without any sign handling.
+    //
+    // DPBUSD computes: ZeroExtend(a) * SignExtend(b), but for values ∈ [0, 15],
+    // sign extension is identity (no high bit set), so it works correctly.
+    __m512i const nibble_mask_u8x64 = _mm512_set1_epi8(0x0F);
+    __m512i const zeros_i8x64 = _mm512_setzero_si512();
+    __m512i a_u4_vec, b_u4_vec;
+    __m512i a_low_u8x64, a_high_u8x64, b_low_u8x64, b_high_u8x64;
+    __m512i ab_i32x16 = zeros_i8x64;
+    __m512i a2_i64x8 = zeros_i8x64;
+    __m512i b2_i64x8 = zeros_i8x64;
+nk_angular_u4_icelake_cycle:
+    if (n_bytes < 64) {
+        __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_bytes);
+        a_u4_vec = _mm512_maskz_loadu_epi8(mask, a);
+        b_u4_vec = _mm512_maskz_loadu_epi8(mask, b);
+        n_bytes = 0;
+    }
+    else {
+        a_u4_vec = _mm512_loadu_epi8(a);
+        b_u4_vec = _mm512_loadu_epi8(b);
+        a += 64, b += 64, n_bytes -= 64;
+    }
+    // Extract nibbles as unsigned [0,15]
+    a_low_u8x64 = _mm512_and_si512(a_u4_vec, nibble_mask_u8x64);
+    a_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(a_u4_vec, 4), nibble_mask_u8x64);
+    b_low_u8x64 = _mm512_and_si512(b_u4_vec, nibble_mask_u8x64);
+    b_high_u8x64 = _mm512_and_si512(_mm512_srli_epi16(b_u4_vec, 4), nibble_mask_u8x64);
+    // Dot product with DPBUSD (safe for unsigned [0,15])
+    ab_i32x16 = _mm512_dpbusd_epi32(ab_i32x16, a_low_u8x64, b_low_u8x64);
+    ab_i32x16 = _mm512_dpbusd_epi32(ab_i32x16, a_high_u8x64, b_high_u8x64);
+    // Squared norms: compute a² per nibble using lookup table for efficiency
+    // Squares lookup: 0 → 0, 1 → 1, 2 → 4, ..., 15 → 225
+    __m512i const u4_squares_lookup_u8x64 = _mm512_set_epi8(
+        (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
+        (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
+        (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, //
+        (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0);
+    __m512i a2_lo_u8x64 = _mm512_shuffle_epi8(u4_squares_lookup_u8x64, a_low_u8x64);
+    __m512i a2_hi_u8x64 = _mm512_shuffle_epi8(u4_squares_lookup_u8x64, a_high_u8x64);
+    __m512i b2_lo_u8x64 = _mm512_shuffle_epi8(u4_squares_lookup_u8x64, b_low_u8x64);
+    __m512i b2_hi_u8x64 = _mm512_shuffle_epi8(u4_squares_lookup_u8x64, b_high_u8x64);
+    // Accumulate low and high squares separately using SAD to avoid u8 overflow
+    a2_i64x8 = _mm512_add_epi64(a2_i64x8, _mm512_sad_epu8(a2_lo_u8x64, zeros_i8x64));
+    a2_i64x8 = _mm512_add_epi64(a2_i64x8, _mm512_sad_epu8(a2_hi_u8x64, zeros_i8x64));
+    b2_i64x8 = _mm512_add_epi64(b2_i64x8, _mm512_sad_epu8(b2_lo_u8x64, zeros_i8x64));
+    b2_i64x8 = _mm512_add_epi64(b2_i64x8, _mm512_sad_epu8(b2_hi_u8x64, zeros_i8x64));
+    if (n_bytes) goto nk_angular_u4_icelake_cycle;
+    nk_i32_t ab = _mm512_reduce_add_epi32(ab_i32x16);
+    nk_i64_t a2 = _mm512_reduce_add_epi64(a2_i64x8);
+    nk_i64_t b2 = _mm512_reduce_add_epi64(b2_i64x8);
+    *result = nk_angular_normalize_f32_haswell_(ab, (nk_f32_t)a2, (nk_f32_t)b2);
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_ICELAKE
+#endif // NK_TARGET_X86_
+#endif // NK_SPATIAL_ICELAKE_H