npm - numkong - Versions diffs - 7.4.5 → 7.6.0 - Mend

numkong 7.4.5 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/README.md +1 -0
package/binding.gyp +99 -5
package/c/dispatch_e5m2.c +23 -3
package/c/dispatch_f16.c +23 -0
package/c/numkong.c +0 -13
package/include/numkong/attention/sme.h +34 -31
package/include/numkong/capabilities.h +2 -15
package/include/numkong/cast/README.md +3 -0
package/include/numkong/cast/haswell.h +28 -64
package/include/numkong/cast/neon.h +15 -0
package/include/numkong/cast/serial.h +17 -0
package/include/numkong/cast/skylake.h +67 -52
package/include/numkong/cast.h +1 -0
package/include/numkong/curved/smef64.h +82 -62
package/include/numkong/dot/README.md +1 -0
package/include/numkong/dot/haswell.h +92 -13
package/include/numkong/dot/rvvbf16.h +1 -1
package/include/numkong/dot/rvvhalf.h +1 -1
package/include/numkong/dot/serial.h +15 -0
package/include/numkong/dot/skylake.h +61 -14
package/include/numkong/dot/sve.h +6 -5
package/include/numkong/dot/svebfdot.h +2 -1
package/include/numkong/dot/svehalf.h +6 -5
package/include/numkong/dot/svesdot.h +3 -2
package/include/numkong/dots/README.md +2 -0
package/include/numkong/dots/graniteamx.h +1167 -0
package/include/numkong/dots/haswell.h +28 -28
package/include/numkong/dots/sapphireamx.h +1 -1
package/include/numkong/dots/serial.h +33 -11
package/include/numkong/dots/skylake.h +28 -23
package/include/numkong/dots/sme.h +172 -140
package/include/numkong/dots/smebi32.h +14 -11
package/include/numkong/dots/smef64.h +31 -26
package/include/numkong/dots.h +41 -3
package/include/numkong/each/serial.h +39 -0
package/include/numkong/geospatial/haswell.h +1 -1
package/include/numkong/geospatial/neon.h +1 -1
package/include/numkong/geospatial/serial.h +15 -4
package/include/numkong/geospatial/skylake.h +1 -1
package/include/numkong/maxsim/serial.h +15 -0
package/include/numkong/maxsim/sme.h +34 -33
package/include/numkong/mesh/README.md +50 -44
package/include/numkong/mesh/genoa.h +462 -0
package/include/numkong/mesh/haswell.h +806 -933
package/include/numkong/mesh/neon.h +871 -943
package/include/numkong/mesh/neonbfdot.h +382 -522
package/include/numkong/mesh/neonfhm.h +676 -0
package/include/numkong/mesh/rvv.h +404 -319
package/include/numkong/mesh/serial.h +225 -161
package/include/numkong/mesh/skylake.h +1029 -1585
package/include/numkong/mesh/v128relaxed.h +403 -377
package/include/numkong/mesh.h +38 -0
package/include/numkong/reduce/neon.h +29 -0
package/include/numkong/reduce/neonbfdot.h +2 -2
package/include/numkong/reduce/neonfhm.h +4 -4
package/include/numkong/reduce/serial.h +15 -1
package/include/numkong/reduce/sve.h +52 -0
package/include/numkong/reduce.h +4 -0
package/include/numkong/set/sve.h +6 -5
package/include/numkong/sets/smebi32.h +35 -30
package/include/numkong/sparse/serial.h +17 -2
package/include/numkong/sparse/sve2.h +3 -2
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +98 -56
package/include/numkong/spatial/serial.h +15 -0
package/include/numkong/spatial/skylake.h +114 -54
package/include/numkong/spatial/sve.h +7 -6
package/include/numkong/spatial/svebfdot.h +7 -4
package/include/numkong/spatial/svehalf.h +5 -4
package/include/numkong/spatial/svesdot.h +9 -8
package/include/numkong/spatial.h +0 -12
package/include/numkong/spatials/graniteamx.h +301 -0
package/include/numkong/spatials/serial.h +39 -0
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +391 -350
package/include/numkong/spatials/smef64.h +79 -70
package/include/numkong/spatials.h +54 -4
package/include/numkong/tensor.hpp +107 -23
package/include/numkong/types.h +59 -0
package/javascript/dist/cjs/numkong.js +13 -0
package/javascript/dist/esm/numkong.js +13 -0
package/javascript/numkong.c +59 -14
package/javascript/numkong.ts +13 -0
package/package.json +7 -7
package/probes/probe.js +2 -2
package/wasm/numkong.wasm +0 -0

package/include/numkong/spatial/haswell.h CHANGED Viewed

@@ -840,28 +840,37 @@ nk_angular_e3m2_haswell_cycle:
 }
 NK_PUBLIC void nk_sqeuclidean_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+    // E4M3 has no free widen shift, so we call the Giesen-based 8-lane cast helper
+    // twice per 16-lane iter and run with two F32 accumulators to break the FMA chain.
+    __m256 first_acc_f32x8 = _mm256_setzero_ps();
+    __m256 second_acc_f32x8 = _mm256_setzero_ps();
+    __m128i a_u8x16, b_u8x16;
 nk_sqeuclidean_e4m3_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_vec.xmm);
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_sqeuclidean_e4m3_haswell_cycle;
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
     }
+    __m256 a_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_u8x16);
+    __m256 a_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(a_u8x16, a_u8x16));
+    __m256 b_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_u8x16);
+    __m256 b_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(b_u8x16, b_u8x16));
+    __m256 diff_low_f32x8 = _mm256_sub_ps(a_low_f32x8, b_low_f32x8);
+    __m256 diff_high_f32x8 = _mm256_sub_ps(a_high_f32x8, b_high_f32x8);
+    first_acc_f32x8 = _mm256_fmadd_ps(diff_low_f32x8, diff_low_f32x8, first_acc_f32x8);
+    second_acc_f32x8 = _mm256_fmadd_ps(diff_high_f32x8, diff_high_f32x8, second_acc_f32x8);
+    if (n) goto nk_sqeuclidean_e4m3_haswell_cycle;
-    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+    *result = nk_reduce_add_f32x8_haswell_(_mm256_add_ps(first_acc_f32x8, second_acc_f32x8));
 }
 NK_PUBLIC void nk_euclidean_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -873,27 +882,33 @@ NK_PUBLIC void nk_angular_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, n
     __m256 dot_product_f32x8 = _mm256_setzero_ps();
     __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
     __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m128i a_u8x16, b_u8x16;
 nk_angular_e4m3_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_vec.xmm);
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_angular_e4m3_haswell_cycle;
-    }
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
+    }
+    __m256 a_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_u8x16);
+    __m256 a_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(a_u8x16, a_u8x16));
+    __m256 b_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_u8x16);
+    __m256 b_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(b_u8x16, b_u8x16));
+    dot_product_f32x8 = _mm256_fmadd_ps(a_low_f32x8, b_low_f32x8, dot_product_f32x8);
+    dot_product_f32x8 = _mm256_fmadd_ps(a_high_f32x8, b_high_f32x8, dot_product_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_low_f32x8, a_low_f32x8, a_norm_sq_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_high_f32x8, a_high_f32x8, a_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_low_f32x8, b_low_f32x8, b_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_high_f32x8, b_high_f32x8, b_norm_sq_f32x8);
+    if (n) goto nk_angular_e4m3_haswell_cycle;
     nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
     nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
@@ -902,28 +917,44 @@ nk_angular_e4m3_haswell_cycle:
 }
 NK_PUBLIC void nk_sqeuclidean_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+    // E5M2 shares F16's exponent bias (15): `byte << 8` equals the matching F16 encoding.
+    // `vpunpck*bw` against zero is the free widen+shift: zero byte in low half of each
+    // 16-bit lane, E5M2 byte in high half. Per-128-bit-lane scrambled; commutative sum
+    // reduction is invariant under that.
+    __m256 first_acc_f32x8 = _mm256_setzero_ps();
+    __m256 second_acc_f32x8 = _mm256_setzero_ps();
+    __m128i const zero_u8x16 = _mm_setzero_si128();
+    __m128i a_u8x16, b_u8x16;
 nk_sqeuclidean_e5m2_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_vec.xmm);
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_sqeuclidean_e5m2_haswell_cycle;
-    }
-    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
+    }
+    __m128i a_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, a_u8x16);
+    __m128i a_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, a_u8x16);
+    __m128i b_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, b_u8x16);
+    __m128i b_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, b_u8x16);
+    __m256 a_first_f32x8 = _mm256_cvtph_ps(a_even_f16x8);
+    __m256 a_second_f32x8 = _mm256_cvtph_ps(a_odd_f16x8);
+    __m256 b_first_f32x8 = _mm256_cvtph_ps(b_even_f16x8);
+    __m256 b_second_f32x8 = _mm256_cvtph_ps(b_odd_f16x8);
+    __m256 diff_first_f32x8 = _mm256_sub_ps(a_first_f32x8, b_first_f32x8);
+    __m256 diff_second_f32x8 = _mm256_sub_ps(a_second_f32x8, b_second_f32x8);
+    first_acc_f32x8 = _mm256_fmadd_ps(diff_first_f32x8, diff_first_f32x8, first_acc_f32x8);
+    second_acc_f32x8 = _mm256_fmadd_ps(diff_second_f32x8, diff_second_f32x8, second_acc_f32x8);
+    if (n) goto nk_sqeuclidean_e5m2_haswell_cycle;
+    *result = nk_reduce_add_f32x8_haswell_(_mm256_add_ps(first_acc_f32x8, second_acc_f32x8));
 }
 NK_PUBLIC void nk_euclidean_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -935,27 +966,38 @@ NK_PUBLIC void nk_angular_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, n
     __m256 dot_product_f32x8 = _mm256_setzero_ps();
     __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
     __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m128i const zero_u8x16 = _mm_setzero_si128();
+    __m128i a_u8x16, b_u8x16;
 nk_angular_e5m2_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_vec.xmm);
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_angular_e5m2_haswell_cycle;
-    }
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
+    }
+    __m128i a_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, a_u8x16);
+    __m128i a_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, a_u8x16);
+    __m128i b_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, b_u8x16);
+    __m128i b_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, b_u8x16);
+    __m256 a_first_f32x8 = _mm256_cvtph_ps(a_even_f16x8);
+    __m256 a_second_f32x8 = _mm256_cvtph_ps(a_odd_f16x8);
+    __m256 b_first_f32x8 = _mm256_cvtph_ps(b_even_f16x8);
+    __m256 b_second_f32x8 = _mm256_cvtph_ps(b_odd_f16x8);
+    dot_product_f32x8 = _mm256_fmadd_ps(a_first_f32x8, b_first_f32x8, dot_product_f32x8);
+    dot_product_f32x8 = _mm256_fmadd_ps(a_second_f32x8, b_second_f32x8, dot_product_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_first_f32x8, a_first_f32x8, a_norm_sq_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_second_f32x8, a_second_f32x8, a_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_first_f32x8, b_first_f32x8, b_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_second_f32x8, b_second_f32x8, b_norm_sq_f32x8);
+    if (n) goto nk_angular_e5m2_haswell_cycle;
     nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
     nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);

package/include/numkong/spatial/serial.h CHANGED Viewed

@@ -108,6 +108,15 @@ extern "C" {
         }                                                                                                         \
     }
+/*  Keep the serial instantiations below actually scalar, regardless of build type.
+ *  See dots/serial.h for rationale. */
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
+#endif
 nk_define_angular_(f64, f64, f64, nk_assign_from_to_, nk_f64_rsqrt_serial)       // nk_angular_f64_serial
 nk_define_sqeuclidean_(f64, f64, f64, nk_assign_from_to_)                        // nk_sqeuclidean_f64_serial
 nk_define_euclidean_(f64, f64, f64, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_euclidean_f64_serial
@@ -340,6 +349,12 @@ NK_INTERNAL void nk_euclidean_through_u32_from_dot_serial_(nk_b128_vec_t dots, n
     }
 }
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
 #if defined(__cplusplus)
 } // extern "C"
 #endif

package/include/numkong/spatial/skylake.h CHANGED Viewed

@@ -346,28 +346,36 @@ nk_angular_f16_skylake_cycle:
 }
 NK_PUBLIC void nk_sqeuclidean_e4m3_skylake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m512 sum_f32x16 = _mm512_setzero_ps();
-    __m128i a_e4m3_u8x16, b_e4m3_u8x16;
+    // E4M3 has no free widen shift (its 4-bit exponent doesn't line up with F16's 5-bit
+    // at bit 10), so we call the Giesen-based 16-lane cast helper twice per iter and
+    // run with two F32 accumulators to break the FMA dependency chain.
+    __m512 first_acc_f32x16 = _mm512_setzero_ps();
+    __m512 second_acc_f32x16 = _mm512_setzero_ps();
+    __m256i a_u8x32, b_u8x32;
 nk_sqeuclidean_e4m3_skylake_cycle:
-    if (n < 16) {
-        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
-        a_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, a);
-        b_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, b);
+    if (n < 32) {
+        __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, (unsigned int)n);
+        a_u8x32 = _mm256_maskz_loadu_epi8(mask, a);
+        b_u8x32 = _mm256_maskz_loadu_epi8(mask, b);
         n = 0;
     }
     else {
-        a_e4m3_u8x16 = _mm_loadu_si128((__m128i const *)a);
-        b_e4m3_u8x16 = _mm_loadu_si128((__m128i const *)b);
-        a += 16, b += 16, n -= 16;
+        a_u8x32 = _mm256_loadu_si256((__m256i const *)a);
+        b_u8x32 = _mm256_loadu_si256((__m256i const *)b);
+        a += 32, b += 32, n -= 32;
     }
-    __m512 a_f32x16 = nk_e4m3x16_to_f32x16_skylake_(a_e4m3_u8x16);
-    __m512 b_f32x16 = nk_e4m3x16_to_f32x16_skylake_(b_e4m3_u8x16);
-    __m512 diff_f32x16 = _mm512_sub_ps(a_f32x16, b_f32x16);
-    sum_f32x16 = _mm512_fmadd_ps(diff_f32x16, diff_f32x16, sum_f32x16);
+    __m512 a_low_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_castsi256_si128(a_u8x32));
+    __m512 a_high_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_extracti128_si256(a_u8x32, 1));
+    __m512 b_low_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_castsi256_si128(b_u8x32));
+    __m512 b_high_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_extracti128_si256(b_u8x32, 1));
+    __m512 diff_low_f32x16 = _mm512_sub_ps(a_low_f32x16, b_low_f32x16);
+    __m512 diff_high_f32x16 = _mm512_sub_ps(a_high_f32x16, b_high_f32x16);
+    first_acc_f32x16 = _mm512_fmadd_ps(diff_low_f32x16, diff_low_f32x16, first_acc_f32x16);
+    second_acc_f32x16 = _mm512_fmadd_ps(diff_high_f32x16, diff_high_f32x16, second_acc_f32x16);
     if (n) goto nk_sqeuclidean_e4m3_skylake_cycle;
-    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
+    *result = nk_reduce_add_f32x16_skylake_(_mm512_add_ps(first_acc_f32x16, second_acc_f32x16));
 }
 NK_PUBLIC void nk_euclidean_e4m3_skylake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -379,25 +387,30 @@ NK_PUBLIC void nk_angular_e4m3_skylake(nk_e4m3_t const *a, nk_e4m3_t const *b, n
     __m512 dot_f32x16 = _mm512_setzero_ps();
     __m512 a_norm_sq_f32x16 = _mm512_setzero_ps();
     __m512 b_norm_sq_f32x16 = _mm512_setzero_ps();
-    __m128i a_e4m3_u8x16, b_e4m3_u8x16;
+    __m256i a_u8x32, b_u8x32;
 nk_angular_e4m3_skylake_cycle:
-    if (n < 16) {
-        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
-        a_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, a);
-        b_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, b);
+    if (n < 32) {
+        __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, (unsigned int)n);
+        a_u8x32 = _mm256_maskz_loadu_epi8(mask, a);
+        b_u8x32 = _mm256_maskz_loadu_epi8(mask, b);
         n = 0;
     }
     else {
-        a_e4m3_u8x16 = _mm_loadu_si128((__m128i const *)a);
-        b_e4m3_u8x16 = _mm_loadu_si128((__m128i const *)b);
-        a += 16, b += 16, n -= 16;
+        a_u8x32 = _mm256_loadu_si256((__m256i const *)a);
+        b_u8x32 = _mm256_loadu_si256((__m256i const *)b);
+        a += 32, b += 32, n -= 32;
     }
-    __m512 a_f32x16 = nk_e4m3x16_to_f32x16_skylake_(a_e4m3_u8x16);
-    __m512 b_f32x16 = nk_e4m3x16_to_f32x16_skylake_(b_e4m3_u8x16);
-    dot_f32x16 = _mm512_fmadd_ps(a_f32x16, b_f32x16, dot_f32x16);
-    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_f32x16, a_f32x16, a_norm_sq_f32x16);
-    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_f32x16, b_f32x16, b_norm_sq_f32x16);
+    __m512 a_low_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_castsi256_si128(a_u8x32));
+    __m512 a_high_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_extracti128_si256(a_u8x32, 1));
+    __m512 b_low_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_castsi256_si128(b_u8x32));
+    __m512 b_high_f32x16 = nk_e4m3x16_to_f32x16_skylake_(_mm256_extracti128_si256(b_u8x32, 1));
+    dot_f32x16 = _mm512_fmadd_ps(a_low_f32x16, b_low_f32x16, dot_f32x16);
+    dot_f32x16 = _mm512_fmadd_ps(a_high_f32x16, b_high_f32x16, dot_f32x16);
+    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_low_f32x16, a_low_f32x16, a_norm_sq_f32x16);
+    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_high_f32x16, a_high_f32x16, a_norm_sq_f32x16);
+    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_low_f32x16, b_low_f32x16, b_norm_sq_f32x16);
+    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_high_f32x16, b_high_f32x16, b_norm_sq_f32x16);
     if (n) goto nk_angular_e4m3_skylake_cycle;
     nk_f32_t dot_f32 = nk_reduce_add_f32x16_skylake_(dot_f32x16);
@@ -407,28 +420,53 @@ nk_angular_e4m3_skylake_cycle:
 }
 NK_PUBLIC void nk_sqeuclidean_e5m2_skylake(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m512 sum_f32x16 = _mm512_setzero_ps();
-    __m128i a_e5m2_u8x16, b_e5m2_u8x16;
+    // E5M2 shares F16's exponent bias (15): `byte << 8` equals the matching F16 bit-pattern
+    // for normals, subnormals, zero, Inf, and NaN. We expose that shift for free by unpacking
+    // against zero — the zero byte lands in the low half of each 16-bit lane, the E5M2 byte
+    // in the high half. `vpunpck*bw` is per-128-bit-lane so the F32 outputs are lane-scrambled
+    // across 512 bits, but the commutative sum reduction is invariant under that.
+    __m512 first_acc_f32x16 = _mm512_setzero_ps();
+    __m512 second_acc_f32x16 = _mm512_setzero_ps();
+    __m512i const zero_u8x64 = _mm512_setzero_si512();
+    __m512i a_u8x64, b_u8x64;
 nk_sqeuclidean_e5m2_skylake_cycle:
-    if (n < 16) {
-        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
-        a_e5m2_u8x16 = _mm_maskz_loadu_epi8(mask, a);
-        b_e5m2_u8x16 = _mm_maskz_loadu_epi8(mask, b);
+    if (n < 64) {
+        __mmask64 mask = _bzhi_u64(0xFFFFFFFFFFFFFFFFULL, (unsigned int)n);
+        a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
         n = 0;
     }
     else {
-        a_e5m2_u8x16 = _mm_loadu_si128((__m128i const *)a);
-        b_e5m2_u8x16 = _mm_loadu_si128((__m128i const *)b);
-        a += 16, b += 16, n -= 16;
+        a_u8x64 = _mm512_loadu_si512((__m512i const *)a);
+        b_u8x64 = _mm512_loadu_si512((__m512i const *)b);
+        a += 64, b += 64, n -= 64;
     }
-    __m512 a_f32x16 = nk_e5m2x16_to_f32x16_skylake_(a_e5m2_u8x16);
-    __m512 b_f32x16 = nk_e5m2x16_to_f32x16_skylake_(b_e5m2_u8x16);
-    __m512 diff_f32x16 = _mm512_sub_ps(a_f32x16, b_f32x16);
-    sum_f32x16 = _mm512_fmadd_ps(diff_f32x16, diff_f32x16, sum_f32x16);
+    __m512i a_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, a_u8x64);
+    __m512i a_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, a_u8x64);
+    __m512i b_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, b_u8x64);
+    __m512i b_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, b_u8x64);
+    __m512 a_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_even_f16x32));
+    __m512 a_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_even_f16x32, 1));
+    __m512 a_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_odd_f16x32));
+    __m512 a_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_odd_f16x32, 1));
+    __m512 b_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_even_f16x32));
+    __m512 b_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_even_f16x32, 1));
+    __m512 b_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_odd_f16x32));
+    __m512 b_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_odd_f16x32, 1));
+    __m512 diff_first_f32x16 = _mm512_sub_ps(a_first_f32x16, b_first_f32x16);
+    __m512 diff_second_f32x16 = _mm512_sub_ps(a_second_f32x16, b_second_f32x16);
+    __m512 diff_third_f32x16 = _mm512_sub_ps(a_third_f32x16, b_third_f32x16);
+    __m512 diff_fourth_f32x16 = _mm512_sub_ps(a_fourth_f32x16, b_fourth_f32x16);
+    first_acc_f32x16 = _mm512_fmadd_ps(diff_first_f32x16, diff_first_f32x16, first_acc_f32x16);
+    second_acc_f32x16 = _mm512_fmadd_ps(diff_second_f32x16, diff_second_f32x16, second_acc_f32x16);
+    first_acc_f32x16 = _mm512_fmadd_ps(diff_third_f32x16, diff_third_f32x16, first_acc_f32x16);
+    second_acc_f32x16 = _mm512_fmadd_ps(diff_fourth_f32x16, diff_fourth_f32x16, second_acc_f32x16);
     if (n) goto nk_sqeuclidean_e5m2_skylake_cycle;
-    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
+    *result = nk_reduce_add_f32x16_skylake_(_mm512_add_ps(first_acc_f32x16, second_acc_f32x16));
 }
 NK_PUBLIC void nk_euclidean_e5m2_skylake(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -440,25 +478,47 @@ NK_PUBLIC void nk_angular_e5m2_skylake(nk_e5m2_t const *a, nk_e5m2_t const *b, n
     __m512 dot_f32x16 = _mm512_setzero_ps();
     __m512 a_norm_sq_f32x16 = _mm512_setzero_ps();
     __m512 b_norm_sq_f32x16 = _mm512_setzero_ps();
-    __m128i a_e5m2_u8x16, b_e5m2_u8x16;
+    __m512i const zero_u8x64 = _mm512_setzero_si512();
+    __m512i a_u8x64, b_u8x64;
 nk_angular_e5m2_skylake_cycle:
-    if (n < 16) {
-        __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n);
-        a_e5m2_u8x16 = _mm_maskz_loadu_epi8(mask, a);
-        b_e5m2_u8x16 = _mm_maskz_loadu_epi8(mask, b);
+    if (n < 64) {
+        __mmask64 mask = _bzhi_u64(0xFFFFFFFFFFFFFFFFULL, (unsigned int)n);
+        a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
         n = 0;
     }
     else {
-        a_e5m2_u8x16 = _mm_loadu_si128((__m128i const *)a);
-        b_e5m2_u8x16 = _mm_loadu_si128((__m128i const *)b);
-        a += 16, b += 16, n -= 16;
+        a_u8x64 = _mm512_loadu_si512((__m512i const *)a);
+        b_u8x64 = _mm512_loadu_si512((__m512i const *)b);
+        a += 64, b += 64, n -= 64;
     }
-    __m512 a_f32x16 = nk_e5m2x16_to_f32x16_skylake_(a_e5m2_u8x16);
-    __m512 b_f32x16 = nk_e5m2x16_to_f32x16_skylake_(b_e5m2_u8x16);
-    dot_f32x16 = _mm512_fmadd_ps(a_f32x16, b_f32x16, dot_f32x16);
-    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_f32x16, a_f32x16, a_norm_sq_f32x16);
-    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_f32x16, b_f32x16, b_norm_sq_f32x16);
+    __m512i a_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, a_u8x64);
+    __m512i a_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, a_u8x64);
+    __m512i b_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, b_u8x64);
+    __m512i b_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, b_u8x64);
+    __m512 a_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_even_f16x32));
+    __m512 a_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_even_f16x32, 1));
+    __m512 a_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_odd_f16x32));
+    __m512 a_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_odd_f16x32, 1));
+    __m512 b_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_even_f16x32));
+    __m512 b_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_even_f16x32, 1));
+    __m512 b_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_odd_f16x32));
+    __m512 b_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_odd_f16x32, 1));
+    dot_f32x16 = _mm512_fmadd_ps(a_first_f32x16, b_first_f32x16, dot_f32x16);
+    dot_f32x16 = _mm512_fmadd_ps(a_second_f32x16, b_second_f32x16, dot_f32x16);
+    dot_f32x16 = _mm512_fmadd_ps(a_third_f32x16, b_third_f32x16, dot_f32x16);
+    dot_f32x16 = _mm512_fmadd_ps(a_fourth_f32x16, b_fourth_f32x16, dot_f32x16);
+    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_first_f32x16, a_first_f32x16, a_norm_sq_f32x16);
+    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_second_f32x16, a_second_f32x16, a_norm_sq_f32x16);
+    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_third_f32x16, a_third_f32x16, a_norm_sq_f32x16);
+    a_norm_sq_f32x16 = _mm512_fmadd_ps(a_fourth_f32x16, a_fourth_f32x16, a_norm_sq_f32x16);
+    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_first_f32x16, b_first_f32x16, b_norm_sq_f32x16);
+    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_second_f32x16, b_second_f32x16, b_norm_sq_f32x16);
+    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_third_f32x16, b_third_f32x16, b_norm_sq_f32x16);
+    b_norm_sq_f32x16 = _mm512_fmadd_ps(b_fourth_f32x16, b_fourth_f32x16, b_norm_sq_f32x16);
     if (n) goto nk_angular_e5m2_skylake_cycle;
     nk_f32_t dot_f32 = nk_reduce_add_f32x16_skylake_(dot_f32x16);

package/include/numkong/spatial/sve.h CHANGED Viewed

@@ -36,6 +36,7 @@
 #if NK_TARGET_SVE
 #include "numkong/types.h"
+#include "numkong/reduce/sve.h"   // `nk_svaddv_f64_`
 #include "numkong/spatial/neon.h" // `nk_f64_sqrt_neon`
 #include "numkong/dot/sve.h"      // `nk_dot_stable_sum_f64_sve_`
@@ -113,7 +114,7 @@ NK_PUBLIC void nk_sqeuclidean_f32_sve(nk_f32_t const *a, nk_f32_t const *b, nk_s
         svfloat64_t diff_odd_f64x = svsub_f64_x(pred_odd_b64x, a_odd_f64x, b_odd_f64x);
         dist_sq_f64x = svmla_f64_m(pred_odd_b64x, dist_sq_f64x, diff_odd_f64x, diff_odd_f64x);
     }
-    nk_f64_t dist_sq_f64 = svaddv_f64(svptrue_b64(), dist_sq_f64x);
+    nk_f64_t dist_sq_f64 = nk_svaddv_f64_(svptrue_b64(), dist_sq_f64x);
     *result = dist_sq_f64;
 }
@@ -149,9 +150,9 @@ NK_PUBLIC void nk_angular_f32_sve(nk_f32_t const *a, nk_f32_t const *b, nk_size_
         b2_f64x = svmla_f64_m(pred_odd_b64x, b2_f64x, b_odd_f64x, b_odd_f64x);
     }
-    nk_f64_t ab_f64 = svaddv_f64(svptrue_b64(), ab_f64x);
-    nk_f64_t a2_f64 = svaddv_f64(svptrue_b64(), a2_f64x);
-    nk_f64_t b2_f64 = svaddv_f64(svptrue_b64(), b2_f64x);
+    nk_f64_t ab_f64 = nk_svaddv_f64_(svptrue_b64(), ab_f64x);
+    nk_f64_t a2_f64 = nk_svaddv_f64_(svptrue_b64(), a2_f64x);
+    nk_f64_t b2_f64 = nk_svaddv_f64_(svptrue_b64(), b2_f64x);
     *result = nk_angular_normalize_f64_neon_(ab_f64, a2_f64, b2_f64);
 }
@@ -225,8 +226,8 @@ NK_PUBLIC void nk_angular_f64_sve(nk_f64_t const *a, nk_f64_t const *b, nk_size_
     } while (i < n);
     nk_f64_t ab_f64 = nk_dot_stable_sum_f64_sve_(predicate_all_b64x, ab_sum_f64x, ab_compensation_f64x);
-    nk_f64_t a2_f64 = svaddv_f64(predicate_all_b64x, a2_f64x);
-    nk_f64_t b2_f64 = svaddv_f64(predicate_all_b64x, b2_f64x);
+    nk_f64_t a2_f64 = nk_svaddv_f64_(predicate_all_b64x, a2_f64x);
+    nk_f64_t b2_f64 = nk_svaddv_f64_(predicate_all_b64x, b2_f64x);
     *result = nk_angular_normalize_f64_neon_(ab_f64, a2_f64, b2_f64);
 }

package/include/numkong/spatial/svebfdot.h CHANGED Viewed

@@ -36,6 +36,7 @@
 #if NK_TARGET_SVEBFDOT
 #include "numkong/types.h"
+#include "numkong/reduce/sve.h"   // `nk_svaddv_f64_`
 #include "numkong/spatial/neon.h" // `nk_f32_sqrt_neon`
 #if defined(__cplusplus)
@@ -75,7 +76,9 @@ NK_PUBLIC void nk_sqeuclidean_bf16_svebfdot(nk_bf16_t const *a_enum, nk_bf16_t c
         d2_high_f32x = svmla_f32_m(predicate_high_b32x, d2_high_f32x, a_minus_b_high_f32x, a_minus_b_high_f32x);
         i += svcnth();
     } while (i < n);
-    nk_f32_t d2 = svaddv_f32(svptrue_b32(), d2_low_f32x) + svaddv_f32(svptrue_b32(), d2_high_f32x);
+    nk_f32_t d2_low = nk_svaddv_f32_(svptrue_b32(), d2_low_f32x);
+    nk_f32_t d2_high = nk_svaddv_f32_(svptrue_b32(), d2_high_f32x);
+    nk_f32_t d2 = d2_low + d2_high;
     *result = d2;
 }
 NK_PUBLIC void nk_euclidean_bf16_svebfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -101,9 +104,9 @@ NK_PUBLIC void nk_angular_bf16_svebfdot(nk_bf16_t const *a_enum, nk_bf16_t const
         i += svcnth();
     } while (i < n);
-    nk_f32_t ab = svaddv_f32(svptrue_b32(), ab_f32x);
-    nk_f32_t a2 = svaddv_f32(svptrue_b32(), a2_f32x);
-    nk_f32_t b2 = svaddv_f32(svptrue_b32(), b2_f32x);
+    nk_f32_t ab = nk_svaddv_f32_(svptrue_b32(), ab_f32x);
+    nk_f32_t a2 = nk_svaddv_f32_(svptrue_b32(), a2_f32x);
+    nk_f32_t b2 = nk_svaddv_f32_(svptrue_b32(), b2_f32x);
     *result = nk_angular_normalize_f32_neon_(ab, a2, b2);
 }

package/include/numkong/spatial/svehalf.h CHANGED Viewed

@@ -32,6 +32,7 @@
 #if NK_TARGET_SVEHALF
 #include "numkong/types.h"
+#include "numkong/reduce/sve.h"   // `nk_svaddv_f64_`
 #include "numkong/spatial/neon.h" // `nk_f32_sqrt_neon`
 #if defined(__cplusplus)
@@ -74,7 +75,7 @@ NK_PUBLIC void nk_sqeuclidean_f16_svehalf(nk_f16_t const *a_enum, nk_f16_t const
         i += svcnth();
     } while (i < n);
-    *result = svaddv_f32(svptrue_b32(), d2_f32x);
+    *result = nk_svaddv_f32_(svptrue_b32(), d2_f32x);
 }
 NK_PUBLIC void nk_euclidean_f16_svehalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -114,9 +115,9 @@ NK_PUBLIC void nk_angular_f16_svehalf(nk_f16_t const *a_enum, nk_f16_t const *b_
         i += svcnth();
     } while (i < n);
-    nk_f32_t ab_f32 = svaddv_f32(svptrue_b32(), ab_f32x);
-    nk_f32_t a2_f32 = svaddv_f32(svptrue_b32(), a2_f32x);
-    nk_f32_t b2_f32 = svaddv_f32(svptrue_b32(), b2_f32x);
+    nk_f32_t ab_f32 = nk_svaddv_f32_(svptrue_b32(), ab_f32x);
+    nk_f32_t a2_f32 = nk_svaddv_f32_(svptrue_b32(), a2_f32x);
+    nk_f32_t b2_f32 = nk_svaddv_f32_(svptrue_b32(), b2_f32x);
     *result = nk_angular_normalize_f32_neon_(ab_f32, a2_f32, b2_f32);
 }

package/include/numkong/spatial/svesdot.h CHANGED Viewed

@@ -34,6 +34,7 @@
 #if NK_TARGET_SVESDOT
 #include "numkong/types.h"
+#include "numkong/reduce/sve.h"   // `nk_svaddv_f64_`
 #include "numkong/spatial/neon.h" // `nk_angular_normalize_f32_neon_`, `nk_f32_sqrt_neon`
 #if defined(__cplusplus)
@@ -58,7 +59,7 @@ NK_PUBLIC void nk_sqeuclidean_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_
         distance_sq_u32x = svdot_u32(distance_sq_u32x, diff_u8x, diff_u8x);
         i += svcntb();
     } while (i < n);
-    *result = (nk_u32_t)svaddv_u32(svptrue_b32(), distance_sq_u32x);
+    *result = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), distance_sq_u32x);
 }
 NK_PUBLIC void nk_euclidean_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
     nk_u32_t distance_sq_u32;
@@ -81,9 +82,9 @@ NK_PUBLIC void nk_angular_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size
         i += svcntb();
     } while (i < n);
-    nk_i32_t ab = (nk_i32_t)svaddv_s32(svptrue_b32(), ab_i32x);
-    nk_i32_t a2 = (nk_i32_t)svaddv_s32(svptrue_b32(), a2_i32x);
-    nk_i32_t b2 = (nk_i32_t)svaddv_s32(svptrue_b32(), b2_i32x);
+    nk_i32_t ab = (nk_i32_t)nk_svaddv_s32_(svptrue_b32(), ab_i32x);
+    nk_i32_t a2 = (nk_i32_t)nk_svaddv_s32_(svptrue_b32(), a2_i32x);
+    nk_i32_t b2 = (nk_i32_t)nk_svaddv_s32_(svptrue_b32(), b2_i32x);
     *result = nk_angular_normalize_f32_neon_((nk_f32_t)ab, (nk_f32_t)a2, (nk_f32_t)b2);
 }
@@ -98,7 +99,7 @@ NK_PUBLIC void nk_sqeuclidean_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_
         distance_sq_u32x = svdot_u32(distance_sq_u32x, diff_u8x, diff_u8x);
         i += svcntb();
     } while (i < n);
-    *result = (nk_u32_t)svaddv_u32(svptrue_b32(), distance_sq_u32x);
+    *result = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), distance_sq_u32x);
 }
 NK_PUBLIC void nk_euclidean_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
     nk_u32_t distance_sq_u32;
@@ -121,9 +122,9 @@ NK_PUBLIC void nk_angular_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size
         i += svcntb();
     } while (i < n);
-    nk_u32_t ab = (nk_u32_t)svaddv_u32(svptrue_b32(), ab_u32x);
-    nk_u32_t a2 = (nk_u32_t)svaddv_u32(svptrue_b32(), a2_u32x);
-    nk_u32_t b2 = (nk_u32_t)svaddv_u32(svptrue_b32(), b2_u32x);
+    nk_u32_t ab = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), ab_u32x);
+    nk_u32_t a2 = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), a2_u32x);
+    nk_u32_t b2 = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), b2_u32x);
     *result = nk_angular_normalize_f32_neon_((nk_f32_t)ab, (nk_f32_t)a2, (nk_f32_t)b2);
 }