npm - numkong - Versions diffs - 7.5.0 → 7.6.0 - Mend

numkong 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/binding.gyp +18 -0
package/c/dispatch_e5m2.c +23 -3
package/include/numkong/capabilities.h +1 -1
package/include/numkong/cast/README.md +3 -0
package/include/numkong/cast/haswell.h +28 -64
package/include/numkong/cast/serial.h +17 -0
package/include/numkong/cast/skylake.h +67 -52
package/include/numkong/cast.h +1 -0
package/include/numkong/dot/README.md +1 -0
package/include/numkong/dot/haswell.h +92 -13
package/include/numkong/dot/serial.h +15 -0
package/include/numkong/dot/skylake.h +61 -14
package/include/numkong/dots/README.md +2 -0
package/include/numkong/dots/graniteamx.h +434 -0
package/include/numkong/dots/haswell.h +28 -28
package/include/numkong/dots/sapphireamx.h +1 -1
package/include/numkong/dots/serial.h +23 -8
package/include/numkong/dots/skylake.h +28 -23
package/include/numkong/dots.h +12 -0
package/include/numkong/each/serial.h +18 -1
package/include/numkong/geospatial/serial.h +14 -3
package/include/numkong/maxsim/serial.h +15 -0
package/include/numkong/mesh/README.md +50 -44
package/include/numkong/mesh/genoa.h +462 -0
package/include/numkong/mesh/haswell.h +806 -933
package/include/numkong/mesh/neon.h +871 -943
package/include/numkong/mesh/neonbfdot.h +382 -522
package/include/numkong/mesh/neonfhm.h +676 -0
package/include/numkong/mesh/rvv.h +404 -319
package/include/numkong/mesh/serial.h +204 -162
package/include/numkong/mesh/skylake.h +1029 -1585
package/include/numkong/mesh/v128relaxed.h +403 -377
package/include/numkong/mesh.h +38 -0
package/include/numkong/reduce/serial.h +15 -1
package/include/numkong/sparse/serial.h +17 -2
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +98 -56
package/include/numkong/spatial/serial.h +15 -0
package/include/numkong/spatial/skylake.h +114 -54
package/include/numkong/spatial.h +0 -12
package/include/numkong/spatials/graniteamx.h +128 -0
package/include/numkong/spatials/serial.h +18 -1
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials.h +17 -0
package/include/numkong/tensor.hpp +107 -23
package/javascript/numkong.c +3 -2
package/package.json +7 -7
package/wasm/numkong.wasm +0 -0

package/include/numkong/mesh.h CHANGED Viewed

@@ -266,6 +266,20 @@ NK_PUBLIC void nk_umeyama_bf16_skylake(nk_bf16_t const *a, nk_bf16_t const *b, n
                                        nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
 #endif // NK_TARGET_SKYLAKE
+/*  SIMD-powered backends for AVX512-BF16 CPUs of AMD Genoa / Intel Sapphire Rapids generation and newer.
+ */
+#if NK_TARGET_GENOA
+/** @copydoc nk_rmsd_bf16 */
+NK_PUBLIC void nk_rmsd_bf16_genoa(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_kabsch_bf16 */
+NK_PUBLIC void nk_kabsch_bf16_genoa(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                    nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_umeyama_bf16 */
+NK_PUBLIC void nk_umeyama_bf16_genoa(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                     nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+#endif // NK_TARGET_GENOA
 /*  SIMD-powered backends for AVX2 CPUs of Haswell generation and newer.
  */
 #if NK_TARGET_HASWELL
@@ -357,6 +371,20 @@ NK_PUBLIC void nk_umeyama_bf16_neonbfdot(nk_bf16_t const *a, nk_bf16_t const *b,
                                          nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
 #endif // NK_TARGET_NEONBFDOT
+/*  SIMD-powered backends for Arm NEON FHM (FP16 widening FMA) CPUs.
+ */
+#if NK_TARGET_NEONFHM
+/** @copydoc nk_rmsd_f16 */
+NK_PUBLIC void nk_rmsd_f16_neonfhm(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                   nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_kabsch_f16 */
+NK_PUBLIC void nk_kabsch_f16_neonfhm(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                     nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_umeyama_f16 */
+NK_PUBLIC void nk_umeyama_f16_neonfhm(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                      nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+#endif // NK_TARGET_NEONFHM
 #if NK_TARGET_RVV
 /** @copydoc nk_rmsd_f32 */
 NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
@@ -454,8 +482,10 @@ NK_INTERNAL nk_dtype_t nk_mesh_transform_dtype(nk_dtype_t dtype) {
 #include "numkong/mesh/serial.h"
 #include "numkong/mesh/neon.h"
 #include "numkong/mesh/neonbfdot.h"
+#include "numkong/mesh/neonfhm.h"
 #include "numkong/mesh/haswell.h"
 #include "numkong/mesh/skylake.h"
+#include "numkong/mesh/genoa.h"
 #include "numkong/mesh/rvv.h"
 #include "numkong/mesh/v128relaxed.h"
@@ -505,6 +535,8 @@ NK_PUBLIC void nk_rmsd_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk
     nk_rmsd_f16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_HASWELL
     nk_rmsd_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_NEONFHM
+    nk_rmsd_f16_neonfhm(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_NEON
     nk_rmsd_f16_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_RVV
@@ -517,6 +549,8 @@ NK_PUBLIC void nk_rmsd_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk
 NK_PUBLIC void nk_rmsd_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
                             nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
 #if NK_TARGET_SKYLAKE
+    //  Skylake f32-widen path wins on Intel where VDPBF16PS throughput matches FMA; on AMD Zen4+
+    //  where VDPBF16PS is faster than FMA, users can call `nk_rmsd_bf16_genoa` directly.
     nk_rmsd_bf16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_HASWELL
     nk_rmsd_bf16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
@@ -569,6 +603,8 @@ NK_PUBLIC void nk_kabsch_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n,
     nk_kabsch_f16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_HASWELL
     nk_kabsch_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_NEONFHM
+    nk_kabsch_f16_neonfhm(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_NEON
     nk_kabsch_f16_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_RVV
@@ -633,6 +669,8 @@ NK_PUBLIC void nk_umeyama_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n,
     nk_umeyama_f16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_HASWELL
     nk_umeyama_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_NEONFHM
+    nk_umeyama_f16_neonfhm(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_NEON
     nk_umeyama_f16_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_RVV

package/include/numkong/reduce/serial.h CHANGED Viewed

@@ -14,7 +14,6 @@
 #define NK_REDUCE_SERIAL_H
 #include "numkong/types.h"
-#include "numkong/scalar/serial.h"
 #include "numkong/cast/serial.h"
 #include "numkong/scalar/serial.h"
@@ -22,6 +21,15 @@
 extern "C" {
 #endif
+/*  Keep the serial instantiations below actually scalar, regardless of build type.
+ *  See dots/serial.h for rationale. */
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
+#endif
 NK_INTERNAL nk_f64_t nk_reduce_sum_f64_serial_(nk_f64_t const *values, nk_f64_t const *compensations, int count) {
     nk_f64_t running_sum = 0, accumulated_error = 0;
     for (int i = 0; i < count; i++) {
@@ -746,6 +754,12 @@ NK_PUBLIC void nk_reduce_minmax_u1_serial(                          //
     *max_value_ptr = max_value, *max_index_ptr = max_idx;
 }
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
 #if defined(__cplusplus)
 } // extern "C"
 #endif

package/include/numkong/sparse/serial.h CHANGED Viewed

@@ -17,7 +17,7 @@ extern "C" {
 #endif
 #define nk_define_sparse_intersect_(input_type)                                                                      \
-    NK_PUBLIC nk_size_t nk_sparse_intersect_##input_type##_galloping_search_(                                        \
+    NK_INTERNAL nk_size_t nk_sparse_intersect_##input_type##_galloping_search_(                                      \
         nk_##input_type##_t const *array, nk_size_t start, nk_size_t length, nk_##input_type##_t val) {              \
         nk_size_t low = start;                                                                                       \
         nk_size_t high = start + 1;                                                                                  \
@@ -32,7 +32,7 @@ extern "C" {
         }                                                                                                            \
         return low;                                                                                                  \
     }                                                                                                                \
-    NK_PUBLIC nk_size_t nk_sparse_intersect_##input_type##_linear_scan_(                                             \
+    NK_INTERNAL nk_size_t nk_sparse_intersect_##input_type##_linear_scan_(                                           \
         nk_##input_type##_t const *a, nk_##input_type##_t const *b, nk_size_t a_length, nk_size_t b_length,          \
         nk_##input_type##_t *result) {                                                                               \
         nk_size_t intersection_size = 0;                                                                             \
@@ -103,6 +103,15 @@ extern "C" {
         *product = weights_product;                                                                        \
     }
+/*  Keep the serial instantiations below actually scalar, regardless of build type.
+ *  See dots/serial.h for rationale. */
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
+#endif
 nk_define_sparse_intersect_(u16) // nk_sparse_intersect_u16_serial
 nk_define_sparse_intersect_(u32) // nk_sparse_intersect_u32_serial
 nk_define_sparse_intersect_(u64) // nk_sparse_intersect_u64_serial
@@ -110,6 +119,12 @@ nk_define_sparse_intersect_(u64) // nk_sparse_intersect_u64_serial
 nk_define_sparse_dot_(u16, bf16, f32, nk_bf16_to_f32_serial) // nk_sparse_dot_u16bf16_serial
 nk_define_sparse_dot_(u32, f32, f64, nk_assign_from_to_)     // nk_sparse_dot_u32f32_serial
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
 #if defined(__cplusplus)
 } // extern "C"
 #endif

package/include/numkong/spatial/genoa.h CHANGED Viewed

@@ -139,74 +139,6 @@ nk_angular_bf16_genoa_cycle:
     *result = nk_angular_normalize_f32_haswell_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
 }
-NK_PUBLIC void nk_sqeuclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m512 a_sq_f32x16 = _mm512_setzero_ps();
-    __m512 b_sq_f32x16 = _mm512_setzero_ps();
-    __m512 ab_f32x16 = _mm512_setzero_ps();
-    __m256i a_e5m2x32, b_e5m2x32;
-nk_sqeuclidean_e5m2_genoa_cycle:
-    if (n < 32) {
-        __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
-        a_e5m2x32 = _mm256_maskz_loadu_epi8(mask, a);
-        b_e5m2x32 = _mm256_maskz_loadu_epi8(mask, b);
-        n = 0;
-    }
-    else {
-        a_e5m2x32 = _mm256_loadu_epi8(a);
-        b_e5m2x32 = _mm256_loadu_epi8(b);
-        a += 32, b += 32, n -= 32;
-    }
-    __m512i a_bf16x32 = nk_e5m2x32_to_bf16x32_icelake_(a_e5m2x32);
-    __m512i b_bf16x32 = nk_e5m2x32_to_bf16x32_icelake_(b_e5m2x32);
-    a_sq_f32x16 = _mm512_dpbf16_ps(a_sq_f32x16, nk_m512bh_from_m512i_(a_bf16x32), nk_m512bh_from_m512i_(a_bf16x32));
-    b_sq_f32x16 = _mm512_dpbf16_ps(b_sq_f32x16, nk_m512bh_from_m512i_(b_bf16x32), nk_m512bh_from_m512i_(b_bf16x32));
-    ab_f32x16 = _mm512_dpbf16_ps(ab_f32x16, nk_m512bh_from_m512i_(a_bf16x32), nk_m512bh_from_m512i_(b_bf16x32));
-    if (n) goto nk_sqeuclidean_e5m2_genoa_cycle;
-    // (a-b)² = a² + b² - 2ab
-    __m512 sum_sq_f32x16 = _mm512_add_ps(a_sq_f32x16, b_sq_f32x16);
-    *result = nk_reduce_add_f32x16_skylake_(_mm512_fnmadd_ps(_mm512_set1_ps(2.0f), ab_f32x16, sum_sq_f32x16));
-}
-NK_PUBLIC void nk_euclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
-    nk_sqeuclidean_e5m2_genoa(a, b, n, result);
-    *result = nk_f32_sqrt_haswell(*result);
-}
-NK_PUBLIC void nk_angular_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m512 dot_f32x16 = _mm512_setzero_ps();
-    __m512 a_norm_sq_f32x16 = _mm512_setzero_ps();
-    __m512 b_norm_sq_f32x16 = _mm512_setzero_ps();
-    __m256i a_e5m2x32, b_e5m2x32;
-nk_angular_e5m2_genoa_cycle:
-    if (n < 32) {
-        __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
-        a_e5m2x32 = _mm256_maskz_loadu_epi8(mask, a);
-        b_e5m2x32 = _mm256_maskz_loadu_epi8(mask, b);
-        n = 0;
-    }
-    else {
-        a_e5m2x32 = _mm256_loadu_epi8(a);
-        b_e5m2x32 = _mm256_loadu_epi8(b);
-        a += 32, b += 32, n -= 32;
-    }
-    __m512i a_bf16x32 = nk_e5m2x32_to_bf16x32_icelake_(a_e5m2x32);
-    __m512i b_bf16x32 = nk_e5m2x32_to_bf16x32_icelake_(b_e5m2x32);
-    dot_f32x16 = _mm512_dpbf16_ps(dot_f32x16, nk_m512bh_from_m512i_(a_bf16x32), nk_m512bh_from_m512i_(b_bf16x32));
-    a_norm_sq_f32x16 = _mm512_dpbf16_ps(a_norm_sq_f32x16, nk_m512bh_from_m512i_(a_bf16x32),
-                                        nk_m512bh_from_m512i_(a_bf16x32));
-    b_norm_sq_f32x16 = _mm512_dpbf16_ps(b_norm_sq_f32x16, nk_m512bh_from_m512i_(b_bf16x32),
-                                        nk_m512bh_from_m512i_(b_bf16x32));
-    if (n) goto nk_angular_e5m2_genoa_cycle;
-    nk_f32_t dot_f32 = nk_reduce_add_f32x16_skylake_(dot_f32x16);
-    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x16_skylake_(a_norm_sq_f32x16);
-    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x16_skylake_(b_norm_sq_f32x16);
-    *result = nk_angular_normalize_f32_haswell_(dot_f32, a_norm_sq_f32, b_norm_sq_f32);
-}
 #if defined(__clang__)
 #pragma clang attribute pop
 #elif defined(__GNUC__)

package/include/numkong/spatial/haswell.h CHANGED Viewed

@@ -840,28 +840,37 @@ nk_angular_e3m2_haswell_cycle:
 }
 NK_PUBLIC void nk_sqeuclidean_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+    // E4M3 has no free widen shift, so we call the Giesen-based 8-lane cast helper
+    // twice per 16-lane iter and run with two F32 accumulators to break the FMA chain.
+    __m256 first_acc_f32x8 = _mm256_setzero_ps();
+    __m256 second_acc_f32x8 = _mm256_setzero_ps();
+    __m128i a_u8x16, b_u8x16;
 nk_sqeuclidean_e4m3_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_vec.xmm);
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_sqeuclidean_e4m3_haswell_cycle;
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
     }
+    __m256 a_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_u8x16);
+    __m256 a_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(a_u8x16, a_u8x16));
+    __m256 b_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_u8x16);
+    __m256 b_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(b_u8x16, b_u8x16));
+    __m256 diff_low_f32x8 = _mm256_sub_ps(a_low_f32x8, b_low_f32x8);
+    __m256 diff_high_f32x8 = _mm256_sub_ps(a_high_f32x8, b_high_f32x8);
+    first_acc_f32x8 = _mm256_fmadd_ps(diff_low_f32x8, diff_low_f32x8, first_acc_f32x8);
+    second_acc_f32x8 = _mm256_fmadd_ps(diff_high_f32x8, diff_high_f32x8, second_acc_f32x8);
+    if (n) goto nk_sqeuclidean_e4m3_haswell_cycle;
-    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+    *result = nk_reduce_add_f32x8_haswell_(_mm256_add_ps(first_acc_f32x8, second_acc_f32x8));
 }
 NK_PUBLIC void nk_euclidean_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -873,27 +882,33 @@ NK_PUBLIC void nk_angular_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, n
     __m256 dot_product_f32x8 = _mm256_setzero_ps();
     __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
     __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m128i a_u8x16, b_u8x16;
 nk_angular_e4m3_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_vec.xmm);
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_angular_e4m3_haswell_cycle;
-    }
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
+    }
+    __m256 a_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_u8x16);
+    __m256 a_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(a_u8x16, a_u8x16));
+    __m256 b_low_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_u8x16);
+    __m256 b_high_f32x8 = nk_e4m3x8_to_f32x8_haswell_(_mm_unpackhi_epi64(b_u8x16, b_u8x16));
+    dot_product_f32x8 = _mm256_fmadd_ps(a_low_f32x8, b_low_f32x8, dot_product_f32x8);
+    dot_product_f32x8 = _mm256_fmadd_ps(a_high_f32x8, b_high_f32x8, dot_product_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_low_f32x8, a_low_f32x8, a_norm_sq_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_high_f32x8, a_high_f32x8, a_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_low_f32x8, b_low_f32x8, b_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_high_f32x8, b_high_f32x8, b_norm_sq_f32x8);
+    if (n) goto nk_angular_e4m3_haswell_cycle;
     nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
     nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);
@@ -902,28 +917,44 @@ nk_angular_e4m3_haswell_cycle:
 }
 NK_PUBLIC void nk_sqeuclidean_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
-    __m256 distance_sq_f32x8 = _mm256_setzero_ps();
+    // E5M2 shares F16's exponent bias (15): `byte << 8` equals the matching F16 encoding.
+    // `vpunpck*bw` against zero is the free widen+shift: zero byte in low half of each
+    // 16-bit lane, E5M2 byte in high half. Per-128-bit-lane scrambled; commutative sum
+    // reduction is invariant under that.
+    __m256 first_acc_f32x8 = _mm256_setzero_ps();
+    __m256 second_acc_f32x8 = _mm256_setzero_ps();
+    __m128i const zero_u8x16 = _mm_setzero_si128();
+    __m128i a_u8x16, b_u8x16;
 nk_sqeuclidean_e5m2_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_vec.xmm);
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        __m256 diff_f32x8 = _mm256_sub_ps(a_f32x8, b_f32x8);
-        distance_sq_f32x8 = _mm256_fmadd_ps(diff_f32x8, diff_f32x8, distance_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_sqeuclidean_e5m2_haswell_cycle;
-    }
-    *result = nk_reduce_add_f32x8_haswell_(distance_sq_f32x8);
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
+    }
+    __m128i a_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, a_u8x16);
+    __m128i a_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, a_u8x16);
+    __m128i b_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, b_u8x16);
+    __m128i b_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, b_u8x16);
+    __m256 a_first_f32x8 = _mm256_cvtph_ps(a_even_f16x8);
+    __m256 a_second_f32x8 = _mm256_cvtph_ps(a_odd_f16x8);
+    __m256 b_first_f32x8 = _mm256_cvtph_ps(b_even_f16x8);
+    __m256 b_second_f32x8 = _mm256_cvtph_ps(b_odd_f16x8);
+    __m256 diff_first_f32x8 = _mm256_sub_ps(a_first_f32x8, b_first_f32x8);
+    __m256 diff_second_f32x8 = _mm256_sub_ps(a_second_f32x8, b_second_f32x8);
+    first_acc_f32x8 = _mm256_fmadd_ps(diff_first_f32x8, diff_first_f32x8, first_acc_f32x8);
+    second_acc_f32x8 = _mm256_fmadd_ps(diff_second_f32x8, diff_second_f32x8, second_acc_f32x8);
+    if (n) goto nk_sqeuclidean_e5m2_haswell_cycle;
+    *result = nk_reduce_add_f32x8_haswell_(_mm256_add_ps(first_acc_f32x8, second_acc_f32x8));
 }
 NK_PUBLIC void nk_euclidean_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
@@ -935,27 +966,38 @@ NK_PUBLIC void nk_angular_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, n
     __m256 dot_product_f32x8 = _mm256_setzero_ps();
     __m256 a_norm_sq_f32x8 = _mm256_setzero_ps();
     __m256 b_norm_sq_f32x8 = _mm256_setzero_ps();
+    __m128i const zero_u8x16 = _mm_setzero_si128();
+    __m128i a_u8x16, b_u8x16;
 nk_angular_e5m2_haswell_cycle:
-    if (n < 8) {
+    if (n < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a, &a_vec, n);
         nk_partial_load_b8x16_serial_(b, &b_vec, n);
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_vec.xmm);
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_vec.xmm);
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
+        a_u8x16 = a_vec.xmm;
+        b_u8x16 = b_vec.xmm;
+        n = 0;
     }
     else {
-        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)a));
-        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(_mm_loadu_si128((__m128i const *)b));
-        dot_product_f32x8 = _mm256_fmadd_ps(a_f32x8, b_f32x8, dot_product_f32x8);
-        a_norm_sq_f32x8 = _mm256_fmadd_ps(a_f32x8, a_f32x8, a_norm_sq_f32x8);
-        b_norm_sq_f32x8 = _mm256_fmadd_ps(b_f32x8, b_f32x8, b_norm_sq_f32x8);
-        n -= 8, a += 8, b += 8;
-        goto nk_angular_e5m2_haswell_cycle;
-    }
+        a_u8x16 = _mm_loadu_si128((__m128i const *)a);
+        b_u8x16 = _mm_loadu_si128((__m128i const *)b);
+        a += 16, b += 16, n -= 16;
+    }
+    __m128i a_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, a_u8x16);
+    __m128i a_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, a_u8x16);
+    __m128i b_even_f16x8 = _mm_unpacklo_epi8(zero_u8x16, b_u8x16);
+    __m128i b_odd_f16x8 = _mm_unpackhi_epi8(zero_u8x16, b_u8x16);
+    __m256 a_first_f32x8 = _mm256_cvtph_ps(a_even_f16x8);
+    __m256 a_second_f32x8 = _mm256_cvtph_ps(a_odd_f16x8);
+    __m256 b_first_f32x8 = _mm256_cvtph_ps(b_even_f16x8);
+    __m256 b_second_f32x8 = _mm256_cvtph_ps(b_odd_f16x8);
+    dot_product_f32x8 = _mm256_fmadd_ps(a_first_f32x8, b_first_f32x8, dot_product_f32x8);
+    dot_product_f32x8 = _mm256_fmadd_ps(a_second_f32x8, b_second_f32x8, dot_product_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_first_f32x8, a_first_f32x8, a_norm_sq_f32x8);
+    a_norm_sq_f32x8 = _mm256_fmadd_ps(a_second_f32x8, a_second_f32x8, a_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_first_f32x8, b_first_f32x8, b_norm_sq_f32x8);
+    b_norm_sq_f32x8 = _mm256_fmadd_ps(b_second_f32x8, b_second_f32x8, b_norm_sq_f32x8);
+    if (n) goto nk_angular_e5m2_haswell_cycle;
     nk_f32_t dot_product_f32 = nk_reduce_add_f32x8_haswell_(dot_product_f32x8);
     nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x8_haswell_(a_norm_sq_f32x8);

package/include/numkong/spatial/serial.h CHANGED Viewed

@@ -108,6 +108,15 @@ extern "C" {
         }                                                                                                         \
     }
+/*  Keep the serial instantiations below actually scalar, regardless of build type.
+ *  See dots/serial.h for rationale. */
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
+#endif
 nk_define_angular_(f64, f64, f64, nk_assign_from_to_, nk_f64_rsqrt_serial)       // nk_angular_f64_serial
 nk_define_sqeuclidean_(f64, f64, f64, nk_assign_from_to_)                        // nk_sqeuclidean_f64_serial
 nk_define_euclidean_(f64, f64, f64, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_euclidean_f64_serial
@@ -340,6 +349,12 @@ NK_INTERNAL void nk_euclidean_through_u32_from_dot_serial_(nk_b128_vec_t dots, n
     }
 }
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
 #if defined(__cplusplus)
 } // extern "C"
 #endif