npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/reduce/neonhalf.h DELETED Viewed

@@ -1,157 +0,0 @@
-/**
- *  @brief NEON FP16 implementations for the redesigned reduction API (moments + minmax).
- *  @file include/numkong/reduce/neonhalf.h
- *  @author Ash Vardanian
- *  @date February 13, 2026
- *
- *  @sa include/numkong/reduce.h
- *
- *  @section reduce_neonhalf_new_design Design Notes
- *
- *  Moments (sum + sum-of-squares) accumulate in f32 via vcvt_f32_f16 widening, giving
- *  full f32 precision. The contiguous path processes 8 f16 elements per iteration, widening
- *  to two f32x4 halves and using vfmaq_f32 for fused multiply-accumulate of squares.
- *
- *  Minmax tracks min/max values as native f16x8 with u16x8 iteration counters (same width
- *  as f16). The u16 counters wrap at 65536, so the dispatcher splits arrays larger than
- *  65536 * 8 = 524288 elements via recursive halving.
- */
-#ifndef NK_REDUCE_NEONHALF_H
-#define NK_REDUCE_NEONHALF_H
-#if NK_TARGET_ARM_
-#if NK_TARGET_NEONHALF
-#include "numkong/types.h"
-#include "numkong/cast/neon.h"
-#include "numkong/cast/serial.h"
-#include "numkong/reduce/serial.h"
-#if defined(__cplusplus)
-extern "C" {
-#endif
-#if defined(__clang__)
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
-#elif defined(__GNUC__)
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd+fp16")
-#endif
-NK_INTERNAL void nk_reduce_moments_f16_neonhalf_contiguous_( //
-    nk_f16_t const *data_ptr, nk_size_t count,               //
-    nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
-    float32x4_t sum_f32x4 = vdupq_n_f32(0);
-    float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
-    nk_size_t idx = 0;
-    for (; idx + 8 <= count; idx += 8) {
-        float16x8_t data_f16x8 = vld1q_f16((nk_f16_for_arm_simd_t const *)(data_ptr + idx));
-        float32x4_t low_f32x4 = vcvt_f32_f16(vget_low_f16(data_f16x8));
-        float32x4_t high_f32x4 = vcvt_f32_f16(vget_high_f16(data_f16x8));
-        sum_f32x4 = vaddq_f32(sum_f32x4, low_f32x4);
-        sum_f32x4 = vaddq_f32(sum_f32x4, high_f32x4);
-        sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, low_f32x4, low_f32x4);
-        sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, high_f32x4, high_f32x4);
-    }
-    // Scalar tail
-    nk_f32_t sum = vaddvq_f32(sum_f32x4);
-    nk_f32_t sumsq = vaddvq_f32(sumsq_f32x4);
-    for (; idx < count; ++idx) {
-        nk_f32_t value_f32;
-        nk_f16_to_f32_serial(data_ptr + idx, &value_f32);
-        sum += value_f32, sumsq += value_f32 * value_f32;
-    }
-    *sum_ptr = sum, *sumsq_ptr = sumsq;
-}
-NK_INTERNAL void nk_reduce_moments_f16_neonhalf_strided_(                 //
-    nk_f16_t const *data_ptr, nk_size_t count, nk_size_t stride_elements, //
-    nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
-    float32x4_t sum_f32x4 = vdupq_n_f32(0);
-    float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
-    nk_size_t idx = 0;
-    if (stride_elements == 2) {
-        for (; idx + 8 <= count; idx += 8) {
-            uint16x8x2_t loaded_u16x8x2 = vld2q_u16((uint16_t const *)(data_ptr + idx * 2));
-            float16x8_t data_f16x8 = vreinterpretq_f16_u16(loaded_u16x8x2.val[0]);
-            float32x4_t low_f32x4 = vcvt_f32_f16(vget_low_f16(data_f16x8));
-            float32x4_t high_f32x4 = vcvt_f32_f16(vget_high_f16(data_f16x8));
-            sum_f32x4 = vaddq_f32(sum_f32x4, low_f32x4);
-            sum_f32x4 = vaddq_f32(sum_f32x4, high_f32x4);
-            sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, low_f32x4, low_f32x4);
-            sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, high_f32x4, high_f32x4);
-        }
-    }
-    else if (stride_elements == 3) {
-        for (; idx + 8 <= count; idx += 8) {
-            uint16x8x3_t loaded_u16x8x3 = vld3q_u16((uint16_t const *)(data_ptr + idx * 3));
-            float16x8_t data_f16x8 = vreinterpretq_f16_u16(loaded_u16x8x3.val[0]);
-            float32x4_t low_f32x4 = vcvt_f32_f16(vget_low_f16(data_f16x8));
-            float32x4_t high_f32x4 = vcvt_f32_f16(vget_high_f16(data_f16x8));
-            sum_f32x4 = vaddq_f32(sum_f32x4, low_f32x4);
-            sum_f32x4 = vaddq_f32(sum_f32x4, high_f32x4);
-            sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, low_f32x4, low_f32x4);
-            sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, high_f32x4, high_f32x4);
-        }
-    }
-    else if (stride_elements == 4) {
-        for (; idx + 8 <= count; idx += 8) {
-            uint16x8x4_t loaded_u16x8x4 = vld4q_u16((uint16_t const *)(data_ptr + idx * 4));
-            float16x8_t data_f16x8 = vreinterpretq_f16_u16(loaded_u16x8x4.val[0]);
-            float32x4_t low_f32x4 = vcvt_f32_f16(vget_low_f16(data_f16x8));
-            float32x4_t high_f32x4 = vcvt_f32_f16(vget_high_f16(data_f16x8));
-            sum_f32x4 = vaddq_f32(sum_f32x4, low_f32x4);
-            sum_f32x4 = vaddq_f32(sum_f32x4, high_f32x4);
-            sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, low_f32x4, low_f32x4);
-            sumsq_f32x4 = vfmaq_f32(sumsq_f32x4, high_f32x4, high_f32x4);
-        }
-    }
-    // Scalar tail for remaining elements
-    nk_f32_t sum = vaddvq_f32(sum_f32x4);
-    nk_f32_t sumsq = vaddvq_f32(sumsq_f32x4);
-    for (; idx < count; ++idx) {
-        nk_f32_t value_f32;
-        nk_f16_to_f32_serial((nk_f16_t const *)(data_ptr + idx * stride_elements), &value_f32);
-        sum += value_f32, sumsq += value_f32 * value_f32;
-    }
-    *sum_ptr = sum, *sumsq_ptr = sumsq;
-}
-NK_PUBLIC void nk_reduce_moments_f16_neonhalf(                         //
-    nk_f16_t const *data_ptr, nk_size_t count, nk_size_t stride_bytes, //
-    nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
-    nk_size_t stride_elements = stride_bytes / sizeof(nk_f16_t);
-    int aligned = (stride_bytes % sizeof(nk_f16_t) == 0);
-    if (count == 0) *sum_ptr = 0, *sumsq_ptr = 0;
-    else if (!aligned) nk_reduce_moments_f16_serial(data_ptr, count, stride_bytes, sum_ptr, sumsq_ptr);
-    else if (count > (nk_size_t)(NK_U16_MAX + 1) * 8) {
-        nk_size_t left_count = count / 2;
-        nk_f32_t left_sum_value, left_sumsq_value, right_sum_value, right_sumsq_value;
-        nk_reduce_moments_f16_neonhalf(data_ptr, left_count, stride_bytes, &left_sum_value, &left_sumsq_value);
-        nk_reduce_moments_f16_neonhalf(data_ptr + left_count * stride_elements, count - left_count, stride_bytes,
-                                       &right_sum_value, &right_sumsq_value);
-        *sum_ptr = left_sum_value + right_sum_value, *sumsq_ptr = left_sumsq_value + right_sumsq_value;
-    }
-    else if (stride_elements == 1) nk_reduce_moments_f16_neonhalf_contiguous_(data_ptr, count, sum_ptr, sumsq_ptr);
-    else if (stride_elements <= 4)
-        nk_reduce_moments_f16_neonhalf_strided_(data_ptr, count, stride_elements, sum_ptr, sumsq_ptr);
-    else nk_reduce_moments_f16_serial(data_ptr, count, stride_bytes, sum_ptr, sumsq_ptr);
-}
-#if defined(__clang__)
-#pragma clang attribute pop
-#elif defined(__GNUC__)
-#pragma GCC pop_options
-#endif
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#endif // NK_TARGET_NEONHALF
-#endif // NK_TARGET_ARM_
-#endif // NK_REDUCE_NEONHALF_H

package/include/numkong/spatial/neonhalf.h DELETED Viewed

@@ -1,118 +0,0 @@
-/**
- *  @brief SIMD-accelerated Spatial Similarity Measures for NEON FP16.
- *  @file include/numkong/spatial/neonhalf.h
- *  @author Ash Vardanian
- *  @date December 27, 2025
- *
- *  @sa include/numkong/spatial.h
- *
- *  @section spatial_neonhalf_instructions ARM NEON FP16 Instructions (ARMv8.2-FP16)
- *
- *      Intrinsic                   Instruction                     Latency     Throughput
- *                                                                              A76         M4+/V1+/Oryon
- *      vfmaq_f16                   FMLA (V.8H, V.8H, V.8H)         4cy         2/cy        4/cy
- *      vcvt_f32_f16                FCVTL (V.4S, V.4H)              3cy         2/cy        4/cy
- *      vld1q_f16                   LD1 (V.8H)                      4cy         2/cy        3/cy
- *      vsubq_f16                   FSUB (V.8H, V.8H, V.8H)         2cy         2/cy        4/cy
- *      vaddvq_f32                  FADDP+FADDP (V.4S)              4cy         1/cy        2/cy
- *
- *  The ARMv8.2-FP16 extension enables native half-precision arithmetic, doubling the element count
- *  per vector register (8x F16 vs 4x F32). For spatial distance computations like L2 and angular
- *  distance, this halves memory bandwidth requirements.
- *
- *  Inputs are widened from F16 to F32 for accumulation via FCVTL to preserve numerical precision
- *  during the squared difference summation. The subtraction and FMA operations use F32 precision
- *  in the accumulator to avoid catastrophic cancellation in distance computations.
- */
-#ifndef NK_SPATIAL_NEONHALF_H
-#define NK_SPATIAL_NEONHALF_H
-#if NK_TARGET_ARM_
-#if NK_TARGET_NEONHALF
-#include "numkong/types.h"
-#include "numkong/cast/serial.h"  // `nk_partial_load_b16x4_serial_`
-#include "numkong/spatial/neon.h" // `nk_angular_normalize_f32_neon_`, `nk_f32_sqrt_neon`
-#if defined(__cplusplus)
-extern "C" {
-#endif
-#if defined(__clang__)
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
-#elif defined(__GNUC__)
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd+fp16")
-#endif
-NK_PUBLIC void nk_sqeuclidean_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
-    float32x4_t a_f32x4, b_f32x4;
-    float32x4_t distance_sq_f32x4 = vdupq_n_f32(0);
-nk_sqeuclidean_f16_neonhalf_cycle:
-    if (n < 4) {
-        nk_b64_vec_t a_vec, b_vec;
-        nk_partial_load_b16x4_serial_(a, &a_vec, n);
-        nk_partial_load_b16x4_serial_(b, &b_vec, n);
-        a_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(a_vec.u16x4));
-        b_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(b_vec.u16x4));
-        n = 0;
-    }
-    else {
-        a_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)a));
-        b_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)b));
-        n -= 4, a += 4, b += 4;
-    }
-    float32x4_t diff_f32x4 = vsubq_f32(a_f32x4, b_f32x4);
-    distance_sq_f32x4 = vfmaq_f32(distance_sq_f32x4, diff_f32x4, diff_f32x4);
-    if (n) goto nk_sqeuclidean_f16_neonhalf_cycle;
-    *result = vaddvq_f32(distance_sq_f32x4);
-}
-NK_PUBLIC void nk_euclidean_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
-    nk_sqeuclidean_f16_neonhalf(a, b, n, result);
-    *result = nk_f32_sqrt_neon(*result);
-}
-NK_PUBLIC void nk_angular_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
-    float32x4_t dot_product_f32x4 = vdupq_n_f32(0), a_norm_sq_f32x4 = vdupq_n_f32(0), b_norm_sq_f32x4 = vdupq_n_f32(0);
-    float32x4_t a_f32x4, b_f32x4;
-nk_angular_f16_neonhalf_cycle:
-    if (n < 4) {
-        nk_b64_vec_t a_vec, b_vec;
-        nk_partial_load_b16x4_serial_(a, &a_vec, n);
-        nk_partial_load_b16x4_serial_(b, &b_vec, n);
-        a_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(a_vec.u16x4));
-        b_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(b_vec.u16x4));
-        n = 0;
-    }
-    else {
-        a_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)a));
-        b_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)b));
-        n -= 4, a += 4, b += 4;
-    }
-    dot_product_f32x4 = vfmaq_f32(dot_product_f32x4, a_f32x4, b_f32x4);
-    a_norm_sq_f32x4 = vfmaq_f32(a_norm_sq_f32x4, a_f32x4, a_f32x4);
-    b_norm_sq_f32x4 = vfmaq_f32(b_norm_sq_f32x4, b_f32x4, b_f32x4);
-    if (n) goto nk_angular_f16_neonhalf_cycle;
-    nk_f32_t dot_product_f32 = vaddvq_f32(dot_product_f32x4);
-    nk_f32_t a_norm_sq_f32 = vaddvq_f32(a_norm_sq_f32x4);
-    nk_f32_t b_norm_sq_f32 = vaddvq_f32(b_norm_sq_f32x4);
-    *result = nk_angular_normalize_f32_neon_(dot_product_f32, a_norm_sq_f32, b_norm_sq_f32);
-}
-#if defined(__clang__)
-#pragma clang attribute pop
-#elif defined(__GNUC__)
-#pragma GCC pop_options
-#endif
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#endif // NK_TARGET_NEONHALF
-#endif // NK_TARGET_ARM_
-#endif // NK_SPATIAL_NEONHALF_H

package/include/numkong/spatial/sapphire.h DELETED Viewed

@@ -1,343 +0,0 @@
-/**
- *  @brief SIMD-accelerated Spatial Similarity Measures for Sapphire Rapids.
- *  @file include/numkong/spatial/sapphire.h
- *  @author Ash Vardanian
- *  @date December 27, 2025
- *
- *  @sa include/numkong/spatial.h
- *
- *  Sapphire Rapids adds native FP16 support via AVX-512 FP16 extension.
- *  For e4m3 L2 distance, we can leverage F16 for the subtraction step:
- *  - e4m3 differences fit in F16 (max |a−b| = 896 < 65504)
- *  - But squared differences overflow F16 (896² = 802816 > 65504)
- *  - So: subtract in F16, convert to F32, then square and accumulate
- *
- *  For e2m3/e3m2 L2 distance, squared differences fit in FP16:
- *  - E2M3: max |a−b| = 15, max (a−b)² = 225 < 65504, flush cadence = 4 (conservative for uniformity)
- *  - E3M2: max |a−b| = 56, max (a−b)² = 3136 < 65504, flush cadence = 4
- *  So the entire sub+square+accumulate stays in FP16 with periodic F32 flush.
- *
- *  @section spatial_sapphire_instructions Relevant Instructions
- *
- *      Intrinsic                   Instruction                     Sapphire    Genoa
- *      _mm256_sub_ph               VSUBPH (YMM, YMM, YMM)          4cy @ p05   3cy @ p01
- *      _mm512_cvtph_ps             VCVTPH2PS (ZMM, YMM)            5cy @ p05   5cy @ p01
- *      _mm512_fmadd_ps             VFMADD (ZMM, ZMM, ZMM)          4cy @ p05   4cy @ p01
- *      _mm512_reduce_add_ps        (pseudo: VHADDPS chain)         ~8cy        ~8cy
- *      _mm_maskz_loadu_epi8        VMOVDQU8 (XMM {K}, M128)        7cy @ p23   7cy @ p23
- */
-#ifndef NK_SPATIAL_SAPPHIRE_H
-#define NK_SPATIAL_SAPPHIRE_H
-#if NK_TARGET_X86_
-#if NK_TARGET_SAPPHIRE
-#include "numkong/types.h"
-#include "numkong/cast/sapphire.h"   // `nk_e4m3x16_to_f16x16_sapphire_`
-#include "numkong/dot/sapphire.h"    // `nk_e2m3x32_to_f16x32_sapphire_`, `nk_flush_f16_to_f32_sapphire_`
-#include "numkong/spatial/haswell.h" // `nk_angular_normalize_f32_haswell_`, `nk_f32_sqrt_haswell`
-#if defined(__cplusplus)
-extern "C" {
-#endif
-#if defined(__clang__)
-#pragma clang attribute push(                                                                        \
-    __attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,avx512fp16,f16c,fma,bmi,bmi2"))), \
-    apply_to = function)
-#elif defined(__GNUC__)
-#pragma GCC push_options
-#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512fp16", "f16c", "fma", "bmi", "bmi2")
-#endif
-NK_PUBLIC void nk_sqeuclidean_e4m3_sapphire(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars,
-                                            nk_size_t count_scalars, nk_f32_t *result) {
-    __m512 sum_f32x16 = _mm512_setzero_ps();
-    while (count_scalars > 0) {
-        nk_size_t const n = count_scalars < 16 ? count_scalars : 16;
-        __mmask16 const mask = (__mmask16)_bzhi_u32(0xFFFF, n);
-        __m128i a_e4m3x16 = _mm_maskz_loadu_epi8(mask, a_scalars);
-        __m128i b_e4m3x16 = _mm_maskz_loadu_epi8(mask, b_scalars);
-        // Convert e4m3 → f16
-        __m256h a_f16x16 = nk_e4m3x16_to_f16x16_sapphire_(a_e4m3x16);
-        __m256h b_f16x16 = nk_e4m3x16_to_f16x16_sapphire_(b_e4m3x16);
-        // Subtract in F16 − differences fit (max 896 < 65504)
-        __m256h diff_f16x16 = _mm256_sub_ph(a_f16x16, b_f16x16);
-        // Convert to F32 before squaring (896² = 802816 overflows F16!)
-        __m512 diff_f32x16 = _mm512_cvtph_ps(_mm256_castph_si256(diff_f16x16));
-        // Square and accumulate in F32
-        sum_f32x16 = _mm512_fmadd_ps(diff_f32x16, diff_f32x16, sum_f32x16);
-        a_scalars += n, b_scalars += n, count_scalars -= n;
-    }
-    *result = _mm512_reduce_add_ps(sum_f32x16);
-}
-NK_PUBLIC void nk_euclidean_e4m3_sapphire(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars,
-                                          nk_size_t count_scalars, nk_f32_t *result) {
-    nk_sqeuclidean_e4m3_sapphire(a_scalars, b_scalars, count_scalars, result);
-    *result = nk_f32_sqrt_haswell(*result);
-}
-NK_PUBLIC void nk_sqeuclidean_e2m3_sapphire(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars,
-                                            nk_size_t count_scalars, nk_f32_t *result) {
-    __m512 sum_f32x16 = _mm512_setzero_ps();
-    // Main loop: 4-way unrolled, 128 elements per flush
-    while (count_scalars >= 128) {
-        __m512h acc_f16x32 = _mm512_setzero_ph();
-        __m512h a_f16x32, b_f16x32, diff_f16x32;
-        // Iteration 1
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Iteration 2
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 32));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 32));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Iteration 3
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 64));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 64));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Iteration 4
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 96));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 96));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Flush to F32
-        sum_f32x16 = nk_flush_f16_to_f32_sapphire_(acc_f16x32, sum_f32x16);
-        a_scalars += 128, b_scalars += 128, count_scalars -= 128;
-    }
-    // Tail: remaining 0–127 elements, 32 at a time via masked loads
-    __m512h acc_f16x32 = _mm512_setzero_ph();
-    while (count_scalars > 0) {
-        nk_size_t const n = count_scalars < 32 ? count_scalars : 32;
-        __mmask32 const mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
-        __m512h a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, a_scalars));
-        __m512h b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, b_scalars));
-        __m512h diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        a_scalars += n, b_scalars += n, count_scalars -= n;
-    }
-    sum_f32x16 = nk_flush_f16_to_f32_sapphire_(acc_f16x32, sum_f32x16);
-    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
-}
-NK_PUBLIC void nk_sqeuclidean_e3m2_sapphire(nk_e3m2_t const *a_scalars, nk_e3m2_t const *b_scalars,
-                                            nk_size_t count_scalars, nk_f32_t *result) {
-    __m512 sum_f32x16 = _mm512_setzero_ps();
-    // Main loop: 4-way unrolled, 128 elements per flush
-    while (count_scalars >= 128) {
-        __m512h acc_f16x32 = _mm512_setzero_ph();
-        __m512h a_f16x32, b_f16x32, diff_f16x32;
-        // Iteration 1
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Iteration 2
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 32));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 32));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Iteration 3
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 64));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 64));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Iteration 4
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 96));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 96));
-        diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        // Flush to F32
-        sum_f32x16 = nk_flush_f16_to_f32_sapphire_(acc_f16x32, sum_f32x16);
-        a_scalars += 128, b_scalars += 128, count_scalars -= 128;
-    }
-    // Tail: remaining 0–127 elements, 32 at a time via masked loads
-    __m512h acc_f16x32 = _mm512_setzero_ph();
-    while (count_scalars > 0) {
-        nk_size_t const n = count_scalars < 32 ? count_scalars : 32;
-        __mmask32 const mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
-        __m512h a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, a_scalars));
-        __m512h b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, b_scalars));
-        __m512h diff_f16x32 = _mm512_sub_ph(a_f16x32, b_f16x32);
-        acc_f16x32 = _mm512_fmadd_ph(diff_f16x32, diff_f16x32, acc_f16x32);
-        a_scalars += n, b_scalars += n, count_scalars -= n;
-    }
-    sum_f32x16 = nk_flush_f16_to_f32_sapphire_(acc_f16x32, sum_f32x16);
-    *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
-}
-NK_PUBLIC void nk_euclidean_e2m3_sapphire(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars,
-                                          nk_size_t count_scalars, nk_f32_t *result) {
-    nk_sqeuclidean_e2m3_sapphire(a_scalars, b_scalars, count_scalars, result);
-    *result = nk_f32_sqrt_haswell(*result);
-}
-NK_PUBLIC void nk_euclidean_e3m2_sapphire(nk_e3m2_t const *a_scalars, nk_e3m2_t const *b_scalars,
-                                          nk_size_t count_scalars, nk_f32_t *result) {
-    nk_sqeuclidean_e3m2_sapphire(a_scalars, b_scalars, count_scalars, result);
-    *result = nk_f32_sqrt_haswell(*result);
-}
-NK_PUBLIC void nk_angular_e2m3_sapphire(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars, nk_size_t count_scalars,
-                                        nk_f32_t *result) {
-    __m512 sum_dot_f32x16 = _mm512_setzero_ps();
-    __m512 sum_a_f32x16 = _mm512_setzero_ps();
-    __m512 sum_b_f32x16 = _mm512_setzero_ps();
-    // Main loop: 4-way unrolled, 128 elements per flush
-    while (count_scalars >= 128) {
-        __m512h dot_acc = _mm512_setzero_ph();
-        __m512h a_norm_acc = _mm512_setzero_ph();
-        __m512h b_norm_acc = _mm512_setzero_ph();
-        __m512h a_f16x32, b_f16x32;
-        // Iteration 1
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Iteration 2
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 32));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 32));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Iteration 3
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 64));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 64));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Iteration 4
-        a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 96));
-        b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 96));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Flush to F32
-        sum_dot_f32x16 = nk_flush_f16_to_f32_sapphire_(dot_acc, sum_dot_f32x16);
-        sum_a_f32x16 = nk_flush_f16_to_f32_sapphire_(a_norm_acc, sum_a_f32x16);
-        sum_b_f32x16 = nk_flush_f16_to_f32_sapphire_(b_norm_acc, sum_b_f32x16);
-        a_scalars += 128, b_scalars += 128, count_scalars -= 128;
-    }
-    // Tail: remaining 0–127 elements, 32 at a time via masked loads
-    __m512h dot_acc = _mm512_setzero_ph();
-    __m512h a_norm_acc = _mm512_setzero_ph();
-    __m512h b_norm_acc = _mm512_setzero_ph();
-    while (count_scalars > 0) {
-        nk_size_t const n = count_scalars < 32 ? count_scalars : 32;
-        __mmask32 const mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
-        __m512h a_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, a_scalars));
-        __m512h b_f16x32 = nk_e2m3x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, b_scalars));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        a_scalars += n, b_scalars += n, count_scalars -= n;
-    }
-    sum_dot_f32x16 = nk_flush_f16_to_f32_sapphire_(dot_acc, sum_dot_f32x16);
-    sum_a_f32x16 = nk_flush_f16_to_f32_sapphire_(a_norm_acc, sum_a_f32x16);
-    sum_b_f32x16 = nk_flush_f16_to_f32_sapphire_(b_norm_acc, sum_b_f32x16);
-    nk_f32_t dot_f32 = nk_reduce_add_f32x16_skylake_(sum_dot_f32x16);
-    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x16_skylake_(sum_a_f32x16);
-    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x16_skylake_(sum_b_f32x16);
-    *result = nk_angular_normalize_f32_haswell_(dot_f32, a_norm_sq_f32, b_norm_sq_f32);
-}
-NK_PUBLIC void nk_angular_e3m2_sapphire(nk_e3m2_t const *a_scalars, nk_e3m2_t const *b_scalars, nk_size_t count_scalars,
-                                        nk_f32_t *result) {
-    __m512 sum_dot_f32x16 = _mm512_setzero_ps();
-    __m512 sum_a_f32x16 = _mm512_setzero_ps();
-    __m512 sum_b_f32x16 = _mm512_setzero_ps();
-    // Main loop: 4-way unrolled, 128 elements per flush
-    while (count_scalars >= 128) {
-        __m512h dot_acc = _mm512_setzero_ph();
-        __m512h a_norm_acc = _mm512_setzero_ph();
-        __m512h b_norm_acc = _mm512_setzero_ph();
-        __m512h a_f16x32, b_f16x32;
-        // Iteration 1
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Iteration 2
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 32));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 32));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Iteration 3
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 64));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 64));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Iteration 4
-        a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(a_scalars + 96));
-        b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_loadu_epi8(b_scalars + 96));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        // Flush to F32
-        sum_dot_f32x16 = nk_flush_f16_to_f32_sapphire_(dot_acc, sum_dot_f32x16);
-        sum_a_f32x16 = nk_flush_f16_to_f32_sapphire_(a_norm_acc, sum_a_f32x16);
-        sum_b_f32x16 = nk_flush_f16_to_f32_sapphire_(b_norm_acc, sum_b_f32x16);
-        a_scalars += 128, b_scalars += 128, count_scalars -= 128;
-    }
-    // Tail: remaining 0–127 elements, 32 at a time via masked loads
-    __m512h dot_acc = _mm512_setzero_ph();
-    __m512h a_norm_acc = _mm512_setzero_ph();
-    __m512h b_norm_acc = _mm512_setzero_ph();
-    while (count_scalars > 0) {
-        nk_size_t const n = count_scalars < 32 ? count_scalars : 32;
-        __mmask32 const mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
-        __m512h a_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, a_scalars));
-        __m512h b_f16x32 = nk_e3m2x32_to_f16x32_sapphire_(_mm256_maskz_loadu_epi8(mask, b_scalars));
-        dot_acc = _mm512_fmadd_ph(a_f16x32, b_f16x32, dot_acc);
-        a_norm_acc = _mm512_fmadd_ph(a_f16x32, a_f16x32, a_norm_acc);
-        b_norm_acc = _mm512_fmadd_ph(b_f16x32, b_f16x32, b_norm_acc);
-        a_scalars += n, b_scalars += n, count_scalars -= n;
-    }
-    sum_dot_f32x16 = nk_flush_f16_to_f32_sapphire_(dot_acc, sum_dot_f32x16);
-    sum_a_f32x16 = nk_flush_f16_to_f32_sapphire_(a_norm_acc, sum_a_f32x16);
-    sum_b_f32x16 = nk_flush_f16_to_f32_sapphire_(b_norm_acc, sum_b_f32x16);
-    nk_f32_t dot_f32 = nk_reduce_add_f32x16_skylake_(sum_dot_f32x16);
-    nk_f32_t a_norm_sq_f32 = nk_reduce_add_f32x16_skylake_(sum_a_f32x16);
-    nk_f32_t b_norm_sq_f32 = nk_reduce_add_f32x16_skylake_(sum_b_f32x16);
-    *result = nk_angular_normalize_f32_haswell_(dot_f32, a_norm_sq_f32, b_norm_sq_f32);
-}
-#if defined(__clang__)
-#pragma clang attribute pop
-#elif defined(__GNUC__)
-#pragma GCC pop_options
-#endif
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#endif // NK_TARGET_SAPPHIRE
-#endif // NK_TARGET_X86_
-#endif // NK_SPATIAL_SAPPHIRE_H

package/include/numkong/spatials/neonhalf.h DELETED Viewed

@@ -1,58 +0,0 @@
-/**
- *  @brief Batched Spatial Distances for NEON FP16 (Half-Precision).
- *  @file include/numkong/spatials/neonhalf.h
- *  @author Ash Vardanian
- *  @date February 23, 2026
- *
- *  @sa include/numkong/spatials.h
- */
-#ifndef NK_SPATIALS_NEONHALF_H
-#define NK_SPATIALS_NEONHALF_H
-#if NK_TARGET_ARM_
-#if NK_TARGET_NEONHALF
-#include "numkong/spatial/neon.h"
-#include "numkong/dots/neonhalf.h"
-#if defined(__cplusplus)
-extern "C" {
-#endif
-#if defined(__clang__)
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
-#elif defined(__GNUC__)
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd+fp16")
-#endif
-nk_define_cross_normalized_packed_(angular, f16, neonhalf, f16, f16, f32, /*norm_value_type=*/f32, f32, nk_b128_vec_t,
-                                   nk_dots_packed_f16_neonhalf, nk_angular_through_f32_from_dot_neon_,
-                                   nk_dots_reduce_sumsq_f16_, nk_load_b128_neon_, nk_partial_load_b32x4_serial_,
-                                   nk_store_b128_neon_, nk_partial_store_b32x4_serial_, 1)
-nk_define_cross_normalized_packed_(euclidean, f16, neonhalf, f16, f16, f32, /*norm_value_type=*/f32, f32, nk_b128_vec_t,
-                                   nk_dots_packed_f16_neonhalf, nk_euclidean_through_f32_from_dot_neon_,
-                                   nk_dots_reduce_sumsq_f16_, nk_load_b128_neon_, nk_partial_load_b32x4_serial_,
-                                   nk_store_b128_neon_, nk_partial_store_b32x4_serial_, 1)
-nk_define_cross_normalized_symmetric_(angular, f16, neonhalf, f16, f32, /*norm_value_type=*/f32, f32, nk_b128_vec_t,
-                                      nk_dots_symmetric_f16_neonhalf, nk_angular_through_f32_from_dot_neon_,
-                                      nk_dots_reduce_sumsq_f16_, nk_load_b128_neon_, nk_partial_load_b32x4_serial_,
-                                      nk_store_b128_neon_, nk_partial_store_b32x4_serial_, 1)
-nk_define_cross_normalized_symmetric_(euclidean, f16, neonhalf, f16, f32, /*norm_value_type=*/f32, f32, nk_b128_vec_t,
-                                      nk_dots_symmetric_f16_neonhalf, nk_euclidean_through_f32_from_dot_neon_,
-                                      nk_dots_reduce_sumsq_f16_, nk_load_b128_neon_, nk_partial_load_b32x4_serial_,
-                                      nk_store_b128_neon_, nk_partial_store_b32x4_serial_, 1)
-#if defined(__clang__)
-#pragma clang attribute pop
-#elif defined(__GNUC__)
-#pragma GCC pop_options
-#endif
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#endif // NK_TARGET_NEONHALF
-#endif // NK_TARGET_ARM_
-#endif // NK_SPATIALS_NEONHALF_H