npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/dot/neon.h CHANGED Viewed

@@ -10,15 +10,18 @@
  *
  *  Key NEON instructions for dot products:
  *
- *      Intrinsic         Instruction                   Latency     Throughput
- *                                                                  A76     M4+/V1+/Oryon
- *      vfmaq_f32         FMLA (V.4S, V.4S, V.4S)       4cy         2/cy    4/cy
- *      vfmaq_f64         FMLA (V.2D, V.2D, V.2D)       4cy         2/cy    4/cy
- *      vmulq_f32         FMUL (V.4S, V.4S, V.4S)       3cy         2/cy    4/cy
- *      vaddvq_f32        FADDP+FADDP (reduce)          5cy         1/cy    1/cy
- *      vaddvq_f64        FADDP (V.2D to scalar)        3cy         1/cy    1/cy
- *      vcvt_f64_f32      FCVTL (V.2D, V.2S)            3cy         2/cy    2/cy
- *      vld2_f32          LD2 ({Vt.2S, Vt2.2S}, [Xn])   4cy         1/cy    1/cy
+ *      Intrinsic     Instruction                  A76       M5
+ *      vfmaq_f32     FMLA (V.4S, V.4S, V.4S)      4cy @ 2p  3cy @ 4p
+ *      vfmaq_f64     FMLA (V.2D, V.2D, V.2D)      4cy @ 2p  4cy @ 4p
+ *      vfmsq_f64     FMLS (V.2D, V.2D, V.2D)      4cy @ 2p  4cy @ 4p
+ *      vmulq_f32     FMUL (V.4S, V.4S, V.4S)      3cy @ 2p  3cy @ 4p
+ *      vmulq_f64     FMUL (V.2D, V.2D, V.2D)      3cy @ 2p  3cy @ 4p
+ *      vaddvq_f32    FADDP+FADDP (reduce)         5cy @ 1p  8cy @ 1p
+ *      vaddvq_f64    FADDP (V.2D to scalar)       3cy @ 1p  3cy @ 1p
+ *      vpaddq_f32    FADDP (V.4S, V.4S, V.4S)     2cy @ 2p  3cy @ 4p
+ *      vpaddq_f64    FADDP (V.2D, V.2D, V.2D)     2cy @ 2p  3cy @ 4p
+ *      vcvt_f64_f32  FCVTL (V.2D, V.2S)           3cy @ 2p  3cy @ 2p
+ *      vld2_f32      LD2 ({Vt.2S, Vt2.2S}, [Xn])  4cy @ 1p  4cy @ 1p
  *
  *  FMA throughput doubles on cores with 4 SIMD pipes (Apple M4+, Graviton3+, Oryon), but
  *  horizontal reductions remain at 1/cy on all cores and become the main bottleneck.
@@ -118,21 +121,25 @@ NK_INTERNAL nk_f64_t nk_dot_stable_sum_f64x2_neon_(float64x2_t sum_f64x2, float6
     return tentative_sum + (lower_error + upper_error + rounding_error);
 }
-#pragma region - Traditional Floats
+#pragma region F32 and F64 Floats
 NK_PUBLIC void nk_dot_f32_neon(nk_f32_t const *a_scalars, nk_f32_t const *b_scalars, nk_size_t count_scalars,
                                nk_f64_t *result) {
-    // Upcast f32 to f64 for accumulation (2 f32s per iteration, avoids slow vget_low/high)
-    float64x2_t sum_f64x2 = vdupq_n_f64(0);
+    // Upcast f32 to f64 via FCVTL/FCVTL2, two independent FMA chains for ILP
+    float64x2_t sum_low_f64x2 = vdupq_n_f64(0);
+    float64x2_t sum_high_f64x2 = vdupq_n_f64(0);
     nk_size_t idx_scalars = 0;
-    for (; idx_scalars + 2 <= count_scalars; idx_scalars += 2) {
-        float32x2_t a_f32x2 = vld1_f32(a_scalars + idx_scalars);
-        float32x2_t b_f32x2 = vld1_f32(b_scalars + idx_scalars);
-        float64x2_t a_f64x2 = vcvt_f64_f32(a_f32x2);
-        float64x2_t b_f64x2 = vcvt_f64_f32(b_f32x2);
-        sum_f64x2 = vfmaq_f64(sum_f64x2, a_f64x2, b_f64x2);
+    for (; idx_scalars + 4 <= count_scalars; idx_scalars += 4) {
+        float32x4_t a_f32x4 = vld1q_f32(a_scalars + idx_scalars);
+        float32x4_t b_f32x4 = vld1q_f32(b_scalars + idx_scalars);
+        float64x2_t a_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_f32x4));
+        float64x2_t a_high_f64x2 = vcvt_high_f64_f32(a_f32x4);
+        float64x2_t b_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_f32x4));
+        float64x2_t b_high_f64x2 = vcvt_high_f64_f32(b_f32x4);
+        sum_low_f64x2 = vfmaq_f64(sum_low_f64x2, a_low_f64x2, b_low_f64x2);
+        sum_high_f64x2 = vfmaq_f64(sum_high_f64x2, a_high_f64x2, b_high_f64x2);
     }
-    nk_f64_t sum_f64 = vaddvq_f64(sum_f64x2);
+    nk_f64_t sum_f64 = vaddvq_f64(vaddq_f64(sum_low_f64x2, sum_high_f64x2));
     for (; idx_scalars < count_scalars; ++idx_scalars)
         sum_f64 += (nk_f64_t)a_scalars[idx_scalars] * (nk_f64_t)b_scalars[idx_scalars];
     *result = sum_f64;
@@ -243,10 +250,10 @@ NK_INTERNAL void nk_dot_f32x2_finalize_neon(
     nk_dot_f32x2_state_neon_t const *state_c, nk_dot_f32x2_state_neon_t const *state_d, //
     nk_size_t total_dimensions, nk_b256_vec_t *result) {
     nk_unused_(total_dimensions);
-    result->f64s[0] = vaddvq_f64(state_a->sum_f64x2);
-    result->f64s[1] = vaddvq_f64(state_b->sum_f64x2);
-    result->f64s[2] = vaddvq_f64(state_c->sum_f64x2);
-    result->f64s[3] = vaddvq_f64(state_d->sum_f64x2);
+    float64x2_t ab_f64x2 = vpaddq_f64(state_a->sum_f64x2, state_b->sum_f64x2);
+    float64x2_t cd_f64x2 = vpaddq_f64(state_c->sum_f64x2, state_d->sum_f64x2);
+    vst1q_f64(&result->f64s[0], ab_f64x2);
+    vst1q_f64(&result->f64s[2], cd_f64x2);
 }
 NK_PUBLIC void nk_dot_f64_neon(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
@@ -302,11 +309,11 @@ nk_dot_f64c_neon_cycle:
         nk_b128_vec_t a_tail, b_tail;
         nk_partial_load_b64x2_serial_(a_pairs, &a_tail, count_pairs * 2);
         nk_partial_load_b64x2_serial_(b_pairs, &b_tail, count_pairs * 2);
-        float64x2_t zeros = vdupq_n_f64(0);
-        a_real_f64x2 = vzip1q_f64(a_tail.f64x2, zeros);
-        a_imag_f64x2 = vzip2q_f64(a_tail.f64x2, zeros);
-        b_real_f64x2 = vzip1q_f64(b_tail.f64x2, zeros);
-        b_imag_f64x2 = vzip2q_f64(b_tail.f64x2, zeros);
+        float64x2_t zeros_f64x2 = vdupq_n_f64(0);
+        a_real_f64x2 = vzip1q_f64(a_tail.f64x2, zeros_f64x2);
+        a_imag_f64x2 = vzip2q_f64(a_tail.f64x2, zeros_f64x2);
+        b_real_f64x2 = vzip1q_f64(b_tail.f64x2, zeros_f64x2);
+        b_imag_f64x2 = vzip2q_f64(b_tail.f64x2, zeros_f64x2);
         count_pairs = 0;
     }
     else {
@@ -385,11 +392,11 @@ nk_vdot_f64c_neon_cycle:
         nk_b128_vec_t a_tail, b_tail;
         nk_partial_load_b64x2_serial_(a_pairs, &a_tail, count_pairs * 2);
         nk_partial_load_b64x2_serial_(b_pairs, &b_tail, count_pairs * 2);
-        float64x2_t zeros = vdupq_n_f64(0);
-        a_real_f64x2 = vzip1q_f64(a_tail.f64x2, zeros);
-        a_imag_f64x2 = vzip2q_f64(a_tail.f64x2, zeros);
-        b_real_f64x2 = vzip1q_f64(b_tail.f64x2, zeros);
-        b_imag_f64x2 = vzip2q_f64(b_tail.f64x2, zeros);
+        float64x2_t zeros_f64x2 = vdupq_n_f64(0);
+        a_real_f64x2 = vzip1q_f64(a_tail.f64x2, zeros_f64x2);
+        a_imag_f64x2 = vzip2q_f64(a_tail.f64x2, zeros_f64x2);
+        b_real_f64x2 = vzip1q_f64(b_tail.f64x2, zeros_f64x2);
+        b_imag_f64x2 = vzip2q_f64(b_tail.f64x2, zeros_f64x2);
         count_pairs = 0;
     }
     else {
@@ -505,9 +512,9 @@ NK_INTERNAL void nk_dot_f64x2_finalize_neon(
     result->f64s[3] = nk_dot_stable_sum_f64x2_neon_(state_d->sum_f64x2, state_d->compensation_f64x2);
 }
-#pragma endregion - Traditional Floats
+#pragma endregion F32 and F64 Floats
-#pragma region - Smaller Floats
+#pragma region F16 and BF16 Floats
 NK_PUBLIC void nk_dot_bf16_neon(nk_bf16_t const *a_scalars, nk_bf16_t const *b_scalars, nk_size_t count_scalars,
                                 nk_f32_t *result) {
@@ -528,9 +535,9 @@ nk_dot_bf16_neon_cycle:
         a_scalars += 8, b_scalars += 8, count_scalars -= 8;
     }
     float32x4_t a_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(a_u16x8), 16));
-    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(a_u16x8), 16));
+    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_high_n_u16(a_u16x8, 16));
     float32x4_t b_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(b_u16x8), 16));
-    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(b_u16x8), 16));
+    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_high_n_u16(b_u16x8, 16));
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
     if (count_scalars) goto nk_dot_bf16_neon_cycle;
@@ -555,9 +562,9 @@ NK_INTERNAL void nk_dot_bf16x8_update_neon(nk_dot_bf16x8_state_neon_t *state, nk
     nk_unused_(active_dimensions);
     // Convert bf16 to f32 via USHLL shift-16 (low and high halves)
     float32x4_t a_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(a.u16x8), 16));
-    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(a.u16x8), 16));
+    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_high_n_u16(a.u16x8, 16));
     float32x4_t b_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(b.u16x8), 16));
-    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(b.u16x8), 16));
+    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_high_n_u16(b.u16x8, 16));
     state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_low_f32x4, b_low_f32x4);
     state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_high_f32x4, b_high_f32x4);
 }
@@ -567,10 +574,9 @@ NK_INTERNAL void nk_dot_bf16x8_finalize_neon(
     nk_dot_bf16x8_state_neon_t const *state_c, nk_dot_bf16x8_state_neon_t const *state_d, //
     nk_size_t total_dimensions, nk_b128_vec_t *result) {
     nk_unused_(total_dimensions);
-    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
-    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
-    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
-    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+    float32x4_t ab_f32x4 = vpaddq_f32(state_a->sum_f32x4, state_b->sum_f32x4);
+    float32x4_t cd_f32x4 = vpaddq_f32(state_c->sum_f32x4, state_d->sum_f32x4);
+    result->f32x4 = vpaddq_f32(ab_f32x4, cd_f32x4);
 }
 NK_PUBLIC void nk_dot_f16_neon(nk_f16_t const *a_scalars, nk_f16_t const *b_scalars, nk_size_t count_scalars,
@@ -591,10 +597,12 @@ nk_dot_f16_neon_cycle:
         b_u16x8 = vld1q_u16((nk_u16_t const *)b_scalars);
         a_scalars += 8, b_scalars += 8, count_scalars -= 8;
     }
-    float32x4_t a_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(a_u16x8));
-    float32x4_t a_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(a_u16x8));
-    float32x4_t b_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(b_u16x8));
-    float32x4_t b_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(b_u16x8));
+    float16x8_t a_f16x8 = vreinterpretq_f16_u16(a_u16x8);
+    float16x8_t b_f16x8 = vreinterpretq_f16_u16(b_u16x8);
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
     if (count_scalars) goto nk_dot_f16_neon_cycle;
@@ -604,8 +612,8 @@ nk_dot_f16_neon_cycle:
 /**
  *  @brief Running state for 128-bit dot accumulation over f16 scalars on plain NEON.
  *
- *  Processes 8 f16 values at a time (128 bits), converting to f32 via integer bit
- *  manipulation for accumulation without requiring the ARMv8.2-A FP16 extension.
+ *  Processes 8 f16 values at a time (128 bits), converting to f32 via FCVTL
+ *  for accumulation without requiring the ARMv8.2-A FP16 arithmetic extension.
  */
 typedef struct nk_dot_f16x8_state_neon_t {
     float32x4_t sum_f32x4;
@@ -617,11 +625,13 @@ NK_INTERNAL void nk_dot_f16x8_update_neon(nk_dot_f16x8_state_neon_t *state, nk_b
                                           nk_size_t depth_offset, nk_size_t active_dimensions) {
     nk_unused_(depth_offset);
     nk_unused_(active_dimensions);
-    // Convert f16 to f32 via integer bit manipulation (low and high halves)
-    float32x4_t a_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(a.u16x8));
-    float32x4_t a_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(a.u16x8));
-    float32x4_t b_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(b.u16x8));
-    float32x4_t b_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(b.u16x8));
+    // Convert f16 to f32 via FCVTL / FCVTL2 (low and high halves)
+    float16x8_t a_f16x8 = vreinterpretq_f16_u16(a.u16x8);
+    float16x8_t b_f16x8 = vreinterpretq_f16_u16(b.u16x8);
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
     state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_low_f32x4, b_low_f32x4);
     state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_high_f32x4, b_high_f32x4);
 }
@@ -631,10 +641,9 @@ NK_INTERNAL void nk_dot_f16x8_finalize_neon(
     nk_dot_f16x8_state_neon_t const *state_c, nk_dot_f16x8_state_neon_t const *state_d, //
     nk_size_t total_dimensions, nk_b128_vec_t *result) {
     nk_unused_(total_dimensions);
-    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
-    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
-    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
-    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+    float32x4_t ab_f32x4 = vpaddq_f32(state_a->sum_f32x4, state_b->sum_f32x4);
+    float32x4_t cd_f32x4 = vpaddq_f32(state_c->sum_f32x4, state_d->sum_f32x4);
+    result->f32x4 = vpaddq_f32(ab_f32x4, cd_f32x4);
 }
 NK_PUBLIC void nk_dot_e4m3_neon(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars, nk_size_t count_scalars,
@@ -656,9 +665,9 @@ nk_dot_e4m3_neon_cycle:
         a_scalars += 8, b_scalars += 8, count_scalars -= 8;
     }
     float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
-    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
     float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
-    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
     if (count_scalars) goto nk_dot_e4m3_neon_cycle;
@@ -684,9 +693,9 @@ nk_dot_e5m2_neon_cycle:
         a_scalars += 8, b_scalars += 8, count_scalars -= 8;
     }
     float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
-    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
     float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
-    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
     sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
     if (count_scalars) goto nk_dot_e5m2_neon_cycle;
@@ -713,12 +722,10 @@ nk_dot_e2m3_neon_cycle:
         a_scalars += 16, b_scalars += 16, count_scalars -= 16;
     }
     sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_low_f16x8)), vcvt_f32_f16(vget_low_f16(b_low_f16x8)));
-    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_low_f16x8)),
-                          vcvt_f32_f16(vget_high_f16(b_low_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_high_f32_f16(a_low_f16x8), vcvt_high_f32_f16(b_low_f16x8));
     sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_high_f16x8)),
                           vcvt_f32_f16(vget_low_f16(b_high_f16x8)));
-    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_high_f16x8)),
-                          vcvt_f32_f16(vget_high_f16(b_high_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_high_f32_f16(a_high_f16x8), vcvt_high_f32_f16(b_high_f16x8));
     if (count_scalars) goto nk_dot_e2m3_neon_cycle;
     *result = vaddvq_f32(sum_f32x4);
 }
@@ -743,19 +750,17 @@ nk_dot_e3m2_neon_cycle:
         a_scalars += 16, b_scalars += 16, count_scalars -= 16;
     }
     sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_low_f16x8)), vcvt_f32_f16(vget_low_f16(b_low_f16x8)));
-    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_low_f16x8)),
-                          vcvt_f32_f16(vget_high_f16(b_low_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_high_f32_f16(a_low_f16x8), vcvt_high_f32_f16(b_low_f16x8));
     sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_high_f16x8)),
                           vcvt_f32_f16(vget_low_f16(b_high_f16x8)));
-    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_high_f16x8)),
-                          vcvt_f32_f16(vget_high_f16(b_high_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_high_f32_f16(a_high_f16x8), vcvt_high_f32_f16(b_high_f16x8));
     if (count_scalars) goto nk_dot_e3m2_neon_cycle;
     *result = vaddvq_f32(sum_f32x4);
 }
-#pragma endregion - Smaller Floats
+#pragma endregion F16 and BF16 Floats
-#pragma region - Binary
+#pragma region Binary
 NK_PUBLIC void nk_dot_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bits, nk_u32_t *result) {
     nk_size_t n_bytes = nk_size_divide_round_up_(n_bits, NK_BITS_PER_BYTE);
@@ -801,7 +806,53 @@ NK_INTERNAL void nk_dot_u1x128_finalize_neon( //
     result->u32x4 = vpaddq_u32(ab_sum_u32x4, cd_sum_u32x4);
 }
-#pragma endregion - Binary
+#pragma endregion Binary
+NK_PUBLIC void nk_dot_f16c_neon(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
+                                nk_f32c_t *result) {
+    float32x4_t sum_real_f32x4 = vdupq_n_f32(0);
+    float32x4_t sum_imag_f32x4 = vdupq_n_f32(0);
+    while (count_pairs >= 4) {
+        int16x4x2_t a_i16x4x2 = vld2_s16((short *)a_pairs);
+        int16x4x2_t b_i16x4x2 = vld2_s16((short *)b_pairs);
+        float32x4_t a_real_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(a_i16x4x2.val[0]));
+        float32x4_t a_imag_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(a_i16x4x2.val[1]));
+        float32x4_t b_real_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(b_i16x4x2.val[0]));
+        float32x4_t b_imag_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(b_i16x4x2.val[1]));
+        sum_real_f32x4 = vfmaq_f32(sum_real_f32x4, a_real_f32x4, b_real_f32x4);
+        sum_real_f32x4 = vfmsq_f32(sum_real_f32x4, a_imag_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmaq_f32(sum_imag_f32x4, a_real_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmaq_f32(sum_imag_f32x4, a_imag_f32x4, b_real_f32x4);
+        count_pairs -= 4, a_pairs += 4, b_pairs += 4;
+    }
+    nk_f32c_t tail_result;
+    nk_dot_f16c_serial(a_pairs, b_pairs, count_pairs, &tail_result);
+    result->real = tail_result.real + vaddvq_f32(sum_real_f32x4);
+    result->imag = tail_result.imag + vaddvq_f32(sum_imag_f32x4);
+}
+NK_PUBLIC void nk_vdot_f16c_neon(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
+                                 nk_f32c_t *result) {
+    float32x4_t sum_real_f32x4 = vdupq_n_f32(0);
+    float32x4_t sum_imag_f32x4 = vdupq_n_f32(0);
+    while (count_pairs >= 4) {
+        int16x4x2_t a_i16x4x2 = vld2_s16((short *)a_pairs);
+        int16x4x2_t b_i16x4x2 = vld2_s16((short *)b_pairs);
+        float32x4_t a_real_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(a_i16x4x2.val[0]));
+        float32x4_t a_imag_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(a_i16x4x2.val[1]));
+        float32x4_t b_real_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(b_i16x4x2.val[0]));
+        float32x4_t b_imag_f32x4 = vcvt_f32_f16(vreinterpret_f16_s16(b_i16x4x2.val[1]));
+        sum_real_f32x4 = vfmaq_f32(sum_real_f32x4, a_real_f32x4, b_real_f32x4);
+        sum_real_f32x4 = vfmaq_f32(sum_real_f32x4, a_imag_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmaq_f32(sum_imag_f32x4, a_real_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmsq_f32(sum_imag_f32x4, a_imag_f32x4, b_real_f32x4);
+        count_pairs -= 4, a_pairs += 4, b_pairs += 4;
+    }
+    nk_f32c_t tail_result;
+    nk_vdot_f16c_serial(a_pairs, b_pairs, count_pairs, &tail_result);
+    result->real = tail_result.real + vaddvq_f32(sum_real_f32x4);
+    result->imag = tail_result.imag + vaddvq_f32(sum_imag_f32x4);
+}
 #if defined(__clang__)
 #pragma clang attribute pop

package/include/numkong/dot/neonbfdot.h CHANGED Viewed

@@ -8,14 +8,14 @@
  *
  *  @section dot_neonbfdot_instructions ARM NEON BF16 Instructions (ARMv8.6-BF16)
  *
- *      Intrinsic                   Instruction                     Latency     Throughput
- *                                                                              A76         M4+/V1+/Oryon
- *      vbfdotq_f32                 BFDOT (V.4S, V.8H, V.8H)        3cy         2/cy        4/cy
- *      vcvt_f32_bf16               BFCVTN (V.4H, V.4S)             3cy         2/cy        4/cy
- *      vld1q_bf16                  LD1 (V.8H)                      4cy         2/cy        3/cy
- *      vaddvq_f32                  FADDP+FADDP (V.4S)              4cy         1/cy        2/cy
- *      vfmaq_f32                   FMLA (V.4S, V.4S, V.4S)         4cy         2/cy        4/cy
- *      vfmsq_f32                   FMLS (V.4S, V.4S, V.4S)         4cy         2/cy        4/cy
+ *      Intrinsic      Instruction               A76       M5
+ *      vbfdotq_f32    BFDOT (V.4S, V.8H, V.8H)  3cy @ 2p  2cy @ 1p
+ *      vcvt_f32_bf16  BFCVTN (V.4H, V.4S)       3cy @ 2p  3cy @ 4p
+ *      vld1q_bf16     LD1 (V.8H)                4cy @ 2p  4cy @ 3p
+ *      vaddvq_f32     FADDP+FADDP (V.4S)        4cy @ 1p  8cy @ 1p
+ *      vpaddq_f32     FADDP (V.4S, V.4S, V.4S)  2cy @ 2p  3cy @ 4p
+ *      vfmaq_f32      FMLA (V.4S, V.4S, V.4S)   4cy @ 2p  3cy @ 4p
+ *      vfmsq_f32      FMLS (V.4S, V.4S, V.4S)   4cy @ 2p  3cy @ 4p
  *
  *  The ARMv8.6-BF16 extension provides the BFDOT instruction for accelerated BF16 dot products,
  *  targeting machine learning inference workloads. BF16 trades mantissa precision (7 bits vs 10 in
@@ -223,10 +223,9 @@ NK_INTERNAL void nk_dot_bf16x8_finalize_neonbfdot(
     nk_dot_bf16x8_state_neonbfdot_t const *state_c, nk_dot_bf16x8_state_neonbfdot_t const *state_d, //
     nk_size_t total_dimensions, nk_b128_vec_t *result) {
     nk_unused_(total_dimensions);
-    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
-    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
-    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
-    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+    float32x4_t ab_f32x4 = vpaddq_f32(state_a->sum_f32x4, state_b->sum_f32x4);
+    float32x4_t cd_f32x4 = vpaddq_f32(state_c->sum_f32x4, state_d->sum_f32x4);
+    result->f32x4 = vpaddq_f32(ab_f32x4, cd_f32x4);
 }
 #if defined(__clang__)

package/include/numkong/dot/neonfhm.h CHANGED Viewed

@@ -8,14 +8,15 @@
  *
  *  @section dot_neonfhm_instructions ARM NEON FP16 Matrix Instructions (ARMv8.4-FHM)
  *
- *      Intrinsic                   Instruction                     Latency     Throughput
- *                                                                              A76         M4+/V1+/Oryon
- *      vfmlalq_low_f16             FMLAL (V.4S, V.8H, V.8H)        4cy         2/cy        4/cy
- *      vfmlalq_high_f16            FMLAL2 (V.4S, V.8H, V.8H)       4cy         2/cy        4/cy
- *      vfmlslq_low_f16             FMLSL (V.4S, V.8H, V.8H)        4cy         2/cy        4/cy
- *      vfmlslq_high_f16            FMLSL2 (V.4S, V.8H, V.8H)       4cy         2/cy        4/cy
- *      vld1q_f16                   LD1 (V.8H)                      4cy         2/cy        3/cy
- *      vaddvq_f32                  FADDP+FADDP (V.4S)              4cy         1/cy        2/cy
+ *      Intrinsic         Instruction                A76       M5
+ *      vfmlalq_low_f16   FMLAL (V.4S, V.8H, V.8H)   4cy @ 2p  4cy @ 4p
+ *      vfmlalq_high_f16  FMLAL2 (V.4S, V.8H, V.8H)  4cy @ 2p  4cy @ 4p
+ *      vfmlslq_low_f16   FMLSL (V.4S, V.8H, V.8H)   4cy @ 2p  4cy @ 4p
+ *      vfmlslq_high_f16  FMLSL2 (V.4S, V.8H, V.8H)  4cy @ 2p  4cy @ 4p
+ *      vld1q_f16         LD1 (V.8H)                 4cy @ 2p  4cy @ 3p
+ *      vaddvq_f32        FADDP+FADDP (V.4S)         4cy @ 1p  8cy @ 1p
+ *      vpaddq_f32        FADDP (V.4S, V.4S, V.4S)   2cy @ 2p  3cy @ 4p
+ *      vshll_n_u8        SHLL (V.8H, V.8B, #8)      2cy @ 2p  2cy @ 4p
  *
  *  The ARMv8.4-FHM extension (FEAT_FHM) provides FMLAL/FMLSL instructions that fuse FP16 to FP32
  *  widening with multiply-accumulate in a single operation. FMLAL executes as a single fused op
@@ -90,8 +91,8 @@ nk_dot_f16_neonfhm_cycle:
         count_scalars = 0;
     }
     else {
-        a_f16x8 = vld1q_f16((nk_f16_for_arm_simd_t const *)(a_scalars));
-        b_f16x8 = vld1q_f16((nk_f16_for_arm_simd_t const *)(b_scalars));
+        a_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)(a_scalars)));
+        b_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)(b_scalars)));
         a_scalars += 8, b_scalars += 8, count_scalars -= 8;
     }
     // FMLAL: widening multiply-accumulate fp16 → f32
@@ -124,10 +125,9 @@ NK_INTERNAL void nk_dot_f16x8_finalize_neonfhm(
     nk_dot_f16x8_state_neonfhm_t const *state_c, nk_dot_f16x8_state_neonfhm_t const *state_d, //
     nk_size_t total_dimensions, nk_b128_vec_t *result) {
     nk_unused_(total_dimensions);
-    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
-    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
-    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
-    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+    float32x4_t ab_f32x4 = vpaddq_f32(state_a->sum_f32x4, state_b->sum_f32x4);
+    float32x4_t cd_f32x4 = vpaddq_f32(state_c->sum_f32x4, state_d->sum_f32x4);
+    result->f32x4 = vpaddq_f32(ab_f32x4, cd_f32x4);
 }
 NK_PUBLIC void nk_dot_f16c_neonfhm(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
@@ -220,58 +220,58 @@ NK_PUBLIC void nk_vdot_f16c_neonfhm(nk_f16c_t const *a_pairs, nk_f16c_t const *b
 NK_PUBLIC void nk_dot_e4m3_neonfhm(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars, nk_size_t count_scalars,
                                    nk_f32_t *result) {
-    float16x8_t a_low, a_high, b_low, b_high;
+    float16x8_t a_low_f16x8, a_high_f16x8, b_low_f16x8, b_high_f16x8;
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
 nk_dot_e4m3_neonfhm_cycle:
     if (count_scalars < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a_scalars, &a_vec, count_scalars);
         nk_partial_load_b8x16_serial_(b_scalars, &b_vec, count_scalars);
-        nk_e4m3x16_to_f16x8x2_neon_(a_vec.u8x16, &a_low, &a_high);
-        nk_e4m3x16_to_f16x8x2_neon_(b_vec.u8x16, &b_low, &b_high);
+        nk_e4m3x16_to_f16x8x2_neon_(a_vec.u8x16, &a_low_f16x8, &a_high_f16x8);
+        nk_e4m3x16_to_f16x8x2_neon_(b_vec.u8x16, &b_low_f16x8, &b_high_f16x8);
         count_scalars = 0;
     }
     else {
-        nk_e4m3x16_to_f16x8x2_neon_(vld1q_u8(a_scalars), &a_low, &a_high);
-        nk_e4m3x16_to_f16x8x2_neon_(vld1q_u8(b_scalars), &b_low, &b_high);
+        nk_e4m3x16_to_f16x8x2_neon_(vld1q_u8(a_scalars), &a_low_f16x8, &a_high_f16x8);
+        nk_e4m3x16_to_f16x8x2_neon_(vld1q_u8(b_scalars), &b_low_f16x8, &b_high_f16x8);
         a_scalars += 16, b_scalars += 16, count_scalars -= 16;
     }
-    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_low, b_low);
-    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_low, b_low);
-    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_high, b_high);
-    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_high, b_high);
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_low_f16x8, b_low_f16x8);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_low_f16x8, b_low_f16x8);
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_high_f16x8, b_high_f16x8);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_high_f16x8, b_high_f16x8);
     if (count_scalars) goto nk_dot_e4m3_neonfhm_cycle;
     *result = vaddvq_f32(sum_f32x4);
 }
 NK_PUBLIC void nk_dot_e5m2_neonfhm(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
                                    nk_f32_t *result) {
-    float16x8_t a_low, a_high, b_low, b_high;
+    float16x8_t a_low_f16x8, a_high_f16x8, b_low_f16x8, b_high_f16x8;
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
 nk_dot_e5m2_neonfhm_cycle:
     if (count_scalars < 16) {
         nk_b128_vec_t a_vec, b_vec;
         nk_partial_load_b8x16_serial_(a_scalars, &a_vec, count_scalars);
         nk_partial_load_b8x16_serial_(b_scalars, &b_vec, count_scalars);
-        a_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a_vec.u8x16), 8));
-        a_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(a_vec.u8x16), 8));
-        b_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b_vec.u8x16), 8));
-        b_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(b_vec.u8x16), 8));
+        a_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a_vec.u8x16), 8));
+        a_high_f16x8 = vreinterpretq_f16_u16(vshll_high_n_u8(a_vec.u8x16, 8));
+        b_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b_vec.u8x16), 8));
+        b_high_f16x8 = vreinterpretq_f16_u16(vshll_high_n_u8(b_vec.u8x16, 8));
         count_scalars = 0;
     }
     else {
         uint8x16_t a_u8x16 = vld1q_u8(a_scalars);
         uint8x16_t b_u8x16 = vld1q_u8(b_scalars);
-        a_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a_u8x16), 8));
-        a_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(a_u8x16), 8));
-        b_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b_u8x16), 8));
-        b_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(b_u8x16), 8));
+        a_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a_u8x16), 8));
+        a_high_f16x8 = vreinterpretq_f16_u16(vshll_high_n_u8(a_u8x16, 8));
+        b_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b_u8x16), 8));
+        b_high_f16x8 = vreinterpretq_f16_u16(vshll_high_n_u8(b_u8x16, 8));
         a_scalars += 16, b_scalars += 16, count_scalars -= 16;
     }
-    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_low, b_low);
-    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_low, b_low);
-    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_high, b_high);
-    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_high, b_high);
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_low_f16x8, b_low_f16x8);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_low_f16x8, b_low_f16x8);
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_high_f16x8, b_high_f16x8);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_high_f16x8, b_high_f16x8);
     if (count_scalars) goto nk_dot_e5m2_neonfhm_cycle;
     *result = vaddvq_f32(sum_f32x4);
 }
@@ -304,10 +304,9 @@ NK_INTERNAL void nk_dot_e4m3x16_finalize_neonfhm(
     nk_dot_e4m3x16_state_neonfhm_t const *state_c, nk_dot_e4m3x16_state_neonfhm_t const *state_d, //
     nk_size_t total_dimensions, nk_b128_vec_t *result) {
     nk_unused_(total_dimensions);
-    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
-    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
-    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
-    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+    float32x4_t ab_f32x4 = vpaddq_f32(state_a->sum_f32x4, state_b->sum_f32x4);
+    float32x4_t cd_f32x4 = vpaddq_f32(state_c->sum_f32x4, state_d->sum_f32x4);
+    result->f32x4 = vpaddq_f32(ab_f32x4, cd_f32x4);
 }
 typedef struct nk_dot_e5m2x16_state_neonfhm_t {
@@ -324,9 +323,9 @@ NK_INTERNAL void nk_dot_e5m2x16_update_neonfhm(nk_dot_e5m2x16_state_neonfhm_t *s
     nk_unused_(active_dimensions);
     // Convert e5m2 → f16 via SHLL: widen u8→u16 and shift left 8 in one instruction
     float16x8_t a_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a.u8x16), 8));
-    float16x8_t a_high_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(a.u8x16), 8));
+    float16x8_t a_high_f16x8 = vreinterpretq_f16_u16(vshll_high_n_u8(a.u8x16, 8));
     float16x8_t b_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b.u8x16), 8));
-    float16x8_t b_high_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(b.u8x16), 8));
+    float16x8_t b_high_f16x8 = vreinterpretq_f16_u16(vshll_high_n_u8(b.u8x16, 8));
     // FMLAL: widening multiply-accumulate fp16 → f32
     state->sum_f32x4 = vfmlalq_low_f16(state->sum_f32x4, a_low_f16x8, b_low_f16x8);
     state->sum_f32x4 = vfmlalq_high_f16(state->sum_f32x4, a_low_f16x8, b_low_f16x8);
@@ -339,10 +338,9 @@ NK_INTERNAL void nk_dot_e5m2x16_finalize_neonfhm(
     nk_dot_e5m2x16_state_neonfhm_t const *state_c, nk_dot_e5m2x16_state_neonfhm_t const *state_d, //
     nk_size_t total_dimensions, nk_b128_vec_t *result) {
     nk_unused_(total_dimensions);
-    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
-    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
-    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
-    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+    float32x4_t ab_f32x4 = vpaddq_f32(state_a->sum_f32x4, state_b->sum_f32x4);
+    float32x4_t cd_f32x4 = vpaddq_f32(state_c->sum_f32x4, state_d->sum_f32x4);
+    result->f32x4 = vpaddq_f32(ab_f32x4, cd_f32x4);
 }
 #if defined(__clang__)