npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/dot/neonbfdot.h ADDED Viewed

@@ -0,0 +1,244 @@
+/**
+ *  @brief SIMD-accelerated Dot Products for NEON BF16.
+ *  @file include/numkong/dot/neonbfdot.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/dot.h
+ *
+ *  @section dot_neonbfdot_instructions ARM NEON BF16 Instructions (ARMv8.6-BF16)
+ *
+ *      Intrinsic                   Instruction                     Latency     Throughput
+ *                                                                              A76         M4+/V1+/Oryon
+ *      vbfdotq_f32                 BFDOT (V.4S, V.8H, V.8H)        3cy         2/cy        4/cy
+ *      vcvt_f32_bf16               BFCVTN (V.4H, V.4S)             3cy         2/cy        4/cy
+ *      vld1q_bf16                  LD1 (V.8H)                      4cy         2/cy        3/cy
+ *      vaddvq_f32                  FADDP+FADDP (V.4S)              4cy         1/cy        2/cy
+ *      vfmaq_f32                   FMLA (V.4S, V.4S, V.4S)         4cy         2/cy        4/cy
+ *      vfmsq_f32                   FMLS (V.4S, V.4S, V.4S)         4cy         2/cy        4/cy
+ *
+ *  The ARMv8.6-BF16 extension provides the BFDOT instruction for accelerated BF16 dot products,
+ *  targeting machine learning inference workloads. BF16 trades mantissa precision (7 bits vs 10 in
+ *  FP16) for a larger exponent range matching FP32, eliminating overflow concerns during training.
+ *
+ *  BFDOT computes two BF16 dot products per lane, accumulating directly into FP32 without explicit
+ *  conversion. This provides higher throughput than FP16 convert-then-FMA sequences for ML inference
+ *  where the reduced precision is acceptable.
+ *
+ *  @section dot_neonbfdot_stateful Stateful Streaming Logic
+ *
+ *  To build memory-optimal tiled algorithms, this file defines following structures and force-inlined
+ *  `NK_INTERNAL` functions:
+ *
+ *  - nk_dot_bf16x8 state with native BFDOT bf16 dot-products.
+ *
+ *  @code{c}
+ *  nk_dot_bf16x8_state_neonbfdot_t state_first, state_second, state_third, state_fourth;
+ *  bfloat16x8_t query_bf16x8, target_first_bf16x8, target_second_bf16x8, target_third_bf16x8, target_fourth_bf16x8;
+ *  nk_dot_bf16x8_init_neonbfdot(&state_first);
+ *  nk_dot_bf16x8_init_neonbfdot(&state_second);
+ *  nk_dot_bf16x8_init_neonbfdot(&state_third);
+ *  nk_dot_bf16x8_init_neonbfdot(&state_fourth);
+ *  for (nk_size_t idx = 0; idx + 8 <= depth; idx += 8) {
+ *      query_bf16x8 = vld1q_bf16(query_ptr + idx);
+ *      target_first_bf16x8 = vld1q_bf16(target_first_ptr + idx);
+ *      target_second_bf16x8 = vld1q_bf16(target_second_ptr + idx);
+ *      target_third_bf16x8 = vld1q_bf16(target_third_ptr + idx);
+ *      target_fourth_bf16x8 = vld1q_bf16(target_fourth_ptr + idx);
+ *      nk_dot_bf16x8_update_neonbfdot(&state_first, query_bf16x8, target_first_bf16x8, idx, 8);
+ *      nk_dot_bf16x8_update_neonbfdot(&state_second, query_bf16x8, target_second_bf16x8, idx, 8);
+ *      nk_dot_bf16x8_update_neonbfdot(&state_third, query_bf16x8, target_third_bf16x8, idx, 8);
+ *      nk_dot_bf16x8_update_neonbfdot(&state_fourth, query_bf16x8, target_fourth_bf16x8, idx, 8);
+ *  }
+ *  float32x4_t results_f32x4;
+ *  nk_dot_bf16x8_finalize_neonbfdot(&state_first, &state_second, &state_third, &state_fourth, depth, &results_f32x4);
+ *  @endcode
+ */
+#ifndef NK_DOT_NEONBFDOT_H
+#define NK_DOT_NEONBFDOT_H
+#if NK_TARGET_ARM_
+#if NK_TARGET_NEONBFDOT
+#include "numkong/types.h"
+#include "numkong/cast/serial.h" // `nk_partial_load_b8x8_serial_`
+#include "numkong/cast/neon.h"   // `nk_e4m3x8_to_bf16x8_neon_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.6-a+simd+bf16")
+#endif
+NK_PUBLIC void nk_dot_bf16_neonbfdot(nk_bf16_t const *a_scalars, nk_bf16_t const *b_scalars, nk_size_t count_scalars,
+                                     nk_f32_t *result) {
+    bfloat16x8_t a_bf16x8, b_bf16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_bf16_neonbfdot_cycle:
+    if (count_scalars < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b16x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_bf16x8 = vreinterpretq_bf16_u16(a_vec.u16x8);
+        b_bf16x8 = vreinterpretq_bf16_u16(b_vec.u16x8);
+        count_scalars = 0;
+    }
+    else {
+        a_bf16x8 = vld1q_bf16((nk_bf16_for_arm_simd_t const *)a_scalars);
+        b_bf16x8 = vld1q_bf16((nk_bf16_for_arm_simd_t const *)b_scalars);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    sum_f32x4 = vbfdotq_f32(sum_f32x4, a_bf16x8, b_bf16x8);
+    if (count_scalars) goto nk_dot_bf16_neonbfdot_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_dot_bf16c_neonbfdot(nk_bf16c_t const *a_pairs, nk_bf16c_t const *b_pairs, nk_size_t count_pairs,
+                                      nk_f32c_t *result) {
+    float32x4_t sum_real_f32x4 = vdupq_n_f32(0);
+    float32x4_t sum_imag_f32x4 = vdupq_n_f32(0);
+    while (count_pairs >= 4) {
+        // Unpack the input arrays into real and imaginary parts.
+        // MSVC sadly doesn't recognize the `vld2_bf16`, so we load the data as signed
+        // integers of the same size and reinterpret with `vreinterpret_bf16_s16` afterwards.
+        int16x4x2_t a_i16x4x2 = vld2_s16((short const *)a_pairs);
+        int16x4x2_t b_i16x4x2 = vld2_s16((short const *)b_pairs);
+        float32x4_t a_real_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(a_i16x4x2.val[0]));
+        float32x4_t a_imag_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(a_i16x4x2.val[1]));
+        float32x4_t b_real_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(b_i16x4x2.val[0]));
+        float32x4_t b_imag_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(b_i16x4x2.val[1]));
+        sum_real_f32x4 = vfmaq_f32(sum_real_f32x4, a_real_f32x4, b_real_f32x4);
+        sum_real_f32x4 = vfmsq_f32(sum_real_f32x4, a_imag_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmaq_f32(sum_imag_f32x4, a_real_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmaq_f32(sum_imag_f32x4, a_imag_f32x4, b_real_f32x4);
+        count_pairs -= 4, a_pairs += 4, b_pairs += 4;
+    }
+    // Reduce horizontal sums and aggregate with the tail:
+    nk_f32c_t tail_result;
+    nk_dot_bf16c_serial(a_pairs, b_pairs, count_pairs, &tail_result);
+    result->real = tail_result.real + vaddvq_f32(sum_real_f32x4);
+    result->imag = tail_result.imag + vaddvq_f32(sum_imag_f32x4);
+}
+NK_PUBLIC void nk_vdot_bf16c_neonbfdot(nk_bf16c_t const *a_pairs, nk_bf16c_t const *b_pairs, nk_size_t count_pairs,
+                                       nk_f32c_t *result) {
+    float32x4_t sum_real_f32x4 = vdupq_n_f32(0);
+    float32x4_t sum_imag_f32x4 = vdupq_n_f32(0);
+    while (count_pairs >= 4) {
+        // Unpack the input arrays into real and imaginary parts.
+        // MSVC sadly doesn't recognize the `vld2_bf16`, so we load the data as signed
+        // integers of the same size and reinterpret with `vreinterpret_bf16_s16` afterwards.
+        int16x4x2_t a_i16x4x2 = vld2_s16((short const *)a_pairs);
+        int16x4x2_t b_i16x4x2 = vld2_s16((short const *)b_pairs);
+        float32x4_t a_real_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(a_i16x4x2.val[0]));
+        float32x4_t a_imag_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(a_i16x4x2.val[1]));
+        float32x4_t b_real_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(b_i16x4x2.val[0]));
+        float32x4_t b_imag_f32x4 = vcvt_f32_bf16(vreinterpret_bf16_s16(b_i16x4x2.val[1]));
+        sum_real_f32x4 = vfmaq_f32(sum_real_f32x4, a_real_f32x4, b_real_f32x4);
+        sum_real_f32x4 = vfmaq_f32(sum_real_f32x4, a_imag_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmaq_f32(sum_imag_f32x4, a_real_f32x4, b_imag_f32x4);
+        sum_imag_f32x4 = vfmsq_f32(sum_imag_f32x4, a_imag_f32x4, b_real_f32x4);
+        count_pairs -= 4, a_pairs += 4, b_pairs += 4;
+    }
+    // Reduce horizontal sums and aggregate with the tail:
+    nk_f32c_t tail_result;
+    nk_vdot_bf16c_serial(a_pairs, b_pairs, count_pairs, &tail_result);
+    result->real = tail_result.real + vaddvq_f32(sum_real_f32x4);
+    result->imag = tail_result.imag + vaddvq_f32(sum_imag_f32x4);
+}
+NK_PUBLIC void nk_dot_e4m3_neonbfdot(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars, nk_size_t count_scalars,
+                                     nk_f32_t *result) {
+    bfloat16x8_t a_bf16x8, b_bf16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_e4m3_neonbfdot_cycle:
+    if (count_scalars < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_bf16x8 = vreinterpretq_bf16_u16(nk_e4m3x8_to_bf16x8_neon_(a_vec.u8x8));
+        b_bf16x8 = vreinterpretq_bf16_u16(nk_e4m3x8_to_bf16x8_neon_(b_vec.u8x8));
+        count_scalars = 0;
+    }
+    else {
+        a_bf16x8 = vreinterpretq_bf16_u16(nk_e4m3x8_to_bf16x8_neon_(vld1_u8(a_scalars)));
+        b_bf16x8 = vreinterpretq_bf16_u16(nk_e4m3x8_to_bf16x8_neon_(vld1_u8(b_scalars)));
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    sum_f32x4 = vbfdotq_f32(sum_f32x4, a_bf16x8, b_bf16x8);
+    if (count_scalars) goto nk_dot_e4m3_neonbfdot_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_dot_e5m2_neonbfdot(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
+                                     nk_f32_t *result) {
+    bfloat16x8_t a_bf16x8, b_bf16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_e5m2_neonbfdot_cycle:
+    if (count_scalars < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_bf16x8 = vreinterpretq_bf16_u16(nk_e5m2x8_to_bf16x8_neon_(a_vec.u8x8));
+        b_bf16x8 = vreinterpretq_bf16_u16(nk_e5m2x8_to_bf16x8_neon_(b_vec.u8x8));
+        count_scalars = 0;
+    }
+    else {
+        a_bf16x8 = vreinterpretq_bf16_u16(nk_e5m2x8_to_bf16x8_neon_(vld1_u8(a_scalars)));
+        b_bf16x8 = vreinterpretq_bf16_u16(nk_e5m2x8_to_bf16x8_neon_(vld1_u8(b_scalars)));
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    sum_f32x4 = vbfdotq_f32(sum_f32x4, a_bf16x8, b_bf16x8);
+    if (count_scalars) goto nk_dot_e5m2_neonbfdot_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over bf16 scalars on NEON.
+ */
+typedef struct nk_dot_bf16x8_state_neonbfdot_t {
+    float32x4_t sum_f32x4;
+} nk_dot_bf16x8_state_neonbfdot_t;
+NK_INTERNAL void nk_dot_bf16x8_init_neonbfdot(nk_dot_bf16x8_state_neonbfdot_t *state) {
+    state->sum_f32x4 = vdupq_n_f32(0);
+}
+NK_INTERNAL void nk_dot_bf16x8_update_neonbfdot(nk_dot_bf16x8_state_neonbfdot_t *state, nk_b128_vec_t a,
+                                                nk_b128_vec_t b, nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    bfloat16x8_t a_bf16x8 = vreinterpretq_bf16_u16(a.u16x8);
+    bfloat16x8_t b_bf16x8 = vreinterpretq_bf16_u16(b.u16x8);
+    state->sum_f32x4 = vbfdotq_f32(state->sum_f32x4, a_bf16x8, b_bf16x8);
+}
+NK_INTERNAL void nk_dot_bf16x8_finalize_neonbfdot(                                                  //
+    nk_dot_bf16x8_state_neonbfdot_t const *state_a, nk_dot_bf16x8_state_neonbfdot_t const *state_b, //
+    nk_dot_bf16x8_state_neonbfdot_t const *state_c, nk_dot_bf16x8_state_neonbfdot_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
+    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
+    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
+    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_NEONBFDOT
+#endif // NK_TARGET_ARM_
+#endif // NK_DOT_NEONBFDOT_H

package/include/numkong/dot/neonfhm.h ADDED Viewed

@@ -0,0 +1,360 @@
+/**
+ *  @brief SIMD-accelerated Dot Products for NEON FHM.
+ *  @file include/numkong/dot/neonfhm.h
+ *  @author Ash Vardanian
+ *  @date December 28, 2025
+ *
+ *  @sa include/numkong/dot.h
+ *
+ *  @section dot_neonfhm_instructions ARM NEON FP16 Matrix Instructions (ARMv8.4-FHM)
+ *
+ *      Intrinsic                   Instruction                     Latency     Throughput
+ *                                                                              A76         M4+/V1+/Oryon
+ *      vfmlalq_low_f16             FMLAL (V.4S, V.8H, V.8H)        4cy         2/cy        4/cy
+ *      vfmlalq_high_f16            FMLAL2 (V.4S, V.8H, V.8H)       4cy         2/cy        4/cy
+ *      vfmlslq_low_f16             FMLSL (V.4S, V.8H, V.8H)        4cy         2/cy        4/cy
+ *      vfmlslq_high_f16            FMLSL2 (V.4S, V.8H, V.8H)       4cy         2/cy        4/cy
+ *      vld1q_f16                   LD1 (V.8H)                      4cy         2/cy        3/cy
+ *      vaddvq_f32                  FADDP+FADDP (V.4S)              4cy         1/cy        2/cy
+ *
+ *  The ARMv8.4-FHM extension (FEAT_FHM) provides FMLAL/FMLSL instructions that fuse FP16 to FP32
+ *  widening with multiply-accumulate in a single operation. FMLAL executes as a single fused op
+ *  (4cy latency, 2/cy throughput on A76, 4/cy on M4+/V1+/Oryon) rather than separate FCVTL + FMLA.
+ *
+ *  FMLAL preserves FP32 accumulator precision while accepting FP16 inputs, ideal for mixed-precision
+ *  workloads. The _low variants process elements 0-3, _high variants process elements 4-7, enabling
+ *  processing of 8 FP16 elements per iteration with full precision accumulation.
+ *
+ *  @section dot_neonfhm_stateful Stateful Streaming Logic
+ *
+ *  To build memory-optimal tiled algorithms, this file defines following structures and force-inlined
+ *  `NK_INTERNAL` functions:
+ *
+ *  - nk_dot_f16x8 state with native FMLAL f16 dot-products.
+ *
+ *  @code{c}
+ *  nk_dot_f16x8_state_neonfhm_t state_first, state_second, state_third, state_fourth;
+ *  float16x8_t query_f16x8, target_first_f16x8, target_second_f16x8, target_third_f16x8, target_fourth_f16x8;
+ *  nk_dot_f16x8_init_neonfhm(&state_first);
+ *  nk_dot_f16x8_init_neonfhm(&state_second);
+ *  nk_dot_f16x8_init_neonfhm(&state_third);
+ *  nk_dot_f16x8_init_neonfhm(&state_fourth);
+ *  for (nk_size_t idx = 0; idx + 8 <= depth; idx += 8) {
+ *      query_f16x8 = vld1q_f16(query_ptr + idx);
+ *      target_first_f16x8 = vld1q_f16(target_first_ptr + idx);
+ *      target_second_f16x8 = vld1q_f16(target_second_ptr + idx);
+ *      target_third_f16x8 = vld1q_f16(target_third_ptr + idx);
+ *      target_fourth_f16x8 = vld1q_f16(target_fourth_ptr + idx);
+ *      nk_dot_f16x8_update_neonfhm(&state_first, query_f16x8, target_first_f16x8, idx, 8);
+ *      nk_dot_f16x8_update_neonfhm(&state_second, query_f16x8, target_second_f16x8, idx, 8);
+ *      nk_dot_f16x8_update_neonfhm(&state_third, query_f16x8, target_third_f16x8, idx, 8);
+ *      nk_dot_f16x8_update_neonfhm(&state_fourth, query_f16x8, target_fourth_f16x8, idx, 8);
+ *  }
+ *  float32x4_t results_f32x4;
+ *  nk_dot_f16x8_finalize_neonfhm(&state_first, &state_second, &state_third, &state_fourth, depth, &results_f32x4);
+ *  @endcode
+ *
+ */
+#ifndef NK_DOT_NEONFHM_H
+#define NK_DOT_NEONFHM_H
+#if NK_TARGET_ARM_
+#if NK_TARGET_NEONFHM
+#include "numkong/types.h"
+#include "numkong/cast/serial.h" // `nk_partial_load_b8x8_serial_`
+#include "numkong/cast/neon.h"   // `nk_e4m3x8_to_f16x8_neon_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16+fp16fml"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+simd+fp16+fp16fml")
+#endif
+NK_PUBLIC void nk_dot_f16_neonfhm(nk_f16_t const *a_scalars, nk_f16_t const *b_scalars, nk_size_t count_scalars,
+                                  nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_f16_neonfhm_cycle:
+    if (count_scalars < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b16x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_f16x8 = vreinterpretq_f16_u16(a_vec.u16x8);
+        b_f16x8 = vreinterpretq_f16_u16(b_vec.u16x8);
+        count_scalars = 0;
+    }
+    else {
+        a_f16x8 = vld1q_f16((nk_f16_for_arm_simd_t const *)(a_scalars));
+        b_f16x8 = vld1q_f16((nk_f16_for_arm_simd_t const *)(b_scalars));
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    // FMLAL: widening multiply-accumulate fp16 → f32
+    // low: processes elements 0-3, high: processes elements 4-7
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_f16x8, b_f16x8);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_f16x8, b_f16x8);
+    if (count_scalars) goto nk_dot_f16_neonfhm_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+typedef struct nk_dot_f16x8_state_neonfhm_t {
+    float32x4_t sum_f32x4;
+} nk_dot_f16x8_state_neonfhm_t;
+NK_INTERNAL void nk_dot_f16x8_init_neonfhm(nk_dot_f16x8_state_neonfhm_t *state) { state->sum_f32x4 = vdupq_n_f32(0); }
+NK_INTERNAL void nk_dot_f16x8_update_neonfhm(nk_dot_f16x8_state_neonfhm_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                             nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    float16x8_t a_f16x8 = vreinterpretq_f16_u16(a.u16x8);
+    float16x8_t b_f16x8 = vreinterpretq_f16_u16(b.u16x8);
+    // FMLAL: widening multiply-accumulate fp16 → f32
+    state->sum_f32x4 = vfmlalq_low_f16(state->sum_f32x4, a_f16x8, b_f16x8);
+    state->sum_f32x4 = vfmlalq_high_f16(state->sum_f32x4, a_f16x8, b_f16x8);
+}
+NK_INTERNAL void nk_dot_f16x8_finalize_neonfhm(                                               //
+    nk_dot_f16x8_state_neonfhm_t const *state_a, nk_dot_f16x8_state_neonfhm_t const *state_b, //
+    nk_dot_f16x8_state_neonfhm_t const *state_c, nk_dot_f16x8_state_neonfhm_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
+    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
+    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
+    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+}
+NK_PUBLIC void nk_dot_f16c_neonfhm(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
+                                   nk_f32c_t *result) {
+    // Accumulate into 4 float32x2_t vectors (low/high for real/imag)
+    float32x2_t sum_real_low_f32x2 = vdup_n_f32(0);
+    float32x2_t sum_real_high_f32x2 = vdup_n_f32(0);
+    float32x2_t sum_imag_low_f32x2 = vdup_n_f32(0);
+    float32x2_t sum_imag_high_f32x2 = vdup_n_f32(0);
+    while (count_pairs >= 4) {
+        // Load and deinterleave: vld2 loads 4 complex pairs as 2 x float16x4_t
+        int16x4x2_t a_i16x4x2 = vld2_s16((short const *)a_pairs);
+        int16x4x2_t b_i16x4x2 = vld2_s16((short const *)b_pairs);
+        float16x4_t a_real_f16x4 = vreinterpret_f16_s16(a_i16x4x2.val[0]);
+        float16x4_t a_imag_f16x4 = vreinterpret_f16_s16(a_i16x4x2.val[1]);
+        float16x4_t b_real_f16x4 = vreinterpret_f16_s16(b_i16x4x2.val[0]);
+        float16x4_t b_imag_f16x4 = vreinterpret_f16_s16(b_i16x4x2.val[1]);
+        // Real: aᵣ × bᵣ - aᵢ × bᵢ (FMLAL then FMLSL)
+        sum_real_low_f32x2 = vfmlal_low_f16(sum_real_low_f32x2, a_real_f16x4, b_real_f16x4);
+        sum_real_low_f32x2 = vfmlsl_low_f16(sum_real_low_f32x2, a_imag_f16x4, b_imag_f16x4);
+        sum_real_high_f32x2 = vfmlal_high_f16(sum_real_high_f32x2, a_real_f16x4, b_real_f16x4);
+        sum_real_high_f32x2 = vfmlsl_high_f16(sum_real_high_f32x2, a_imag_f16x4, b_imag_f16x4);
+        // Imag: aᵣ × bᵢ + aᵢ × bᵣ (FMLAL for both)
+        sum_imag_low_f32x2 = vfmlal_low_f16(sum_imag_low_f32x2, a_real_f16x4, b_imag_f16x4);
+        sum_imag_low_f32x2 = vfmlal_low_f16(sum_imag_low_f32x2, a_imag_f16x4, b_real_f16x4);
+        sum_imag_high_f32x2 = vfmlal_high_f16(sum_imag_high_f32x2, a_real_f16x4, b_imag_f16x4);
+        sum_imag_high_f32x2 = vfmlal_high_f16(sum_imag_high_f32x2, a_imag_f16x4, b_real_f16x4);
+        count_pairs -= 4, a_pairs += 4, b_pairs += 4;
+    }
+    // Combine and reduce
+    float32x4_t sum_real_f32x4 = vcombine_f32(sum_real_low_f32x2, sum_real_high_f32x2);
+    float32x4_t sum_imag_f32x4 = vcombine_f32(sum_imag_low_f32x2, sum_imag_high_f32x2);
+    // Handle tail with serial fallback
+    nk_f32c_t tail_result;
+    nk_dot_f16c_serial(a_pairs, b_pairs, count_pairs, &tail_result);
+    result->real = vaddvq_f32(sum_real_f32x4) + tail_result.real;
+    result->imag = vaddvq_f32(sum_imag_f32x4) + tail_result.imag;
+}
+NK_PUBLIC void nk_vdot_f16c_neonfhm(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
+                                    nk_f32c_t *result) {
+    // Accumulate into 4 float32x2_t vectors (low/high for real/imag)
+    float32x2_t sum_real_low_f32x2 = vdup_n_f32(0);
+    float32x2_t sum_real_high_f32x2 = vdup_n_f32(0);
+    float32x2_t sum_imag_low_f32x2 = vdup_n_f32(0);
+    float32x2_t sum_imag_high_f32x2 = vdup_n_f32(0);
+    while (count_pairs >= 4) {
+        // Load and deinterleave: vld2 loads 4 complex pairs as 2 x float16x4_t
+        int16x4x2_t a_i16x4x2 = vld2_s16((short const *)a_pairs);
+        int16x4x2_t b_i16x4x2 = vld2_s16((short const *)b_pairs);
+        float16x4_t a_real_f16x4 = vreinterpret_f16_s16(a_i16x4x2.val[0]);
+        float16x4_t a_imag_f16x4 = vreinterpret_f16_s16(a_i16x4x2.val[1]);
+        float16x4_t b_real_f16x4 = vreinterpret_f16_s16(b_i16x4x2.val[0]);
+        float16x4_t b_imag_f16x4 = vreinterpret_f16_s16(b_i16x4x2.val[1]);
+        // Real: aᵣ × bᵣ + aᵢ × bᵢ (FMLAL for both)
+        sum_real_low_f32x2 = vfmlal_low_f16(sum_real_low_f32x2, a_real_f16x4, b_real_f16x4);
+        sum_real_low_f32x2 = vfmlal_low_f16(sum_real_low_f32x2, a_imag_f16x4, b_imag_f16x4);
+        sum_real_high_f32x2 = vfmlal_high_f16(sum_real_high_f32x2, a_real_f16x4, b_real_f16x4);
+        sum_real_high_f32x2 = vfmlal_high_f16(sum_real_high_f32x2, a_imag_f16x4, b_imag_f16x4);
+        // Imag: aᵣ × bᵢ - aᵢ × bᵣ (FMLAL then FMLSL)
+        sum_imag_low_f32x2 = vfmlal_low_f16(sum_imag_low_f32x2, a_real_f16x4, b_imag_f16x4);
+        sum_imag_low_f32x2 = vfmlsl_low_f16(sum_imag_low_f32x2, a_imag_f16x4, b_real_f16x4);
+        sum_imag_high_f32x2 = vfmlal_high_f16(sum_imag_high_f32x2, a_real_f16x4, b_imag_f16x4);
+        sum_imag_high_f32x2 = vfmlsl_high_f16(sum_imag_high_f32x2, a_imag_f16x4, b_real_f16x4);
+        count_pairs -= 4, a_pairs += 4, b_pairs += 4;
+    }
+    // Combine and reduce
+    float32x4_t sum_real_f32x4 = vcombine_f32(sum_real_low_f32x2, sum_real_high_f32x2);
+    float32x4_t sum_imag_f32x4 = vcombine_f32(sum_imag_low_f32x2, sum_imag_high_f32x2);
+    // Handle tail with serial fallback
+    nk_f32c_t tail_result;
+    nk_vdot_f16c_serial(a_pairs, b_pairs, count_pairs, &tail_result);
+    result->real = vaddvq_f32(sum_real_f32x4) + tail_result.real;
+    result->imag = vaddvq_f32(sum_imag_f32x4) + tail_result.imag;
+}
+NK_PUBLIC void nk_dot_e4m3_neonfhm(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    float16x8_t a_low, a_high, b_low, b_high;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_e4m3_neonfhm_cycle:
+    if (count_scalars < 16) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x16_serial_(b_scalars, &b_vec, count_scalars);
+        nk_e4m3x16_to_f16x8x2_neon_(a_vec.u8x16, &a_low, &a_high);
+        nk_e4m3x16_to_f16x8x2_neon_(b_vec.u8x16, &b_low, &b_high);
+        count_scalars = 0;
+    }
+    else {
+        nk_e4m3x16_to_f16x8x2_neon_(vld1q_u8(a_scalars), &a_low, &a_high);
+        nk_e4m3x16_to_f16x8x2_neon_(vld1q_u8(b_scalars), &b_low, &b_high);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_low, b_low);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_low, b_low);
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_high, b_high);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_high, b_high);
+    if (count_scalars) goto nk_dot_e4m3_neonfhm_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_dot_e5m2_neonfhm(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    float16x8_t a_low, a_high, b_low, b_high;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_e5m2_neonfhm_cycle:
+    if (count_scalars < 16) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x16_serial_(b_scalars, &b_vec, count_scalars);
+        a_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a_vec.u8x16), 8));
+        a_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(a_vec.u8x16), 8));
+        b_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b_vec.u8x16), 8));
+        b_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(b_vec.u8x16), 8));
+        count_scalars = 0;
+    }
+    else {
+        uint8x16_t a_u8x16 = vld1q_u8(a_scalars);
+        uint8x16_t b_u8x16 = vld1q_u8(b_scalars);
+        a_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a_u8x16), 8));
+        a_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(a_u8x16), 8));
+        b_low = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b_u8x16), 8));
+        b_high = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(b_u8x16), 8));
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_low, b_low);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_low, b_low);
+    sum_f32x4 = vfmlalq_low_f16(sum_f32x4, a_high, b_high);
+    sum_f32x4 = vfmlalq_high_f16(sum_f32x4, a_high, b_high);
+    if (count_scalars) goto nk_dot_e5m2_neonfhm_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+typedef struct nk_dot_e4m3x16_state_neonfhm_t {
+    float32x4_t sum_f32x4;
+} nk_dot_e4m3x16_state_neonfhm_t;
+NK_INTERNAL void nk_dot_e4m3x16_init_neonfhm(nk_dot_e4m3x16_state_neonfhm_t *state) {
+    state->sum_f32x4 = vdupq_n_f32(0);
+}
+NK_INTERNAL void nk_dot_e4m3x16_update_neonfhm(nk_dot_e4m3x16_state_neonfhm_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                               nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Convert e4m3 → f16 using 16-element LUT path (4× VQTBL4)
+    float16x8_t a_low_f16x8, a_high_f16x8, b_low_f16x8, b_high_f16x8;
+    nk_e4m3x16_to_f16x8x2_neon_(a.u8x16, &a_low_f16x8, &a_high_f16x8);
+    nk_e4m3x16_to_f16x8x2_neon_(b.u8x16, &b_low_f16x8, &b_high_f16x8);
+    // FMLAL: widening multiply-accumulate fp16 → f32
+    state->sum_f32x4 = vfmlalq_low_f16(state->sum_f32x4, a_low_f16x8, b_low_f16x8);
+    state->sum_f32x4 = vfmlalq_high_f16(state->sum_f32x4, a_low_f16x8, b_low_f16x8);
+    state->sum_f32x4 = vfmlalq_low_f16(state->sum_f32x4, a_high_f16x8, b_high_f16x8);
+    state->sum_f32x4 = vfmlalq_high_f16(state->sum_f32x4, a_high_f16x8, b_high_f16x8);
+}
+NK_INTERNAL void nk_dot_e4m3x16_finalize_neonfhm(                                                 //
+    nk_dot_e4m3x16_state_neonfhm_t const *state_a, nk_dot_e4m3x16_state_neonfhm_t const *state_b, //
+    nk_dot_e4m3x16_state_neonfhm_t const *state_c, nk_dot_e4m3x16_state_neonfhm_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
+    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
+    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
+    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+}
+typedef struct nk_dot_e5m2x16_state_neonfhm_t {
+    float32x4_t sum_f32x4;
+} nk_dot_e5m2x16_state_neonfhm_t;
+NK_INTERNAL void nk_dot_e5m2x16_init_neonfhm(nk_dot_e5m2x16_state_neonfhm_t *state) {
+    state->sum_f32x4 = vdupq_n_f32(0);
+}
+NK_INTERNAL void nk_dot_e5m2x16_update_neonfhm(nk_dot_e5m2x16_state_neonfhm_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                               nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Convert e5m2 → f16 via SHLL: widen u8→u16 and shift left 8 in one instruction
+    float16x8_t a_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(a.u8x16), 8));
+    float16x8_t a_high_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(a.u8x16), 8));
+    float16x8_t b_low_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_low_u8(b.u8x16), 8));
+    float16x8_t b_high_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vget_high_u8(b.u8x16), 8));
+    // FMLAL: widening multiply-accumulate fp16 → f32
+    state->sum_f32x4 = vfmlalq_low_f16(state->sum_f32x4, a_low_f16x8, b_low_f16x8);
+    state->sum_f32x4 = vfmlalq_high_f16(state->sum_f32x4, a_low_f16x8, b_low_f16x8);
+    state->sum_f32x4 = vfmlalq_low_f16(state->sum_f32x4, a_high_f16x8, b_high_f16x8);
+    state->sum_f32x4 = vfmlalq_high_f16(state->sum_f32x4, a_high_f16x8, b_high_f16x8);
+}
+NK_INTERNAL void nk_dot_e5m2x16_finalize_neonfhm(                                                 //
+    nk_dot_e5m2x16_state_neonfhm_t const *state_a, nk_dot_e5m2x16_state_neonfhm_t const *state_b, //
+    nk_dot_e5m2x16_state_neonfhm_t const *state_c, nk_dot_e5m2x16_state_neonfhm_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
+    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
+    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
+    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_NEONFHM
+#endif // NK_TARGET_ARM_
+#endif // NK_DOT_NEONFHM_H