npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/dot/neon.h ADDED Viewed

@@ -0,0 +1,818 @@
+/**
+ *  @brief SIMD-accelerated Dot Products for NEON.
+ *  @file include/numkong/dot/neon.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/dot.h
+ *
+ *  @section dot_neon_instructions NEON Dot Product Instructions
+ *
+ *  Key NEON instructions for dot products:
+ *
+ *      Intrinsic         Instruction                   Latency     Throughput
+ *                                                                  A76     M4+/V1+/Oryon
+ *      vfmaq_f32         FMLA (V.4S, V.4S, V.4S)       4cy         2/cy    4/cy
+ *      vfmaq_f64         FMLA (V.2D, V.2D, V.2D)       4cy         2/cy    4/cy
+ *      vmulq_f32         FMUL (V.4S, V.4S, V.4S)       3cy         2/cy    4/cy
+ *      vaddvq_f32        FADDP+FADDP (reduce)          5cy         1/cy    1/cy
+ *      vaddvq_f64        FADDP (V.2D to scalar)        3cy         1/cy    1/cy
+ *      vcvt_f64_f32      FCVTL (V.2D, V.2S)            3cy         2/cy    2/cy
+ *      vld2_f32          LD2 ({Vt.2S, Vt2.2S}, [Xn])   4cy         1/cy    1/cy
+ *
+ *  FMA throughput doubles on cores with 4 SIMD pipes (Apple M4+, Graviton3+, Oryon), but
+ *  horizontal reductions remain at 1/cy on all cores and become the main bottleneck.
+ *
+ *  For f32 dot products, we upcast to f64 for accumulation to preserve precision and
+ *  avoid catastrophic cancellation in large-magnitude sums.
+ *
+ *  @section dot_neon_stateful Stateful Streaming Logic
+ *
+ *  To build memory-optimal tiled algorithms, this file defines following structures and force-inlined
+ *  `NK_INTERNAL` functions:
+ *
+ *  - nk_dot_f32x2 state for f32 inputs with double-precision accumulation,
+ *  - nk_dot_f64x2 state with Dot2 stable dot-products for f64 inputs.
+ *
+ *  @code{c}
+ *  nk_dot_f32x2_state_neon_t state_first, state_second, state_third, state_fourth;
+ *  float32x2_t query_f32x2, target_first_f32x2, target_second_f32x2, target_third_f32x2, target_fourth_f32x2;
+ *  nk_dot_f32x2_init_neon(&state_first);
+ *  nk_dot_f32x2_init_neon(&state_second);
+ *  nk_dot_f32x2_init_neon(&state_third);
+ *  nk_dot_f32x2_init_neon(&state_fourth);
+ *  for (nk_size_t idx = 0; idx + 2 <= depth; idx += 2) {
+ *      query_f32x2 = vld1_f32(query_ptr + idx);
+ *      target_first_f32x2 = vld1_f32(target_first_ptr + idx);
+ *      target_second_f32x2 = vld1_f32(target_second_ptr + idx);
+ *      target_third_f32x2 = vld1_f32(target_third_ptr + idx);
+ *      target_fourth_f32x2 = vld1_f32(target_fourth_ptr + idx);
+ *      nk_dot_f32x2_update_neon(&state_first, query_f32x2, target_first_f32x2, idx, 2);
+ *      nk_dot_f32x2_update_neon(&state_second, query_f32x2, target_second_f32x2, idx, 2);
+ *      nk_dot_f32x2_update_neon(&state_third, query_f32x2, target_third_f32x2, idx, 2);
+ *      nk_dot_f32x2_update_neon(&state_fourth, query_f32x2, target_fourth_f32x2, idx, 2);
+ *  }
+ *  float32x4_t results_f32x4;
+ *  nk_dot_f32x2_finalize_neon(&state_first, &state_second, &state_third, &state_fourth, depth, &results_f32x4);
+ *  @endcode
+ *
+ *  For f64 inputs, Dot2 compensated summation provides numerical stability:
+ *
+ *  @code{c}
+ *  nk_dot_f64x2_state_neon_t state_first, state_second, state_third, state_fourth;
+ *  float64x2_t query_f64x2, target_first_f64x2, target_second_f64x2, target_third_f64x2, target_fourth_f64x2;
+ *  nk_dot_f64x2_init_neon(&state_first);
+ *  nk_dot_f64x2_init_neon(&state_second);
+ *  nk_dot_f64x2_init_neon(&state_third);
+ *  nk_dot_f64x2_init_neon(&state_fourth);
+ *  for (nk_size_t idx = 0; idx + 2 <= depth; idx += 2) {
+ *      query_f64x2 = vld1q_f64(query_ptr + idx);
+ *      target_first_f64x2 = vld1q_f64(target_first_ptr + idx);
+ *      target_second_f64x2 = vld1q_f64(target_second_ptr + idx);
+ *      target_third_f64x2 = vld1q_f64(target_third_ptr + idx);
+ *      target_fourth_f64x2 = vld1q_f64(target_fourth_ptr + idx);
+ *      nk_dot_f64x2_update_neon(&state_first, query_f64x2, target_first_f64x2, idx, 2);
+ *      nk_dot_f64x2_update_neon(&state_second, query_f64x2, target_second_f64x2, idx, 2);
+ *      nk_dot_f64x2_update_neon(&state_third, query_f64x2, target_third_f64x2, idx, 2);
+ *      nk_dot_f64x2_update_neon(&state_fourth, query_f64x2, target_fourth_f64x2, idx, 2);
+ *  }
+ *  float64x4_t results_f64x4;
+ *  nk_dot_f64x2_finalize_neon(&state_first, &state_second, &state_third, &state_fourth, depth, &results_f64x4);
+ *  @endcode
+ */
+#ifndef NK_DOT_NEON_H
+#define NK_DOT_NEON_H
+#if NK_TARGET_ARM_
+#if NK_TARGET_NEON
+#include "numkong/cast/neon.h" // `nk_e4m3x8_to_f16x8_neon_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("arch=armv8-a+simd")
+#endif
+/** @brief Compensated horizontal sum of 2 f64 lanes via TwoSum. */
+NK_INTERNAL nk_f64_t nk_dot_stable_sum_f64x2_neon_(float64x2_t sum_f64x2, float64x2_t compensation_f64x2) {
+    // TwoSum merge of sum + compensation (2-wide)
+    float64x2_t tentative_sum_f64x2 = vaddq_f64(sum_f64x2, compensation_f64x2);
+    float64x2_t virtual_addend_f64x2 = vsubq_f64(tentative_sum_f64x2, sum_f64x2);
+    float64x2_t rounding_error_f64x2 = vaddq_f64(
+        vsubq_f64(sum_f64x2, vsubq_f64(tentative_sum_f64x2, virtual_addend_f64x2)),
+        vsubq_f64(compensation_f64x2, virtual_addend_f64x2));
+    // Scalar TwoSum 2→1
+    nk_f64_t lower_sum = vgetq_lane_f64(tentative_sum_f64x2, 0);
+    nk_f64_t upper_sum = vgetq_lane_f64(tentative_sum_f64x2, 1);
+    nk_f64_t lower_error = vgetq_lane_f64(rounding_error_f64x2, 0);
+    nk_f64_t upper_error = vgetq_lane_f64(rounding_error_f64x2, 1);
+    nk_f64_t tentative_sum = lower_sum + upper_sum;
+    nk_f64_t virtual_addend = tentative_sum - lower_sum;
+    nk_f64_t rounding_error = (lower_sum - (tentative_sum - virtual_addend)) + (upper_sum - virtual_addend);
+    return tentative_sum + (lower_error + upper_error + rounding_error);
+}
+#pragma region - Traditional Floats
+NK_PUBLIC void nk_dot_f32_neon(nk_f32_t const *a_scalars, nk_f32_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f64_t *result) {
+    // Upcast f32 to f64 for accumulation (2 f32s per iteration, avoids slow vget_low/high)
+    float64x2_t sum_f64x2 = vdupq_n_f64(0);
+    nk_size_t idx_scalars = 0;
+    for (; idx_scalars + 2 <= count_scalars; idx_scalars += 2) {
+        float32x2_t a_f32x2 = vld1_f32(a_scalars + idx_scalars);
+        float32x2_t b_f32x2 = vld1_f32(b_scalars + idx_scalars);
+        float64x2_t a_f64x2 = vcvt_f64_f32(a_f32x2);
+        float64x2_t b_f64x2 = vcvt_f64_f32(b_f32x2);
+        sum_f64x2 = vfmaq_f64(sum_f64x2, a_f64x2, b_f64x2);
+    }
+    nk_f64_t sum_f64 = vaddvq_f64(sum_f64x2);
+    for (; idx_scalars < count_scalars; ++idx_scalars)
+        sum_f64 += (nk_f64_t)a_scalars[idx_scalars] * (nk_f64_t)b_scalars[idx_scalars];
+    *result = sum_f64;
+}
+NK_PUBLIC void nk_dot_f32c_neon(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
+                                nk_f64c_t *result) {
+    // Upcast f32 to f64 for accumulation (2 complex pairs per iteration, avoids slow vget_low/high)
+    float64x2_t sum_real_f64x2 = vdupq_n_f64(0);
+    float64x2_t sum_imag_f64x2 = vdupq_n_f64(0);
+    nk_size_t idx_pairs = 0;
+    // ARMv8.3-A FCMLA (`vcmlaq_rot0/rot90_f32`) was benchmarked as an alternative to the
+    // deinterleave+4FMA pattern below. FCMLA processes only 2 complex pairs per iteration
+    // (interleaved 128-bit operands, 2x `vcmlaq`), while `vld2_f32` deinterleaves 2 pairs
+    // with 4 independent FMA instructions that fully utilize M4's 4 SIMD pipes. Result on
+    // Apple M4 at n=4096: manual f32 39.7 GiB/s, FCMLA 17.1 GiB/s (2.3x slower).
+    // The f64 upcast here trades throughput for precision — FCMLA offers neither advantage.
+    for (; idx_pairs + 2 <= count_pairs; idx_pairs += 2) {
+        // Unpack 2 complex pairs into real and imaginary parts:
+        float32x2x2_t a_f32x2x2 = vld2_f32((nk_f32_t const *)(a_pairs + idx_pairs));
+        float32x2x2_t b_f32x2x2 = vld2_f32((nk_f32_t const *)(b_pairs + idx_pairs));
+        // Upcast to f64
+        float64x2_t a_real_f64x2 = vcvt_f64_f32(a_f32x2x2.val[0]);
+        float64x2_t a_imag_f64x2 = vcvt_f64_f32(a_f32x2x2.val[1]);
+        float64x2_t b_real_f64x2 = vcvt_f64_f32(b_f32x2x2.val[0]);
+        float64x2_t b_imag_f64x2 = vcvt_f64_f32(b_f32x2x2.val[1]);
+        // Compute the dot product: real = aᵣ × bᵣ - aᵢ × bᵢ, imag = aᵣ × bᵢ + aᵢ × bᵣ
+        sum_real_f64x2 = vfmaq_f64(sum_real_f64x2, a_real_f64x2, b_real_f64x2);
+        sum_real_f64x2 = vfmsq_f64(sum_real_f64x2, a_imag_f64x2, b_imag_f64x2);
+        sum_imag_f64x2 = vfmaq_f64(sum_imag_f64x2, a_real_f64x2, b_imag_f64x2);
+        sum_imag_f64x2 = vfmaq_f64(sum_imag_f64x2, a_imag_f64x2, b_real_f64x2);
+    }
+    // Reduce horizontal sums:
+    nk_f64_t sum_real_f64 = vaddvq_f64(sum_real_f64x2);
+    nk_f64_t sum_imag_f64 = vaddvq_f64(sum_imag_f64x2);
+    // Handle the tail:
+    for (; idx_pairs != count_pairs; ++idx_pairs) {
+        nk_f32c_t a_pair = a_pairs[idx_pairs], b_pair = b_pairs[idx_pairs];
+        nk_f64_t ar = a_pair.real, ai = a_pair.imag, br = b_pair.real, bi = b_pair.imag;
+        sum_real_f64 += ar * br - ai * bi;
+        sum_imag_f64 += ar * bi + ai * br;
+    }
+    result->real = sum_real_f64;
+    result->imag = sum_imag_f64;
+}
+NK_PUBLIC void nk_vdot_f32c_neon(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
+                                 nk_f64c_t *result) {
+    // Upcast f32 to f64 for accumulation (2 complex pairs per iteration, avoids slow vget_low/high)
+    float64x2_t sum_real_f64x2 = vdupq_n_f64(0);
+    float64x2_t sum_imag_f64x2 = vdupq_n_f64(0);
+    nk_size_t idx_pairs = 0;
+    for (; idx_pairs + 2 <= count_pairs; idx_pairs += 2) {
+        // Unpack 2 complex pairs into real and imaginary parts:
+        float32x2x2_t a_f32x2x2 = vld2_f32((nk_f32_t const *)(a_pairs + idx_pairs));
+        float32x2x2_t b_f32x2x2 = vld2_f32((nk_f32_t const *)(b_pairs + idx_pairs));
+        // Upcast to f64
+        float64x2_t a_real_f64x2 = vcvt_f64_f32(a_f32x2x2.val[0]);
+        float64x2_t a_imag_f64x2 = vcvt_f64_f32(a_f32x2x2.val[1]);
+        float64x2_t b_real_f64x2 = vcvt_f64_f32(b_f32x2x2.val[0]);
+        float64x2_t b_imag_f64x2 = vcvt_f64_f32(b_f32x2x2.val[1]);
+        // Compute conjugate dot product: real = aᵣ × bᵣ + aᵢ × bᵢ, imag = aᵣ × bᵢ - aᵢ × bᵣ
+        sum_real_f64x2 = vfmaq_f64(sum_real_f64x2, a_real_f64x2, b_real_f64x2);
+        sum_real_f64x2 = vfmaq_f64(sum_real_f64x2, a_imag_f64x2, b_imag_f64x2);
+        sum_imag_f64x2 = vfmaq_f64(sum_imag_f64x2, a_real_f64x2, b_imag_f64x2);
+        sum_imag_f64x2 = vfmsq_f64(sum_imag_f64x2, a_imag_f64x2, b_real_f64x2);
+    }
+    // Reduce horizontal sums:
+    nk_f64_t sum_real_f64 = vaddvq_f64(sum_real_f64x2);
+    nk_f64_t sum_imag_f64 = vaddvq_f64(sum_imag_f64x2);
+    // Handle the tail:
+    for (; idx_pairs != count_pairs; ++idx_pairs) {
+        nk_f32c_t a_pair = a_pairs[idx_pairs], b_pair = b_pairs[idx_pairs];
+        nk_f64_t ar = a_pair.real, ai = a_pair.imag, br = b_pair.real, bi = b_pair.imag;
+        sum_real_f64 += ar * br + ai * bi;
+        sum_imag_f64 += ar * bi - ai * br;
+    }
+    result->real = sum_real_f64;
+    result->imag = sum_imag_f64;
+}
+/**
+ *  @brief Running state for 64-bit dot accumulation over f32 scalars on NEON.
+ *
+ *  Processes 2 f32 values at a time, upcasting to f64 for accumulation to avoid
+ *  catastrophic cancellation in long reductions.
+ */
+typedef struct nk_dot_f32x2_state_neon_t {
+    float64x2_t sum_f64x2;
+} nk_dot_f32x2_state_neon_t;
+NK_INTERNAL void nk_dot_f32x2_init_neon(nk_dot_f32x2_state_neon_t *state) { state->sum_f64x2 = vdupq_n_f64(0); }
+NK_INTERNAL void nk_dot_f32x2_update_neon(nk_dot_f32x2_state_neon_t *state, nk_b64_vec_t a, nk_b64_vec_t b,
+                                          nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Upcast 2 f32s to f64s for high-precision accumulation
+    float32x2_t a_f32x2 = vreinterpret_f32_u32(a.u32x2);
+    float32x2_t b_f32x2 = vreinterpret_f32_u32(b.u32x2);
+    float64x2_t a_f64x2 = vcvt_f64_f32(a_f32x2);
+    float64x2_t b_f64x2 = vcvt_f64_f32(b_f32x2);
+    state->sum_f64x2 = vfmaq_f64(state->sum_f64x2, a_f64x2, b_f64x2);
+}
+NK_INTERNAL void nk_dot_f32x2_finalize_neon(                                            //
+    nk_dot_f32x2_state_neon_t const *state_a, nk_dot_f32x2_state_neon_t const *state_b, //
+    nk_dot_f32x2_state_neon_t const *state_c, nk_dot_f32x2_state_neon_t const *state_d, //
+    nk_size_t total_dimensions, nk_b256_vec_t *result) {
+    nk_unused_(total_dimensions);
+    result->f64s[0] = vaddvq_f64(state_a->sum_f64x2);
+    result->f64s[1] = vaddvq_f64(state_b->sum_f64x2);
+    result->f64s[2] = vaddvq_f64(state_c->sum_f64x2);
+    result->f64s[3] = vaddvq_f64(state_d->sum_f64x2);
+}
+NK_PUBLIC void nk_dot_f64_neon(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f64_t *result) {
+    // Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated dot product
+    float64x2_t sum_f64x2 = vdupq_n_f64(0);
+    float64x2_t compensation_f64x2 = vdupq_n_f64(0);
+    float64x2_t a_f64x2, b_f64x2;
+nk_dot_f64_neon_cycle:
+    if (count_scalars < 2) {
+        nk_b128_vec_t a_tail, b_tail;
+        nk_partial_load_b64x2_serial_(a_scalars, &a_tail, count_scalars);
+        nk_partial_load_b64x2_serial_(b_scalars, &b_tail, count_scalars);
+        a_f64x2 = a_tail.f64x2;
+        b_f64x2 = b_tail.f64x2;
+        count_scalars = 0;
+    }
+    else {
+        a_f64x2 = vld1q_f64(a_scalars);
+        b_f64x2 = vld1q_f64(b_scalars);
+        a_scalars += 2, b_scalars += 2, count_scalars -= 2;
+    }
+    // TwoProd: h = a × b, r = fma(a, b, -h) captures the rounding error
+    float64x2_t product_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+    float64x2_t product_error_f64x2 = vnegq_f64(vfmsq_f64(product_f64x2, a_f64x2, b_f64x2));
+    // TwoSum: (t, q) = TwoSum(sum, h) where t = sum + h rounded, q = error
+    float64x2_t tentative_sum_f64x2 = vaddq_f64(sum_f64x2, product_f64x2);
+    float64x2_t virtual_addend_f64x2 = vsubq_f64(tentative_sum_f64x2, sum_f64x2);
+    float64x2_t sum_error_f64x2 = vaddq_f64(vsubq_f64(sum_f64x2, vsubq_f64(tentative_sum_f64x2, virtual_addend_f64x2)),
+                                            vsubq_f64(product_f64x2, virtual_addend_f64x2));
+    // Update: sum = t, compensation += q + r
+    sum_f64x2 = tentative_sum_f64x2;
+    compensation_f64x2 = vaddq_f64(compensation_f64x2, vaddq_f64(sum_error_f64x2, product_error_f64x2));
+    if (count_scalars) goto nk_dot_f64_neon_cycle;
+    // Compensated horizontal reduction preserving Dot2 error tracking
+    *result = nk_dot_stable_sum_f64x2_neon_(sum_f64x2, compensation_f64x2);
+}
+NK_PUBLIC void nk_dot_f64c_neon(nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_size_t count_pairs,
+                                nk_f64c_t *result) {
+    // Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated complex dot product
+    float64x2_t sum_real_f64x2 = vdupq_n_f64(0);
+    float64x2_t sum_imag_f64x2 = vdupq_n_f64(0);
+    float64x2_t compensation_real_f64x2 = vdupq_n_f64(0);
+    float64x2_t compensation_imag_f64x2 = vdupq_n_f64(0);
+    float64x2_t a_real_f64x2, a_imag_f64x2, b_real_f64x2, b_imag_f64x2;
+nk_dot_f64c_neon_cycle:
+    if (count_pairs < 2) {
+        nk_b128_vec_t a_tail, b_tail;
+        nk_partial_load_b64x2_serial_(a_pairs, &a_tail, count_pairs * 2);
+        nk_partial_load_b64x2_serial_(b_pairs, &b_tail, count_pairs * 2);
+        float64x2_t zeros = vdupq_n_f64(0);
+        a_real_f64x2 = vzip1q_f64(a_tail.f64x2, zeros);
+        a_imag_f64x2 = vzip2q_f64(a_tail.f64x2, zeros);
+        b_real_f64x2 = vzip1q_f64(b_tail.f64x2, zeros);
+        b_imag_f64x2 = vzip2q_f64(b_tail.f64x2, zeros);
+        count_pairs = 0;
+    }
+    else {
+        float64x2x2_t a_f64x2x2 = vld2q_f64((nk_f64_t const *)a_pairs);
+        float64x2x2_t b_f64x2x2 = vld2q_f64((nk_f64_t const *)b_pairs);
+        a_real_f64x2 = a_f64x2x2.val[0];
+        a_imag_f64x2 = a_f64x2x2.val[1];
+        b_real_f64x2 = b_f64x2x2.val[0];
+        b_imag_f64x2 = b_f64x2x2.val[1];
+        a_pairs += 2, b_pairs += 2, count_pairs -= 2;
+    }
+    // Real part: aᵣ × bᵣ - aᵢ × bᵢ (using TwoProd and TwoSum)
+    // First term: +aᵣ × bᵣ
+    float64x2_t product_rr_f64x2 = vmulq_f64(a_real_f64x2, b_real_f64x2);
+    float64x2_t error_rr_f64x2 = vnegq_f64(vfmsq_f64(product_rr_f64x2, a_real_f64x2, b_real_f64x2));
+    float64x2_t tentative_sum_real_f64x2 = vaddq_f64(sum_real_f64x2, product_rr_f64x2);
+    float64x2_t virtual_addend_real_f64x2 = vsubq_f64(tentative_sum_real_f64x2, sum_real_f64x2);
+    float64x2_t error_sum_real_f64x2 = vaddq_f64(
+        vsubq_f64(sum_real_f64x2, vsubq_f64(tentative_sum_real_f64x2, virtual_addend_real_f64x2)),
+        vsubq_f64(product_rr_f64x2, virtual_addend_real_f64x2));
+    sum_real_f64x2 = tentative_sum_real_f64x2;
+    compensation_real_f64x2 = vaddq_f64(compensation_real_f64x2, vaddq_f64(error_sum_real_f64x2, error_rr_f64x2));
+    // Second term: -aᵢ × bᵢ (negate product and error, then standard TwoSum)
+    float64x2_t product_ii_f64x2 = vmulq_f64(a_imag_f64x2, b_imag_f64x2);
+    float64x2_t error_ii_f64x2 = vnegq_f64(vfmsq_f64(product_ii_f64x2, a_imag_f64x2, b_imag_f64x2));
+    float64x2_t neg_product_ii_f64x2 = vnegq_f64(product_ii_f64x2);
+    float64x2_t neg_error_ii_f64x2 = vnegq_f64(error_ii_f64x2);
+    tentative_sum_real_f64x2 = vaddq_f64(sum_real_f64x2, neg_product_ii_f64x2);
+    virtual_addend_real_f64x2 = vsubq_f64(tentative_sum_real_f64x2, sum_real_f64x2);
+    error_sum_real_f64x2 = vaddq_f64(
+        vsubq_f64(sum_real_f64x2, vsubq_f64(tentative_sum_real_f64x2, virtual_addend_real_f64x2)),
+        vsubq_f64(neg_product_ii_f64x2, virtual_addend_real_f64x2));
+    sum_real_f64x2 = tentative_sum_real_f64x2;
+    compensation_real_f64x2 = vaddq_f64(compensation_real_f64x2, vaddq_f64(error_sum_real_f64x2, neg_error_ii_f64x2));
+    // Imag part: aᵣ × bᵢ + aᵢ × bᵣ (using TwoProd and TwoSum)
+    // First term: +aᵣ × bᵢ
+    float64x2_t product_ri_f64x2 = vmulq_f64(a_real_f64x2, b_imag_f64x2);
+    float64x2_t error_ri_f64x2 = vnegq_f64(vfmsq_f64(product_ri_f64x2, a_real_f64x2, b_imag_f64x2));
+    float64x2_t tentative_sum_imag_f64x2 = vaddq_f64(sum_imag_f64x2, product_ri_f64x2);
+    float64x2_t virtual_addend_imag_f64x2 = vsubq_f64(tentative_sum_imag_f64x2, sum_imag_f64x2);
+    float64x2_t error_sum_imag_f64x2 = vaddq_f64(
+        vsubq_f64(sum_imag_f64x2, vsubq_f64(tentative_sum_imag_f64x2, virtual_addend_imag_f64x2)),
+        vsubq_f64(product_ri_f64x2, virtual_addend_imag_f64x2));
+    sum_imag_f64x2 = tentative_sum_imag_f64x2;
+    compensation_imag_f64x2 = vaddq_f64(compensation_imag_f64x2, vaddq_f64(error_sum_imag_f64x2, error_ri_f64x2));
+    // Second term: +aᵢ × bᵣ
+    float64x2_t product_ir_f64x2 = vmulq_f64(a_imag_f64x2, b_real_f64x2);
+    float64x2_t error_ir_f64x2 = vnegq_f64(vfmsq_f64(product_ir_f64x2, a_imag_f64x2, b_real_f64x2));
+    tentative_sum_imag_f64x2 = vaddq_f64(sum_imag_f64x2, product_ir_f64x2);
+    virtual_addend_imag_f64x2 = vsubq_f64(tentative_sum_imag_f64x2, sum_imag_f64x2);
+    error_sum_imag_f64x2 = vaddq_f64(
+        vsubq_f64(sum_imag_f64x2, vsubq_f64(tentative_sum_imag_f64x2, virtual_addend_imag_f64x2)),
+        vsubq_f64(product_ir_f64x2, virtual_addend_imag_f64x2));
+    sum_imag_f64x2 = tentative_sum_imag_f64x2;
+    compensation_imag_f64x2 = vaddq_f64(compensation_imag_f64x2, vaddq_f64(error_sum_imag_f64x2, error_ir_f64x2));
+    if (count_pairs) goto nk_dot_f64c_neon_cycle;
+    // Compensated horizontal reduction preserving Dot2 error tracking
+    result->real = nk_dot_stable_sum_f64x2_neon_(sum_real_f64x2, compensation_real_f64x2);
+    result->imag = nk_dot_stable_sum_f64x2_neon_(sum_imag_f64x2, compensation_imag_f64x2);
+}
+NK_PUBLIC void nk_vdot_f64c_neon(nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_size_t count_pairs,
+                                 nk_f64c_t *result) {
+    // Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated conjugate dot product
+    float64x2_t sum_real_f64x2 = vdupq_n_f64(0);
+    float64x2_t sum_imag_f64x2 = vdupq_n_f64(0);
+    float64x2_t compensation_real_f64x2 = vdupq_n_f64(0);
+    float64x2_t compensation_imag_f64x2 = vdupq_n_f64(0);
+    float64x2_t a_real_f64x2, a_imag_f64x2, b_real_f64x2, b_imag_f64x2;
+nk_vdot_f64c_neon_cycle:
+    if (count_pairs < 2) {
+        nk_b128_vec_t a_tail, b_tail;
+        nk_partial_load_b64x2_serial_(a_pairs, &a_tail, count_pairs * 2);
+        nk_partial_load_b64x2_serial_(b_pairs, &b_tail, count_pairs * 2);
+        float64x2_t zeros = vdupq_n_f64(0);
+        a_real_f64x2 = vzip1q_f64(a_tail.f64x2, zeros);
+        a_imag_f64x2 = vzip2q_f64(a_tail.f64x2, zeros);
+        b_real_f64x2 = vzip1q_f64(b_tail.f64x2, zeros);
+        b_imag_f64x2 = vzip2q_f64(b_tail.f64x2, zeros);
+        count_pairs = 0;
+    }
+    else {
+        float64x2x2_t a_f64x2x2 = vld2q_f64((nk_f64_t const *)a_pairs);
+        float64x2x2_t b_f64x2x2 = vld2q_f64((nk_f64_t const *)b_pairs);
+        a_real_f64x2 = a_f64x2x2.val[0];
+        a_imag_f64x2 = a_f64x2x2.val[1];
+        b_real_f64x2 = b_f64x2x2.val[0];
+        b_imag_f64x2 = b_f64x2x2.val[1];
+        a_pairs += 2, b_pairs += 2, count_pairs -= 2;
+    }
+    // Real part: aᵣ × bᵣ + aᵢ × bᵢ (using TwoProd and TwoSum)
+    // First term: +aᵣ × bᵣ
+    float64x2_t product_rr_f64x2 = vmulq_f64(a_real_f64x2, b_real_f64x2);
+    float64x2_t error_rr_f64x2 = vnegq_f64(vfmsq_f64(product_rr_f64x2, a_real_f64x2, b_real_f64x2));
+    float64x2_t tentative_sum_real_f64x2 = vaddq_f64(sum_real_f64x2, product_rr_f64x2);
+    float64x2_t virtual_addend_real_f64x2 = vsubq_f64(tentative_sum_real_f64x2, sum_real_f64x2);
+    float64x2_t error_sum_real_f64x2 = vaddq_f64(
+        vsubq_f64(sum_real_f64x2, vsubq_f64(tentative_sum_real_f64x2, virtual_addend_real_f64x2)),
+        vsubq_f64(product_rr_f64x2, virtual_addend_real_f64x2));
+    sum_real_f64x2 = tentative_sum_real_f64x2;
+    compensation_real_f64x2 = vaddq_f64(compensation_real_f64x2, vaddq_f64(error_sum_real_f64x2, error_rr_f64x2));
+    // Second term: +aᵢ × bᵢ (conjugate: add instead of subtract)
+    float64x2_t product_ii_f64x2 = vmulq_f64(a_imag_f64x2, b_imag_f64x2);
+    float64x2_t error_ii_f64x2 = vnegq_f64(vfmsq_f64(product_ii_f64x2, a_imag_f64x2, b_imag_f64x2));
+    tentative_sum_real_f64x2 = vaddq_f64(sum_real_f64x2, product_ii_f64x2);
+    virtual_addend_real_f64x2 = vsubq_f64(tentative_sum_real_f64x2, sum_real_f64x2);
+    error_sum_real_f64x2 = vaddq_f64(
+        vsubq_f64(sum_real_f64x2, vsubq_f64(tentative_sum_real_f64x2, virtual_addend_real_f64x2)),
+        vsubq_f64(product_ii_f64x2, virtual_addend_real_f64x2));
+    sum_real_f64x2 = tentative_sum_real_f64x2;
+    compensation_real_f64x2 = vaddq_f64(compensation_real_f64x2, vaddq_f64(error_sum_real_f64x2, error_ii_f64x2));
+    // Imag part: aᵣ × bᵢ - aᵢ × bᵣ (using TwoProd and TwoSum)
+    // First term: +aᵣ × bᵢ
+    float64x2_t product_ri_f64x2 = vmulq_f64(a_real_f64x2, b_imag_f64x2);
+    float64x2_t error_ri_f64x2 = vnegq_f64(vfmsq_f64(product_ri_f64x2, a_real_f64x2, b_imag_f64x2));
+    float64x2_t tentative_sum_imag_f64x2 = vaddq_f64(sum_imag_f64x2, product_ri_f64x2);
+    float64x2_t virtual_addend_imag_f64x2 = vsubq_f64(tentative_sum_imag_f64x2, sum_imag_f64x2);
+    float64x2_t error_sum_imag_f64x2 = vaddq_f64(
+        vsubq_f64(sum_imag_f64x2, vsubq_f64(tentative_sum_imag_f64x2, virtual_addend_imag_f64x2)),
+        vsubq_f64(product_ri_f64x2, virtual_addend_imag_f64x2));
+    sum_imag_f64x2 = tentative_sum_imag_f64x2;
+    compensation_imag_f64x2 = vaddq_f64(compensation_imag_f64x2, vaddq_f64(error_sum_imag_f64x2, error_ri_f64x2));
+    // Second term: -aᵢ × bᵣ (conjugate: negate product and error, then standard TwoSum)
+    float64x2_t product_ir_f64x2 = vmulq_f64(a_imag_f64x2, b_real_f64x2);
+    float64x2_t error_ir_f64x2 = vnegq_f64(vfmsq_f64(product_ir_f64x2, a_imag_f64x2, b_real_f64x2));
+    float64x2_t neg_product_ir_f64x2 = vnegq_f64(product_ir_f64x2);
+    float64x2_t neg_error_ir_f64x2 = vnegq_f64(error_ir_f64x2);
+    tentative_sum_imag_f64x2 = vaddq_f64(sum_imag_f64x2, neg_product_ir_f64x2);
+    virtual_addend_imag_f64x2 = vsubq_f64(tentative_sum_imag_f64x2, sum_imag_f64x2);
+    error_sum_imag_f64x2 = vaddq_f64(
+        vsubq_f64(sum_imag_f64x2, vsubq_f64(tentative_sum_imag_f64x2, virtual_addend_imag_f64x2)),
+        vsubq_f64(neg_product_ir_f64x2, virtual_addend_imag_f64x2));
+    sum_imag_f64x2 = tentative_sum_imag_f64x2;
+    compensation_imag_f64x2 = vaddq_f64(compensation_imag_f64x2, vaddq_f64(error_sum_imag_f64x2, neg_error_ir_f64x2));
+    if (count_pairs) goto nk_vdot_f64c_neon_cycle;
+    // Compensated horizontal reduction preserving Dot2 error tracking
+    result->real = nk_dot_stable_sum_f64x2_neon_(sum_real_f64x2, compensation_real_f64x2);
+    result->imag = nk_dot_stable_sum_f64x2_neon_(sum_imag_f64x2, compensation_imag_f64x2);
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over f64 scalars on NEON.
+ *
+ *  Uses the Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated dot product.
+ */
+typedef struct nk_dot_f64x2_state_neon_t {
+    float64x2_t sum_f64x2;
+    float64x2_t compensation_f64x2;
+} nk_dot_f64x2_state_neon_t;
+NK_INTERNAL void nk_dot_f64x2_init_neon(nk_dot_f64x2_state_neon_t *state) {
+    state->sum_f64x2 = vdupq_n_f64(0);
+    state->compensation_f64x2 = vdupq_n_f64(0);
+}
+NK_INTERNAL void nk_dot_f64x2_update_neon(nk_dot_f64x2_state_neon_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                          nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    float64x2_t sum_f64x2 = state->sum_f64x2;
+    float64x2_t compensation_f64x2 = state->compensation_f64x2;
+    float64x2_t a_f64x2 = vreinterpretq_f64_u64(a.u64x2);
+    float64x2_t b_f64x2 = vreinterpretq_f64_u64(b.u64x2);
+    // TwoProd: h = a × b, r = fma(a, b, -h) captures the rounding error
+    float64x2_t product_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+    float64x2_t product_error_f64x2 = vnegq_f64(vfmsq_f64(product_f64x2, a_f64x2, b_f64x2));
+    // TwoSum: (t, q) = TwoSum(sum, h) where t = sum + h rounded, q = error
+    float64x2_t tentative_sum_f64x2 = vaddq_f64(sum_f64x2, product_f64x2);
+    float64x2_t virtual_addend_f64x2 = vsubq_f64(tentative_sum_f64x2, sum_f64x2);
+    float64x2_t sum_error_f64x2 = vaddq_f64(vsubq_f64(sum_f64x2, vsubq_f64(tentative_sum_f64x2, virtual_addend_f64x2)),
+                                            vsubq_f64(product_f64x2, virtual_addend_f64x2));
+    // Update: sum = t, compensation += q + r
+    state->sum_f64x2 = tentative_sum_f64x2;
+    state->compensation_f64x2 = vaddq_f64(compensation_f64x2, vaddq_f64(sum_error_f64x2, product_error_f64x2));
+}
+NK_INTERNAL void nk_dot_f64x2_finalize_neon(                                            //
+    nk_dot_f64x2_state_neon_t const *state_a, nk_dot_f64x2_state_neon_t const *state_b, //
+    nk_dot_f64x2_state_neon_t const *state_c, nk_dot_f64x2_state_neon_t const *state_d, //
+    nk_size_t total_dimensions, nk_b256_vec_t *result) {
+    nk_unused_(total_dimensions);
+    // Compensated horizontal reduction preserving Dot2 error tracking per state
+    result->f64s[0] = nk_dot_stable_sum_f64x2_neon_(state_a->sum_f64x2, state_a->compensation_f64x2);
+    result->f64s[1] = nk_dot_stable_sum_f64x2_neon_(state_b->sum_f64x2, state_b->compensation_f64x2);
+    result->f64s[2] = nk_dot_stable_sum_f64x2_neon_(state_c->sum_f64x2, state_c->compensation_f64x2);
+    result->f64s[3] = nk_dot_stable_sum_f64x2_neon_(state_d->sum_f64x2, state_d->compensation_f64x2);
+}
+#pragma endregion - Traditional Floats
+#pragma region - Smaller Floats
+NK_PUBLIC void nk_dot_bf16_neon(nk_bf16_t const *a_scalars, nk_bf16_t const *b_scalars, nk_size_t count_scalars,
+                                nk_f32_t *result) {
+    uint16x8_t a_u16x8, b_u16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_bf16_neon_cycle:
+    if (count_scalars < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b16x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_u16x8 = a_vec.u16x8;
+        b_u16x8 = b_vec.u16x8;
+        count_scalars = 0;
+    }
+    else {
+        a_u16x8 = vld1q_u16((nk_u16_t const *)a_scalars);
+        b_u16x8 = vld1q_u16((nk_u16_t const *)b_scalars);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    float32x4_t a_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(a_u16x8), 16));
+    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(a_u16x8), 16));
+    float32x4_t b_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(b_u16x8), 16));
+    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(b_u16x8), 16));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
+    if (count_scalars) goto nk_dot_bf16_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over bf16 scalars on plain NEON.
+ *
+ *  Processes 8 bf16 values at a time (128 bits), converting to f32 via USHLL shift-16
+ *  for accumulation without requiring the ARMv8.6-BF16 extension.
+ */
+typedef struct nk_dot_bf16x8_state_neon_t {
+    float32x4_t sum_f32x4;
+} nk_dot_bf16x8_state_neon_t;
+NK_INTERNAL void nk_dot_bf16x8_init_neon(nk_dot_bf16x8_state_neon_t *state) { state->sum_f32x4 = vdupq_n_f32(0); }
+NK_INTERNAL void nk_dot_bf16x8_update_neon(nk_dot_bf16x8_state_neon_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                           nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Convert bf16 to f32 via USHLL shift-16 (low and high halves)
+    float32x4_t a_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(a.u16x8), 16));
+    float32x4_t a_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(a.u16x8), 16));
+    float32x4_t b_low_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(b.u16x8), 16));
+    float32x4_t b_high_f32x4 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(b.u16x8), 16));
+    state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_low_f32x4, b_low_f32x4);
+    state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_high_f32x4, b_high_f32x4);
+}
+NK_INTERNAL void nk_dot_bf16x8_finalize_neon(                                             //
+    nk_dot_bf16x8_state_neon_t const *state_a, nk_dot_bf16x8_state_neon_t const *state_b, //
+    nk_dot_bf16x8_state_neon_t const *state_c, nk_dot_bf16x8_state_neon_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
+    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
+    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
+    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+}
+NK_PUBLIC void nk_dot_f16_neon(nk_f16_t const *a_scalars, nk_f16_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f32_t *result) {
+    uint16x8_t a_u16x8, b_u16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_f16_neon_cycle:
+    if (count_scalars < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b16x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_u16x8 = a_vec.u16x8;
+        b_u16x8 = b_vec.u16x8;
+        count_scalars = 0;
+    }
+    else {
+        a_u16x8 = vld1q_u16((nk_u16_t const *)a_scalars);
+        b_u16x8 = vld1q_u16((nk_u16_t const *)b_scalars);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    float32x4_t a_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(a_u16x8));
+    float32x4_t a_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(a_u16x8));
+    float32x4_t b_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(b_u16x8));
+    float32x4_t b_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(b_u16x8));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
+    if (count_scalars) goto nk_dot_f16_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over f16 scalars on plain NEON.
+ *
+ *  Processes 8 f16 values at a time (128 bits), converting to f32 via integer bit
+ *  manipulation for accumulation without requiring the ARMv8.2-A FP16 extension.
+ */
+typedef struct nk_dot_f16x8_state_neon_t {
+    float32x4_t sum_f32x4;
+} nk_dot_f16x8_state_neon_t;
+NK_INTERNAL void nk_dot_f16x8_init_neon(nk_dot_f16x8_state_neon_t *state) { state->sum_f32x4 = vdupq_n_f32(0); }
+NK_INTERNAL void nk_dot_f16x8_update_neon(nk_dot_f16x8_state_neon_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                          nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Convert f16 to f32 via integer bit manipulation (low and high halves)
+    float32x4_t a_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(a.u16x8));
+    float32x4_t a_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(a.u16x8));
+    float32x4_t b_low_f32x4 = nk_f16x4_to_f32x4_neon_(vget_low_u16(b.u16x8));
+    float32x4_t b_high_f32x4 = nk_f16x4_to_f32x4_neon_(vget_high_u16(b.u16x8));
+    state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_low_f32x4, b_low_f32x4);
+    state->sum_f32x4 = vfmaq_f32(state->sum_f32x4, a_high_f32x4, b_high_f32x4);
+}
+NK_INTERNAL void nk_dot_f16x8_finalize_neon(                                            //
+    nk_dot_f16x8_state_neon_t const *state_a, nk_dot_f16x8_state_neon_t const *state_b, //
+    nk_dot_f16x8_state_neon_t const *state_c, nk_dot_f16x8_state_neon_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    result->f32s[0] = vaddvq_f32(state_a->sum_f32x4);
+    result->f32s[1] = vaddvq_f32(state_b->sum_f32x4);
+    result->f32s[2] = vaddvq_f32(state_c->sum_f32x4);
+    result->f32s[3] = vaddvq_f32(state_d->sum_f32x4);
+}
+NK_PUBLIC void nk_dot_e4m3_neon(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars, nk_size_t count_scalars,
+                                nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_e4m3_neon_cycle:
+    if (count_scalars < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_f16x8 = nk_e4m3x8_to_f16x8_neon_(a_vec.u8x8);
+        b_f16x8 = nk_e4m3x8_to_f16x8_neon_(b_vec.u8x8);
+        count_scalars = 0;
+    }
+    else {
+        a_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(a_scalars));
+        b_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(b_scalars));
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
+    if (count_scalars) goto nk_dot_e4m3_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_dot_e5m2_neon(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
+                                nk_f32_t *result) {
+    float16x8_t a_f16x8, b_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+nk_dot_e5m2_neon_cycle:
+    if (count_scalars < 8) {
+        nk_b64_vec_t a_vec, b_vec;
+        nk_partial_load_b8x8_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x8_serial_(b_scalars, &b_vec, count_scalars);
+        a_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(a_vec.u8x8, 8));
+        b_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(b_vec.u8x8, 8));
+        count_scalars = 0;
+    }
+    else {
+        a_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vld1_u8(a_scalars), 8));
+        b_f16x8 = vreinterpretq_f16_u16(vshll_n_u8(vld1_u8(b_scalars), 8));
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+    float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+    float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+    float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, b_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, b_high_f32x4);
+    if (count_scalars) goto nk_dot_e5m2_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_dot_e2m3_neon(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars, nk_size_t count_scalars,
+                                nk_f32_t *result) {
+    float16x8_t a_low_f16x8, a_high_f16x8, b_low_f16x8, b_high_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+    // x16 TBL path: process 16 elements per iteration via lookup table upcast
+nk_dot_e2m3_neon_cycle:
+    if (count_scalars < 16) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x16_serial_(b_scalars, &b_vec, count_scalars);
+        nk_e2m3x16_to_f16x8x2_neon_(a_vec.u8x16, &a_low_f16x8, &a_high_f16x8);
+        nk_e2m3x16_to_f16x8x2_neon_(b_vec.u8x16, &b_low_f16x8, &b_high_f16x8);
+        count_scalars = 0;
+    }
+    else {
+        nk_e2m3x16_to_f16x8x2_neon_(vld1q_u8(a_scalars), &a_low_f16x8, &a_high_f16x8);
+        nk_e2m3x16_to_f16x8x2_neon_(vld1q_u8(b_scalars), &b_low_f16x8, &b_high_f16x8);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_low_f16x8)), vcvt_f32_f16(vget_low_f16(b_low_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_low_f16x8)),
+                          vcvt_f32_f16(vget_high_f16(b_low_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_high_f16x8)),
+                          vcvt_f32_f16(vget_low_f16(b_high_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_high_f16x8)),
+                          vcvt_f32_f16(vget_high_f16(b_high_f16x8)));
+    if (count_scalars) goto nk_dot_e2m3_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+NK_PUBLIC void nk_dot_e3m2_neon(nk_e3m2_t const *a_scalars, nk_e3m2_t const *b_scalars, nk_size_t count_scalars,
+                                nk_f32_t *result) {
+    float16x8_t a_low_f16x8, a_high_f16x8, b_low_f16x8, b_high_f16x8;
+    float32x4_t sum_f32x4 = vdupq_n_f32(0);
+    // x16 TBL path: process 16 elements per iteration via lookup table upcast
+nk_dot_e3m2_neon_cycle:
+    if (count_scalars < 16) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b8x16_serial_(a_scalars, &a_vec, count_scalars);
+        nk_partial_load_b8x16_serial_(b_scalars, &b_vec, count_scalars);
+        nk_e3m2x16_to_f16x8x2_neon_(a_vec.u8x16, &a_low_f16x8, &a_high_f16x8);
+        nk_e3m2x16_to_f16x8x2_neon_(b_vec.u8x16, &b_low_f16x8, &b_high_f16x8);
+        count_scalars = 0;
+    }
+    else {
+        nk_e3m2x16_to_f16x8x2_neon_(vld1q_u8(a_scalars), &a_low_f16x8, &a_high_f16x8);
+        nk_e3m2x16_to_f16x8x2_neon_(vld1q_u8(b_scalars), &b_low_f16x8, &b_high_f16x8);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_low_f16x8)), vcvt_f32_f16(vget_low_f16(b_low_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_low_f16x8)),
+                          vcvt_f32_f16(vget_high_f16(b_low_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_low_f16(a_high_f16x8)),
+                          vcvt_f32_f16(vget_low_f16(b_high_f16x8)));
+    sum_f32x4 = vfmaq_f32(sum_f32x4, vcvt_f32_f16(vget_high_f16(a_high_f16x8)),
+                          vcvt_f32_f16(vget_high_f16(b_high_f16x8)));
+    if (count_scalars) goto nk_dot_e3m2_neon_cycle;
+    *result = vaddvq_f32(sum_f32x4);
+}
+#pragma endregion - Smaller Floats
+#pragma region - Binary
+NK_PUBLIC void nk_dot_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bits, nk_u32_t *result) {
+    nk_size_t n_bytes = nk_size_divide_round_up_(n_bits, NK_BITS_PER_BYTE);
+    nk_u32_t dot = 0;
+    nk_size_t i = 0;
+    while (i + 16 <= n_bytes) {
+        uint8x16_t popcount_u8x16 = vdupq_n_u8(0);
+        for (nk_size_t cycle = 0; cycle < 31 && i + 16 <= n_bytes; ++cycle, i += 16) {
+            uint8x16_t a_u8x16 = vld1q_u8(a + i);
+            uint8x16_t b_u8x16 = vld1q_u8(b + i);
+            popcount_u8x16 = vaddq_u8(popcount_u8x16, vcntq_u8(vandq_u8(a_u8x16, b_u8x16)));
+        }
+        dot += (nk_u32_t)vaddlvq_u8(popcount_u8x16);
+    }
+    for (; i != n_bytes; ++i) dot += nk_u1x8_popcount_(a[i] & b[i]);
+    *result = dot;
+}
+typedef struct nk_dot_u1x128_state_neon_t {
+    uint32x4_t dot_count_u32x4;
+} nk_dot_u1x128_state_neon_t;
+NK_INTERNAL void nk_dot_u1x128_init_neon(nk_dot_u1x128_state_neon_t *state) { state->dot_count_u32x4 = vdupq_n_u32(0); }
+NK_INTERNAL void nk_dot_u1x128_update_neon(nk_dot_u1x128_state_neon_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                           nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    uint8x16_t and_u8x16 = vandq_u8(a.u8x16, b.u8x16);
+    uint8x16_t popcount_u8x16 = vcntq_u8(and_u8x16);
+    uint16x8_t popcount_u16x8 = vpaddlq_u8(popcount_u8x16);
+    uint32x4_t popcount_u32x4 = vpaddlq_u16(popcount_u16x8);
+    state->dot_count_u32x4 = vaddq_u32(state->dot_count_u32x4, popcount_u32x4);
+}
+NK_INTERNAL void nk_dot_u1x128_finalize_neon( //
+    nk_dot_u1x128_state_neon_t const *state_a, nk_dot_u1x128_state_neon_t const *state_b,
+    nk_dot_u1x128_state_neon_t const *state_c, nk_dot_u1x128_state_neon_t const *state_d, nk_size_t total_dimensions,
+    nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    uint32x4_t ab_sum_u32x4 = vpaddq_u32(state_a->dot_count_u32x4, state_b->dot_count_u32x4);
+    uint32x4_t cd_sum_u32x4 = vpaddq_u32(state_c->dot_count_u32x4, state_d->dot_count_u32x4);
+    result->u32x4 = vpaddq_u32(ab_sum_u32x4, cd_sum_u32x4);
+}
+#pragma endregion - Binary
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_NEON
+#endif // NK_TARGET_ARM_
+#endif // NK_DOT_NEON_H