npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/each/neon.h ADDED Viewed

@@ -0,0 +1,1104 @@
+/**
+ *  @brief SIMD-accelerated Elementwise Arithmetic for NEON.
+ *  @file include/numkong/each/neon.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/each.h
+ *
+ *  @section elementwise_neon_instructions ARM NEON Instructions
+ *
+ *      Intrinsic         Instruction                   Latency     Throughput
+ *                                                                  A76     M4+/V1+/Oryon
+ *      vld1q_f32         LD1 (V.4S)                    4cy         2/cy    2/cy
+ *      vst1q_f32         ST1 (V.4S)                    2cy         2/cy    2/cy
+ *      vaddq_f32         FADD (V.4S, V.4S, V.4S)       2cy         2/cy    4/cy
+ *      vmulq_f32         FMUL (V.4S, V.4S, V.4S)       3cy         2/cy    4/cy
+ *      vfmaq_f32         FMLA (V.4S, V.4S, V.4S)       4cy         2/cy    4/cy
+ *      vaddq_f64         FADD (V.2D, V.2D, V.2D)       2cy         2/cy    4/cy
+ *      vmulq_f64         FMUL (V.2D, V.2D, V.2D)       3cy         2/cy    4/cy
+ *      vfmaq_f64         FMLA (V.2D, V.2D, V.2D)       4cy         2/cy    4/cy
+ *      vqaddq_s16        SQADD (V.8H, V.8H, V.8H)      2cy         2/cy    4/cy
+ *      vcvtq_f32_s32     SCVTF (V.4S, V.4S)            3cy         2/cy    2/cy
+ *      vcvtnq_s32_f32    FCVTNS (V.4S, V.4S)           3cy         2/cy    2/cy
+ *      vqmovn_s32        SQXTN (V.4H, V.4S)            3cy         2/cy    2/cy
+ *
+ *  Elementwise operations are throughput-bound rather than latency-bound. FP arithmetic
+ *  throughput doubles on 4-pipe cores (Apple M4+, Graviton3+, Oryon) from 2/cy to 4/cy.
+ *
+ *  Memory bandwidth (LD1/ST1) typically becomes the bottleneck for large arrays, as load/store
+ *  throughput remains at 2/cy across all cores.
+ */
+#ifndef NK_EACH_NEON_H
+#define NK_EACH_NEON_H
+#if NK_TARGET_ARM_
+#if NK_TARGET_NEON
+#include "numkong/types.h"
+#include "numkong/cast/neon.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("arch=armv8-a+simd")
+#endif
+NK_PUBLIC void nk_each_sum_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        float32x4_t a_f32x4 = vld1q_f32(a + i);
+        float32x4_t b_f32x4 = vld1q_f32(b + i);
+        float32x4_t sum_f32x4 = vaddq_f32(a_f32x4, b_f32x4);
+        vst1q_f32(result + i, sum_f32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = a[i] + b[i];
+}
+NK_PUBLIC void nk_each_scale_f32_neon(nk_f32_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                      nk_f32_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    float32x4_t beta_f32x4 = vdupq_n_f32(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        float32x4_t a_f32x4 = vld1q_f32(a + i);
+        float32x4_t result_f32x4 = vfmaq_n_f32(beta_f32x4, a_f32x4, alpha_val);
+        vst1q_f32(result + i, result_f32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val;
+}
+NK_PUBLIC void nk_each_blend_f32_neon(                 //
+    nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_f32_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_f32_neon(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_f32_neon(a, n, alpha, &zero, result); }
+        else { nk_each_scale_f32_neon(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        float32x4_t a_f32x4 = vld1q_f32(a + i);
+        float32x4_t b_f32x4 = vld1q_f32(b + i);
+        float32x4_t a_scaled_f32x4 = vmulq_n_f32(a_f32x4, alpha_val);
+        float32x4_t result_f32x4 = vfmaq_n_f32(a_scaled_f32x4, b_f32x4, beta_val);
+        vst1q_f32(result + i, result_f32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val * b[i];
+}
+NK_PUBLIC void nk_each_fma_f32_neon(                         //
+    nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, //
+    nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta, nk_f32_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        float32x4_t a_f32x4 = vld1q_f32(a + i);
+        float32x4_t b_f32x4 = vld1q_f32(b + i);
+        float32x4_t c_f32x4 = vld1q_f32(c + i);
+        float32x4_t ab_f32x4 = vmulq_f32(a_f32x4, b_f32x4);
+        float32x4_t ab_scaled_f32x4 = vmulq_n_f32(ab_f32x4, alpha_val);
+        float32x4_t result_f32x4 = vfmaq_n_f32(ab_scaled_f32x4, c_f32x4, beta_val);
+        vst1q_f32(result + i, result_f32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] * b[i] + beta_val * c[i];
+}
+NK_PUBLIC void nk_each_sum_i16_neon(nk_i16_t const *a, nk_i16_t const *b, nk_size_t n, nk_i16_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        int16x8_t a_s16x8 = vld1q_s16(a + i);
+        int16x8_t b_s16x8 = vld1q_s16(b + i);
+        int16x8_t sum_s16x8 = vqaddq_s16(a_s16x8, b_s16x8);
+        vst1q_s16(result + i, sum_s16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = nk_i16_saturating_add_serial(a[i], b[i]);
+}
+NK_PUBLIC void nk_each_scale_i16_neon(nk_i16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                      nk_i16_t *result) {
+    float32_t alpha_f32 = *alpha;
+    float32_t beta_f32 = *beta;
+    float32x4_t alpha_f32x4 = vdupq_n_f32(alpha_f32);
+    float32x4_t beta_f32x4 = vdupq_n_f32(beta_f32);
+    float32x4_t min_f32x4 = vdupq_n_f32(-32768.0f);
+    float32x4_t max_f32x4 = vdupq_n_f32(32767.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        int16x4_t a_i16x4 = vld1_s16(a + i);
+        float32x4_t a_f32x4 = vcvtq_f32_s32(vmovl_s16(a_i16x4));
+        float32x4_t result_f32x4 = vfmaq_f32(beta_f32x4, a_f32x4, alpha_f32x4);
+        result_f32x4 = vmaxq_f32(vminq_f32(result_f32x4, max_f32x4), min_f32x4);
+        int16x4_t result_i16x4 = vqmovn_s32(vcvtnq_s32_f32(result_f32x4));
+        vst1_s16(result + i, result_i16x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t sum = alpha_f32 * a[i] + beta_f32;
+        nk_f32_to_i16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_i16_neon(                         //
+    nk_i16_t const *a, nk_i16_t const *b, nk_i16_t const *c, //
+    nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta, nk_i16_t *result) {
+    float32_t alpha_f32 = *alpha;
+    float32_t beta_f32 = *beta;
+    float32x4_t min_f32x4 = vdupq_n_f32(-32768.0f);
+    float32x4_t max_f32x4 = vdupq_n_f32(32767.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        int16x4_t a_i16x4 = vld1_s16(a + i);
+        int16x4_t b_i16x4 = vld1_s16(b + i);
+        int16x4_t c_i16x4 = vld1_s16(c + i);
+        float32x4_t a_f32x4 = vcvtq_f32_s32(vmovl_s16(a_i16x4));
+        float32x4_t b_f32x4 = vcvtq_f32_s32(vmovl_s16(b_i16x4));
+        float32x4_t c_f32x4 = vcvtq_f32_s32(vmovl_s16(c_i16x4));
+        float32x4_t ab_f32x4 = vmulq_f32(a_f32x4, b_f32x4);
+        float32x4_t ab_scaled_f32x4 = vmulq_n_f32(ab_f32x4, alpha_f32);
+        float32x4_t result_f32x4 = vfmaq_n_f32(ab_scaled_f32x4, c_f32x4, beta_f32);
+        result_f32x4 = vmaxq_f32(vminq_f32(result_f32x4, max_f32x4), min_f32x4);
+        int16x4_t result_i16x4 = vqmovn_s32(vcvtnq_s32_f32(result_f32x4));
+        vst1_s16(result + i, result_i16x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t sum = alpha_f32 * a[i] * b[i] + beta_f32 * c[i];
+        nk_f32_to_i16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_u16_neon(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_u16_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        uint16x8_t a_u16x8 = vld1q_u16(a + i);
+        uint16x8_t b_u16x8 = vld1q_u16(b + i);
+        uint16x8_t sum_u16x8 = vqaddq_u16(a_u16x8, b_u16x8);
+        vst1q_u16(result + i, sum_u16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = nk_u16_saturating_add_serial(a[i], b[i]);
+}
+NK_PUBLIC void nk_each_scale_u16_neon(nk_u16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                      nk_u16_t *result) {
+    float32_t alpha_f32 = *alpha;
+    float32_t beta_f32 = *beta;
+    float32x4_t alpha_f32x4 = vdupq_n_f32(alpha_f32);
+    float32x4_t beta_f32x4 = vdupq_n_f32(beta_f32);
+    float32x4_t min_f32x4 = vdupq_n_f32(0.0f);
+    float32x4_t max_f32x4 = vdupq_n_f32(65535.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        uint16x4_t a_u16x4 = vld1_u16(a + i);
+        float32x4_t a_f32x4 = vcvtq_f32_u32(vmovl_u16(a_u16x4));
+        float32x4_t result_f32x4 = vfmaq_f32(beta_f32x4, a_f32x4, alpha_f32x4);
+        result_f32x4 = vmaxq_f32(vminq_f32(result_f32x4, max_f32x4), min_f32x4);
+        uint16x4_t result_u16x4 = vqmovn_u32(vcvtnq_u32_f32(result_f32x4));
+        vst1_u16(result + i, result_u16x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t sum = alpha_f32 * a[i] + beta_f32;
+        nk_f32_to_u16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_u16_neon(                         //
+    nk_u16_t const *a, nk_u16_t const *b, nk_u16_t const *c, //
+    nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta, nk_u16_t *result) {
+    float32_t alpha_f32 = *alpha;
+    float32_t beta_f32 = *beta;
+    float32x4_t min_f32x4 = vdupq_n_f32(0.0f);
+    float32x4_t max_f32x4 = vdupq_n_f32(65535.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        uint16x4_t a_u16x4 = vld1_u16(a + i);
+        uint16x4_t b_u16x4 = vld1_u16(b + i);
+        uint16x4_t c_u16x4 = vld1_u16(c + i);
+        float32x4_t a_f32x4 = vcvtq_f32_u32(vmovl_u16(a_u16x4));
+        float32x4_t b_f32x4 = vcvtq_f32_u32(vmovl_u16(b_u16x4));
+        float32x4_t c_f32x4 = vcvtq_f32_u32(vmovl_u16(c_u16x4));
+        float32x4_t ab_f32x4 = vmulq_f32(a_f32x4, b_f32x4);
+        float32x4_t ab_scaled_f32x4 = vmulq_n_f32(ab_f32x4, alpha_f32);
+        float32x4_t result_f32x4 = vfmaq_n_f32(ab_scaled_f32x4, c_f32x4, beta_f32);
+        result_f32x4 = vmaxq_f32(vminq_f32(result_f32x4, max_f32x4), min_f32x4);
+        uint16x4_t result_u16x4 = vqmovn_u32(vcvtnq_u32_f32(result_f32x4));
+        vst1_u16(result + i, result_u16x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t sum = alpha_f32 * a[i] * b[i] + beta_f32 * c[i];
+        nk_f32_to_u16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_i32_neon(nk_i32_t const *a, nk_i32_t const *b, nk_size_t n, nk_i32_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        int32x4_t a_s32x4 = vld1q_s32(a + i);
+        int32x4_t b_s32x4 = vld1q_s32(b + i);
+        int32x4_t sum_s32x4 = vqaddq_s32(a_s32x4, b_s32x4);
+        vst1q_s32(result + i, sum_s32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = nk_i32_saturating_add_serial(a[i], b[i]);
+}
+NK_PUBLIC void nk_each_scale_i32_neon(nk_i32_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                      nk_i32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t alpha_f64x2 = vdupq_n_f64(alpha_val);
+    float64x2_t beta_f64x2 = vdupq_n_f64(beta_val);
+    float64x2_t min_f64x2 = vdupq_n_f64(-2147483648.0);
+    float64x2_t max_f64x2 = vdupq_n_f64(2147483647.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        int32x2_t a_i32x2 = vld1_s32(a + i);
+        float64x2_t a_f64x2 = vcvtq_f64_s64(vmovl_s32(a_i32x2));
+        float64x2_t result_f64x2 = vfmaq_f64(beta_f64x2, a_f64x2, alpha_f64x2);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        int32x2_t result_i32x2 = vqmovn_s64(vcvtnq_s64_f64(result_f64x2));
+        vst1_s32(result + i, result_i32x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] + beta_val;
+        nk_f64_to_i32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_i32_neon(                         //
+    nk_i32_t const *a, nk_i32_t const *b, nk_i32_t const *c, //
+    nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta, nk_i32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t min_f64x2 = vdupq_n_f64(-2147483648.0);
+    float64x2_t max_f64x2 = vdupq_n_f64(2147483647.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        int32x2_t a_i32x2 = vld1_s32(a + i);
+        int32x2_t b_i32x2 = vld1_s32(b + i);
+        int32x2_t c_i32x2 = vld1_s32(c + i);
+        float64x2_t a_f64x2 = vcvtq_f64_s64(vmovl_s32(a_i32x2));
+        float64x2_t b_f64x2 = vcvtq_f64_s64(vmovl_s32(b_i32x2));
+        float64x2_t c_f64x2 = vcvtq_f64_s64(vmovl_s32(c_i32x2));
+        float64x2_t ab_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+        float64x2_t ab_scaled_f64x2 = vmulq_n_f64(ab_f64x2, alpha_val);
+        float64x2_t result_f64x2 = vfmaq_n_f64(ab_scaled_f64x2, c_f64x2, beta_val);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        int32x2_t result_i32x2 = vqmovn_s64(vcvtnq_s64_f64(result_f64x2));
+        vst1_s32(result + i, result_i32x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] * b[i] + beta_val * c[i];
+        nk_f64_to_i32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_u32_neon(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_u32_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        uint32x4_t a_u32x4 = vld1q_u32(a + i);
+        uint32x4_t b_u32x4 = vld1q_u32(b + i);
+        uint32x4_t sum_u32x4 = vqaddq_u32(a_u32x4, b_u32x4);
+        vst1q_u32(result + i, sum_u32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = nk_u32_saturating_add_serial(a[i], b[i]);
+}
+NK_PUBLIC void nk_each_scale_u32_neon(nk_u32_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                      nk_u32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t alpha_f64x2 = vdupq_n_f64(alpha_val);
+    float64x2_t beta_f64x2 = vdupq_n_f64(beta_val);
+    float64x2_t min_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t max_f64x2 = vdupq_n_f64(4294967295.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        uint32x2_t a_u32x2 = vld1_u32(a + i);
+        float64x2_t a_f64x2 = vcvtq_f64_u64(vmovl_u32(a_u32x2));
+        float64x2_t result_f64x2 = vfmaq_f64(beta_f64x2, a_f64x2, alpha_f64x2);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        uint32x2_t result_u32x2 = vqmovn_u64(vcvtnq_u64_f64(result_f64x2));
+        vst1_u32(result + i, result_u32x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] + beta_val;
+        nk_f64_to_u32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_u32_neon(                         //
+    nk_u32_t const *a, nk_u32_t const *b, nk_u32_t const *c, //
+    nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta, nk_u32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t min_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t max_f64x2 = vdupq_n_f64(4294967295.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        uint32x2_t a_u32x2 = vld1_u32(a + i);
+        uint32x2_t b_u32x2 = vld1_u32(b + i);
+        uint32x2_t c_u32x2 = vld1_u32(c + i);
+        float64x2_t a_f64x2 = vcvtq_f64_u64(vmovl_u32(a_u32x2));
+        float64x2_t b_f64x2 = vcvtq_f64_u64(vmovl_u32(b_u32x2));
+        float64x2_t c_f64x2 = vcvtq_f64_u64(vmovl_u32(c_u32x2));
+        float64x2_t ab_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+        float64x2_t ab_scaled_f64x2 = vmulq_n_f64(ab_f64x2, alpha_val);
+        float64x2_t result_f64x2 = vfmaq_n_f64(ab_scaled_f64x2, c_f64x2, beta_val);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        uint32x2_t result_u32x2 = vqmovn_u64(vcvtnq_u64_f64(result_f64x2));
+        vst1_u32(result + i, result_u32x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] * b[i] + beta_val * c[i];
+        nk_f64_to_u32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_i64_neon(nk_i64_t const *a, nk_i64_t const *b, nk_size_t n, nk_i64_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        int64x2_t a_s64x2 = vld1q_s64(a + i);
+        int64x2_t b_s64x2 = vld1q_s64(b + i);
+        int64x2_t sum_s64x2 = vqaddq_s64(a_s64x2, b_s64x2);
+        vst1q_s64(result + i, sum_s64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = nk_i64_saturating_add_serial(a[i], b[i]);
+}
+NK_PUBLIC void nk_each_scale_i64_neon(nk_i64_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                      nk_i64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t alpha_f64x2 = vdupq_n_f64(alpha_val);
+    float64x2_t beta_f64x2 = vdupq_n_f64(beta_val);
+    float64x2_t min_f64x2 = vdupq_n_f64((nk_f64_t)NK_I64_MIN);
+    float64x2_t max_f64x2 = vdupq_n_f64((nk_f64_t)NK_I64_MAX);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        int64x2_t a_i64x2 = vld1q_s64(a + i);
+        float64x2_t a_f64x2 = vcvtq_f64_s64(a_i64x2);
+        float64x2_t result_f64x2 = vfmaq_f64(beta_f64x2, a_f64x2, alpha_f64x2);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        int64x2_t result_i64x2 = vcvtnq_s64_f64(result_f64x2);
+        vst1q_s64(result + i, result_i64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] + beta_val;
+        nk_f64_to_i64_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_i64_neon(                         //
+    nk_i64_t const *a, nk_i64_t const *b, nk_i64_t const *c, //
+    nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta, nk_i64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t min_f64x2 = vdupq_n_f64((nk_f64_t)NK_I64_MIN);
+    float64x2_t max_f64x2 = vdupq_n_f64((nk_f64_t)NK_I64_MAX);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        int64x2_t a_i64x2 = vld1q_s64(a + i);
+        int64x2_t b_i64x2 = vld1q_s64(b + i);
+        int64x2_t c_i64x2 = vld1q_s64(c + i);
+        float64x2_t a_f64x2 = vcvtq_f64_s64(a_i64x2);
+        float64x2_t b_f64x2 = vcvtq_f64_s64(b_i64x2);
+        float64x2_t c_f64x2 = vcvtq_f64_s64(c_i64x2);
+        float64x2_t ab_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+        float64x2_t ab_scaled_f64x2 = vmulq_n_f64(ab_f64x2, alpha_val);
+        float64x2_t result_f64x2 = vfmaq_n_f64(ab_scaled_f64x2, c_f64x2, beta_val);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        int64x2_t result_i64x2 = vcvtnq_s64_f64(result_f64x2);
+        vst1q_s64(result + i, result_i64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] * b[i] + beta_val * c[i];
+        nk_f64_to_i64_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_u64_neon(nk_u64_t const *a, nk_u64_t const *b, nk_size_t n, nk_u64_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        uint64x2_t a_u64x2 = vld1q_u64(a + i);
+        uint64x2_t b_u64x2 = vld1q_u64(b + i);
+        uint64x2_t sum_u64x2 = vqaddq_u64(a_u64x2, b_u64x2);
+        vst1q_u64(result + i, sum_u64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = nk_u64_saturating_add_serial(a[i], b[i]);
+}
+NK_PUBLIC void nk_each_scale_u64_neon(nk_u64_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                      nk_u64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t alpha_f64x2 = vdupq_n_f64(alpha_val);
+    float64x2_t beta_f64x2 = vdupq_n_f64(beta_val);
+    float64x2_t min_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t max_f64x2 = vdupq_n_f64((nk_f64_t)NK_U64_MAX);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        uint64x2_t a_u64x2 = vld1q_u64(a + i);
+        float64x2_t a_f64x2 = vcvtq_f64_u64(a_u64x2);
+        float64x2_t result_f64x2 = vfmaq_f64(beta_f64x2, a_f64x2, alpha_f64x2);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        uint64x2_t result_u64x2 = vcvtnq_u64_f64(result_f64x2);
+        vst1q_u64(result + i, result_u64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] + beta_val;
+        nk_f64_to_u64_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_u64_neon(                         //
+    nk_u64_t const *a, nk_u64_t const *b, nk_u64_t const *c, //
+    nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta, nk_u64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t min_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t max_f64x2 = vdupq_n_f64((nk_f64_t)NK_U64_MAX);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        uint64x2_t a_u64x2 = vld1q_u64(a + i);
+        uint64x2_t b_u64x2 = vld1q_u64(b + i);
+        uint64x2_t c_u64x2 = vld1q_u64(c + i);
+        float64x2_t a_f64x2 = vcvtq_f64_u64(a_u64x2);
+        float64x2_t b_f64x2 = vcvtq_f64_u64(b_u64x2);
+        float64x2_t c_f64x2 = vcvtq_f64_u64(c_u64x2);
+        float64x2_t ab_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+        float64x2_t ab_scaled_f64x2 = vmulq_n_f64(ab_f64x2, alpha_val);
+        float64x2_t result_f64x2 = vfmaq_n_f64(ab_scaled_f64x2, c_f64x2, beta_val);
+        result_f64x2 = vmaxq_f64(vminq_f64(result_f64x2, max_f64x2), min_f64x2);
+        uint64x2_t result_u64x2 = vcvtnq_u64_f64(result_f64x2);
+        vst1q_u64(result + i, result_u64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t sum = alpha_val * a[i] * b[i] + beta_val * c[i];
+        nk_f64_to_u64_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float64x2_t a_f64x2 = vld1q_f64(a + i);
+        float64x2_t b_f64x2 = vld1q_f64(b + i);
+        float64x2_t sum_f64x2 = vaddq_f64(a_f64x2, b_f64x2);
+        vst1q_f64(result + i, sum_f64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = a[i] + b[i];
+}
+NK_PUBLIC void nk_each_scale_f64_neon(nk_f64_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                      nk_f64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    float64x2_t alpha_f64x2 = vdupq_n_f64(alpha_val);
+    float64x2_t beta_f64x2 = vdupq_n_f64(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float64x2_t a_f64x2 = vld1q_f64(a + i);
+        float64x2_t result_f64x2 = vfmaq_f64(beta_f64x2, a_f64x2, alpha_f64x2);
+        vst1q_f64(result + i, result_f64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val;
+}
+NK_PUBLIC void nk_each_blend_f64_neon(                 //
+    nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, //
+    nk_f64_t const *alpha, nk_f64_t const *beta, nk_f64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_f64_neon(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f64_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_f64_neon(a, n, alpha, &zero, result); }
+        else { nk_each_scale_f64_neon(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float64x2_t a_f64x2 = vld1q_f64(a + i);
+        float64x2_t b_f64x2 = vld1q_f64(b + i);
+        float64x2_t a_scaled_f64x2 = vmulq_n_f64(a_f64x2, alpha_val);
+        float64x2_t b_scaled_f64x2 = vmulq_n_f64(b_f64x2, beta_val);
+        float64x2_t result_f64x2 = vaddq_f64(a_scaled_f64x2, b_scaled_f64x2);
+        vst1q_f64(result + i, result_f64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val * b[i];
+}
+NK_PUBLIC void nk_each_fma_f64_neon(                         //
+    nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, //
+    nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta, nk_f64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float64x2_t a_f64x2 = vld1q_f64(a + i);
+        float64x2_t b_f64x2 = vld1q_f64(b + i);
+        float64x2_t c_f64x2 = vld1q_f64(c + i);
+        float64x2_t ab_f64x2 = vmulq_f64(a_f64x2, b_f64x2);
+        float64x2_t ab_scaled_f64x2 = vmulq_n_f64(ab_f64x2, alpha_val);
+        float64x2_t result_f64x2 = vfmaq_n_f64(ab_scaled_f64x2, c_f64x2, beta_val);
+        vst1q_f64(result + i, result_f64x2);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] * b[i] + beta_val * c[i];
+}
+NK_PUBLIC void nk_each_sum_e4m3_neon(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_e4m3_t *result) {
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(a + i));
+        float16x8_t b_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(b + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+        float32x4_t result_low_f32x4 = vaddq_f32(a_low_f32x4, b_low_f32x4);
+        float32x4_t result_high_f32x4 = vaddq_f32(a_high_f32x4, b_high_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e4m3x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e4m3x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, sum;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        nk_e4m3_to_f32_serial(b + i, &bi);
+        sum = ai + bi;
+        nk_f32_to_e4m3_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_e5m2_neon(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_e5m2_t *result) {
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(a + i));
+        float16x8_t b_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(b + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+        float32x4_t result_low_f32x4 = vaddq_f32(a_low_f32x4, b_low_f32x4);
+        float32x4_t result_high_f32x4 = vaddq_f32(a_high_f32x4, b_high_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e5m2x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e5m2x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, sum;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        nk_e5m2_to_f32_serial(b + i, &bi);
+        sum = ai + bi;
+        nk_f32_to_e5m2_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_e4m3_neon(nk_e4m3_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                       nk_e4m3_t *result) {
+    float32x4_t alpha_f32x4 = vdupq_n_f32(*alpha);
+    float32x4_t beta_f32x4 = vdupq_n_f32(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(a + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t result_low_f32x4 = vfmaq_f32(beta_f32x4, a_low_f32x4, alpha_f32x4);
+        float32x4_t result_high_f32x4 = vfmaq_f32(beta_f32x4, a_high_f32x4, alpha_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e4m3x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e4m3x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, scaled;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        scaled = *alpha * ai + *beta;
+        nk_f32_to_e4m3_serial(&scaled, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_e5m2_neon(nk_e5m2_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                       nk_e5m2_t *result) {
+    float32x4_t alpha_f32x4 = vdupq_n_f32(*alpha);
+    float32x4_t beta_f32x4 = vdupq_n_f32(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(a + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t result_low_f32x4 = vfmaq_f32(beta_f32x4, a_low_f32x4, alpha_f32x4);
+        float32x4_t result_high_f32x4 = vfmaq_f32(beta_f32x4, a_high_f32x4, alpha_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e5m2x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e5m2x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, scaled;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        scaled = *alpha * ai + *beta;
+        nk_f32_to_e5m2_serial(&scaled, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_e4m3_neon(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t const *alpha,
+                                       nk_f32_t const *beta, nk_e4m3_t *result) {
+    float32x4_t alpha_f32x4 = vdupq_n_f32(*alpha);
+    float32x4_t beta_f32x4 = vdupq_n_f32(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(a + i));
+        float16x8_t b_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(b + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+        float32x4_t a_scaled_low_f32x4 = vmulq_f32(a_low_f32x4, alpha_f32x4);
+        float32x4_t a_scaled_high_f32x4 = vmulq_f32(a_high_f32x4, alpha_f32x4);
+        float32x4_t result_low_f32x4 = vfmaq_f32(a_scaled_low_f32x4, b_low_f32x4, beta_f32x4);
+        float32x4_t result_high_f32x4 = vfmaq_f32(a_scaled_high_f32x4, b_high_f32x4, beta_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e4m3x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e4m3x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, blended;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        nk_e4m3_to_f32_serial(b + i, &bi);
+        blended = *alpha * ai + *beta * bi;
+        nk_f32_to_e4m3_serial(&blended, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_e5m2_neon(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t const *alpha,
+                                       nk_f32_t const *beta, nk_e5m2_t *result) {
+    float32x4_t alpha_f32x4 = vdupq_n_f32(*alpha);
+    float32x4_t beta_f32x4 = vdupq_n_f32(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(a + i));
+        float16x8_t b_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(b + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+        float32x4_t a_scaled_low_f32x4 = vmulq_f32(a_low_f32x4, alpha_f32x4);
+        float32x4_t a_scaled_high_f32x4 = vmulq_f32(a_high_f32x4, alpha_f32x4);
+        float32x4_t result_low_f32x4 = vfmaq_f32(a_scaled_low_f32x4, b_low_f32x4, beta_f32x4);
+        float32x4_t result_high_f32x4 = vfmaq_f32(a_scaled_high_f32x4, b_high_f32x4, beta_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e5m2x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e5m2x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, blended;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        nk_e5m2_to_f32_serial(b + i, &bi);
+        blended = *alpha * ai + *beta * bi;
+        nk_f32_to_e5m2_serial(&blended, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_e4m3_neon(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_e4m3_t const *c, nk_size_t n,
+                                     nk_f32_t const *alpha, nk_f32_t const *beta, nk_e4m3_t *result) {
+    float32x4_t alpha_f32x4 = vdupq_n_f32(*alpha);
+    float32x4_t beta_f32x4 = vdupq_n_f32(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(a + i));
+        float16x8_t b_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(b + i));
+        float16x8_t c_f16x8 = nk_e4m3x8_to_f16x8_neon_(vld1_u8(c + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+        float32x4_t c_low_f32x4 = vcvt_f32_f16(vget_low_f16(c_f16x8));
+        float32x4_t c_high_f32x4 = vcvt_f32_f16(vget_high_f16(c_f16x8));
+        float32x4_t ab_low_f32x4 = vmulq_f32(a_low_f32x4, b_low_f32x4);
+        float32x4_t ab_high_f32x4 = vmulq_f32(a_high_f32x4, b_high_f32x4);
+        float32x4_t ab_scaled_low_f32x4 = vmulq_f32(ab_low_f32x4, alpha_f32x4);
+        float32x4_t ab_scaled_high_f32x4 = vmulq_f32(ab_high_f32x4, alpha_f32x4);
+        float32x4_t result_low_f32x4 = vfmaq_f32(ab_scaled_low_f32x4, c_low_f32x4, beta_f32x4);
+        float32x4_t result_high_f32x4 = vfmaq_f32(ab_scaled_high_f32x4, c_high_f32x4, beta_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e4m3x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e4m3x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, ci, fma;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        nk_e4m3_to_f32_serial(b + i, &bi);
+        nk_e4m3_to_f32_serial(c + i, &ci);
+        fma = *alpha * ai * bi + *beta * ci;
+        nk_f32_to_e4m3_serial(&fma, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_e5m2_neon(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_e5m2_t const *c, nk_size_t n,
+                                     nk_f32_t const *alpha, nk_f32_t const *beta, nk_e5m2_t *result) {
+    float32x4_t alpha_f32x4 = vdupq_n_f32(*alpha);
+    float32x4_t beta_f32x4 = vdupq_n_f32(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        float16x8_t a_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(a + i));
+        float16x8_t b_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(b + i));
+        float16x8_t c_f16x8 = nk_e5m2x8_to_f16x8_neon_(vld1_u8(c + i));
+        float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        float32x4_t a_high_f32x4 = vcvt_f32_f16(vget_high_f16(a_f16x8));
+        float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        float32x4_t b_high_f32x4 = vcvt_f32_f16(vget_high_f16(b_f16x8));
+        float32x4_t c_low_f32x4 = vcvt_f32_f16(vget_low_f16(c_f16x8));
+        float32x4_t c_high_f32x4 = vcvt_f32_f16(vget_high_f16(c_f16x8));
+        float32x4_t ab_low_f32x4 = vmulq_f32(a_low_f32x4, b_low_f32x4);
+        float32x4_t ab_high_f32x4 = vmulq_f32(a_high_f32x4, b_high_f32x4);
+        float32x4_t ab_scaled_low_f32x4 = vmulq_f32(ab_low_f32x4, alpha_f32x4);
+        float32x4_t ab_scaled_high_f32x4 = vmulq_f32(ab_high_f32x4, alpha_f32x4);
+        float32x4_t result_low_f32x4 = vfmaq_f32(ab_scaled_low_f32x4, c_low_f32x4, beta_f32x4);
+        float32x4_t result_high_f32x4 = vfmaq_f32(ab_scaled_high_f32x4, c_high_f32x4, beta_f32x4);
+        nk_b32_vec_t result_low_vec = nk_f32x4_to_e5m2x4_neon_(result_low_f32x4);
+        nk_b32_vec_t result_high_vec = nk_f32x4_to_e5m2x4_neon_(result_high_f32x4);
+        vst1_u8(result + i, vcreate_u8((nk_u64_t)result_low_vec.u32 | ((nk_u64_t)result_high_vec.u32 << 32)));
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, ci, fma;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        nk_e5m2_to_f32_serial(b + i, &bi);
+        nk_e5m2_to_f32_serial(c + i, &ci);
+        fma = *alpha * ai * bi + *beta * ci;
+        nk_f32_to_e5m2_serial(&fma, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_f32c_neon(nk_f32c_t const *a, nk_size_t n, nk_f32c_t const *alpha, nk_f32c_t const *beta,
+                                       nk_f32c_t *result) {
+    float32x4_t alpha_real_f32x4 = vdupq_n_f32(alpha->real);
+    float32x4_t alpha_imag_f32x4 = vdupq_n_f32(alpha->imag);
+    float32x4_t beta_real_f32x4 = vdupq_n_f32(beta->real);
+    float32x4_t beta_imag_f32x4 = vdupq_n_f32(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        float32x4x2_t a_f32x4x2 = vld2q_f32((nk_f32_t const *)(a + i));
+        float32x4_t y_real_f32x4 = vfmaq_f32(beta_real_f32x4, alpha_real_f32x4, a_f32x4x2.val[0]);
+        y_real_f32x4 = vfmsq_f32(y_real_f32x4, alpha_imag_f32x4, a_f32x4x2.val[1]);
+        float32x4_t y_imag_f32x4 = vfmaq_f32(beta_imag_f32x4, alpha_real_f32x4, a_f32x4x2.val[1]);
+        y_imag_f32x4 = vfmaq_f32(y_imag_f32x4, alpha_imag_f32x4, a_f32x4x2.val[0]);
+        float32x4x2_t out = {y_real_f32x4, y_imag_f32x4};
+        vst2q_f32((nk_f32_t *)(result + i), out);
+    }
+    for (; i < n; i++) {
+        nk_f32_t a_real = a[i].real, a_imag = a[i].imag;
+        result[i].real = alpha->real * a_real - alpha->imag * a_imag + beta->real;
+        result[i].imag = alpha->real * a_imag + alpha->imag * a_real + beta->imag;
+    }
+}
+NK_PUBLIC void nk_each_scale_f64c_neon(nk_f64c_t const *a, nk_size_t n, nk_f64c_t const *alpha, nk_f64c_t const *beta,
+                                       nk_f64c_t *result) {
+    float64x2_t alpha_real_f64x2 = vdupq_n_f64(alpha->real);
+    float64x2_t alpha_imag_f64x2 = vdupq_n_f64(alpha->imag);
+    float64x2_t beta_real_f64x2 = vdupq_n_f64(beta->real);
+    float64x2_t beta_imag_f64x2 = vdupq_n_f64(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float64x2x2_t a_f64x2x2 = vld2q_f64((nk_f64_t const *)(a + i));
+        float64x2_t y_real_f64x2 = vfmaq_f64(beta_real_f64x2, alpha_real_f64x2, a_f64x2x2.val[0]);
+        y_real_f64x2 = vfmsq_f64(y_real_f64x2, alpha_imag_f64x2, a_f64x2x2.val[1]);
+        float64x2_t y_imag_f64x2 = vfmaq_f64(beta_imag_f64x2, alpha_real_f64x2, a_f64x2x2.val[1]);
+        y_imag_f64x2 = vfmaq_f64(y_imag_f64x2, alpha_imag_f64x2, a_f64x2x2.val[0]);
+        float64x2x2_t out = {y_real_f64x2, y_imag_f64x2};
+        vst2q_f64((nk_f64_t *)(result + i), out);
+    }
+    for (; i < n; i++) {
+        nk_f64_t a_real = a[i].real, a_imag = a[i].imag;
+        result[i].real = alpha->real * a_real - alpha->imag * a_imag + beta->real;
+        result[i].imag = alpha->real * a_imag + alpha->imag * a_real + beta->imag;
+    }
+}
+NK_PUBLIC void nk_each_blend_f32c_neon(nk_f32c_t const *a, nk_f32c_t const *b, nk_size_t n, nk_f32c_t const *alpha,
+                                       nk_f32c_t const *beta, nk_f32c_t *result) {
+    float32x4_t alpha_real_f32x4 = vdupq_n_f32(alpha->real);
+    float32x4_t alpha_imag_f32x4 = vdupq_n_f32(alpha->imag);
+    float32x4_t beta_real_f32x4 = vdupq_n_f32(beta->real);
+    float32x4_t beta_imag_f32x4 = vdupq_n_f32(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        float32x4x2_t a_f32x4x2 = vld2q_f32((nk_f32_t const *)(a + i));
+        float32x4x2_t b_f32x4x2 = vld2q_f32((nk_f32_t const *)(b + i));
+        float32x4_t ya_real_f32x4 = vmulq_f32(alpha_real_f32x4, a_f32x4x2.val[0]);
+        ya_real_f32x4 = vfmsq_f32(ya_real_f32x4, alpha_imag_f32x4, a_f32x4x2.val[1]);
+        float32x4_t ya_imag_f32x4 = vmulq_f32(alpha_real_f32x4, a_f32x4x2.val[1]);
+        ya_imag_f32x4 = vfmaq_f32(ya_imag_f32x4, alpha_imag_f32x4, a_f32x4x2.val[0]);
+        float32x4_t y_real_f32x4 = vfmaq_f32(ya_real_f32x4, beta_real_f32x4, b_f32x4x2.val[0]);
+        y_real_f32x4 = vfmsq_f32(y_real_f32x4, beta_imag_f32x4, b_f32x4x2.val[1]);
+        float32x4_t y_imag_f32x4 = vfmaq_f32(ya_imag_f32x4, beta_real_f32x4, b_f32x4x2.val[1]);
+        y_imag_f32x4 = vfmaq_f32(y_imag_f32x4, beta_imag_f32x4, b_f32x4x2.val[0]);
+        float32x4x2_t out = {y_real_f32x4, y_imag_f32x4};
+        vst2q_f32((nk_f32_t *)(result + i), out);
+    }
+    for (; i < n; i++) {
+        nk_f32_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f32_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f32_t ar = alpha->real * a_real - alpha->imag * a_imag;
+        nk_f32_t ai = alpha->real * a_imag + alpha->imag * a_real;
+        nk_f32_t br = beta->real * b_real - beta->imag * b_imag;
+        nk_f32_t bi = beta->real * b_imag + beta->imag * b_real;
+        result[i].real = ar + br;
+        result[i].imag = ai + bi;
+    }
+}
+NK_PUBLIC void nk_each_blend_f64c_neon(nk_f64c_t const *a, nk_f64c_t const *b, nk_size_t n, nk_f64c_t const *alpha,
+                                       nk_f64c_t const *beta, nk_f64c_t *result) {
+    float64x2_t alpha_real_f64x2 = vdupq_n_f64(alpha->real);
+    float64x2_t alpha_imag_f64x2 = vdupq_n_f64(alpha->imag);
+    float64x2_t beta_real_f64x2 = vdupq_n_f64(beta->real);
+    float64x2_t beta_imag_f64x2 = vdupq_n_f64(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float64x2x2_t a_f64x2x2 = vld2q_f64((nk_f64_t const *)(a + i));
+        float64x2x2_t b_f64x2x2 = vld2q_f64((nk_f64_t const *)(b + i));
+        float64x2_t ya_real_f64x2 = vmulq_f64(alpha_real_f64x2, a_f64x2x2.val[0]);
+        ya_real_f64x2 = vfmsq_f64(ya_real_f64x2, alpha_imag_f64x2, a_f64x2x2.val[1]);
+        float64x2_t ya_imag_f64x2 = vmulq_f64(alpha_real_f64x2, a_f64x2x2.val[1]);
+        ya_imag_f64x2 = vfmaq_f64(ya_imag_f64x2, alpha_imag_f64x2, a_f64x2x2.val[0]);
+        float64x2_t y_real_f64x2 = vfmaq_f64(ya_real_f64x2, beta_real_f64x2, b_f64x2x2.val[0]);
+        y_real_f64x2 = vfmsq_f64(y_real_f64x2, beta_imag_f64x2, b_f64x2x2.val[1]);
+        float64x2_t y_imag_f64x2 = vfmaq_f64(ya_imag_f64x2, beta_real_f64x2, b_f64x2x2.val[1]);
+        y_imag_f64x2 = vfmaq_f64(y_imag_f64x2, beta_imag_f64x2, b_f64x2x2.val[0]);
+        float64x2x2_t out = {y_real_f64x2, y_imag_f64x2};
+        vst2q_f64((nk_f64_t *)(result + i), out);
+    }
+    for (; i < n; i++) {
+        nk_f64_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f64_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f64_t ar = alpha->real * a_real - alpha->imag * a_imag;
+        nk_f64_t ai = alpha->real * a_imag + alpha->imag * a_real;
+        nk_f64_t br = beta->real * b_real - beta->imag * b_imag;
+        nk_f64_t bi = beta->real * b_imag + beta->imag * b_real;
+        result[i].real = ar + br;
+        result[i].imag = ai + bi;
+    }
+}
+NK_PUBLIC void nk_each_fma_f32c_neon(nk_f32c_t const *a, nk_f32c_t const *b, nk_f32c_t const *c, nk_size_t n,
+                                     nk_f32c_t const *alpha, nk_f32c_t const *beta, nk_f32c_t *result) {
+    float32x4_t alpha_real_f32x4 = vdupq_n_f32(alpha->real);
+    float32x4_t alpha_imag_f32x4 = vdupq_n_f32(alpha->imag);
+    float32x4_t beta_real_f32x4 = vdupq_n_f32(beta->real);
+    float32x4_t beta_imag_f32x4 = vdupq_n_f32(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        float32x4x2_t a_f32x4x2 = vld2q_f32((nk_f32_t const *)(a + i));
+        float32x4x2_t b_f32x4x2 = vld2q_f32((nk_f32_t const *)(b + i));
+        float32x4x2_t c_f32x4x2 = vld2q_f32((nk_f32_t const *)(c + i));
+        float32x4_t ab_real_f32x4 = vmulq_f32(a_f32x4x2.val[0], b_f32x4x2.val[0]);
+        ab_real_f32x4 = vfmsq_f32(ab_real_f32x4, a_f32x4x2.val[1], b_f32x4x2.val[1]);
+        float32x4_t ab_imag_f32x4 = vmulq_f32(a_f32x4x2.val[0], b_f32x4x2.val[1]);
+        ab_imag_f32x4 = vfmaq_f32(ab_imag_f32x4, a_f32x4x2.val[1], b_f32x4x2.val[0]);
+        float32x4_t y_real_f32x4 = vmulq_f32(alpha_real_f32x4, ab_real_f32x4);
+        y_real_f32x4 = vfmsq_f32(y_real_f32x4, alpha_imag_f32x4, ab_imag_f32x4);
+        float32x4_t y_imag_f32x4 = vmulq_f32(alpha_real_f32x4, ab_imag_f32x4);
+        y_imag_f32x4 = vfmaq_f32(y_imag_f32x4, alpha_imag_f32x4, ab_real_f32x4);
+        y_real_f32x4 = vfmaq_f32(y_real_f32x4, beta_real_f32x4, c_f32x4x2.val[0]);
+        y_real_f32x4 = vfmsq_f32(y_real_f32x4, beta_imag_f32x4, c_f32x4x2.val[1]);
+        y_imag_f32x4 = vfmaq_f32(y_imag_f32x4, beta_real_f32x4, c_f32x4x2.val[1]);
+        y_imag_f32x4 = vfmaq_f32(y_imag_f32x4, beta_imag_f32x4, c_f32x4x2.val[0]);
+        float32x4x2_t out = {y_real_f32x4, y_imag_f32x4};
+        vst2q_f32((nk_f32_t *)(result + i), out);
+    }
+    for (; i < n; i++) {
+        nk_f32_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f32_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f32_t c_real = c[i].real, c_imag = c[i].imag;
+        nk_f32_t ab_real = a_real * b_real - a_imag * b_imag;
+        nk_f32_t ab_imag = a_real * b_imag + a_imag * b_real;
+        nk_f32_t aab_real = alpha->real * ab_real - alpha->imag * ab_imag;
+        nk_f32_t aab_imag = alpha->real * ab_imag + alpha->imag * ab_real;
+        nk_f32_t bc_real = beta->real * c_real - beta->imag * c_imag;
+        nk_f32_t bc_imag = beta->real * c_imag + beta->imag * c_real;
+        result[i].real = aab_real + bc_real;
+        result[i].imag = aab_imag + bc_imag;
+    }
+}
+NK_PUBLIC void nk_each_fma_f64c_neon(nk_f64c_t const *a, nk_f64c_t const *b, nk_f64c_t const *c, nk_size_t n,
+                                     nk_f64c_t const *alpha, nk_f64c_t const *beta, nk_f64c_t *result) {
+    float64x2_t alpha_real_f64x2 = vdupq_n_f64(alpha->real);
+    float64x2_t alpha_imag_f64x2 = vdupq_n_f64(alpha->imag);
+    float64x2_t beta_real_f64x2 = vdupq_n_f64(beta->real);
+    float64x2_t beta_imag_f64x2 = vdupq_n_f64(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        float64x2x2_t a_f64x2x2 = vld2q_f64((nk_f64_t const *)(a + i));
+        float64x2x2_t b_f64x2x2 = vld2q_f64((nk_f64_t const *)(b + i));
+        float64x2x2_t c_f64x2x2 = vld2q_f64((nk_f64_t const *)(c + i));
+        float64x2_t ab_real_f64x2 = vmulq_f64(a_f64x2x2.val[0], b_f64x2x2.val[0]);
+        ab_real_f64x2 = vfmsq_f64(ab_real_f64x2, a_f64x2x2.val[1], b_f64x2x2.val[1]);
+        float64x2_t ab_imag_f64x2 = vmulq_f64(a_f64x2x2.val[0], b_f64x2x2.val[1]);
+        ab_imag_f64x2 = vfmaq_f64(ab_imag_f64x2, a_f64x2x2.val[1], b_f64x2x2.val[0]);
+        float64x2_t y_real_f64x2 = vmulq_f64(alpha_real_f64x2, ab_real_f64x2);
+        y_real_f64x2 = vfmsq_f64(y_real_f64x2, alpha_imag_f64x2, ab_imag_f64x2);
+        float64x2_t y_imag_f64x2 = vmulq_f64(alpha_real_f64x2, ab_imag_f64x2);
+        y_imag_f64x2 = vfmaq_f64(y_imag_f64x2, alpha_imag_f64x2, ab_real_f64x2);
+        y_real_f64x2 = vfmaq_f64(y_real_f64x2, beta_real_f64x2, c_f64x2x2.val[0]);
+        y_real_f64x2 = vfmsq_f64(y_real_f64x2, beta_imag_f64x2, c_f64x2x2.val[1]);
+        y_imag_f64x2 = vfmaq_f64(y_imag_f64x2, beta_real_f64x2, c_f64x2x2.val[1]);
+        y_imag_f64x2 = vfmaq_f64(y_imag_f64x2, beta_imag_f64x2, c_f64x2x2.val[0]);
+        float64x2x2_t out = {y_real_f64x2, y_imag_f64x2};
+        vst2q_f64((nk_f64_t *)(result + i), out);
+    }
+    for (; i < n; i++) {
+        nk_f64_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f64_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f64_t c_real = c[i].real, c_imag = c[i].imag;
+        nk_f64_t ab_real = a_real * b_real - a_imag * b_imag;
+        nk_f64_t ab_imag = a_real * b_imag + a_imag * b_real;
+        nk_f64_t aab_real = alpha->real * ab_real - alpha->imag * ab_imag;
+        nk_f64_t aab_imag = alpha->real * ab_imag + alpha->imag * ab_real;
+        nk_f64_t bc_real = beta->real * c_real - beta->imag * c_imag;
+        nk_f64_t bc_imag = beta->real * c_imag + beta->imag * c_real;
+        result[i].real = aab_real + bc_real;
+        result[i].imag = aab_imag + bc_imag;
+    }
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_NEON
+#endif // NK_TARGET_ARM_
+#endif // NK_EACH_NEON_H