npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/each/haswell.h ADDED Viewed

@@ -0,0 +1,1658 @@
+/**
+ *  @brief SIMD-accelerated Elementwise Arithmetic for Haswell.
+ *  @file include/numkong/each/haswell.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/each.h
+ *
+ *  @section haswell_elementwise_instructions Key AVX2 Elementwise Instructions
+ *
+ *      Intrinsic                   Instruction                     Latency     Throughput  Ports
+ *      _mm256_fmadd_ps             VFMADD (YMM, YMM, YMM)          5cy         0.5/cy      p01
+ *      _mm256_add_ps               VADDPS (YMM, YMM, YMM)          3cy         1/cy        p01
+ *      _mm256_mul_ps               VMULPS (YMM, YMM, YMM)          5cy         0.5/cy      p01
+ *      _mm256_cvtepi32_ps          VCVTDQ2PS (YMM, YMM)            4cy         1/cy        p01
+ *      _mm256_cvtepi8_epi32        VPMOVSXBD (YMM, XMM)            3cy         1/cy        p5
+ *
+ *  Elementwise operations (sum, scale, blend, fma) are compute-bound on FMA throughput. For mixed-
+ *  precision operations, type conversion chains (e.g., i8->i32->f32) add ~7-10 cycles overhead.
+ *  The FMA unit handles both multiply-add fusion and standalone multiply/add operations.
+ */
+#ifndef NK_EACH_HASWELL_H
+#define NK_EACH_HASWELL_H
+#if NK_TARGET_X86_
+#if NK_TARGET_HASWELL
+#include "numkong/types.h"
+#include "numkong/cast/serial.h"    // `nk_f32_to_i8_serial`
+#include "numkong/reduce/haswell.h" // `nk_e4m3x8_to_f32x8_haswell_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2")
+#endif
+NK_PUBLIC void nk_each_sum_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_loadu_ps(a + i);
+        __m256 b_f32x8 = _mm256_loadu_ps(b + i);
+        __m256 result_f32x8 = _mm256_add_ps(a_f32x8, b_f32x8);
+        _mm256_storeu_ps(result + i, result_f32x8);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = a[i] + b[i];
+}
+NK_PUBLIC void nk_each_scale_f32_haswell(nk_f32_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                         nk_f32_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_loadu_ps(a + i);
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        _mm256_storeu_ps(result + i, result_f32x8);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val;
+}
+NK_PUBLIC void nk_each_blend_f32_haswell(              //
+    nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_f32_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_f32_haswell(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_f32_haswell(a, n, alpha, &zero, result); }
+        else { nk_each_scale_f32_haswell(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_loadu_ps(a + i);
+        __m256 b_f32x8 = _mm256_loadu_ps(b + i);
+        __m256 a_scaled_f32x8 = _mm256_mul_ps(a_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(b_f32x8, beta_f32x8, a_scaled_f32x8);
+        _mm256_storeu_ps(result + i, result_f32x8);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val * b[i];
+}
+NK_PUBLIC void nk_each_sum_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_loadu_pd(a + i);
+        __m256d b_f64x4 = _mm256_loadu_pd(b + i);
+        __m256d result_f64x4 = _mm256_add_pd(a_f64x4, b_f64x4);
+        _mm256_storeu_pd(result + i, result_f64x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = a[i] + b[i];
+}
+NK_PUBLIC void nk_each_scale_f64_haswell(nk_f64_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                         nk_f64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    __m256d alpha_f64x4 = _mm256_set1_pd(alpha_val);
+    __m256d beta_f64x4 = _mm256_set1_pd(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_loadu_pd(a + i);
+        __m256d result_f64x4 = _mm256_fmadd_pd(a_f64x4, alpha_f64x4, beta_f64x4);
+        _mm256_storeu_pd(result + i, result_f64x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val;
+}
+NK_PUBLIC void nk_each_blend_f64_haswell(              //
+    nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, //
+    nk_f64_t const *alpha, nk_f64_t const *beta, nk_f64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_f64_haswell(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f64_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_f64_haswell(a, n, alpha, &zero, result); }
+        else { nk_each_scale_f64_haswell(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    __m256d alpha_f64x4 = _mm256_set1_pd(alpha_val);
+    __m256d beta_f64x4 = _mm256_set1_pd(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_loadu_pd(a + i);
+        __m256d b_f64x4 = _mm256_loadu_pd(b + i);
+        __m256d a_scaled_f64x4 = _mm256_mul_pd(a_f64x4, alpha_f64x4);
+        __m256d result_f64x4 = _mm256_fmadd_pd(b_f64x4, beta_f64x4, a_scaled_f64x4);
+        _mm256_storeu_pd(result + i, result_f64x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] + beta_val * b[i];
+}
+NK_PUBLIC void nk_each_sum_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f16_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_f16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m128i b_f16x8 = _mm_loadu_si128((__m128i const *)(b + i));
+        __m256 a_f32x8 = _mm256_cvtph_ps(a_f16x8);
+        __m256 b_f32x8 = _mm256_cvtph_ps(b_f16x8);
+        __m256 result_f32x8 = _mm256_add_ps(a_f32x8, b_f32x8);
+        __m128i result_f16x8 = _mm256_cvtps_ph(result_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm_storeu_si128((__m128i *)(result + i), result_f16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_f16_to_f32_haswell(a + i, &ai);
+        nk_f16_to_f32_haswell(b + i, &bi);
+        nk_f32_t sum = ai + bi;
+        nk_f32_to_f16_haswell(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_f16_haswell(nk_f16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                         nk_f16_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_f16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m256 a_f32x8 = _mm256_cvtph_ps(a_f16x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        __m128i result_f16x8 = _mm256_cvtps_ph(result_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm_storeu_si128((__m128i *)(result + i), result_f16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai;
+        nk_f16_to_f32_haswell(a + i, &ai);
+        nk_f32_t sum = alpha_val * ai + beta_val;
+        nk_f32_to_f16_haswell(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_f16_haswell(              //
+    nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_f16_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_f16_haswell(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_f16_haswell(a, n, alpha, &zero, result); }
+        else { nk_each_scale_f16_haswell(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_f16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m128i b_f16x8 = _mm_loadu_si128((__m128i const *)(b + i));
+        __m256 a_f32x8 = _mm256_cvtph_ps(a_f16x8);
+        __m256 b_f32x8 = _mm256_cvtph_ps(b_f16x8);
+        __m256 a_scaled_f32x8 = _mm256_mul_ps(a_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(b_f32x8, beta_f32x8, a_scaled_f32x8);
+        __m128i result_f16x8 = _mm256_cvtps_ph(result_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm_storeu_si128((__m128i *)(result + i), result_f16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_f16_to_f32_haswell(a + i, &ai);
+        nk_f16_to_f32_haswell(b + i, &bi);
+        nk_f32_t sum = alpha_val * ai + beta_val * bi;
+        nk_f32_to_f16_haswell(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_bf16_haswell(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_bf16_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_bf16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m128i b_bf16x8 = _mm_loadu_si128((__m128i const *)(b + i));
+        __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16x8);
+        __m256 b_f32x8 = nk_bf16x8_to_f32x8_haswell_(b_bf16x8);
+        __m256 result_f32x8 = _mm256_add_ps(a_f32x8, b_f32x8);
+        __m128i result_bf16x8 = nk_f32x8_to_bf16x8_haswell_(result_f32x8);
+        _mm_storeu_si128((__m128i *)(result + i), result_bf16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_bf16_to_f32_serial(a + i, &ai);
+        nk_bf16_to_f32_serial(b + i, &bi);
+        nk_f32_t sum = ai + bi;
+        nk_f32_to_bf16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_bf16_haswell(nk_bf16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                          nk_bf16_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_bf16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        __m128i result_bf16x8 = nk_f32x8_to_bf16x8_haswell_(result_f32x8);
+        _mm_storeu_si128((__m128i *)(result + i), result_bf16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai;
+        nk_bf16_to_f32_serial(a + i, &ai);
+        nk_f32_t sum = alpha_val * ai + beta_val;
+        nk_f32_to_bf16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_bf16_haswell(               //
+    nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_bf16_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_bf16_haswell(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_bf16_haswell(a, n, alpha, &zero, result); }
+        else { nk_each_scale_bf16_haswell(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_bf16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m128i b_bf16x8 = _mm_loadu_si128((__m128i const *)(b + i));
+        __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16x8);
+        __m256 b_f32x8 = nk_bf16x8_to_f32x8_haswell_(b_bf16x8);
+        __m256 a_scaled_f32x8 = _mm256_mul_ps(a_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(b_f32x8, beta_f32x8, a_scaled_f32x8);
+        __m128i result_bf16x8 = nk_f32x8_to_bf16x8_haswell_(result_f32x8);
+        _mm_storeu_si128((__m128i *)(result + i), result_bf16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_bf16_to_f32_serial(a + i, &ai);
+        nk_bf16_to_f32_serial(b + i, &bi);
+        nk_f32_t sum = alpha_val * ai + beta_val * bi;
+        nk_f32_to_bf16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_f32_haswell(                      //
+    nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, //
+    nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta, nk_f32_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_loadu_ps(a + i);
+        __m256 b_f32x8 = _mm256_loadu_ps(b + i);
+        __m256 c_f32x8 = _mm256_loadu_ps(c + i);
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 ab_scaled_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, ab_scaled_f32x8);
+        _mm256_storeu_ps(result + i, result_f32x8);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] * b[i] + beta_val * c[i];
+}
+NK_PUBLIC void nk_each_fma_f64_haswell(                      //
+    nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, //
+    nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta, nk_f64_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    __m256d alpha_f64x4 = _mm256_set1_pd(alpha_val);
+    __m256d beta_f64x4 = _mm256_set1_pd(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_loadu_pd(a + i);
+        __m256d b_f64x4 = _mm256_loadu_pd(b + i);
+        __m256d c_f64x4 = _mm256_loadu_pd(c + i);
+        __m256d ab_f64x4 = _mm256_mul_pd(a_f64x4, b_f64x4);
+        __m256d abc_f64x4 = _mm256_mul_pd(ab_f64x4, alpha_f64x4);
+        __m256d result_f64x4 = _mm256_fmadd_pd(c_f64x4, beta_f64x4, abc_f64x4);
+        _mm256_storeu_pd(result + i, result_f64x4);
+    }
+    // The tail:
+    for (; i < n; ++i) result[i] = alpha_val * a[i] * b[i] + beta_val * c[i];
+}
+NK_PUBLIC void nk_each_fma_f16_haswell(                      //
+    nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, //
+    nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta, nk_f16_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_f16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m128i b_f16x8 = _mm_loadu_si128((__m128i const *)(b + i));
+        __m128i c_f16x8 = _mm_loadu_si128((__m128i const *)(c + i));
+        __m256 a_f32x8 = _mm256_cvtph_ps(a_f16x8);
+        __m256 b_f32x8 = _mm256_cvtph_ps(b_f16x8);
+        __m256 c_f32x8 = _mm256_cvtph_ps(c_f16x8);
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 abc_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, abc_f32x8);
+        __m128i result_f16x8 = _mm256_cvtps_ph(result_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm_storeu_si128((__m128i *)(result + i), result_f16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, ci;
+        nk_f16_to_f32_haswell(a + i, &ai);
+        nk_f16_to_f32_haswell(b + i, &bi);
+        nk_f16_to_f32_haswell(c + i, &ci);
+        nk_f32_t sum = alpha_val * ai * bi + beta_val * ci;
+        nk_f32_to_f16_haswell(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_bf16_haswell(                        //
+    nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_t const *c, //
+    nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta, nk_bf16_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_bf16x8 = _mm_loadu_si128((__m128i const *)(a + i));
+        __m128i b_bf16x8 = _mm_loadu_si128((__m128i const *)(b + i));
+        __m128i c_bf16x8 = _mm_loadu_si128((__m128i const *)(c + i));
+        __m256 a_f32x8 = nk_bf16x8_to_f32x8_haswell_(a_bf16x8);
+        __m256 b_f32x8 = nk_bf16x8_to_f32x8_haswell_(b_bf16x8);
+        __m256 c_f32x8 = nk_bf16x8_to_f32x8_haswell_(c_bf16x8);
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 abc_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, abc_f32x8);
+        __m128i result_bf16x8 = nk_f32x8_to_bf16x8_haswell_(result_f32x8);
+        _mm_storeu_si128((__m128i *)(result + i), result_bf16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, ci;
+        nk_bf16_to_f32_serial(a + i, &ai);
+        nk_bf16_to_f32_serial(b + i, &bi);
+        nk_bf16_to_f32_serial(c + i, &ci);
+        nk_f32_t sum = alpha_val * ai * bi + beta_val * ci;
+        nk_f32_to_bf16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_i8_haswell(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 32 <= n; i += 32) {
+        __m256i a_i8x32 = _mm256_loadu_si256((__m256i *)(a + i));
+        __m256i b_i8x32 = _mm256_loadu_si256((__m256i *)(b + i));
+        __m256i result_i8x32 = _mm256_adds_epi8(a_i8x32, b_i8x32);
+        _mm256_storeu_si256((__m256i *)(result + i), result_i8x32);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i];
+        nk_f32_t sum = ai + bi;
+        nk_f32_to_i8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_i8_haswell(nk_i8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                        nk_i8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    int sum_i32s[8], a_i32s[8];
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
+        //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
+        a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
+            a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
+        //! This can be done at least 50% faster if we convert 8-bit integers to floats instead
+        //! of relying on `_mm256_cvtepi32_ps`: 4cy (1/cy) @ p01.
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)a_i32s));
+        // The normal part.
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        // Instead of serial calls to expensive `nk_f32_to_u8_serial`, convert and clip with SIMD.
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        result_i32x8 = _mm256_max_epi32(result_i32x8, _mm256_set1_epi32(-128));
+        result_i32x8 = _mm256_min_epi32(result_i32x8, _mm256_set1_epi32(127));
+        // Export into a serial buffer.
+        _mm256_storeu_si256((__m256i *)sum_i32s, result_i32x8);
+        result[i + 0] = (nk_i8_t)sum_i32s[0];
+        result[i + 1] = (nk_i8_t)sum_i32s[1];
+        result[i + 2] = (nk_i8_t)sum_i32s[2];
+        result[i + 3] = (nk_i8_t)sum_i32s[3];
+        result[i + 4] = (nk_i8_t)sum_i32s[4];
+        result[i + 5] = (nk_i8_t)sum_i32s[5];
+        result[i + 6] = (nk_i8_t)sum_i32s[6];
+        result[i + 7] = (nk_i8_t)sum_i32s[7];
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i];
+        nk_f32_t sum = alpha_val * ai + beta_val;
+        nk_f32_to_i8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_i8_haswell(             //
+    nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_i8_haswell(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_i8_haswell(a, n, alpha, &zero, result); }
+        else { nk_each_scale_i8_haswell(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    int sum_i32s[8], a_i32s[8], b_i32s[8];
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
+        //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
+        a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
+            a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
+        b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
+            b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
+        //! This can be done at least 50% faster if we convert 8-bit integers to floats instead
+        //! of relying on `_mm256_cvtepi32_ps`: 4cy (1/cy) @ p01.
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)a_i32s));
+        __m256 b_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)b_i32s));
+        // The normal part.
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(b_f32x8, beta_f32x8, ab_f32x8);
+        // Instead of serial calls to expensive `nk_f32_to_u8_serial`, convert and clip with SIMD.
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        result_i32x8 = _mm256_max_epi32(result_i32x8, _mm256_set1_epi32(-128));
+        result_i32x8 = _mm256_min_epi32(result_i32x8, _mm256_set1_epi32(127));
+        // Export into a serial buffer.
+        _mm256_storeu_si256((__m256i *)sum_i32s, result_i32x8);
+        result[i + 0] = (nk_i8_t)sum_i32s[0];
+        result[i + 1] = (nk_i8_t)sum_i32s[1];
+        result[i + 2] = (nk_i8_t)sum_i32s[2];
+        result[i + 3] = (nk_i8_t)sum_i32s[3];
+        result[i + 4] = (nk_i8_t)sum_i32s[4];
+        result[i + 5] = (nk_i8_t)sum_i32s[5];
+        result[i + 6] = (nk_i8_t)sum_i32s[6];
+        result[i + 7] = (nk_i8_t)sum_i32s[7];
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i];
+        nk_f32_t sum = alpha_val * ai + beta_val * bi;
+        nk_f32_to_i8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_u8_haswell(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 32 <= n; i += 32) {
+        __m256i a_u8x32 = _mm256_loadu_si256((__m256i *)(a + i));
+        __m256i b_u8x32 = _mm256_loadu_si256((__m256i *)(b + i));
+        __m256i result_u8x32 = _mm256_adds_epu8(a_u8x32, b_u8x32);
+        _mm256_storeu_si256((__m256i *)(result + i), result_u8x32);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i];
+        nk_f32_t sum = ai + bi;
+        nk_f32_to_u8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_u8_haswell(nk_u8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                        nk_u8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    int sum_i32s[8], a_i32s[8];
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
+        //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
+        a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
+            a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
+        //! This can be done at least 50% faster if we convert 8-bit integers to floats instead
+        //! of relying on `_mm256_cvtepi32_ps`: 4cy (1/cy) @ p01.
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)a_i32s));
+        // The normal part.
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        // Instead of serial calls to expensive `nk_f32_to_u8_serial`, convert and clip with SIMD.
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        result_i32x8 = _mm256_max_epi32(result_i32x8, _mm256_set1_epi32(0));
+        result_i32x8 = _mm256_min_epi32(result_i32x8, _mm256_set1_epi32(255));
+        // Export into a serial buffer.
+        _mm256_storeu_si256((__m256i *)sum_i32s, result_i32x8);
+        result[i + 0] = (nk_u8_t)sum_i32s[0];
+        result[i + 1] = (nk_u8_t)sum_i32s[1];
+        result[i + 2] = (nk_u8_t)sum_i32s[2];
+        result[i + 3] = (nk_u8_t)sum_i32s[3];
+        result[i + 4] = (nk_u8_t)sum_i32s[4];
+        result[i + 5] = (nk_u8_t)sum_i32s[5];
+        result[i + 6] = (nk_u8_t)sum_i32s[6];
+        result[i + 7] = (nk_u8_t)sum_i32s[7];
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i];
+        nk_f32_t sum = alpha_val * ai + beta_val;
+        nk_f32_to_u8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_u8_haswell(             //
+    nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_u8_haswell(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_u8_haswell(a, n, alpha, &zero, result); }
+        else { nk_each_scale_u8_haswell(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    int sum_i32s[8], a_i32s[8], b_i32s[8];
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
+        //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
+        a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
+            a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
+        b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
+            b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
+        //! This can be done at least 50% faster if we convert 8-bit integers to floats instead
+        //! of relying on `_mm256_cvtepi32_ps`: 4cy (1/cy) @ p01.
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)a_i32s));
+        __m256 b_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)b_i32s));
+        // The normal part.
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(b_f32x8, beta_f32x8, ab_f32x8);
+        // Instead of serial calls to expensive `nk_f32_to_u8_serial`, convert and clip with SIMD.
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        result_i32x8 = _mm256_max_epi32(result_i32x8, _mm256_set1_epi32(0));
+        result_i32x8 = _mm256_min_epi32(result_i32x8, _mm256_set1_epi32(255));
+        // Export into a serial buffer.
+        _mm256_storeu_si256((__m256i *)sum_i32s, result_i32x8);
+        result[i + 0] = (nk_u8_t)sum_i32s[0];
+        result[i + 1] = (nk_u8_t)sum_i32s[1];
+        result[i + 2] = (nk_u8_t)sum_i32s[2];
+        result[i + 3] = (nk_u8_t)sum_i32s[3];
+        result[i + 4] = (nk_u8_t)sum_i32s[4];
+        result[i + 5] = (nk_u8_t)sum_i32s[5];
+        result[i + 6] = (nk_u8_t)sum_i32s[6];
+        result[i + 7] = (nk_u8_t)sum_i32s[7];
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i];
+        nk_f32_t sum = alpha_val * ai + beta_val * bi;
+        nk_f32_to_u8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_i8_haswell(                                 //
+    nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8];
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
+        //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
+        a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
+            a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
+        b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
+            b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
+        c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], //
+            c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7];
+        //! This can be done at least 50% faster if we convert 8-bit integers to floats instead
+        //! of relying on `_mm256_cvtepi32_ps`: 4cy (1/cy) @ p01.
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)a_i32s));
+        __m256 b_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)b_i32s));
+        __m256 c_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)c_i32s));
+        // The normal part.
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 abc_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, abc_f32x8);
+        // Instead of serial calls to expensive `nk_f32_to_u8_serial`, convert and clip with SIMD.
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        result_i32x8 = _mm256_max_epi32(result_i32x8, _mm256_set1_epi32(-128));
+        result_i32x8 = _mm256_min_epi32(result_i32x8, _mm256_set1_epi32(127));
+        // Export into a serial buffer.
+        _mm256_storeu_si256((__m256i *)sum_i32s, result_i32x8);
+        result[i + 0] = (nk_i8_t)sum_i32s[0];
+        result[i + 1] = (nk_i8_t)sum_i32s[1];
+        result[i + 2] = (nk_i8_t)sum_i32s[2];
+        result[i + 3] = (nk_i8_t)sum_i32s[3];
+        result[i + 4] = (nk_i8_t)sum_i32s[4];
+        result[i + 5] = (nk_i8_t)sum_i32s[5];
+        result[i + 6] = (nk_i8_t)sum_i32s[6];
+        result[i + 7] = (nk_i8_t)sum_i32s[7];
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i], ci = c[i];
+        nk_f32_t sum = alpha_val * ai * bi + beta_val * ci;
+        nk_f32_to_i8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_u8_haswell(                                 //
+    nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_val);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_val);
+    int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8];
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
+        //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
+        a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
+            a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
+        b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
+            b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
+        c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], //
+            c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7];
+        //! This can be done at least 50% faster if we convert 8-bit integers to floats instead
+        //! of relying on `_mm256_cvtepi32_ps`: 4cy (1/cy) @ p01.
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)a_i32s));
+        __m256 b_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)b_i32s));
+        __m256 c_f32x8 = _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i *)c_i32s));
+        // The normal part.
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 abc_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, abc_f32x8);
+        // Instead of serial calls to expensive `nk_f32_to_u8_serial`, convert and clip with SIMD.
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        result_i32x8 = _mm256_max_epi32(result_i32x8, _mm256_set1_epi32(0));
+        result_i32x8 = _mm256_min_epi32(result_i32x8, _mm256_set1_epi32(255));
+        // Export into a serial buffer.
+        _mm256_storeu_si256((__m256i *)sum_i32s, result_i32x8);
+        result[i + 0] = (nk_u8_t)sum_i32s[0];
+        result[i + 1] = (nk_u8_t)sum_i32s[1];
+        result[i + 2] = (nk_u8_t)sum_i32s[2];
+        result[i + 3] = (nk_u8_t)sum_i32s[3];
+        result[i + 4] = (nk_u8_t)sum_i32s[4];
+        result[i + 5] = (nk_u8_t)sum_i32s[5];
+        result[i + 6] = (nk_u8_t)sum_i32s[6];
+        result[i + 7] = (nk_u8_t)sum_i32s[7];
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i], ci = c[i];
+        nk_f32_t sum = alpha_val * ai * bi + beta_val * ci;
+        nk_f32_to_u8_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_i16_haswell(nk_i16_t const *a, nk_i16_t const *b, nk_size_t n, nk_i16_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 16 <= n; i += 16) {
+        __m256i a_vec = _mm256_loadu_si256((__m256i *)(a + i));
+        __m256i b_vec = _mm256_loadu_si256((__m256i *)(b + i));
+        __m256i sum_vec = _mm256_adds_epi16(a_vec, b_vec);
+        _mm256_storeu_si256((__m256i *)(result + i), sum_vec);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_i64_t ai = a[i], bi = b[i];
+        nk_i64_t sum = ai + bi;
+        nk_i64_to_i16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_i16_haswell(nk_i16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                         nk_i16_t *result) {
+    nk_f32_t alpha_f32 = *alpha;
+    nk_f32_t beta_f32 = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_f32);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_f32);
+    __m256 min_f32x8 = _mm256_set1_ps(-32768.0f);
+    __m256 max_f32x8 = _mm256_set1_ps(32767.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i *)(a + i))));
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        result_f32x8 = _mm256_max_ps(result_f32x8, min_f32x8);
+        result_f32x8 = _mm256_min_ps(result_f32x8, max_f32x8);
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        // Casting down to 16-bit integers is tricky!
+        __m128i result_i16x8 = _mm_packs_epi32(_mm256_castsi256_si128(result_i32x8),
+                                               _mm256_extracti128_si256(result_i32x8, 1));
+        _mm_storeu_si128((__m128i *)(result + i), result_i16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i];
+        nk_f32_t sum = alpha_f32 * ai + beta_f32;
+        nk_f32_to_i16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_i16_haswell(                                   //
+    nk_i16_t const *a, nk_i16_t const *b, nk_i16_t const *c, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_i16_t *result) {
+    nk_f32_t alpha_f32 = *alpha;
+    nk_f32_t beta_f32 = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_f32);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_f32);
+    __m256 min_f32x8 = _mm256_set1_ps(-32768.0f);
+    __m256 max_f32x8 = _mm256_set1_ps(32767.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i *)(a + i))));
+        __m256 b_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i *)(b + i))));
+        __m256 c_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i *)(c + i))));
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 abc_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, abc_f32x8);
+        result_f32x8 = _mm256_max_ps(result_f32x8, min_f32x8);
+        result_f32x8 = _mm256_min_ps(result_f32x8, max_f32x8);
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        // Casting down to 16-bit integers is tricky!
+        __m128i result_i16x8 = _mm_packs_epi32(_mm256_castsi256_si128(result_i32x8),
+                                               _mm256_extracti128_si256(result_i32x8, 1));
+        _mm_storeu_si128((__m128i *)(result + i), result_i16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i], ci = c[i];
+        nk_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci;
+        nk_f32_to_i16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_u16_haswell(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_u16_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 16 <= n; i += 16) {
+        __m256i a_vec = _mm256_loadu_si256((__m256i *)(a + i));
+        __m256i b_vec = _mm256_loadu_si256((__m256i *)(b + i));
+        __m256i sum_vec = _mm256_adds_epu16(a_vec, b_vec);
+        _mm256_storeu_si256((__m256i *)(result + i), sum_vec);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_u64_t ai = a[i], bi = b[i];
+        nk_u64_t sum = ai + bi;
+        nk_u64_to_u16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_u16_haswell(nk_u16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                         nk_u16_t *result) {
+    nk_f32_t alpha_f32 = *alpha;
+    nk_f32_t beta_f32 = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_f32);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_f32);
+    __m256 min_f32x8 = _mm256_setzero_ps();
+    __m256 max_f32x8 = _mm256_set1_ps(65535.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(a + i))));
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        result_f32x8 = _mm256_max_ps(result_f32x8, min_f32x8);
+        result_f32x8 = _mm256_min_ps(result_f32x8, max_f32x8);
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        // Casting down to 16-bit integers is tricky!
+        __m128i result_u16x8 = _mm_packus_epi32(_mm256_castsi256_si128(result_i32x8),
+                                                _mm256_extracti128_si256(result_i32x8, 1));
+        _mm_storeu_si128((__m128i *)(result + i), result_u16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i];
+        nk_f32_t sum = alpha_f32 * ai + beta_f32;
+        nk_f32_to_u16_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_u16_haswell(                                   //
+    nk_u16_t const *a, nk_u16_t const *b, nk_u16_t const *c, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_u16_t *result) {
+    nk_f32_t alpha_f32 = *alpha;
+    nk_f32_t beta_f32 = *beta;
+    __m256 alpha_f32x8 = _mm256_set1_ps(alpha_f32);
+    __m256 beta_f32x8 = _mm256_set1_ps(beta_f32);
+    __m256 min_f32x8 = _mm256_setzero_ps();
+    __m256 max_f32x8 = _mm256_set1_ps(65535.0f);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256 a_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(a + i))));
+        __m256 b_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(b + i))));
+        __m256 c_f32x8 = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(c + i))));
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 abc_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, abc_f32x8);
+        result_f32x8 = _mm256_max_ps(result_f32x8, min_f32x8);
+        result_f32x8 = _mm256_min_ps(result_f32x8, max_f32x8);
+        __m256i result_i32x8 = _mm256_cvtps_epi32(result_f32x8);
+        // Casting down to 16-bit integers is tricky!
+        __m128i result_u16x8 = _mm_packus_epi32(_mm256_castsi256_si128(result_i32x8),
+                                                _mm256_extracti128_si256(result_i32x8, 1));
+        _mm_storeu_si128((__m128i *)(result + i), result_u16x8);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f32_t ai = a[i], bi = b[i], ci = c[i];
+        nk_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci;
+        nk_f32_to_u16_serial(&sum, result + i);
+    }
+}
+NK_INTERNAL __m256i _mm256_adds_epi32_haswell(__m256i a, __m256i b) {
+    __m256i sum_i32x8 = _mm256_add_epi32(a, b);
+    __m256i a_xor_b_i32x8 = _mm256_xor_si256(a, b);
+    __m256i sum_xor_a_i32x8 = _mm256_xor_si256(sum_i32x8, a);
+    // ~(a^b) & (sum^a): overflow iff same-sign inputs produce different-sign result
+    __m256i overflow_i32x8 = _mm256_srai_epi32(_mm256_andnot_si256(a_xor_b_i32x8, sum_xor_a_i32x8), 31);
+    // Positive overflow → INT32_MAX, negative overflow → INT32_MIN
+    __m256i max_i32x8 = _mm256_set1_epi32(0x7FFFFFFF);
+    __m256i min_i32x8 = _mm256_set1_epi32((int)0x80000000);
+    __m256i saturated_i32x8 = _mm256_blendv_epi8(max_i32x8, min_i32x8, _mm256_srai_epi32(a, 31));
+    return _mm256_blendv_epi8(sum_i32x8, saturated_i32x8, overflow_i32x8);
+}
+NK_PUBLIC void nk_each_sum_i32_haswell(nk_i32_t const *a, nk_i32_t const *b, nk_size_t n, nk_i32_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256i a_vec = _mm256_loadu_si256((__m256i *)(a + i));
+        __m256i b_vec = _mm256_loadu_si256((__m256i *)(b + i));
+        __m256i sum_vec = _mm256_adds_epi32_haswell(a_vec, b_vec);
+        _mm256_storeu_si256((__m256i *)(result + i), sum_vec);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_i64_t ai = a[i], bi = b[i];
+        nk_i64_t sum = ai + bi;
+        nk_i64_to_i32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_i32_haswell(nk_i32_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                         nk_i32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    __m256d alpha_f64x4 = _mm256_set1_pd(alpha_val);
+    __m256d beta_f64x4 = _mm256_set1_pd(beta_val);
+    __m256d min_f64x4 = _mm256_set1_pd(-2147483648.0);
+    __m256d max_f64x4 = _mm256_set1_pd(2147483647.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_cvtepi32_pd(_mm_loadu_si128((__m128i *)(a + i)));
+        __m256d result_f64x4 = _mm256_fmadd_pd(a_f64x4, alpha_f64x4, beta_f64x4);
+        // Clip to the largest values representable by 32-bit integers.
+        result_f64x4 = _mm256_max_pd(result_f64x4, min_f64x4);
+        result_f64x4 = _mm256_min_pd(result_f64x4, max_f64x4);
+        __m128i result_i32x4 = _mm256_cvtpd_epi32(result_f64x4);
+        _mm_storeu_si128((__m128i *)(result + i), result_i32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t ai = a[i];
+        nk_f64_t sum = alpha_val * ai + beta_val;
+        nk_f64_to_i32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_i32_haswell(                                   //
+    nk_i32_t const *a, nk_i32_t const *b, nk_i32_t const *c, nk_size_t n, //
+    nk_f64_t const *alpha, nk_f64_t const *beta, nk_i32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    __m256d alpha_f64x4 = _mm256_set1_pd(alpha_val);
+    __m256d beta_f64x4 = _mm256_set1_pd(beta_val);
+    __m256d min_f64x4 = _mm256_set1_pd(-2147483648.0);
+    __m256d max_f64x4 = _mm256_set1_pd(2147483647.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_cvtepi32_pd(_mm_loadu_si128((__m128i *)(a + i)));
+        __m256d b_f64x4 = _mm256_cvtepi32_pd(_mm_loadu_si128((__m128i *)(b + i)));
+        __m256d c_f64x4 = _mm256_cvtepi32_pd(_mm_loadu_si128((__m128i *)(c + i)));
+        __m256d ab_f64x4 = _mm256_mul_pd(a_f64x4, b_f64x4);
+        __m256d ab_scaled_f64x4 = _mm256_mul_pd(ab_f64x4, alpha_f64x4);
+        __m256d result_f64x4 = _mm256_fmadd_pd(c_f64x4, beta_f64x4, ab_scaled_f64x4);
+        // Clip to the largest values representable by 32-bit integers.
+        result_f64x4 = _mm256_max_pd(result_f64x4, min_f64x4);
+        result_f64x4 = _mm256_min_pd(result_f64x4, max_f64x4);
+        __m128i result_i32x4 = _mm256_cvtpd_epi32(result_f64x4);
+        _mm_storeu_si128((__m128i *)(result + i), result_i32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t ai = a[i], bi = b[i], ci = c[i];
+        nk_f64_t sum = alpha_val * ai * bi + beta_val * ci;
+        nk_f64_to_i32_serial(&sum, result + i);
+    }
+}
+NK_INTERNAL __m256i _mm256_adds_epu32_haswell(__m256i a, __m256i b) {
+    __m256i sum_u32x8 = _mm256_add_epi32(a, b);
+    __m256i max_u32x8 = _mm256_set1_epi32((int)0xFFFFFFFF);
+    // Overflow iff sum < a (unsigned wrapping). max_epu32(sum, a) != sum means overflow.
+    __m256i no_overflow_u32x8 = _mm256_cmpeq_epi32(_mm256_max_epu32(sum_u32x8, a), sum_u32x8);
+    return _mm256_blendv_epi8(max_u32x8, sum_u32x8, no_overflow_u32x8);
+}
+NK_INTERNAL __m256d _mm256_cvtepu32_pd_haswell(__m128i a) {
+    // TODO: Converting unsigned 32-bit integers to double-precision floats isn't trivial in AVX2.
+    // Let's convert the lower 31 bits to a double-precision float.
+    // And then conditionally add 2³¹ to the result if the MSB is set.
+    //
+    //  __m256d result = _mm256_cvtepi32_pd(_mm_and_si128(a, _mm_set1_epi32(0x7FFFFFFF)));
+    //  int should_increment = (_mm_movemask_epi8(a) & 0x8888);
+    //  should_increment = should_increment / 0x8888; // Transform something like 0b1000100010001000 to 0b1111
+    //  __m256d incremented = _mm256_add_pd(result, _mm256_set1_pd(2147483648.0));
+    //  result = _mm256_blend_pd(result, incremented, should_increment);
+    nk_u32_t from[4];
+    nk_f64_t to[4];
+    _mm_storeu_si128((__m128i *)from, a);
+    to[0] = (nk_f64_t)from[0];
+    to[1] = (nk_f64_t)from[1];
+    to[2] = (nk_f64_t)from[2];
+    to[3] = (nk_f64_t)from[3];
+    return _mm256_loadu_pd(to);
+}
+NK_INTERNAL __m128i _mm256_cvtpd_epu32_haswell(__m256d a) {
+    //? For now let's avoid SIMD and just use serial conversion.
+    nk_f64_t from[4];
+    nk_u32_t to[4];
+    _mm256_storeu_pd(from, a);
+    to[0] = (nk_u32_t)from[0];
+    to[1] = (nk_u32_t)from[1];
+    to[2] = (nk_u32_t)from[2];
+    to[3] = (nk_u32_t)from[3];
+    return _mm_loadu_si128((__m128i *)to);
+}
+NK_PUBLIC void nk_each_sum_u32_haswell(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_u32_t *result) {
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m256i a_vec = _mm256_loadu_si256((__m256i *)(a + i));
+        __m256i b_vec = _mm256_loadu_si256((__m256i *)(b + i));
+        __m256i sum_vec = _mm256_adds_epu32_haswell(a_vec, b_vec);
+        _mm256_storeu_si256((__m256i *)(result + i), sum_vec);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_i64_t ai = a[i], bi = b[i];
+        nk_i64_t sum = ai + bi;
+        nk_i64_to_u32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_u32_haswell(nk_u32_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
+                                         nk_u32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    __m256d alpha_f64x4 = _mm256_set1_pd(alpha_val);
+    __m256d beta_f64x4 = _mm256_set1_pd(beta_val);
+    __m256d min_f64x4 = _mm256_set1_pd(0);
+    __m256d max_f64x4 = _mm256_set1_pd(4294967295.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_cvtepu32_pd_haswell(_mm_loadu_si128((__m128i *)(a + i)));
+        __m256d result_f64x4 = _mm256_fmadd_pd(a_f64x4, alpha_f64x4, beta_f64x4);
+        // Clip to the largest values representable by 32-bit integers.
+        result_f64x4 = _mm256_max_pd(result_f64x4, min_f64x4);
+        result_f64x4 = _mm256_min_pd(result_f64x4, max_f64x4);
+        __m128i result_u32x4 = _mm256_cvtpd_epu32_haswell(result_f64x4);
+        _mm_storeu_si128((__m128i *)(result + i), result_u32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t ai = a[i];
+        nk_f64_t sum = alpha_val * ai + beta_val;
+        nk_f64_to_u32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_u32_haswell(                                   //
+    nk_u32_t const *a, nk_u32_t const *b, nk_u32_t const *c, nk_size_t n, //
+    nk_f64_t const *alpha, nk_f64_t const *beta, nk_u32_t *result) {
+    nk_f64_t alpha_val = *alpha;
+    nk_f64_t beta_val = *beta;
+    __m256d alpha_f64x4 = _mm256_set1_pd(alpha_val);
+    __m256d beta_f64x4 = _mm256_set1_pd(beta_val);
+    __m256d min_f64x4 = _mm256_set1_pd(0);
+    __m256d max_f64x4 = _mm256_set1_pd(4294967295.0);
+    // The main loop:
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256d a_f64x4 = _mm256_cvtepu32_pd_haswell(_mm_loadu_si128((__m128i *)(a + i)));
+        __m256d b_f64x4 = _mm256_cvtepu32_pd_haswell(_mm_loadu_si128((__m128i *)(b + i)));
+        __m256d c_f64x4 = _mm256_cvtepu32_pd_haswell(_mm_loadu_si128((__m128i *)(c + i)));
+        __m256d ab_f64x4 = _mm256_mul_pd(a_f64x4, b_f64x4);
+        __m256d ab_scaled_f64x4 = _mm256_mul_pd(ab_f64x4, alpha_f64x4);
+        __m256d result_f64x4 = _mm256_fmadd_pd(c_f64x4, beta_f64x4, ab_scaled_f64x4);
+        // Clip to the largest values representable by 32-bit integers.
+        result_f64x4 = _mm256_max_pd(result_f64x4, min_f64x4);
+        result_f64x4 = _mm256_min_pd(result_f64x4, max_f64x4);
+        __m128i result_u32x4 = _mm256_cvtpd_epu32_haswell(result_f64x4);
+        _mm_storeu_si128((__m128i *)(result + i), result_u32x4);
+    }
+    // The tail:
+    for (; i < n; ++i) {
+        nk_f64_t ai = a[i], bi = b[i], ci = c[i];
+        nk_f64_t sum = alpha_val * ai * bi + beta_val * ci;
+        nk_f64_to_u32_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_e4m3_t *result) {
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e4m3x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m128i b_e4m3x8 = _mm_loadl_epi64((__m128i const *)(b + i));
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_e4m3x8);
+        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_e4m3x8);
+        __m256 result_f32x8 = _mm256_add_ps(a_f32x8, b_f32x8);
+        __m128i result_e4m3x8 = nk_f32x8_to_e4m3x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e4m3x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        nk_e4m3_to_f32_serial(b + i, &bi);
+        nk_f32_t sum = ai + bi;
+        nk_f32_to_e4m3_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_sum_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_e5m2_t *result) {
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e5m2x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m128i b_e5m2x8 = _mm_loadl_epi64((__m128i const *)(b + i));
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_e5m2x8);
+        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_e5m2x8);
+        __m256 result_f32x8 = _mm256_add_ps(a_f32x8, b_f32x8);
+        __m128i result_e5m2x8 = nk_f32x8_to_e5m2x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e5m2x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        nk_e5m2_to_f32_serial(b + i, &bi);
+        nk_f32_t sum = ai + bi;
+        nk_f32_to_e5m2_serial(&sum, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_e4m3_haswell(nk_e4m3_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                          nk_e4m3_t *result) {
+    __m256 alpha_f32x8 = _mm256_set1_ps(*alpha);
+    __m256 beta_f32x8 = _mm256_set1_ps(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e4m3x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_e4m3x8);
+        // FP8 rounding note: FMA is acceptable here because scale computes (α × a + β),
+        // a single multiply-add operation where single-rounding preserves accuracy.
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        __m128i result_e4m3x8 = nk_f32x8_to_e4m3x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e4m3x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        nk_f32_t scaled = *alpha * ai + *beta;
+        nk_f32_to_e4m3_serial(&scaled, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_e5m2_haswell(nk_e5m2_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                          nk_e5m2_t *result) {
+    __m256 alpha_f32x8 = _mm256_set1_ps(*alpha);
+    __m256 beta_f32x8 = _mm256_set1_ps(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e5m2x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_e5m2x8);
+        // FP8 rounding note: FMA is acceptable here because scale computes (α × a + β),
+        // a single multiply-add operation where single-rounding preserves accuracy.
+        __m256 result_f32x8 = _mm256_fmadd_ps(a_f32x8, alpha_f32x8, beta_f32x8);
+        __m128i result_e5m2x8 = nk_f32x8_to_e5m2x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e5m2x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        nk_f32_t scaled = *alpha * ai + *beta;
+        nk_f32_to_e5m2_serial(&scaled, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t const *alpha,
+                                          nk_f32_t const *beta, nk_e4m3_t *result) {
+    __m256 alpha_f32x8 = _mm256_set1_ps(*alpha);
+    __m256 beta_f32x8 = _mm256_set1_ps(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e4m3x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m128i b_e4m3x8 = _mm_loadl_epi64((__m128i const *)(b + i));
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_e4m3x8);
+        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_e4m3x8);
+        __m256 a_scaled_f32x8 = _mm256_mul_ps(a_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(b_f32x8, beta_f32x8, a_scaled_f32x8);
+        __m128i result_e4m3x8 = nk_f32x8_to_e4m3x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e4m3x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        nk_e4m3_to_f32_serial(b + i, &bi);
+        nk_f32_t blended = *alpha * ai + *beta * bi;
+        nk_f32_to_e4m3_serial(&blended, result + i);
+    }
+}
+NK_PUBLIC void nk_each_blend_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t const *alpha,
+                                          nk_f32_t const *beta, nk_e5m2_t *result) {
+    __m256 alpha_f32x8 = _mm256_set1_ps(*alpha);
+    __m256 beta_f32x8 = _mm256_set1_ps(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e5m2x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m128i b_e5m2x8 = _mm_loadl_epi64((__m128i const *)(b + i));
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_e5m2x8);
+        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_e5m2x8);
+        __m256 a_scaled_f32x8 = _mm256_mul_ps(a_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(b_f32x8, beta_f32x8, a_scaled_f32x8);
+        __m128i result_e5m2x8 = nk_f32x8_to_e5m2x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e5m2x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        nk_e5m2_to_f32_serial(b + i, &bi);
+        nk_f32_t blended = *alpha * ai + *beta * bi;
+        nk_f32_to_e5m2_serial(&blended, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_e4m3_haswell(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_e4m3_t const *c, nk_size_t n,
+                                        nk_f32_t const *alpha, nk_f32_t const *beta, nk_e4m3_t *result) {
+    __m256 alpha_f32x8 = _mm256_set1_ps(*alpha);
+    __m256 beta_f32x8 = _mm256_set1_ps(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e4m3x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m128i b_e4m3x8 = _mm_loadl_epi64((__m128i const *)(b + i));
+        __m128i c_e4m3x8 = _mm_loadl_epi64((__m128i const *)(c + i));
+        __m256 a_f32x8 = nk_e4m3x8_to_f32x8_haswell_(a_e4m3x8);
+        __m256 b_f32x8 = nk_e4m3x8_to_f32x8_haswell_(b_e4m3x8);
+        __m256 c_f32x8 = nk_e4m3x8_to_f32x8_haswell_(c_e4m3x8);
+        // FP8 rounding note: Hybrid approach - use separate MUL for (a × b) and (α × a × b) to
+        // preserve intermediate rounding, then FMA for final addition since it matches scalar
+        // semantics of (α × a × b + β × c) when the multiply term is already computed.
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 ab_scaled_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, ab_scaled_f32x8);
+        __m128i result_e4m3x8 = nk_f32x8_to_e4m3x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e4m3x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, ci;
+        nk_e4m3_to_f32_serial(a + i, &ai);
+        nk_e4m3_to_f32_serial(b + i, &bi);
+        nk_e4m3_to_f32_serial(c + i, &ci);
+        nk_f32_t fma = *alpha * ai * bi + *beta * ci;
+        nk_f32_to_e4m3_serial(&fma, result + i);
+    }
+}
+NK_PUBLIC void nk_each_fma_e5m2_haswell(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_e5m2_t const *c, nk_size_t n,
+                                        nk_f32_t const *alpha, nk_f32_t const *beta, nk_e5m2_t *result) {
+    __m256 alpha_f32x8 = _mm256_set1_ps(*alpha);
+    __m256 beta_f32x8 = _mm256_set1_ps(*beta);
+    nk_size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        __m128i a_e5m2x8 = _mm_loadl_epi64((__m128i const *)(a + i));
+        __m128i b_e5m2x8 = _mm_loadl_epi64((__m128i const *)(b + i));
+        __m128i c_e5m2x8 = _mm_loadl_epi64((__m128i const *)(c + i));
+        __m256 a_f32x8 = nk_e5m2x8_to_f32x8_haswell_(a_e5m2x8);
+        __m256 b_f32x8 = nk_e5m2x8_to_f32x8_haswell_(b_e5m2x8);
+        __m256 c_f32x8 = nk_e5m2x8_to_f32x8_haswell_(c_e5m2x8);
+        // FP8 rounding note: Hybrid approach - use separate MUL for (a × b) and (α × a × b) to
+        // preserve intermediate rounding, then FMA for final addition since it matches scalar
+        // semantics of (α × a × b + β × c) when the multiply term is already computed.
+        __m256 ab_f32x8 = _mm256_mul_ps(a_f32x8, b_f32x8);
+        __m256 ab_scaled_f32x8 = _mm256_mul_ps(ab_f32x8, alpha_f32x8);
+        __m256 result_f32x8 = _mm256_fmadd_ps(c_f32x8, beta_f32x8, ab_scaled_f32x8);
+        __m128i result_e5m2x8 = nk_f32x8_to_e5m2x8_haswell_(result_f32x8);
+        _mm_storel_epi64((__m128i *)(result + i), result_e5m2x8);
+    }
+    for (; i < n; ++i) {
+        nk_f32_t ai, bi, ci;
+        nk_e5m2_to_f32_serial(a + i, &ai);
+        nk_e5m2_to_f32_serial(b + i, &bi);
+        nk_e5m2_to_f32_serial(c + i, &ci);
+        nk_f32_t fma = *alpha * ai * bi + *beta * ci;
+        nk_f32_to_e5m2_serial(&fma, result + i);
+    }
+}
+NK_PUBLIC void nk_each_scale_f32c_haswell(nk_f32c_t const *a, nk_size_t n, nk_f32c_t const *alpha,
+                                          nk_f32c_t const *beta, nk_f32c_t *result) {
+    nk_f32_t const *a_f32 = (nk_f32_t const *)a;
+    nk_f32_t *result_f32 = (nk_f32_t *)result;
+    __m256 alpha_real_f32x8 = _mm256_set1_ps(alpha->real);
+    __m256 alpha_imag_f32x8 = _mm256_set1_ps(alpha->imag);
+    __m256 beta_f32x8 = _mm256_setr_ps(beta->real, beta->imag, beta->real, beta->imag, beta->real, beta->imag,
+                                       beta->real, beta->imag);
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256 a_f32x8 = _mm256_loadu_ps(a_f32 + 2 * i);
+        __m256 a_swapped_f32x8 = _mm256_permute_ps(a_f32x8, 0xB1);
+        __m256 temp_f32x8 = _mm256_mul_ps(alpha_imag_f32x8, a_swapped_f32x8);
+        __m256 y_f32x8 = _mm256_fmaddsub_ps(alpha_real_f32x8, a_f32x8, temp_f32x8);
+        y_f32x8 = _mm256_add_ps(y_f32x8, beta_f32x8);
+        _mm256_storeu_ps(result_f32 + 2 * i, y_f32x8);
+    }
+    for (; i < n; i++) {
+        nk_f32_t a_real = a[i].real, a_imag = a[i].imag;
+        result[i].real = alpha->real * a_real - alpha->imag * a_imag + beta->real;
+        result[i].imag = alpha->real * a_imag + alpha->imag * a_real + beta->imag;
+    }
+}
+NK_PUBLIC void nk_each_scale_f64c_haswell(nk_f64c_t const *a, nk_size_t n, nk_f64c_t const *alpha,
+                                          nk_f64c_t const *beta, nk_f64c_t *result) {
+    nk_f64_t const *a_f64 = (nk_f64_t const *)a;
+    nk_f64_t *result_f64 = (nk_f64_t *)result;
+    __m256d alpha_real_f64x4 = _mm256_set1_pd(alpha->real);
+    __m256d alpha_imag_f64x4 = _mm256_set1_pd(alpha->imag);
+    __m256d beta_f64x4 = _mm256_setr_pd(beta->real, beta->imag, beta->real, beta->imag);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        __m256d a_f64x4 = _mm256_loadu_pd(a_f64 + 2 * i);
+        __m256d a_swapped_f64x4 = _mm256_permute_pd(a_f64x4, 0x5);
+        __m256d temp_f64x4 = _mm256_mul_pd(alpha_imag_f64x4, a_swapped_f64x4);
+        __m256d y_f64x4 = _mm256_fmaddsub_pd(alpha_real_f64x4, a_f64x4, temp_f64x4);
+        y_f64x4 = _mm256_add_pd(y_f64x4, beta_f64x4);
+        _mm256_storeu_pd(result_f64 + 2 * i, y_f64x4);
+    }
+    for (; i < n; i++) {
+        nk_f64_t a_real = a[i].real, a_imag = a[i].imag;
+        result[i].real = alpha->real * a_real - alpha->imag * a_imag + beta->real;
+        result[i].imag = alpha->real * a_imag + alpha->imag * a_real + beta->imag;
+    }
+}
+NK_PUBLIC void nk_each_blend_f32c_haswell(nk_f32c_t const *a, nk_f32c_t const *b, nk_size_t n, nk_f32c_t const *alpha,
+                                          nk_f32c_t const *beta, nk_f32c_t *result) {
+    nk_f32_t const *a_f32 = (nk_f32_t const *)a;
+    nk_f32_t const *b_f32 = (nk_f32_t const *)b;
+    nk_f32_t *result_f32 = (nk_f32_t *)result;
+    __m256 alpha_real_f32x8 = _mm256_set1_ps(alpha->real);
+    __m256 alpha_imag_f32x8 = _mm256_set1_ps(alpha->imag);
+    __m256 beta_real_f32x8 = _mm256_set1_ps(beta->real);
+    __m256 beta_imag_f32x8 = _mm256_set1_ps(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256 a_f32x8 = _mm256_loadu_ps(a_f32 + 2 * i);
+        __m256 b_f32x8 = _mm256_loadu_ps(b_f32 + 2 * i);
+        __m256 a_swapped_f32x8 = _mm256_permute_ps(a_f32x8, 0xB1);
+        __m256 ta_f32x8 = _mm256_mul_ps(alpha_imag_f32x8, a_swapped_f32x8);
+        __m256 ya_f32x8 = _mm256_fmaddsub_ps(alpha_real_f32x8, a_f32x8, ta_f32x8);
+        __m256 b_swapped_f32x8 = _mm256_permute_ps(b_f32x8, 0xB1);
+        __m256 tb_f32x8 = _mm256_mul_ps(beta_imag_f32x8, b_swapped_f32x8);
+        __m256 yb_f32x8 = _mm256_fmaddsub_ps(beta_real_f32x8, b_f32x8, tb_f32x8);
+        _mm256_storeu_ps(result_f32 + 2 * i, _mm256_add_ps(ya_f32x8, yb_f32x8));
+    }
+    for (; i < n; i++) {
+        nk_f32_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f32_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f32_t ar = alpha->real * a_real - alpha->imag * a_imag;
+        nk_f32_t ai = alpha->real * a_imag + alpha->imag * a_real;
+        nk_f32_t br = beta->real * b_real - beta->imag * b_imag;
+        nk_f32_t bi = beta->real * b_imag + beta->imag * b_real;
+        result[i].real = ar + br;
+        result[i].imag = ai + bi;
+    }
+}
+NK_PUBLIC void nk_each_blend_f64c_haswell(nk_f64c_t const *a, nk_f64c_t const *b, nk_size_t n, nk_f64c_t const *alpha,
+                                          nk_f64c_t const *beta, nk_f64c_t *result) {
+    nk_f64_t const *a_f64 = (nk_f64_t const *)a;
+    nk_f64_t const *b_f64 = (nk_f64_t const *)b;
+    nk_f64_t *result_f64 = (nk_f64_t *)result;
+    __m256d alpha_real_f64x4 = _mm256_set1_pd(alpha->real);
+    __m256d alpha_imag_f64x4 = _mm256_set1_pd(alpha->imag);
+    __m256d beta_real_f64x4 = _mm256_set1_pd(beta->real);
+    __m256d beta_imag_f64x4 = _mm256_set1_pd(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        __m256d a_f64x4 = _mm256_loadu_pd(a_f64 + 2 * i);
+        __m256d b_f64x4 = _mm256_loadu_pd(b_f64 + 2 * i);
+        __m256d a_swapped_f64x4 = _mm256_permute_pd(a_f64x4, 0x5);
+        __m256d ta_f64x4 = _mm256_mul_pd(alpha_imag_f64x4, a_swapped_f64x4);
+        __m256d ya_f64x4 = _mm256_fmaddsub_pd(alpha_real_f64x4, a_f64x4, ta_f64x4);
+        __m256d b_swapped_f64x4 = _mm256_permute_pd(b_f64x4, 0x5);
+        __m256d tb_f64x4 = _mm256_mul_pd(beta_imag_f64x4, b_swapped_f64x4);
+        __m256d yb_f64x4 = _mm256_fmaddsub_pd(beta_real_f64x4, b_f64x4, tb_f64x4);
+        _mm256_storeu_pd(result_f64 + 2 * i, _mm256_add_pd(ya_f64x4, yb_f64x4));
+    }
+    for (; i < n; i++) {
+        nk_f64_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f64_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f64_t ar = alpha->real * a_real - alpha->imag * a_imag;
+        nk_f64_t ai = alpha->real * a_imag + alpha->imag * a_real;
+        nk_f64_t br = beta->real * b_real - beta->imag * b_imag;
+        nk_f64_t bi = beta->real * b_imag + beta->imag * b_real;
+        result[i].real = ar + br;
+        result[i].imag = ai + bi;
+    }
+}
+NK_PUBLIC void nk_each_fma_f32c_haswell(nk_f32c_t const *a, nk_f32c_t const *b, nk_f32c_t const *c, nk_size_t n,
+                                        nk_f32c_t const *alpha, nk_f32c_t const *beta, nk_f32c_t *result) {
+    nk_f32_t const *a_f32 = (nk_f32_t const *)a;
+    nk_f32_t const *b_f32 = (nk_f32_t const *)b;
+    nk_f32_t const *c_f32 = (nk_f32_t const *)c;
+    nk_f32_t *result_f32 = (nk_f32_t *)result;
+    __m256 alpha_real_f32x8 = _mm256_set1_ps(alpha->real);
+    __m256 alpha_imag_f32x8 = _mm256_set1_ps(alpha->imag);
+    __m256 beta_real_f32x8 = _mm256_set1_ps(beta->real);
+    __m256 beta_imag_f32x8 = _mm256_set1_ps(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        __m256 a_f32x8 = _mm256_loadu_ps(a_f32 + 2 * i);
+        __m256 b_f32x8 = _mm256_loadu_ps(b_f32 + 2 * i);
+        __m256 c_f32x8 = _mm256_loadu_ps(c_f32 + 2 * i);
+        __m256 b_swapped_f32x8 = _mm256_permute_ps(b_f32x8, 0xB1);
+        __m256 a_real_f32x8 = _mm256_moveldup_ps(a_f32x8);
+        __m256 a_imag_f32x8 = _mm256_movehdup_ps(a_f32x8);
+        __m256 tab_f32x8 = _mm256_mul_ps(a_imag_f32x8, b_swapped_f32x8);
+        __m256 ab_f32x8 = _mm256_fmaddsub_ps(a_real_f32x8, b_f32x8, tab_f32x8);
+        __m256 ab_swapped_f32x8 = _mm256_permute_ps(ab_f32x8, 0xB1);
+        __m256 taa_f32x8 = _mm256_mul_ps(alpha_imag_f32x8, ab_swapped_f32x8);
+        __m256 ya_f32x8 = _mm256_fmaddsub_ps(alpha_real_f32x8, ab_f32x8, taa_f32x8);
+        __m256 c_swapped_f32x8 = _mm256_permute_ps(c_f32x8, 0xB1);
+        __m256 tbc_f32x8 = _mm256_mul_ps(beta_imag_f32x8, c_swapped_f32x8);
+        __m256 yb_f32x8 = _mm256_fmaddsub_ps(beta_real_f32x8, c_f32x8, tbc_f32x8);
+        _mm256_storeu_ps(result_f32 + 2 * i, _mm256_add_ps(ya_f32x8, yb_f32x8));
+    }
+    for (; i < n; i++) {
+        nk_f32_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f32_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f32_t c_real = c[i].real, c_imag = c[i].imag;
+        nk_f32_t ab_real = a_real * b_real - a_imag * b_imag;
+        nk_f32_t ab_imag = a_real * b_imag + a_imag * b_real;
+        nk_f32_t aab_real = alpha->real * ab_real - alpha->imag * ab_imag;
+        nk_f32_t aab_imag = alpha->real * ab_imag + alpha->imag * ab_real;
+        nk_f32_t bc_real = beta->real * c_real - beta->imag * c_imag;
+        nk_f32_t bc_imag = beta->real * c_imag + beta->imag * c_real;
+        result[i].real = aab_real + bc_real;
+        result[i].imag = aab_imag + bc_imag;
+    }
+}
+NK_PUBLIC void nk_each_fma_f64c_haswell(nk_f64c_t const *a, nk_f64c_t const *b, nk_f64c_t const *c, nk_size_t n,
+                                        nk_f64c_t const *alpha, nk_f64c_t const *beta, nk_f64c_t *result) {
+    nk_f64_t const *a_f64 = (nk_f64_t const *)a;
+    nk_f64_t const *b_f64 = (nk_f64_t const *)b;
+    nk_f64_t const *c_f64 = (nk_f64_t const *)c;
+    nk_f64_t *result_f64 = (nk_f64_t *)result;
+    __m256d alpha_real_f64x4 = _mm256_set1_pd(alpha->real);
+    __m256d alpha_imag_f64x4 = _mm256_set1_pd(alpha->imag);
+    __m256d beta_real_f64x4 = _mm256_set1_pd(beta->real);
+    __m256d beta_imag_f64x4 = _mm256_set1_pd(beta->imag);
+    nk_size_t i = 0;
+    for (; i + 2 <= n; i += 2) {
+        __m256d a_f64x4 = _mm256_loadu_pd(a_f64 + 2 * i);
+        __m256d b_f64x4 = _mm256_loadu_pd(b_f64 + 2 * i);
+        __m256d c_f64x4 = _mm256_loadu_pd(c_f64 + 2 * i);
+        __m256d b_swapped_f64x4 = _mm256_permute_pd(b_f64x4, 0x5);
+        __m256d a_real_f64x4 = _mm256_movedup_pd(a_f64x4);
+        __m256d a_imag_f64x4 = _mm256_permute_pd(a_f64x4, 0xF);
+        __m256d tab_f64x4 = _mm256_mul_pd(a_imag_f64x4, b_swapped_f64x4);
+        __m256d ab_f64x4 = _mm256_fmaddsub_pd(a_real_f64x4, b_f64x4, tab_f64x4);
+        __m256d ab_swapped_f64x4 = _mm256_permute_pd(ab_f64x4, 0x5);
+        __m256d taa_f64x4 = _mm256_mul_pd(alpha_imag_f64x4, ab_swapped_f64x4);
+        __m256d ya_f64x4 = _mm256_fmaddsub_pd(alpha_real_f64x4, ab_f64x4, taa_f64x4);
+        __m256d c_swapped_f64x4 = _mm256_permute_pd(c_f64x4, 0x5);
+        __m256d tbc_f64x4 = _mm256_mul_pd(beta_imag_f64x4, c_swapped_f64x4);
+        __m256d yb_f64x4 = _mm256_fmaddsub_pd(beta_real_f64x4, c_f64x4, tbc_f64x4);
+        _mm256_storeu_pd(result_f64 + 2 * i, _mm256_add_pd(ya_f64x4, yb_f64x4));
+    }
+    for (; i < n; i++) {
+        nk_f64_t a_real = a[i].real, a_imag = a[i].imag;
+        nk_f64_t b_real = b[i].real, b_imag = b[i].imag;
+        nk_f64_t c_real = c[i].real, c_imag = c[i].imag;
+        nk_f64_t ab_real = a_real * b_real - a_imag * b_imag;
+        nk_f64_t ab_imag = a_real * b_imag + a_imag * b_real;
+        nk_f64_t aab_real = alpha->real * ab_real - alpha->imag * ab_imag;
+        nk_f64_t aab_imag = alpha->real * ab_imag + alpha->imag * ab_real;
+        nk_f64_t bc_real = beta->real * c_real - beta->imag * c_imag;
+        nk_f64_t bc_imag = beta->real * c_imag + beta->imag * c_real;
+        result[i].real = aab_real + bc_real;
+        result[i].imag = aab_imag + bc_imag;
+    }
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_HASWELL
+#endif // NK_TARGET_X86_
+#endif // NK_EACH_HASWELL_H