npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/each/sapphire.h ADDED Viewed

@@ -0,0 +1,477 @@
+/**
+ *  @brief SIMD-accelerated Elementwise Arithmetic for Sapphire Rapids.
+ *  @file include/numkong/each/sapphire.h
+ *  @author Ash Vardanian
+ *  @date December 27, 2025
+ *
+ *  @sa include/numkong/each.h
+ *
+ *  @section sapphire_elementwise_instructions Relevant Instructions
+ *
+ *      Intrinsic                   Instruction                     Sapphire    Genoa
+ *      _mm512_add_ph               VADDPH (ZMM, ZMM, ZMM)          4cy @ p05   3cy @ p01
+ *      _mm512_mul_ph               VMULPH (ZMM, ZMM, ZMM)          4cy @ p05   3cy @ p01
+ *      _mm512_fmadd_ph             VFMADD (ZMM, ZMM, ZMM)          4cy @ p05   4cy @ p01
+ *      _mm512_cvtepi16_ph          VCVTW2PH (ZMM, ZMM)             4cy @ p05   4cy @ p01
+ *      _mm512_cvtph_epi16          VCVTPH2W (ZMM, ZMM)             4cy @ p05   4cy @ p01
+ *      _mm512_cvtepi8_epi16        VPMOVSXBW (ZMM, YMM)            3cy @ p5    3cy @ p12
+ *      _mm512_cvtsepi16_epi8       VPMOVSWB (YMM, ZMM)             4cy @ p5    4cy @ p12
+ *      _mm512_packus_epi16         VPACKUSWB (ZMM, ZMM, ZMM)       1cy @ p5    1cy @ p12
+ *      _mm256_add_ph               VADDPH (YMM, YMM, YMM)          4cy @ p05   3cy @ p01
+ *      _mm512_maskz_loadu_epi16    VMOVDQU16 (ZMM {K}, M512)       7cy @ p23   7cy @ p23
+ *      _mm512_mask_storeu_epi16    VMOVDQU16 (M512 {K}, ZMM)       4cy @ p4    4cy @ p4
+ */
+#ifndef NK_EACH_SAPPHIRE_H
+#define NK_EACH_SAPPHIRE_H
+#if NK_TARGET_X86_
+#if NK_TARGET_SAPPHIRE
+#include "numkong/types.h"
+#include "numkong/cast/sapphire.h" // `nk_f32_to_f16_sapphire`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512fp16,f16c,fma,bmi,bmi2"))), \
+                             apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512fp16", "f16c", "fma", "bmi", "bmi2")
+#endif
+NK_PUBLIC void nk_each_sum_f16_sapphire(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f16_t *result) {
+    __mmask32 mask = 0xFFFFFFFF;
+    __m512h a_f16_vec, b_f16_vec;
+    __m512h sum_f16_vec;
+nk_each_sum_f16_sapphire_cycle:
+    if (n < 32) {
+        mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
+        a_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a));
+        b_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b));
+        n = 0;
+    }
+    else {
+        a_f16_vec = _mm512_loadu_ph(a);
+        b_f16_vec = _mm512_loadu_ph(b);
+        a += 32, b += 32, n -= 32;
+    }
+    sum_f16_vec = _mm512_add_ph(a_f16_vec, b_f16_vec);
+    _mm512_mask_storeu_epi16(result, mask, _mm512_castph_si512(sum_f16_vec));
+    result += 32;
+    if (n) goto nk_each_sum_f16_sapphire_cycle;
+}
+NK_PUBLIC void nk_each_scale_u8_sapphire(nk_u8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                         nk_u8_t *result) {
+    short alpha_short, beta_short;
+    nk_f32_to_f16_sapphire(alpha, (nk_f16_t *)&alpha_short);
+    nk_f32_to_f16_sapphire(beta, (nk_f16_t *)&beta_short);
+    __mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
+    __m512h alpha_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(alpha_short));
+    __m512h beta_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(beta_short));
+    __m512i a_u8x64, result_u8x64;
+    __m512h a_low_f16x32, a_high_f16x32;
+    __m512h result_low_f16x32, result_high_f16x32;
+    __m512i result_low_i16x32, result_high_i16x32;
+nk_each_scale_u8_sapphire_cycle:
+    if (n < 64) {
+        mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
+        a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        n = 0;
+    }
+    else {
+        a_u8x64 = _mm512_loadu_epi8(a);
+        a += 64, n -= 64;
+    }
+    // Upcast:
+    a_low_f16x32 = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(a_u8x64, _mm512_setzero_si512()));
+    a_high_f16x32 = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(a_u8x64, _mm512_setzero_si512()));
+    // Scale:
+    result_low_f16x32 = _mm512_fmadd_ph(a_low_f16x32, alpha_f16x32, beta_f16x32);
+    result_high_f16x32 = _mm512_fmadd_ph(a_high_f16x32, alpha_f16x32, beta_f16x32);
+    // Downcast:
+    result_low_i16x32 = _mm512_cvtph_epi16(result_low_f16x32);
+    result_high_i16x32 = _mm512_cvtph_epi16(result_high_f16x32);
+    result_u8x64 = _mm512_packus_epi16(result_low_i16x32, result_high_i16x32);
+    _mm512_mask_storeu_epi8(result, mask, result_u8x64);
+    result += 64;
+    if (n) goto nk_each_scale_u8_sapphire_cycle;
+}
+NK_PUBLIC void nk_each_blend_u8_sapphire(            //
+    nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_u8_icelake(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_u8_sapphire(a, n, alpha, &zero, result); }
+        else { nk_each_scale_u8_sapphire(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    short alpha_short, beta_short;
+    nk_f32_to_f16_sapphire(&alpha_val, (nk_f16_t *)&alpha_short);
+    nk_f32_to_f16_sapphire(&beta_val, (nk_f16_t *)&beta_short);
+    __mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
+    __m512h alpha_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(alpha_short));
+    __m512h beta_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(beta_short));
+    __m512i a_u8x64, b_u8x64, result_u8x64;
+    __m512h a_low_f16x32, a_high_f16x32, b_low_f16x32, b_high_f16x32;
+    __m512h a_scaled_low_f16x32, a_scaled_high_f16x32, result_low_f16x32, result_high_f16x32;
+    __m512i result_low_i16x32, result_high_i16x32;
+nk_each_blend_u8_sapphire_cycle:
+    if (n < 64) {
+        mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
+        a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
+        n = 0;
+    }
+    else {
+        a_u8x64 = _mm512_loadu_epi8(a);
+        b_u8x64 = _mm512_loadu_epi8(b);
+        a += 64, b += 64, n -= 64;
+    }
+    // Upcast:
+    a_low_f16x32 = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(a_u8x64, _mm512_setzero_si512()));
+    a_high_f16x32 = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(a_u8x64, _mm512_setzero_si512()));
+    b_low_f16x32 = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(b_u8x64, _mm512_setzero_si512()));
+    b_high_f16x32 = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(b_u8x64, _mm512_setzero_si512()));
+    // Scale:
+    a_scaled_low_f16x32 = _mm512_mul_ph(a_low_f16x32, alpha_f16x32);
+    a_scaled_high_f16x32 = _mm512_mul_ph(a_high_f16x32, alpha_f16x32);
+    // Add:
+    result_low_f16x32 = _mm512_fmadd_ph(b_low_f16x32, beta_f16x32, a_scaled_low_f16x32);
+    result_high_f16x32 = _mm512_fmadd_ph(b_high_f16x32, beta_f16x32, a_scaled_high_f16x32);
+    // Downcast:
+    result_low_i16x32 = _mm512_cvtph_epi16(result_low_f16x32);
+    result_high_i16x32 = _mm512_cvtph_epi16(result_high_f16x32);
+    result_u8x64 = _mm512_packus_epi16(result_low_i16x32, result_high_i16x32);
+    _mm512_mask_storeu_epi8(result, mask, result_u8x64);
+    result += 64;
+    if (n) goto nk_each_blend_u8_sapphire_cycle;
+}
+NK_PUBLIC void nk_each_scale_i8_sapphire(nk_i8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
+                                         nk_i8_t *result) {
+    short alpha_short, beta_short;
+    nk_f32_to_f16_sapphire(alpha, (nk_f16_t *)&alpha_short);
+    nk_f32_to_f16_sapphire(beta, (nk_f16_t *)&beta_short);
+    __mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
+    __m512h alpha_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(alpha_short));
+    __m512h beta_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(beta_short));
+    __m256i a_low_i8x32, a_high_i8x32;
+    __m512i result_i8x64;
+    __m512h a_low_f16x32, a_high_f16x32;
+    __m512h result_low_f16x32, result_high_f16x32;
+    __m512i result_low_i16x32, result_high_i16x32;
+nk_each_scale_i8_sapphire_cycle:
+    if (n < 64) {
+        // Tail: use masked 512-bit load and extract (runs once)
+        mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
+        __m512i a_i8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        a_low_i8x32 = _mm512_castsi512_si256(a_i8x64);
+        a_high_i8x32 = _mm512_extracti64x4_epi64(a_i8x64, 1);
+        n = 0;
+    }
+    else {
+        // Hot path: 2×256-bit loads to avoid VEXTRACTI64X4 (Port 5)
+        a_low_i8x32 = _mm256_loadu_epi8(a);
+        a_high_i8x32 = _mm256_loadu_epi8(a + 32);
+        a += 64, n -= 64;
+    }
+    // Upcast from 256-bit halves:
+    a_low_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(a_low_i8x32));
+    a_high_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(a_high_i8x32));
+    // Scale:
+    result_low_f16x32 = _mm512_fmadd_ph(a_low_f16x32, alpha_f16x32, beta_f16x32);
+    result_high_f16x32 = _mm512_fmadd_ph(a_high_f16x32, alpha_f16x32, beta_f16x32);
+    // Downcast:
+    result_low_i16x32 = _mm512_cvtph_epi16(result_low_f16x32);
+    result_high_i16x32 = _mm512_cvtph_epi16(result_high_f16x32);
+    result_i8x64 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi16_epi8(result_low_i16x32)),
+                                      _mm512_cvtsepi16_epi8(result_high_i16x32), 1);
+    _mm512_mask_storeu_epi8(result, mask, result_i8x64);
+    result += 64;
+    if (n) goto nk_each_scale_i8_sapphire_cycle;
+}
+NK_PUBLIC void nk_each_blend_i8_sapphire(            //
+    nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result) {
+    nk_f32_t alpha_val = *alpha;
+    nk_f32_t beta_val = *beta;
+    // There are several special cases we may want to implement:
+    // 1. Simple addition, when both weights are equal to 1.0.
+    if (alpha_val == 1 && beta_val == 1) {
+        // In this case we can avoid expensive multiplications.
+        nk_each_sum_i8_icelake(a, b, n, result);
+        return;
+    }
+    // 2. Just scaling, when one of the weights is equal to zero.
+    else if (alpha_val == 0 || beta_val == 0) {
+        // In this case we can avoid half of the load instructions.
+        nk_f32_t zero = 0;
+        if (beta_val == 0) { nk_each_scale_i8_sapphire(a, n, alpha, &zero, result); }
+        else { nk_each_scale_i8_sapphire(b, n, beta, &zero, result); }
+        return;
+    }
+    // The general case.
+    short alpha_short, beta_short;
+    nk_f32_to_f16_sapphire(&alpha_val, (nk_f16_t *)&alpha_short);
+    nk_f32_to_f16_sapphire(&beta_val, (nk_f16_t *)&beta_short);
+    __mmask64 mask = 0xFFFFFFFFFFFFFFFFull;
+    __m512h alpha_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(alpha_short));
+    __m512h beta_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(beta_short));
+    __m256i a_low_i8x32, a_high_i8x32, b_low_i8x32, b_high_i8x32;
+    __m512i result_i8x64;
+    __m512h a_low_f16x32, a_high_f16x32, b_low_f16x32, b_high_f16x32;
+    __m512h a_scaled_low_f16x32, a_scaled_high_f16x32, result_low_f16x32, result_high_f16x32;
+    __m512i result_low_i16x32, result_high_i16x32;
+nk_each_blend_i8_sapphire_cycle:
+    if (n < 64) {
+        // Tail: use masked 512-bit loads and extract (runs once)
+        mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
+        __m512i a_i8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        __m512i b_i8x64 = _mm512_maskz_loadu_epi8(mask, b);
+        a_low_i8x32 = _mm512_castsi512_si256(a_i8x64);
+        a_high_i8x32 = _mm512_extracti64x4_epi64(a_i8x64, 1);
+        b_low_i8x32 = _mm512_castsi512_si256(b_i8x64);
+        b_high_i8x32 = _mm512_extracti64x4_epi64(b_i8x64, 1);
+        n = 0;
+    }
+    else {
+        // Hot path: 2×256-bit loads per vector to avoid VEXTRACTI64X4 (Port 5)
+        a_low_i8x32 = _mm256_loadu_epi8(a);
+        a_high_i8x32 = _mm256_loadu_epi8(a + 32);
+        b_low_i8x32 = _mm256_loadu_epi8(b);
+        b_high_i8x32 = _mm256_loadu_epi8(b + 32);
+        a += 64, b += 64, n -= 64;
+    }
+    // Upcast from 256-bit halves:
+    a_low_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(a_low_i8x32));
+    a_high_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(a_high_i8x32));
+    b_low_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(b_low_i8x32));
+    b_high_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(b_high_i8x32));
+    // Scale:
+    a_scaled_low_f16x32 = _mm512_mul_ph(a_low_f16x32, alpha_f16x32);
+    a_scaled_high_f16x32 = _mm512_mul_ph(a_high_f16x32, alpha_f16x32);
+    // Add:
+    result_low_f16x32 = _mm512_fmadd_ph(b_low_f16x32, beta_f16x32, a_scaled_low_f16x32);
+    result_high_f16x32 = _mm512_fmadd_ph(b_high_f16x32, beta_f16x32, a_scaled_high_f16x32);
+    // Downcast:
+    result_low_i16x32 = _mm512_cvtph_epi16(result_low_f16x32);
+    result_high_i16x32 = _mm512_cvtph_epi16(result_high_f16x32);
+    result_i8x64 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi16_epi8(result_low_i16x32)),
+                                      _mm512_cvtsepi16_epi8(result_high_i16x32), 1);
+    _mm512_mask_storeu_epi8(result, mask, result_i8x64);
+    result += 64;
+    if (n) goto nk_each_blend_i8_sapphire_cycle;
+}
+NK_PUBLIC void nk_each_fma_i8_sapphire(                                //
+    nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result) {
+    short alpha_short, beta_short;
+    nk_f32_to_f16_sapphire(alpha, (nk_f16_t *)&alpha_short);
+    nk_f32_to_f16_sapphire(beta, (nk_f16_t *)&beta_short);
+    __mmask64 mask = 0xFFFFFFFFFFFFFFFF;
+    __m512h alpha_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(alpha_short));
+    __m512h beta_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(beta_short));
+    __m256i a_low_i8x32, a_high_i8x32, b_low_i8x32, b_high_i8x32, c_low_i8x32, c_high_i8x32;
+    __m512i result_i8x64;
+    __m512h a_low_f16x32, a_high_f16x32, b_low_f16x32, b_high_f16x32;
+    __m512h c_low_f16x32, c_high_f16x32, ab_low_f16x32, ab_high_f16x32;
+    __m512h ab_scaled_low_f16x32, ab_scaled_high_f16x32, result_low_f16x32, result_high_f16x32;
+    __m512i result_low_i16x32, result_high_i16x32;
+    __m512h min_f16x32 = _mm512_cvtepi16_ph(_mm512_set1_epi16(-128));
+    __m512h max_f16x32 = _mm512_cvtepi16_ph(_mm512_set1_epi16(127));
+nk_each_fma_i8_sapphire_cycle:
+    if (n < 64) {
+        // Tail: use masked 512-bit loads and extract (runs once)
+        mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
+        __m512i a_i8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        __m512i b_i8x64 = _mm512_maskz_loadu_epi8(mask, b);
+        __m512i c_i8x64 = _mm512_maskz_loadu_epi8(mask, c);
+        a_low_i8x32 = _mm512_castsi512_si256(a_i8x64);
+        a_high_i8x32 = _mm512_extracti64x4_epi64(a_i8x64, 1);
+        b_low_i8x32 = _mm512_castsi512_si256(b_i8x64);
+        b_high_i8x32 = _mm512_extracti64x4_epi64(b_i8x64, 1);
+        c_low_i8x32 = _mm512_castsi512_si256(c_i8x64);
+        c_high_i8x32 = _mm512_extracti64x4_epi64(c_i8x64, 1);
+        n = 0;
+    }
+    else {
+        // Hot path: 2×256-bit loads per vector to avoid VEXTRACTI64X4 (Port 5)
+        a_low_i8x32 = _mm256_loadu_epi8(a);
+        a_high_i8x32 = _mm256_loadu_epi8(a + 32);
+        b_low_i8x32 = _mm256_loadu_epi8(b);
+        b_high_i8x32 = _mm256_loadu_epi8(b + 32);
+        c_low_i8x32 = _mm256_loadu_epi8(c);
+        c_high_i8x32 = _mm256_loadu_epi8(c + 32);
+        a += 64, b += 64, c += 64, n -= 64;
+    }
+    // Upcast from 256-bit halves:
+    a_low_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(a_low_i8x32));
+    a_high_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(a_high_i8x32));
+    b_low_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(b_low_i8x32));
+    b_high_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(b_high_i8x32));
+    c_low_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(c_low_i8x32));
+    c_high_f16x32 = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(c_high_i8x32));
+    // Multiply:
+    ab_low_f16x32 = _mm512_mul_ph(a_low_f16x32, b_low_f16x32);
+    ab_high_f16x32 = _mm512_mul_ph(a_high_f16x32, b_high_f16x32);
+    // Scale:
+    ab_scaled_low_f16x32 = _mm512_mul_ph(ab_low_f16x32, alpha_f16x32);
+    ab_scaled_high_f16x32 = _mm512_mul_ph(ab_high_f16x32, alpha_f16x32);
+    // Add:
+    result_low_f16x32 = _mm512_fmadd_ph(c_low_f16x32, beta_f16x32, ab_scaled_low_f16x32);
+    result_high_f16x32 = _mm512_fmadd_ph(c_high_f16x32, beta_f16x32, ab_scaled_high_f16x32);
+    // Clip the 16-bit result to 8-bit:
+    result_low_f16x32 = _mm512_max_ph(_mm512_min_ph(result_low_f16x32, max_f16x32), min_f16x32);
+    result_high_f16x32 = _mm512_max_ph(_mm512_min_ph(result_high_f16x32, max_f16x32), min_f16x32);
+    // Downcast:
+    result_low_i16x32 = _mm512_cvtph_epi16(result_low_f16x32);
+    result_high_i16x32 = _mm512_cvtph_epi16(result_high_f16x32);
+    // Merge back:
+    result_i8x64 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi16_epi8(result_low_i16x32)),
+                                      _mm512_cvtsepi16_epi8(result_high_i16x32), 1);
+    _mm512_mask_storeu_epi8(result, mask, result_i8x64);
+    result += 64;
+    if (n) goto nk_each_fma_i8_sapphire_cycle;
+}
+NK_PUBLIC void nk_each_fma_u8_sapphire(                                //
+    nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n, //
+    nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result) {
+    short alpha_short, beta_short;
+    nk_f32_to_f16_sapphire(alpha, (nk_f16_t *)&alpha_short);
+    nk_f32_to_f16_sapphire(beta, (nk_f16_t *)&beta_short);
+    __mmask64 mask = 0xFFFFFFFFFFFFFFFF;
+    __m512h alpha_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(alpha_short));
+    __m512h beta_f16x32 = _mm512_castsi512_ph(_mm512_set1_epi16(beta_short));
+    __m512i a_u8x64, b_u8x64, c_u8x64, result_u8x64;
+    __m512h a_low_f16x32, a_high_f16x32, b_low_f16x32, b_high_f16x32;
+    __m512h c_low_f16x32, c_high_f16x32, ab_low_f16x32, ab_high_f16x32;
+    __m512h ab_scaled_low_f16x32, ab_scaled_high_f16x32, result_low_f16x32, result_high_f16x32;
+    __m512i result_low_i16x32, result_high_i16x32;
+    __m512h min_f16x32 = _mm512_cvtepi16_ph(_mm512_set1_epi16(0));
+    __m512h max_f16x32 = _mm512_cvtepi16_ph(_mm512_set1_epi16(255));
+nk_each_fma_u8_sapphire_cycle:
+    if (n < 64) {
+        mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
+        a_u8x64 = _mm512_maskz_loadu_epi8(mask, a);
+        b_u8x64 = _mm512_maskz_loadu_epi8(mask, b);
+        c_u8x64 = _mm512_maskz_loadu_epi8(mask, c);
+        n = 0;
+    }
+    else {
+        a_u8x64 = _mm512_loadu_epi8(a);
+        b_u8x64 = _mm512_loadu_epi8(b);
+        c_u8x64 = _mm512_loadu_epi8(c);
+        a += 64, b += 64, c += 64, n -= 64;
+    }
+    // Upcast:
+    a_low_f16x32 = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(a_u8x64, _mm512_setzero_si512()));
+    a_high_f16x32 = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(a_u8x64, _mm512_setzero_si512()));
+    b_low_f16x32 = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(b_u8x64, _mm512_setzero_si512()));
+    b_high_f16x32 = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(b_u8x64, _mm512_setzero_si512()));
+    c_low_f16x32 = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(c_u8x64, _mm512_setzero_si512()));
+    c_high_f16x32 = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(c_u8x64, _mm512_setzero_si512()));
+    // Multiply:
+    ab_low_f16x32 = _mm512_mul_ph(a_low_f16x32, b_low_f16x32);
+    ab_high_f16x32 = _mm512_mul_ph(a_high_f16x32, b_high_f16x32);
+    // Scale:
+    ab_scaled_low_f16x32 = _mm512_mul_ph(ab_low_f16x32, alpha_f16x32);
+    ab_scaled_high_f16x32 = _mm512_mul_ph(ab_high_f16x32, alpha_f16x32);
+    // Add:
+    result_low_f16x32 = _mm512_fmadd_ph(c_low_f16x32, beta_f16x32, ab_scaled_low_f16x32);
+    result_high_f16x32 = _mm512_fmadd_ph(c_high_f16x32, beta_f16x32, ab_scaled_high_f16x32);
+    // Clip the 16-bit result to 8-bit:
+    result_low_f16x32 = _mm512_max_ph(_mm512_min_ph(result_low_f16x32, max_f16x32), min_f16x32);
+    result_high_f16x32 = _mm512_max_ph(_mm512_min_ph(result_high_f16x32, max_f16x32), min_f16x32);
+    // Downcast:
+    result_low_i16x32 = _mm512_cvtph_epi16(result_low_f16x32);
+    result_high_i16x32 = _mm512_cvtph_epi16(result_high_f16x32);
+    // Merge back:
+    result_u8x64 = _mm512_packus_epi16(result_low_i16x32, result_high_i16x32);
+    _mm512_mask_storeu_epi8(result, mask, result_u8x64);
+    result += 64;
+    if (n) goto nk_each_fma_u8_sapphire_cycle;
+}
+NK_PUBLIC void nk_each_sum_e4m3_sapphire(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_e4m3_t *result) {
+    __m256i a_e4m3x32, b_e4m3x32;
+    __m256h a_lo_f16x16, a_hi_f16x16, b_lo_f16x16, b_hi_f16x16;
+    __m256h sum_lo_f16x16, sum_hi_f16x16;
+    __m128i result_lo_e4m3x16, result_hi_e4m3x16;
+    __mmask32 mask = 0xFFFFFFFF;
+nk_each_sum_e4m3_sapphire_cycle:
+    if (n < 32) {
+        mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, (unsigned int)n);
+        a_e4m3x32 = _mm256_maskz_loadu_epi8(mask, a);
+        b_e4m3x32 = _mm256_maskz_loadu_epi8(mask, b);
+        n = 0;
+    }
+    else {
+        a_e4m3x32 = _mm256_loadu_si256((__m256i const *)a);
+        b_e4m3x32 = _mm256_loadu_si256((__m256i const *)b);
+        a += 32, b += 32, n -= 32;
+    }
+    // Convert e4m3x16 → f16x16 (two halves)
+    a_lo_f16x16 = nk_e4m3x16_to_f16x16_sapphire_(_mm256_castsi256_si128(a_e4m3x32));
+    a_hi_f16x16 = nk_e4m3x16_to_f16x16_sapphire_(_mm256_extracti128_si256(a_e4m3x32, 1));
+    b_lo_f16x16 = nk_e4m3x16_to_f16x16_sapphire_(_mm256_castsi256_si128(b_e4m3x32));
+    b_hi_f16x16 = nk_e4m3x16_to_f16x16_sapphire_(_mm256_extracti128_si256(b_e4m3x32, 1));
+    // Add in F16 - e4m3 sum is safe (max 896 < 65504)
+    sum_lo_f16x16 = _mm256_add_ph(a_lo_f16x16, b_lo_f16x16);
+    sum_hi_f16x16 = _mm256_add_ph(a_hi_f16x16, b_hi_f16x16);
+    // Convert f16x16 → e4m3x16
+    result_lo_e4m3x16 = nk_f16x16_to_e4m3x16_sapphire_(sum_lo_f16x16);
+    result_hi_e4m3x16 = nk_f16x16_to_e4m3x16_sapphire_(sum_hi_f16x16);
+    // Pack and store
+    __m256i result_e4m3x32 = _mm256_inserti128_si256(_mm256_castsi128_si256(result_lo_e4m3x16), result_hi_e4m3x16, 1);
+    _mm256_mask_storeu_epi8(result, mask, result_e4m3x32);
+    result += 32;
+    if (n) goto nk_each_sum_e4m3_sapphire_cycle;
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_SAPPHIRE
+#endif // NK_TARGET_X86_
+#endif // NK_EACH_SAPPHIRE_H