numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Dispatch Initialization for Type Conversions and Scalar Math.
|
|
3
|
+
* @file c/dispatch_other.c
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 3, 2026
|
|
6
|
+
*/
|
|
7
|
+
#include "dispatch.h"
|
|
8
|
+
|
|
9
|
+
void nk_dispatch_cast_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
|
|
10
|
+
typedef nk_kernel_punned_t m_t;
|
|
11
|
+
#if NK_TARGET_NEON
|
|
12
|
+
if (v & nk_cap_neon_k) switch (k) {
|
|
13
|
+
case nk_kernel_cast_k: *m = (m_t)&nk_cast_neon, *c = nk_cap_neon_k; return;
|
|
14
|
+
default: break;
|
|
15
|
+
}
|
|
16
|
+
#endif
|
|
17
|
+
#if NK_TARGET_SAPPHIRE
|
|
18
|
+
if (v & nk_cap_sapphire_k) switch (k) {
|
|
19
|
+
case nk_kernel_cast_k: *m = (m_t)&nk_cast_sapphire, *c = nk_cap_sapphire_k; return;
|
|
20
|
+
default: break;
|
|
21
|
+
}
|
|
22
|
+
#endif
|
|
23
|
+
#if NK_TARGET_ICELAKE
|
|
24
|
+
if (v & nk_cap_icelake_k) switch (k) {
|
|
25
|
+
case nk_kernel_cast_k: *m = (m_t)&nk_cast_icelake, *c = nk_cap_icelake_k; return;
|
|
26
|
+
default: break;
|
|
27
|
+
}
|
|
28
|
+
#endif
|
|
29
|
+
#if NK_TARGET_SKYLAKE
|
|
30
|
+
if (v & nk_cap_skylake_k) switch (k) {
|
|
31
|
+
case nk_kernel_cast_k: *m = (m_t)&nk_cast_skylake, *c = nk_cap_skylake_k; return;
|
|
32
|
+
default: break;
|
|
33
|
+
}
|
|
34
|
+
#endif
|
|
35
|
+
#if NK_TARGET_HASWELL
|
|
36
|
+
if (v & nk_cap_haswell_k) switch (k) {
|
|
37
|
+
case nk_kernel_cast_k: *m = (m_t)&nk_cast_haswell, *c = nk_cap_haswell_k; return;
|
|
38
|
+
default: break;
|
|
39
|
+
}
|
|
40
|
+
#endif
|
|
41
|
+
#if NK_TARGET_RVV
|
|
42
|
+
if (v & nk_cap_rvv_k) switch (k) {
|
|
43
|
+
case nk_kernel_cast_k: *m = (m_t)&nk_cast_rvv, *c = nk_cap_rvv_k; return;
|
|
44
|
+
default: break;
|
|
45
|
+
}
|
|
46
|
+
#endif
|
|
47
|
+
if (v & nk_cap_serial_k) switch (k) {
|
|
48
|
+
case nk_kernel_cast_k: *m = (m_t)&nk_cast_serial, *c = nk_cap_serial_k; return;
|
|
49
|
+
default: break;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Error fallback - zero capability signals lookup failure
|
|
53
|
+
*m = (m_t)nk_error_dense_, *c = 0;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
void nk_dispatch_cast_init_(nk_capability_t caps) {
|
|
57
|
+
nk_implementations_t *t = &nk_dispatch_table;
|
|
58
|
+
nk_capability_t used;
|
|
59
|
+
|
|
60
|
+
// Type casting (buffer-to-buffer)
|
|
61
|
+
nk_dispatch_cast_find_(caps, nk_kernel_cast_k, (nk_kernel_punned_t *)&t->cast, &used);
|
|
62
|
+
|
|
63
|
+
// Scalar conversions: bf16 ↔ f32
|
|
64
|
+
t->bf16_to_f32 = &nk_bf16_to_f32_serial;
|
|
65
|
+
t->f32_to_bf16 = &nk_f32_to_bf16_serial;
|
|
66
|
+
|
|
67
|
+
// Scalar conversions: f16 ↔ f32
|
|
68
|
+
t->f16_to_f32 = &nk_f16_to_f32_serial;
|
|
69
|
+
t->f32_to_f16 = &nk_f32_to_f16_serial;
|
|
70
|
+
|
|
71
|
+
#if NK_TARGET_HASWELL
|
|
72
|
+
if (caps & nk_cap_haswell_k) {
|
|
73
|
+
t->f16_to_f32 = &nk_f16_to_f32_haswell;
|
|
74
|
+
t->f32_to_f16 = &nk_f32_to_f16_haswell;
|
|
75
|
+
}
|
|
76
|
+
#endif
|
|
77
|
+
|
|
78
|
+
#if NK_TARGET_SAPPHIRE
|
|
79
|
+
if (caps & nk_cap_sapphire_k) {
|
|
80
|
+
t->f16_to_f32 = &nk_f16_to_f32_sapphire;
|
|
81
|
+
t->f32_to_f16 = &nk_f32_to_f16_sapphire;
|
|
82
|
+
}
|
|
83
|
+
#endif
|
|
84
|
+
|
|
85
|
+
#if NK_TARGET_NEON
|
|
86
|
+
if (caps & nk_cap_neon_k) {
|
|
87
|
+
t->f16_to_f32 = &nk_f16_to_f32_neon;
|
|
88
|
+
t->f32_to_f16 = &nk_f32_to_f16_neon;
|
|
89
|
+
}
|
|
90
|
+
#endif
|
|
91
|
+
|
|
92
|
+
// Scalar conversions: e5m2, e4m3, e3m2, e2m3 (serial only)
|
|
93
|
+
t->e5m2_to_f32 = &nk_e5m2_to_f32_serial;
|
|
94
|
+
t->f32_to_e5m2 = &nk_f32_to_e5m2_serial;
|
|
95
|
+
t->e4m3_to_f32 = &nk_e4m3_to_f32_serial;
|
|
96
|
+
t->f32_to_e4m3 = &nk_f32_to_e4m3_serial;
|
|
97
|
+
t->e3m2_to_f32 = &nk_e3m2_to_f32_serial;
|
|
98
|
+
t->f32_to_e3m2 = &nk_f32_to_e3m2_serial;
|
|
99
|
+
t->e2m3_to_f32 = &nk_e2m3_to_f32_serial;
|
|
100
|
+
t->f32_to_e2m3 = &nk_f32_to_e2m3_serial;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
void nk_dispatch_math_init_(nk_capability_t caps) {
|
|
104
|
+
nk_implementations_t *t = &nk_dispatch_table;
|
|
105
|
+
|
|
106
|
+
// Scalar math: f64
|
|
107
|
+
t->f64_sqrt = &nk_f64_sqrt_serial;
|
|
108
|
+
t->f64_rsqrt = &nk_f64_rsqrt_serial;
|
|
109
|
+
t->f64_fma = &nk_f64_fma_serial;
|
|
110
|
+
|
|
111
|
+
// Scalar math: f32
|
|
112
|
+
t->f32_sqrt = &nk_f32_sqrt_serial;
|
|
113
|
+
t->f32_rsqrt = &nk_f32_rsqrt_serial;
|
|
114
|
+
t->f32_fma = &nk_f32_fma_serial;
|
|
115
|
+
|
|
116
|
+
#if NK_TARGET_V128RELAXED
|
|
117
|
+
if (caps & nk_cap_v128relaxed_k) {
|
|
118
|
+
t->f64_rsqrt = &nk_f64_rsqrt_v128relaxed;
|
|
119
|
+
t->f64_fma = &nk_f64_fma_v128relaxed;
|
|
120
|
+
t->f32_rsqrt = &nk_f32_rsqrt_v128relaxed;
|
|
121
|
+
t->f32_fma = &nk_f32_fma_v128relaxed;
|
|
122
|
+
}
|
|
123
|
+
#endif
|
|
124
|
+
|
|
125
|
+
#if NK_TARGET_HASWELL
|
|
126
|
+
if (caps & nk_cap_haswell_k) {
|
|
127
|
+
t->f64_sqrt = &nk_f64_sqrt_haswell;
|
|
128
|
+
t->f64_rsqrt = &nk_f64_rsqrt_haswell;
|
|
129
|
+
t->f64_fma = &nk_f64_fma_haswell;
|
|
130
|
+
t->f32_sqrt = &nk_f32_sqrt_haswell;
|
|
131
|
+
t->f32_rsqrt = &nk_f32_rsqrt_haswell;
|
|
132
|
+
t->f32_fma = &nk_f32_fma_haswell;
|
|
133
|
+
}
|
|
134
|
+
#endif
|
|
135
|
+
|
|
136
|
+
#if NK_TARGET_NEON
|
|
137
|
+
if (caps & nk_cap_neon_k) {
|
|
138
|
+
t->f64_sqrt = &nk_f64_sqrt_neon;
|
|
139
|
+
t->f64_rsqrt = &nk_f64_rsqrt_neon;
|
|
140
|
+
t->f64_fma = &nk_f64_fma_neon;
|
|
141
|
+
t->f32_sqrt = &nk_f32_sqrt_neon;
|
|
142
|
+
t->f32_rsqrt = &nk_f32_rsqrt_neon;
|
|
143
|
+
t->f32_fma = &nk_f32_fma_neon;
|
|
144
|
+
}
|
|
145
|
+
#endif
|
|
146
|
+
|
|
147
|
+
// Scalar math: f16
|
|
148
|
+
t->f16_sqrt = &nk_f16_sqrt_serial;
|
|
149
|
+
t->f16_rsqrt = &nk_f16_rsqrt_serial;
|
|
150
|
+
t->f16_fma = &nk_f16_fma_serial;
|
|
151
|
+
|
|
152
|
+
#if NK_TARGET_HASWELL
|
|
153
|
+
if (caps & nk_cap_haswell_k) {
|
|
154
|
+
t->f16_sqrt = &nk_f16_sqrt_haswell;
|
|
155
|
+
t->f16_rsqrt = &nk_f16_rsqrt_haswell;
|
|
156
|
+
t->f16_fma = &nk_f16_fma_haswell;
|
|
157
|
+
}
|
|
158
|
+
#endif
|
|
159
|
+
|
|
160
|
+
#if NK_TARGET_NEONHALF
|
|
161
|
+
if (caps & nk_cap_neonhalf_k) {
|
|
162
|
+
t->f16_sqrt = &nk_f16_sqrt_neonhalf;
|
|
163
|
+
t->f16_rsqrt = &nk_f16_rsqrt_neonhalf;
|
|
164
|
+
t->f16_fma = &nk_f16_fma_neonhalf;
|
|
165
|
+
}
|
|
166
|
+
#endif
|
|
167
|
+
|
|
168
|
+
#if NK_TARGET_SAPPHIRE
|
|
169
|
+
if (caps & nk_cap_sapphire_k) {
|
|
170
|
+
t->f16_sqrt = &nk_f16_sqrt_sapphire;
|
|
171
|
+
t->f16_rsqrt = &nk_f16_rsqrt_sapphire;
|
|
172
|
+
t->f16_fma = &nk_f16_fma_sapphire;
|
|
173
|
+
}
|
|
174
|
+
#endif
|
|
175
|
+
|
|
176
|
+
#if NK_TARGET_RVV
|
|
177
|
+
if (caps & nk_cap_rvv_k) {
|
|
178
|
+
t->f64_fma = &nk_f64_fma_rvv;
|
|
179
|
+
t->f32_fma = &nk_f32_fma_rvv;
|
|
180
|
+
}
|
|
181
|
+
#endif
|
|
182
|
+
|
|
183
|
+
// Saturating arithmetic
|
|
184
|
+
t->i64_saturating_add = &nk_i64_saturating_add_serial;
|
|
185
|
+
t->i64_saturating_mul = &nk_i64_saturating_mul_serial;
|
|
186
|
+
t->i32_saturating_add = &nk_i32_saturating_add_serial;
|
|
187
|
+
t->i32_saturating_mul = &nk_i32_saturating_mul_serial;
|
|
188
|
+
t->i16_saturating_add = &nk_i16_saturating_add_serial;
|
|
189
|
+
t->i16_saturating_mul = &nk_i16_saturating_mul_serial;
|
|
190
|
+
t->i8_saturating_add = &nk_i8_saturating_add_serial;
|
|
191
|
+
t->i8_saturating_mul = &nk_i8_saturating_mul_serial;
|
|
192
|
+
t->i4x2_saturating_add = &nk_i4x2_saturating_add_serial;
|
|
193
|
+
t->i4x2_saturating_mul = &nk_i4x2_saturating_mul_serial;
|
|
194
|
+
t->u64_saturating_add = &nk_u64_saturating_add_serial;
|
|
195
|
+
t->u64_saturating_mul = &nk_u64_saturating_mul_serial;
|
|
196
|
+
t->u32_saturating_add = &nk_u32_saturating_add_serial;
|
|
197
|
+
t->u32_saturating_mul = &nk_u32_saturating_mul_serial;
|
|
198
|
+
t->u16_saturating_add = &nk_u16_saturating_add_serial;
|
|
199
|
+
t->u16_saturating_mul = &nk_u16_saturating_mul_serial;
|
|
200
|
+
t->u8_saturating_add = &nk_u8_saturating_add_serial;
|
|
201
|
+
t->u8_saturating_mul = &nk_u8_saturating_mul_serial;
|
|
202
|
+
t->u4x2_saturating_add = &nk_u4x2_saturating_add_serial;
|
|
203
|
+
t->u4x2_saturating_mul = &nk_u4x2_saturating_mul_serial;
|
|
204
|
+
|
|
205
|
+
#if NK_TARGET_RVV
|
|
206
|
+
if (caps & nk_cap_rvv_k) {
|
|
207
|
+
t->i64_saturating_add = &nk_i64_saturating_add_rvv;
|
|
208
|
+
t->i64_saturating_mul = &nk_i64_saturating_mul_rvv;
|
|
209
|
+
t->i32_saturating_add = &nk_i32_saturating_add_rvv;
|
|
210
|
+
t->i32_saturating_mul = &nk_i32_saturating_mul_rvv;
|
|
211
|
+
t->i16_saturating_add = &nk_i16_saturating_add_rvv;
|
|
212
|
+
t->i16_saturating_mul = &nk_i16_saturating_mul_rvv;
|
|
213
|
+
t->i8_saturating_add = &nk_i8_saturating_add_rvv;
|
|
214
|
+
t->i8_saturating_mul = &nk_i8_saturating_mul_rvv;
|
|
215
|
+
t->u64_saturating_add = &nk_u64_saturating_add_rvv;
|
|
216
|
+
t->u64_saturating_mul = &nk_u64_saturating_mul_rvv;
|
|
217
|
+
t->u32_saturating_add = &nk_u32_saturating_add_rvv;
|
|
218
|
+
t->u32_saturating_mul = &nk_u32_saturating_mul_rvv;
|
|
219
|
+
t->u16_saturating_add = &nk_u16_saturating_add_rvv;
|
|
220
|
+
t->u16_saturating_mul = &nk_u16_saturating_mul_rvv;
|
|
221
|
+
t->u8_saturating_add = &nk_u8_saturating_add_rvv;
|
|
222
|
+
t->u8_saturating_mul = &nk_u8_saturating_mul_rvv;
|
|
223
|
+
}
|
|
224
|
+
#endif
|
|
225
|
+
|
|
226
|
+
#if NK_TARGET_NEON
|
|
227
|
+
if (caps & nk_cap_neon_k) {
|
|
228
|
+
t->i64_saturating_add = &nk_i64_saturating_add_neon;
|
|
229
|
+
t->i64_saturating_mul = &nk_i64_saturating_mul_neon;
|
|
230
|
+
t->i32_saturating_add = &nk_i32_saturating_add_neon;
|
|
231
|
+
t->i16_saturating_add = &nk_i16_saturating_add_neon;
|
|
232
|
+
t->i8_saturating_add = &nk_i8_saturating_add_neon;
|
|
233
|
+
t->u64_saturating_add = &nk_u64_saturating_add_neon;
|
|
234
|
+
t->u64_saturating_mul = &nk_u64_saturating_mul_neon;
|
|
235
|
+
t->u32_saturating_add = &nk_u32_saturating_add_neon;
|
|
236
|
+
t->u16_saturating_add = &nk_u16_saturating_add_neon;
|
|
237
|
+
t->u8_saturating_add = &nk_u8_saturating_add_neon;
|
|
238
|
+
}
|
|
239
|
+
#endif
|
|
240
|
+
|
|
241
|
+
#if NK_TARGET_HASWELL
|
|
242
|
+
if (caps & nk_cap_haswell_k) {
|
|
243
|
+
t->i64_saturating_mul = &nk_i64_saturating_mul_haswell;
|
|
244
|
+
t->i16_saturating_add = &nk_i16_saturating_add_haswell;
|
|
245
|
+
t->i8_saturating_add = &nk_i8_saturating_add_haswell;
|
|
246
|
+
t->u64_saturating_mul = &nk_u64_saturating_mul_haswell;
|
|
247
|
+
t->u16_saturating_add = &nk_u16_saturating_add_haswell;
|
|
248
|
+
t->u8_saturating_add = &nk_u8_saturating_add_haswell;
|
|
249
|
+
}
|
|
250
|
+
#endif
|
|
251
|
+
|
|
252
|
+
// Conversion-free ordering for mini-floats
|
|
253
|
+
t->bf16_order = &nk_bf16_order_serial;
|
|
254
|
+
t->f16_order = &nk_f16_order_serial;
|
|
255
|
+
t->e5m2_order = &nk_e5m2_order_serial;
|
|
256
|
+
t->e4m3_order = &nk_e4m3_order_serial;
|
|
257
|
+
t->e3m2_order = &nk_e3m2_order_serial;
|
|
258
|
+
t->e2m3_order = &nk_e2m3_order_serial;
|
|
259
|
+
|
|
260
|
+
#if NK_TARGET_SAPPHIRE
|
|
261
|
+
if (caps & nk_cap_sapphire_k) { t->f16_order = &nk_f16_order_sapphire; }
|
|
262
|
+
#endif
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Scalar conversion dispatch functions
|
|
266
|
+
|
|
267
|
+
NK_DYNAMIC void nk_bf16_to_f32(nk_bf16_t const *src, nk_f32_t *dest) { nk_dispatch_table.bf16_to_f32(src, dest); }
|
|
268
|
+
NK_DYNAMIC void nk_f32_to_bf16(nk_f32_t const *src, nk_bf16_t *dest) { nk_dispatch_table.f32_to_bf16(src, dest); }
|
|
269
|
+
NK_DYNAMIC void nk_f16_to_f32(nk_f16_t const *src, nk_f32_t *dest) { nk_dispatch_table.f16_to_f32(src, dest); }
|
|
270
|
+
NK_DYNAMIC void nk_f32_to_f16(nk_f32_t const *src, nk_f16_t *dest) { nk_dispatch_table.f32_to_f16(src, dest); }
|
|
271
|
+
NK_DYNAMIC void nk_e5m2_to_f32(nk_e5m2_t const *src, nk_f32_t *dest) { nk_dispatch_table.e5m2_to_f32(src, dest); }
|
|
272
|
+
NK_DYNAMIC void nk_f32_to_e5m2(nk_f32_t const *src, nk_e5m2_t *dest) { nk_dispatch_table.f32_to_e5m2(src, dest); }
|
|
273
|
+
NK_DYNAMIC void nk_e4m3_to_f32(nk_e4m3_t const *src, nk_f32_t *dest) { nk_dispatch_table.e4m3_to_f32(src, dest); }
|
|
274
|
+
NK_DYNAMIC void nk_f32_to_e4m3(nk_f32_t const *src, nk_e4m3_t *dest) { nk_dispatch_table.f32_to_e4m3(src, dest); }
|
|
275
|
+
NK_DYNAMIC void nk_e3m2_to_f32(nk_e3m2_t const *src, nk_f32_t *dest) { nk_dispatch_table.e3m2_to_f32(src, dest); }
|
|
276
|
+
NK_DYNAMIC void nk_f32_to_e3m2(nk_f32_t const *src, nk_e3m2_t *dest) { nk_dispatch_table.f32_to_e3m2(src, dest); }
|
|
277
|
+
NK_DYNAMIC void nk_e2m3_to_f32(nk_e2m3_t const *src, nk_f32_t *dest) { nk_dispatch_table.e2m3_to_f32(src, dest); }
|
|
278
|
+
NK_DYNAMIC void nk_f32_to_e2m3(nk_f32_t const *src, nk_e2m3_t *dest) { nk_dispatch_table.f32_to_e2m3(src, dest); }
|
|
279
|
+
|
|
280
|
+
// Scalar math dispatch functions
|
|
281
|
+
|
|
282
|
+
NK_DYNAMIC nk_f64_t nk_f64_sqrt(nk_f64_t x) { return nk_dispatch_table.f64_sqrt(x); }
|
|
283
|
+
NK_DYNAMIC nk_f64_t nk_f64_rsqrt(nk_f64_t x) { return nk_dispatch_table.f64_rsqrt(x); }
|
|
284
|
+
NK_DYNAMIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c) { return nk_dispatch_table.f64_fma(a, b, c); }
|
|
285
|
+
NK_DYNAMIC nk_f32_t nk_f32_sqrt(nk_f32_t x) { return nk_dispatch_table.f32_sqrt(x); }
|
|
286
|
+
NK_DYNAMIC nk_f32_t nk_f32_rsqrt(nk_f32_t x) { return nk_dispatch_table.f32_rsqrt(x); }
|
|
287
|
+
NK_DYNAMIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c) { return nk_dispatch_table.f32_fma(a, b, c); }
|
|
288
|
+
NK_DYNAMIC nk_f16_t nk_f16_sqrt(nk_f16_t x) { return nk_dispatch_table.f16_sqrt(x); }
|
|
289
|
+
NK_DYNAMIC nk_f16_t nk_f16_rsqrt(nk_f16_t x) { return nk_dispatch_table.f16_rsqrt(x); }
|
|
290
|
+
NK_DYNAMIC nk_f16_t nk_f16_fma(nk_f16_t a, nk_f16_t b, nk_f16_t c) { return nk_dispatch_table.f16_fma(a, b, c); }
|
|
291
|
+
|
|
292
|
+
// Saturating arithmetic dispatch functions
|
|
293
|
+
|
|
294
|
+
NK_DYNAMIC nk_i64_t nk_i64_saturating_add(nk_i64_t a, nk_i64_t b) { return nk_dispatch_table.i64_saturating_add(a, b); }
|
|
295
|
+
NK_DYNAMIC nk_i64_t nk_i64_saturating_mul(nk_i64_t a, nk_i64_t b) { return nk_dispatch_table.i64_saturating_mul(a, b); }
|
|
296
|
+
NK_DYNAMIC nk_i32_t nk_i32_saturating_add(nk_i32_t a, nk_i32_t b) { return nk_dispatch_table.i32_saturating_add(a, b); }
|
|
297
|
+
NK_DYNAMIC nk_i32_t nk_i32_saturating_mul(nk_i32_t a, nk_i32_t b) { return nk_dispatch_table.i32_saturating_mul(a, b); }
|
|
298
|
+
NK_DYNAMIC nk_i16_t nk_i16_saturating_add(nk_i16_t a, nk_i16_t b) { return nk_dispatch_table.i16_saturating_add(a, b); }
|
|
299
|
+
NK_DYNAMIC nk_i16_t nk_i16_saturating_mul(nk_i16_t a, nk_i16_t b) { return nk_dispatch_table.i16_saturating_mul(a, b); }
|
|
300
|
+
NK_DYNAMIC nk_i8_t nk_i8_saturating_add(nk_i8_t a, nk_i8_t b) { return nk_dispatch_table.i8_saturating_add(a, b); }
|
|
301
|
+
NK_DYNAMIC nk_i8_t nk_i8_saturating_mul(nk_i8_t a, nk_i8_t b) { return nk_dispatch_table.i8_saturating_mul(a, b); }
|
|
302
|
+
NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_add(nk_i4x2_t a, nk_i4x2_t b) {
|
|
303
|
+
return nk_dispatch_table.i4x2_saturating_add(a, b);
|
|
304
|
+
}
|
|
305
|
+
NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_mul(nk_i4x2_t a, nk_i4x2_t b) {
|
|
306
|
+
return nk_dispatch_table.i4x2_saturating_mul(a, b);
|
|
307
|
+
}
|
|
308
|
+
NK_DYNAMIC nk_u64_t nk_u64_saturating_add(nk_u64_t a, nk_u64_t b) { return nk_dispatch_table.u64_saturating_add(a, b); }
|
|
309
|
+
NK_DYNAMIC nk_u64_t nk_u64_saturating_mul(nk_u64_t a, nk_u64_t b) { return nk_dispatch_table.u64_saturating_mul(a, b); }
|
|
310
|
+
NK_DYNAMIC nk_u32_t nk_u32_saturating_add(nk_u32_t a, nk_u32_t b) { return nk_dispatch_table.u32_saturating_add(a, b); }
|
|
311
|
+
NK_DYNAMIC nk_u32_t nk_u32_saturating_mul(nk_u32_t a, nk_u32_t b) { return nk_dispatch_table.u32_saturating_mul(a, b); }
|
|
312
|
+
NK_DYNAMIC nk_u16_t nk_u16_saturating_add(nk_u16_t a, nk_u16_t b) { return nk_dispatch_table.u16_saturating_add(a, b); }
|
|
313
|
+
NK_DYNAMIC nk_u16_t nk_u16_saturating_mul(nk_u16_t a, nk_u16_t b) { return nk_dispatch_table.u16_saturating_mul(a, b); }
|
|
314
|
+
NK_DYNAMIC nk_u8_t nk_u8_saturating_add(nk_u8_t a, nk_u8_t b) { return nk_dispatch_table.u8_saturating_add(a, b); }
|
|
315
|
+
NK_DYNAMIC nk_u8_t nk_u8_saturating_mul(nk_u8_t a, nk_u8_t b) { return nk_dispatch_table.u8_saturating_mul(a, b); }
|
|
316
|
+
NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_add(nk_u4x2_t a, nk_u4x2_t b) {
|
|
317
|
+
return nk_dispatch_table.u4x2_saturating_add(a, b);
|
|
318
|
+
}
|
|
319
|
+
NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_mul(nk_u4x2_t a, nk_u4x2_t b) {
|
|
320
|
+
return nk_dispatch_table.u4x2_saturating_mul(a, b);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Ordering dispatch functions
|
|
324
|
+
|
|
325
|
+
NK_DYNAMIC int nk_bf16_order(nk_bf16_t a, nk_bf16_t b) { return nk_dispatch_table.bf16_order(a, b); }
|
|
326
|
+
NK_DYNAMIC int nk_f16_order(nk_f16_t a, nk_f16_t b) { return nk_dispatch_table.f16_order(a, b); }
|
|
327
|
+
NK_DYNAMIC int nk_e5m2_order(nk_e5m2_t a, nk_e5m2_t b) { return nk_dispatch_table.e5m2_order(a, b); }
|
|
328
|
+
NK_DYNAMIC int nk_e4m3_order(nk_e4m3_t a, nk_e4m3_t b) { return nk_dispatch_table.e4m3_order(a, b); }
|
|
329
|
+
NK_DYNAMIC int nk_e3m2_order(nk_e3m2_t a, nk_e3m2_t b) { return nk_dispatch_table.e3m2_order(a, b); }
|
|
330
|
+
NK_DYNAMIC int nk_e2m3_order(nk_e2m3_t a, nk_e2m3_t b) { return nk_dispatch_table.e2m3_order(a, b); }
|
package/c/dispatch_u1.c
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Dispatch Initialization for U1 Data Types.
|
|
3
|
+
* @file c/dispatch_u1.c
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 3, 2026
|
|
6
|
+
*/
|
|
7
|
+
#include "dispatch.h"
|
|
8
|
+
|
|
9
|
+
void nk_dispatch_u1_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
|
|
10
|
+
typedef nk_kernel_punned_t m_t;
|
|
11
|
+
#if NK_TARGET_V128RELAXED
|
|
12
|
+
if (v & nk_cap_v128relaxed_k) switch (k) {
|
|
13
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
14
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
15
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
16
|
+
default: break;
|
|
17
|
+
}
|
|
18
|
+
#endif
|
|
19
|
+
#if NK_TARGET_SMEBI32
|
|
20
|
+
if (v & nk_cap_sme_k) switch (k) {
|
|
21
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
22
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
23
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
24
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
25
|
+
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
26
|
+
case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
27
|
+
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
28
|
+
case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
|
|
29
|
+
default: break;
|
|
30
|
+
}
|
|
31
|
+
#endif
|
|
32
|
+
#if NK_TARGET_SVE
|
|
33
|
+
if (v & nk_cap_sve_k) switch (k) {
|
|
34
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_sve, *c = nk_cap_sve_k; return;
|
|
35
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_sve, *c = nk_cap_sve_k; return;
|
|
36
|
+
default: break;
|
|
37
|
+
}
|
|
38
|
+
#endif
|
|
39
|
+
#if NK_TARGET_NEON
|
|
40
|
+
if (v & nk_cap_neon_k) switch (k) {
|
|
41
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_neon, *c = nk_cap_neon_k; return;
|
|
42
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_neon, *c = nk_cap_neon_k; return;
|
|
43
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_neon, *c = nk_cap_neon_k; return;
|
|
44
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_neon, *c = nk_cap_neon_k; return;
|
|
45
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_neon, *c = nk_cap_neon_k; return;
|
|
46
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_neon, *c = nk_cap_neon_k; return;
|
|
47
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_neon, *c = nk_cap_neon_k; return;
|
|
48
|
+
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_neon, *c = nk_cap_neon_k; return;
|
|
49
|
+
case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_neon, *c = nk_cap_neon_k; return;
|
|
50
|
+
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_neon, *c = nk_cap_neon_k; return;
|
|
51
|
+
case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_neon, *c = nk_cap_neon_k; return;
|
|
52
|
+
default: break;
|
|
53
|
+
}
|
|
54
|
+
#endif
|
|
55
|
+
#if NK_TARGET_ICELAKE
|
|
56
|
+
if (v & nk_cap_icelake_k) switch (k) {
|
|
57
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
58
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
59
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
60
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
61
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
62
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
63
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
64
|
+
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
65
|
+
case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
66
|
+
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
67
|
+
case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_icelake, *c = nk_cap_icelake_k; return;
|
|
68
|
+
default: break;
|
|
69
|
+
}
|
|
70
|
+
#endif
|
|
71
|
+
#if NK_TARGET_SKYLAKE
|
|
72
|
+
if (v & nk_cap_skylake_k) switch (k) {
|
|
73
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u1_skylake, *c = nk_cap_skylake_k; return;
|
|
74
|
+
default: break;
|
|
75
|
+
}
|
|
76
|
+
#endif
|
|
77
|
+
#if NK_TARGET_HASWELL
|
|
78
|
+
if (v & nk_cap_haswell_k) switch (k) {
|
|
79
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
80
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
81
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
82
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
83
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
84
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
85
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
86
|
+
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
87
|
+
case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
88
|
+
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
89
|
+
case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
90
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u1_haswell, *c = nk_cap_haswell_k; return;
|
|
91
|
+
default: break;
|
|
92
|
+
}
|
|
93
|
+
#endif
|
|
94
|
+
#if NK_TARGET_RVVBB
|
|
95
|
+
if (v & nk_cap_rvvbb_k) switch (k) {
|
|
96
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
|
|
97
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
|
|
98
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
|
|
99
|
+
default: break;
|
|
100
|
+
}
|
|
101
|
+
#endif
|
|
102
|
+
#if NK_TARGET_RVV
|
|
103
|
+
if (v & nk_cap_rvv_k) switch (k) {
|
|
104
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_rvv, *c = nk_cap_rvv_k; return;
|
|
105
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_rvv, *c = nk_cap_rvv_k; return;
|
|
106
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_rvv, *c = nk_cap_rvv_k; return;
|
|
107
|
+
default: break;
|
|
108
|
+
}
|
|
109
|
+
#endif
|
|
110
|
+
if (v & nk_cap_serial_k) switch (k) {
|
|
111
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_serial, *c = nk_cap_serial_k; return;
|
|
112
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_serial, *c = nk_cap_serial_k; return;
|
|
113
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_serial, *c = nk_cap_serial_k; return;
|
|
114
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_serial, *c = nk_cap_serial_k; return;
|
|
115
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_serial, *c = nk_cap_serial_k; return;
|
|
116
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_serial, *c = nk_cap_serial_k; return;
|
|
117
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_serial, *c = nk_cap_serial_k; return;
|
|
118
|
+
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_serial, *c = nk_cap_serial_k; return;
|
|
119
|
+
case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_serial, *c = nk_cap_serial_k; return;
|
|
120
|
+
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_serial, *c = nk_cap_serial_k; return;
|
|
121
|
+
case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_serial, *c = nk_cap_serial_k; return;
|
|
122
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u1_serial, *c = nk_cap_serial_k; return;
|
|
123
|
+
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u1_serial, *c = nk_cap_serial_k; return;
|
|
124
|
+
default: break;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Error fallback - zero capability signals lookup failure
|
|
128
|
+
*m = (m_t)nk_error_dense_, *c = 0;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
void nk_dispatch_u1_init_(nk_capability_t caps) {
|
|
132
|
+
nk_implementations_t *t = &nk_dispatch_table;
|
|
133
|
+
nk_capability_t used;
|
|
134
|
+
|
|
135
|
+
nk_dispatch_u1_find_(caps, nk_kernel_dot_k, (nk_kernel_punned_t *)&t->dot_u1, &used);
|
|
136
|
+
nk_dispatch_u1_find_(caps, nk_kernel_hamming_k, (nk_kernel_punned_t *)&t->hamming_u1, &used);
|
|
137
|
+
nk_dispatch_u1_find_(caps, nk_kernel_jaccard_k, (nk_kernel_punned_t *)&t->jaccard_u1, &used);
|
|
138
|
+
nk_dispatch_u1_find_(caps, nk_kernel_reduce_moments_k, (nk_kernel_punned_t *)&t->reduce_moments_u1, &used);
|
|
139
|
+
nk_dispatch_u1_find_(caps, nk_kernel_reduce_minmax_k, (nk_kernel_punned_t *)&t->reduce_minmax_u1, &used);
|
|
140
|
+
nk_dispatch_u1_find_(caps, nk_kernel_dots_packed_size_k, (nk_kernel_punned_t *)&t->dots_packed_size_u1, &used);
|
|
141
|
+
nk_dispatch_u1_find_(caps, nk_kernel_dots_pack_k, (nk_kernel_punned_t *)&t->dots_pack_u1, &used);
|
|
142
|
+
nk_dispatch_u1_find_(caps, nk_kernel_dots_packed_k, (nk_kernel_punned_t *)&t->dots_packed_u1, &used);
|
|
143
|
+
nk_dispatch_u1_find_(caps, nk_kernel_dots_symmetric_k, (nk_kernel_punned_t *)&t->dots_symmetric_u1, &used);
|
|
144
|
+
nk_dispatch_u1_find_(caps, nk_kernel_hammings_packed_k, (nk_kernel_punned_t *)&t->hammings_packed_u1, &used);
|
|
145
|
+
nk_dispatch_u1_find_(caps, nk_kernel_hammings_symmetric_k, (nk_kernel_punned_t *)&t->hammings_symmetric_u1, &used);
|
|
146
|
+
nk_dispatch_u1_find_(caps, nk_kernel_jaccards_packed_k, (nk_kernel_punned_t *)&t->jaccards_packed_u1, &used);
|
|
147
|
+
nk_dispatch_u1_find_(caps, nk_kernel_jaccards_symmetric_k, (nk_kernel_punned_t *)&t->jaccards_symmetric_u1, &used);
|
|
148
|
+
}
|
package/c/dispatch_u16.c
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Dispatch Initialization for U16 Data Types.
|
|
3
|
+
* @file c/dispatch_u16.c
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 3, 2026
|
|
6
|
+
*/
|
|
7
|
+
#include "dispatch.h"
|
|
8
|
+
|
|
9
|
+
void nk_dispatch_u16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
|
|
10
|
+
typedef nk_kernel_punned_t m_t;
|
|
11
|
+
#if NK_TARGET_V128RELAXED
|
|
12
|
+
if (v & nk_cap_v128relaxed_k) switch (k) {
|
|
13
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
14
|
+
case nk_kernel_reduce_moments_k:
|
|
15
|
+
*m = (m_t)&nk_reduce_moments_u16_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
16
|
+
return;
|
|
17
|
+
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
|
+
default: break;
|
|
19
|
+
}
|
|
20
|
+
#endif
|
|
21
|
+
#if NK_TARGET_SVE2
|
|
22
|
+
if (v & nk_cap_sve2_k) switch (k) {
|
|
23
|
+
case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_sve2, *c = nk_cap_sve2_k; return;
|
|
24
|
+
default: break;
|
|
25
|
+
}
|
|
26
|
+
#endif
|
|
27
|
+
#if NK_TARGET_SVE
|
|
28
|
+
if (v & nk_cap_sve_k) switch (k) {
|
|
29
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_sve, *c = nk_cap_sve_k; return;
|
|
30
|
+
default: break;
|
|
31
|
+
}
|
|
32
|
+
#endif
|
|
33
|
+
#if NK_TARGET_NEON
|
|
34
|
+
if (v & nk_cap_neon_k) switch (k) {
|
|
35
|
+
case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_neon, *c = nk_cap_neon_k; return;
|
|
36
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_neon, *c = nk_cap_neon_k; return;
|
|
37
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_neon, *c = nk_cap_neon_k; return;
|
|
38
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_neon, *c = nk_cap_neon_k; return;
|
|
39
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_neon, *c = nk_cap_neon_k; return;
|
|
40
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_neon, *c = nk_cap_neon_k; return;
|
|
41
|
+
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_neon, *c = nk_cap_neon_k; return;
|
|
42
|
+
default: break;
|
|
43
|
+
}
|
|
44
|
+
#endif
|
|
45
|
+
#if NK_TARGET_TURIN
|
|
46
|
+
if (v & nk_cap_turin_k) switch (k) {
|
|
47
|
+
case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_turin, *c = nk_cap_turin_k; return;
|
|
48
|
+
default: break;
|
|
49
|
+
}
|
|
50
|
+
#endif
|
|
51
|
+
#if NK_TARGET_ICELAKE
|
|
52
|
+
if (v & nk_cap_icelake_k) switch (k) {
|
|
53
|
+
case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_icelake, *c = nk_cap_skylake_k; return;
|
|
54
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_icelake, *c = nk_cap_icelake_k; return;
|
|
55
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_icelake, *c = nk_cap_icelake_k; return;
|
|
56
|
+
default: break;
|
|
57
|
+
}
|
|
58
|
+
#endif
|
|
59
|
+
#if NK_TARGET_SKYLAKE
|
|
60
|
+
if (v & nk_cap_skylake_k) switch (k) {
|
|
61
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_skylake, *c = nk_cap_skylake_k; return;
|
|
62
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_skylake, *c = nk_cap_skylake_k; return;
|
|
63
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_skylake, *c = nk_cap_skylake_k; return;
|
|
64
|
+
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_skylake, *c = nk_cap_skylake_k; return;
|
|
65
|
+
default: break;
|
|
66
|
+
}
|
|
67
|
+
#endif
|
|
68
|
+
#if NK_TARGET_ALDER
|
|
69
|
+
if (v & nk_cap_alder_k) switch (k) {
|
|
70
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_alder, *c = nk_cap_alder_k; return;
|
|
71
|
+
default: break;
|
|
72
|
+
}
|
|
73
|
+
#endif
|
|
74
|
+
#if NK_TARGET_HASWELL
|
|
75
|
+
if (v & nk_cap_haswell_k) switch (k) {
|
|
76
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_haswell, *c = nk_cap_haswell_k; return;
|
|
77
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_haswell, *c = nk_cap_haswell_k; return;
|
|
78
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_haswell, *c = nk_cap_haswell_k; return;
|
|
79
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_haswell, *c = nk_cap_haswell_k; return;
|
|
80
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_haswell, *c = nk_cap_haswell_k; return;
|
|
81
|
+
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_haswell, *c = nk_cap_haswell_k; return;
|
|
82
|
+
default: break;
|
|
83
|
+
}
|
|
84
|
+
#endif
|
|
85
|
+
#if NK_TARGET_RVV
|
|
86
|
+
if (v & nk_cap_rvv_k) switch (k) {
|
|
87
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_rvv, *c = nk_cap_rvv_k; return;
|
|
88
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_rvv, *c = nk_cap_rvv_k; return;
|
|
89
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_rvv, *c = nk_cap_rvv_k; return;
|
|
90
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_rvv, *c = nk_cap_rvv_k; return;
|
|
91
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_rvv, *c = nk_cap_rvv_k; return;
|
|
92
|
+
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_rvv, *c = nk_cap_rvv_k; return;
|
|
93
|
+
default: break;
|
|
94
|
+
}
|
|
95
|
+
#endif
|
|
96
|
+
if (v & nk_cap_serial_k) switch (k) {
|
|
97
|
+
case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_serial, *c = nk_cap_serial_k; return;
|
|
98
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_serial, *c = nk_cap_serial_k; return;
|
|
99
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_serial, *c = nk_cap_serial_k; return;
|
|
100
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_serial, *c = nk_cap_serial_k; return;
|
|
101
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_serial, *c = nk_cap_serial_k; return;
|
|
102
|
+
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u16_serial, *c = nk_cap_serial_k; return;
|
|
103
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_serial, *c = nk_cap_serial_k; return;
|
|
104
|
+
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_serial, *c = nk_cap_serial_k; return;
|
|
105
|
+
default: break;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Error fallback - zero capability signals lookup failure
|
|
109
|
+
*m = (m_t)nk_error_dense_, *c = 0;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
void nk_dispatch_u16_init_(nk_capability_t caps) {
|
|
113
|
+
nk_implementations_t *t = &nk_dispatch_table;
|
|
114
|
+
nk_capability_t used;
|
|
115
|
+
|
|
116
|
+
nk_dispatch_u16_find_(caps, nk_kernel_jaccard_k, (nk_kernel_punned_t *)&t->jaccard_u16, &used);
|
|
117
|
+
nk_dispatch_u16_find_(caps, nk_kernel_sparse_intersect_k, (nk_kernel_punned_t *)&t->sparse_intersect_u16, &used);
|
|
118
|
+
nk_dispatch_u16_find_(caps, nk_kernel_each_scale_k, (nk_kernel_punned_t *)&t->each_scale_u16, &used);
|
|
119
|
+
nk_dispatch_u16_find_(caps, nk_kernel_each_sum_k, (nk_kernel_punned_t *)&t->each_sum_u16, &used);
|
|
120
|
+
nk_dispatch_u16_find_(caps, nk_kernel_each_blend_k, (nk_kernel_punned_t *)&t->each_blend_u16, &used);
|
|
121
|
+
nk_dispatch_u16_find_(caps, nk_kernel_each_fma_k, (nk_kernel_punned_t *)&t->each_fma_u16, &used);
|
|
122
|
+
nk_dispatch_u16_find_(caps, nk_kernel_reduce_moments_k, (nk_kernel_punned_t *)&t->reduce_moments_u16, &used);
|
|
123
|
+
nk_dispatch_u16_find_(caps, nk_kernel_reduce_minmax_k, (nk_kernel_punned_t *)&t->reduce_minmax_u16, &used);
|
|
124
|
+
}
|