numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
package/c/dispatch_e2m3.c
CHANGED
|
@@ -15,6 +15,9 @@ void nk_dispatch_e2m3_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
15
15
|
#if NK_TARGET_V128RELAXED
|
|
16
16
|
if (v & nk_cap_v128relaxed_k) switch (k) {
|
|
17
17
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e2m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e2m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
19
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e2m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
20
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e2m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
21
|
case nk_kernel_reduce_moments_k:
|
|
19
22
|
*m = (m_t)&nk_reduce_moments_e2m3_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
20
23
|
return;
|
|
@@ -55,10 +58,45 @@ void nk_dispatch_e2m3_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
55
58
|
default: break;
|
|
56
59
|
}
|
|
57
60
|
#endif
|
|
61
|
+
#if NK_TARGET_NEONFP8
|
|
62
|
+
if (v & nk_cap_neonfp8_k) switch (k) {
|
|
63
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
64
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
65
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
66
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
67
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
68
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
69
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
70
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
71
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
72
|
+
case nk_kernel_angulars_symmetric_k:
|
|
73
|
+
*m = (m_t)&nk_angulars_symmetric_e2m3_neonfp8, *c = nk_cap_neonfp8_k;
|
|
74
|
+
return;
|
|
75
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e2m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
76
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
77
|
+
*m = (m_t)&nk_euclideans_symmetric_e2m3_neonfp8, *c = nk_cap_neonfp8_k;
|
|
78
|
+
return;
|
|
79
|
+
default: break;
|
|
80
|
+
}
|
|
81
|
+
#endif
|
|
58
82
|
#if NK_TARGET_NEONSDOT
|
|
59
83
|
if (v & nk_cap_neonsdot_k) switch (k) {
|
|
60
84
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e2m3_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
61
85
|
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e2m3_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
86
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e2m3_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
87
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e2m3_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
88
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e2m3_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
89
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e2m3_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
90
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e2m3_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
91
|
+
case nk_kernel_angulars_symmetric_k:
|
|
92
|
+
*m = (m_t)&nk_angulars_symmetric_e2m3_neonsdot, *c = nk_cap_neonsdot_k;
|
|
93
|
+
return;
|
|
94
|
+
case nk_kernel_euclideans_packed_k:
|
|
95
|
+
*m = (m_t)&nk_euclideans_packed_e2m3_neonsdot, *c = nk_cap_neonsdot_k;
|
|
96
|
+
return;
|
|
97
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
98
|
+
*m = (m_t)&nk_euclideans_symmetric_e2m3_neonsdot, *c = nk_cap_neonsdot_k;
|
|
99
|
+
return;
|
|
62
100
|
default: break;
|
|
63
101
|
}
|
|
64
102
|
#endif
|
|
@@ -98,17 +136,12 @@ void nk_dispatch_e2m3_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
98
136
|
default: break;
|
|
99
137
|
}
|
|
100
138
|
#endif
|
|
101
|
-
#if NK_TARGET_SAPPHIRE
|
|
102
|
-
if (v & nk_cap_sapphire_k) switch (k) {
|
|
103
|
-
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e2m3_sapphire, *c = nk_cap_sapphire_k; return;
|
|
104
|
-
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e2m3_sapphire, *c = nk_cap_sapphire_k; return;
|
|
105
|
-
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e2m3_sapphire, *c = nk_cap_sapphire_k; return;
|
|
106
|
-
default: break;
|
|
107
|
-
}
|
|
108
|
-
#endif
|
|
109
139
|
#if NK_TARGET_ICELAKE
|
|
110
140
|
if (v & nk_cap_icelake_k) switch (k) {
|
|
111
141
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e2m3_icelake, *c = nk_cap_icelake_k; return;
|
|
142
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e2m3_icelake, *c = nk_cap_icelake_k; return;
|
|
143
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e2m3_icelake, *c = nk_cap_icelake_k; return;
|
|
144
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e2m3_icelake, *c = nk_cap_icelake_k; return;
|
|
112
145
|
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e2m3_icelake, *c = nk_cap_icelake_k; return;
|
|
113
146
|
default: break;
|
|
114
147
|
}
|
package/c/dispatch_e3m2.c
CHANGED
|
@@ -15,6 +15,9 @@ void nk_dispatch_e3m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
15
15
|
#if NK_TARGET_V128RELAXED
|
|
16
16
|
if (v & nk_cap_v128relaxed_k) switch (k) {
|
|
17
17
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e3m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e3m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
19
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e3m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
20
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e3m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
21
|
case nk_kernel_reduce_moments_k:
|
|
19
22
|
*m = (m_t)&nk_reduce_moments_e3m2_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
20
23
|
return;
|
|
@@ -55,9 +58,44 @@ void nk_dispatch_e3m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
55
58
|
default: break;
|
|
56
59
|
}
|
|
57
60
|
#endif
|
|
61
|
+
#if NK_TARGET_NEONFP8
|
|
62
|
+
if (v & nk_cap_neonfp8_k) switch (k) {
|
|
63
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
64
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
65
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
66
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
67
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
68
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
69
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
70
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
71
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
72
|
+
case nk_kernel_angulars_symmetric_k:
|
|
73
|
+
*m = (m_t)&nk_angulars_symmetric_e3m2_neonfp8, *c = nk_cap_neonfp8_k;
|
|
74
|
+
return;
|
|
75
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e3m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
76
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
77
|
+
*m = (m_t)&nk_euclideans_symmetric_e3m2_neonfp8, *c = nk_cap_neonfp8_k;
|
|
78
|
+
return;
|
|
79
|
+
default: break;
|
|
80
|
+
}
|
|
81
|
+
#endif
|
|
58
82
|
#if NK_TARGET_NEONSDOT
|
|
59
83
|
if (v & nk_cap_neonsdot_k) switch (k) {
|
|
60
84
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e3m2_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
85
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e3m2_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
86
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e3m2_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
87
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e3m2_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
88
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e3m2_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
89
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e3m2_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
90
|
+
case nk_kernel_angulars_symmetric_k:
|
|
91
|
+
*m = (m_t)&nk_angulars_symmetric_e3m2_neonsdot, *c = nk_cap_neonsdot_k;
|
|
92
|
+
return;
|
|
93
|
+
case nk_kernel_euclideans_packed_k:
|
|
94
|
+
*m = (m_t)&nk_euclideans_packed_e3m2_neonsdot, *c = nk_cap_neonsdot_k;
|
|
95
|
+
return;
|
|
96
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
97
|
+
*m = (m_t)&nk_euclideans_symmetric_e3m2_neonsdot, *c = nk_cap_neonsdot_k;
|
|
98
|
+
return;
|
|
61
99
|
default: break;
|
|
62
100
|
}
|
|
63
101
|
#endif
|
|
@@ -97,17 +135,12 @@ void nk_dispatch_e3m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
97
135
|
default: break;
|
|
98
136
|
}
|
|
99
137
|
#endif
|
|
100
|
-
#if NK_TARGET_SAPPHIRE
|
|
101
|
-
if (v & nk_cap_sapphire_k) switch (k) {
|
|
102
|
-
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e3m2_sapphire, *c = nk_cap_sapphire_k; return;
|
|
103
|
-
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e3m2_sapphire, *c = nk_cap_sapphire_k; return;
|
|
104
|
-
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e3m2_sapphire, *c = nk_cap_sapphire_k; return;
|
|
105
|
-
default: break;
|
|
106
|
-
}
|
|
107
|
-
#endif
|
|
108
138
|
#if NK_TARGET_ICELAKE
|
|
109
139
|
if (v & nk_cap_icelake_k) switch (k) {
|
|
110
140
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e3m2_icelake, *c = nk_cap_icelake_k; return;
|
|
141
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e3m2_icelake, *c = nk_cap_icelake_k; return;
|
|
142
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e3m2_icelake, *c = nk_cap_icelake_k; return;
|
|
143
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e3m2_icelake, *c = nk_cap_icelake_k; return;
|
|
111
144
|
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e3m2_icelake, *c = nk_cap_icelake_k; return;
|
|
112
145
|
default: break;
|
|
113
146
|
}
|
|
@@ -135,6 +168,14 @@ void nk_dispatch_e3m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
135
168
|
default: break;
|
|
136
169
|
}
|
|
137
170
|
#endif
|
|
171
|
+
#if NK_TARGET_SIERRA
|
|
172
|
+
if (v & nk_cap_sierra_k) switch (k) {
|
|
173
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e3m2_sierra, *c = nk_cap_sierra_k; return;
|
|
174
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e3m2_sierra, *c = nk_cap_sierra_k; return;
|
|
175
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e3m2_sierra, *c = nk_cap_sierra_k; return;
|
|
176
|
+
default: break;
|
|
177
|
+
}
|
|
178
|
+
#endif
|
|
138
179
|
#if NK_TARGET_ALDER
|
|
139
180
|
if (v & nk_cap_alder_k) switch (k) {
|
|
140
181
|
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e3m2_alder, *c = nk_cap_alder_k; return;
|
package/c/dispatch_e4m3.c
CHANGED
|
@@ -15,6 +15,9 @@ void nk_dispatch_e4m3_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
15
15
|
return;
|
|
16
16
|
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
17
17
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
19
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
20
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
21
|
case nk_kernel_dots_packed_size_k:
|
|
19
22
|
*m = (m_t)&nk_dots_packed_size_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
20
23
|
return;
|
|
@@ -51,11 +54,31 @@ void nk_dispatch_e4m3_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
51
54
|
default: break;
|
|
52
55
|
}
|
|
53
56
|
#endif
|
|
57
|
+
#if NK_TARGET_NEONFP8
|
|
58
|
+
if (v & nk_cap_neonfp8_k) switch (k) {
|
|
59
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
60
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
61
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
62
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
63
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
64
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
65
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
66
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
67
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
68
|
+
case nk_kernel_angulars_symmetric_k:
|
|
69
|
+
*m = (m_t)&nk_angulars_symmetric_e4m3_neonfp8, *c = nk_cap_neonfp8_k;
|
|
70
|
+
return;
|
|
71
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
72
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
73
|
+
*m = (m_t)&nk_euclideans_symmetric_e4m3_neonfp8, *c = nk_cap_neonfp8_k;
|
|
74
|
+
return;
|
|
75
|
+
default: break;
|
|
76
|
+
}
|
|
77
|
+
#endif
|
|
54
78
|
#if NK_TARGET_NEONFHM
|
|
55
79
|
if (v & nk_cap_neonfhm_k) switch (k) {
|
|
56
80
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
57
81
|
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
58
|
-
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
59
82
|
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
60
83
|
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
61
84
|
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
@@ -115,19 +138,38 @@ void nk_dispatch_e4m3_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
115
138
|
default: break;
|
|
116
139
|
}
|
|
117
140
|
#endif
|
|
118
|
-
#if
|
|
119
|
-
if (v &
|
|
120
|
-
case
|
|
121
|
-
case
|
|
141
|
+
#if NK_TARGET_DIAMOND
|
|
142
|
+
if (v & nk_cap_diamond_k) switch (k) {
|
|
143
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
144
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
145
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
146
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
147
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
148
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
149
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
150
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
151
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
152
|
+
case nk_kernel_angulars_symmetric_k:
|
|
153
|
+
*m = (m_t)&nk_angulars_symmetric_e4m3_diamond, *c = nk_cap_diamond_k;
|
|
154
|
+
return;
|
|
155
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_diamond, *c = nk_cap_diamond_k; return;
|
|
156
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
157
|
+
*m = (m_t)&nk_euclideans_symmetric_e4m3_diamond, *c = nk_cap_diamond_k;
|
|
158
|
+
return;
|
|
159
|
+
default: break;
|
|
160
|
+
}
|
|
161
|
+
#endif
|
|
162
|
+
#if NK_TARGET_ICELAKE
|
|
163
|
+
if (v & nk_cap_icelake_k) switch (k) {
|
|
164
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_icelake, *c = nk_cap_icelake_k; return;
|
|
165
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_icelake, *c = nk_cap_icelake_k; return;
|
|
166
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_icelake, *c = nk_cap_icelake_k; return;
|
|
167
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_icelake, *c = nk_cap_icelake_k; return;
|
|
122
168
|
default: break;
|
|
123
169
|
}
|
|
124
170
|
#endif
|
|
125
171
|
#if NK_TARGET_GENOA
|
|
126
172
|
if (v & nk_cap_genoa_k) switch (k) {
|
|
127
|
-
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_genoa, *c = nk_cap_genoa_k; return;
|
|
128
|
-
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_genoa, *c = nk_cap_genoa_k; return;
|
|
129
|
-
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_genoa, *c = nk_cap_genoa_k; return;
|
|
130
|
-
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_genoa, *c = nk_cap_genoa_k; return;
|
|
131
173
|
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_genoa, *c = nk_cap_genoa_k; return;
|
|
132
174
|
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_genoa, *c = nk_cap_genoa_k; return;
|
|
133
175
|
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_genoa, *c = nk_cap_genoa_k; return;
|
package/c/dispatch_e5m2.c
CHANGED
|
@@ -15,6 +15,9 @@ void nk_dispatch_e5m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
15
15
|
return;
|
|
16
16
|
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
17
17
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
19
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
20
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
18
21
|
case nk_kernel_dots_packed_size_k:
|
|
19
22
|
*m = (m_t)&nk_dots_packed_size_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
20
23
|
return;
|
|
@@ -51,11 +54,31 @@ void nk_dispatch_e5m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
51
54
|
default: break;
|
|
52
55
|
}
|
|
53
56
|
#endif
|
|
57
|
+
#if NK_TARGET_NEONFP8
|
|
58
|
+
if (v & nk_cap_neonfp8_k) switch (k) {
|
|
59
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
60
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
61
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
62
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
63
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
64
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
65
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
66
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
67
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
68
|
+
case nk_kernel_angulars_symmetric_k:
|
|
69
|
+
*m = (m_t)&nk_angulars_symmetric_e5m2_neonfp8, *c = nk_cap_neonfp8_k;
|
|
70
|
+
return;
|
|
71
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_neonfp8, *c = nk_cap_neonfp8_k; return;
|
|
72
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
73
|
+
*m = (m_t)&nk_euclideans_symmetric_e5m2_neonfp8, *c = nk_cap_neonfp8_k;
|
|
74
|
+
return;
|
|
75
|
+
default: break;
|
|
76
|
+
}
|
|
77
|
+
#endif
|
|
54
78
|
#if NK_TARGET_NEONFHM
|
|
55
79
|
if (v & nk_cap_neonfhm_k) switch (k) {
|
|
56
80
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
57
81
|
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
58
|
-
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
59
82
|
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
60
83
|
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
61
84
|
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
|
|
@@ -115,6 +138,27 @@ void nk_dispatch_e5m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
115
138
|
default: break;
|
|
116
139
|
}
|
|
117
140
|
#endif
|
|
141
|
+
#if NK_TARGET_DIAMOND
|
|
142
|
+
if (v & nk_cap_diamond_k) switch (k) {
|
|
143
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
144
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
145
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
146
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
147
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
148
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
149
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
150
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
151
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
152
|
+
case nk_kernel_angulars_symmetric_k:
|
|
153
|
+
*m = (m_t)&nk_angulars_symmetric_e5m2_diamond, *c = nk_cap_diamond_k;
|
|
154
|
+
return;
|
|
155
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_diamond, *c = nk_cap_diamond_k; return;
|
|
156
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
157
|
+
*m = (m_t)&nk_euclideans_symmetric_e5m2_diamond, *c = nk_cap_diamond_k;
|
|
158
|
+
return;
|
|
159
|
+
default: break;
|
|
160
|
+
}
|
|
161
|
+
#endif
|
|
118
162
|
#if NK_TARGET_GENOA
|
|
119
163
|
if (v & nk_cap_genoa_k) switch (k) {
|
|
120
164
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
package/c/dispatch_f16.c
CHANGED
|
@@ -43,6 +43,10 @@ void nk_dispatch_f16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
43
43
|
case nk_kernel_euclideans_symmetric_k:
|
|
44
44
|
*m = (m_t)&nk_euclideans_symmetric_f16_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
45
45
|
return;
|
|
46
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_f16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
47
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_f16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
48
|
+
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_f16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
49
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_f16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
46
50
|
default: break;
|
|
47
51
|
}
|
|
48
52
|
#endif
|
|
@@ -91,42 +95,27 @@ void nk_dispatch_f16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
91
95
|
#endif
|
|
92
96
|
#if NK_TARGET_NEONHALF
|
|
93
97
|
if (v & nk_cap_neonhalf_k) switch (k) {
|
|
94
|
-
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
95
|
-
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
96
|
-
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
97
|
-
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
98
|
-
case nk_kernel_jsd_k: *m = (m_t)&nk_jsd_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
99
|
-
case nk_kernel_kld_k: *m = (m_t)&nk_kld_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
100
|
-
case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
101
|
-
case nk_kernel_mahalanobis_k: *m = (m_t)&nk_mahalanobis_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
102
98
|
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
103
99
|
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
104
100
|
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
105
101
|
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
106
|
-
case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
107
|
-
case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
108
|
-
case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
109
|
-
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
110
|
-
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
111
|
-
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
112
|
-
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
113
|
-
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
114
|
-
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_f16_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
115
|
-
case nk_kernel_angulars_symmetric_k:
|
|
116
|
-
*m = (m_t)&nk_angulars_symmetric_f16_neonhalf, *c = nk_cap_neonhalf_k;
|
|
117
|
-
return;
|
|
118
|
-
case nk_kernel_euclideans_packed_k:
|
|
119
|
-
*m = (m_t)&nk_euclideans_packed_f16_neonhalf, *c = nk_cap_neonhalf_k;
|
|
120
|
-
return;
|
|
121
|
-
case nk_kernel_euclideans_symmetric_k:
|
|
122
|
-
*m = (m_t)&nk_euclideans_symmetric_f16_neonhalf, *c = nk_cap_neonhalf_k;
|
|
123
|
-
return;
|
|
124
102
|
default: break;
|
|
125
103
|
}
|
|
126
104
|
#endif
|
|
127
105
|
#if NK_TARGET_NEON
|
|
128
106
|
if (v & nk_cap_neon_k) switch (k) {
|
|
129
107
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16_neon, *c = nk_cap_neon_k; return;
|
|
108
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f16_neon, *c = nk_cap_neon_k; return;
|
|
109
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f16_neon, *c = nk_cap_neon_k; return;
|
|
110
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f16_neon, *c = nk_cap_neon_k; return;
|
|
111
|
+
case nk_kernel_jsd_k: *m = (m_t)&nk_jsd_f16_neon, *c = nk_cap_neon_k; return;
|
|
112
|
+
case nk_kernel_kld_k: *m = (m_t)&nk_kld_f16_neon, *c = nk_cap_neon_k; return;
|
|
113
|
+
case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_f16_neon, *c = nk_cap_neon_k; return;
|
|
114
|
+
case nk_kernel_mahalanobis_k: *m = (m_t)&nk_mahalanobis_f16_neon, *c = nk_cap_neon_k; return;
|
|
115
|
+
case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_f16_neon, *c = nk_cap_neon_k; return;
|
|
116
|
+
case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_f16_neon, *c = nk_cap_neon_k; return;
|
|
117
|
+
case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_f16_neon, *c = nk_cap_neon_k; return;
|
|
118
|
+
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_f16_neon, *c = nk_cap_neon_k; return;
|
|
130
119
|
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_f16_neon, *c = nk_cap_neon_k; return;
|
|
131
120
|
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f16_neon, *c = nk_cap_neon_k; return;
|
|
132
121
|
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f16_neon, *c = nk_cap_neon_k; return;
|
|
@@ -174,6 +163,15 @@ void nk_dispatch_f16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
174
163
|
default: break;
|
|
175
164
|
}
|
|
176
165
|
#endif
|
|
166
|
+
#if NK_TARGET_DIAMOND
|
|
167
|
+
if (v & nk_cap_diamond_k) switch (k) {
|
|
168
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16_diamond, *c = nk_cap_diamond_k; return;
|
|
169
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f16_diamond, *c = nk_cap_diamond_k; return;
|
|
170
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f16_diamond, *c = nk_cap_diamond_k; return;
|
|
171
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f16_diamond, *c = nk_cap_diamond_k; return;
|
|
172
|
+
default: break;
|
|
173
|
+
}
|
|
174
|
+
#endif
|
|
177
175
|
#if NK_TARGET_SKYLAKE
|
|
178
176
|
if (v & nk_cap_skylake_k) switch (k) {
|
|
179
177
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16_skylake, *c = nk_cap_skylake_k; return;
|
|
@@ -202,6 +200,9 @@ void nk_dispatch_f16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
202
200
|
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_f16_skylake, *c = nk_cap_skylake_k; return;
|
|
203
201
|
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_f16_skylake, *c = nk_cap_skylake_k; return;
|
|
204
202
|
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_f16_skylake, *c = nk_cap_skylake_k; return;
|
|
203
|
+
case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_f16_skylake, *c = nk_cap_skylake_k; return;
|
|
204
|
+
case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_f16_skylake, *c = nk_cap_skylake_k; return;
|
|
205
|
+
case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_f16_skylake, *c = nk_cap_skylake_k; return;
|
|
205
206
|
default: break;
|
|
206
207
|
}
|
|
207
208
|
#endif
|
|
@@ -252,6 +253,58 @@ void nk_dispatch_f16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
252
253
|
default: break;
|
|
253
254
|
}
|
|
254
255
|
#endif
|
|
256
|
+
#if NK_TARGET_POWERVSX
|
|
257
|
+
if (v & nk_cap_powervsx_k) switch (k) {
|
|
258
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
259
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
260
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
261
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
262
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
263
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
264
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
265
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
266
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_f16_powervsx, *c = nk_cap_powervsx_k; return;
|
|
267
|
+
case nk_kernel_angulars_symmetric_k:
|
|
268
|
+
*m = (m_t)&nk_angulars_symmetric_f16_powervsx, *c = nk_cap_powervsx_k;
|
|
269
|
+
return;
|
|
270
|
+
case nk_kernel_euclideans_packed_k:
|
|
271
|
+
*m = (m_t)&nk_euclideans_packed_f16_powervsx, *c = nk_cap_powervsx_k;
|
|
272
|
+
return;
|
|
273
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
274
|
+
*m = (m_t)&nk_euclideans_symmetric_f16_powervsx, *c = nk_cap_powervsx_k;
|
|
275
|
+
return;
|
|
276
|
+
default: break;
|
|
277
|
+
}
|
|
278
|
+
#endif
|
|
279
|
+
#if NK_TARGET_LOONGSONASX
|
|
280
|
+
if (v & nk_cap_loongsonasx_k) switch (k) {
|
|
281
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
282
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
283
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
284
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
285
|
+
case nk_kernel_dots_packed_size_k:
|
|
286
|
+
*m = (m_t)&nk_dots_packed_size_f16_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
287
|
+
return;
|
|
288
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
289
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f16_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
290
|
+
case nk_kernel_dots_symmetric_k:
|
|
291
|
+
*m = (m_t)&nk_dots_symmetric_f16_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
292
|
+
return;
|
|
293
|
+
case nk_kernel_angulars_packed_k:
|
|
294
|
+
*m = (m_t)&nk_angulars_packed_f16_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
295
|
+
return;
|
|
296
|
+
case nk_kernel_angulars_symmetric_k:
|
|
297
|
+
*m = (m_t)&nk_angulars_symmetric_f16_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
298
|
+
return;
|
|
299
|
+
case nk_kernel_euclideans_packed_k:
|
|
300
|
+
*m = (m_t)&nk_euclideans_packed_f16_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
301
|
+
return;
|
|
302
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
303
|
+
*m = (m_t)&nk_euclideans_symmetric_f16_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
304
|
+
return;
|
|
305
|
+
default: break;
|
|
306
|
+
}
|
|
307
|
+
#endif
|
|
255
308
|
#if NK_TARGET_RVVHALF
|
|
256
309
|
if (v & nk_cap_rvvhalf_k) switch (k) {
|
|
257
310
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16_rvvhalf, *c = nk_cap_rvvhalf_k; return;
|
package/c/dispatch_f16c.c
CHANGED
|
@@ -22,11 +22,11 @@ void nk_dispatch_f16c_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
22
22
|
default: break;
|
|
23
23
|
}
|
|
24
24
|
#endif
|
|
25
|
-
#if
|
|
26
|
-
if (v &
|
|
27
|
-
case nk_kernel_dot_k: *m = (m_t)&
|
|
28
|
-
case nk_kernel_vdot_k: *m = (m_t)&
|
|
29
|
-
case nk_kernel_bilinear_k: *m = (m_t)&
|
|
25
|
+
#if NK_TARGET_NEON
|
|
26
|
+
if (v & nk_cap_neon_k) switch (k) {
|
|
27
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f16c_neon, *c = nk_cap_neon_k; return;
|
|
28
|
+
case nk_kernel_vdot_k: *m = (m_t)&nk_vdot_f16c_neon, *c = nk_cap_neon_k; return;
|
|
29
|
+
case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_f16c_neon, *c = nk_cap_neon_k; return;
|
|
30
30
|
default: break;
|
|
31
31
|
}
|
|
32
32
|
#endif
|
package/c/dispatch_f32.c
CHANGED
|
@@ -51,6 +51,10 @@ void nk_dispatch_f32_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
51
51
|
case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_f32_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
52
52
|
case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_f32_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
53
53
|
case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_f32_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
54
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_f32_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
55
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_f32_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
56
|
+
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_f32_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
57
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_f32_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
54
58
|
default: break;
|
|
55
59
|
}
|
|
56
60
|
#endif
|
|
@@ -255,6 +259,58 @@ void nk_dispatch_f32_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
255
259
|
default: break;
|
|
256
260
|
}
|
|
257
261
|
#endif
|
|
262
|
+
#if NK_TARGET_POWERVSX
|
|
263
|
+
if (v & nk_cap_powervsx_k) switch (k) {
|
|
264
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
265
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
266
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
267
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
268
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
269
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
270
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
271
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
272
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_f32_powervsx, *c = nk_cap_powervsx_k; return;
|
|
273
|
+
case nk_kernel_angulars_symmetric_k:
|
|
274
|
+
*m = (m_t)&nk_angulars_symmetric_f32_powervsx, *c = nk_cap_powervsx_k;
|
|
275
|
+
return;
|
|
276
|
+
case nk_kernel_euclideans_packed_k:
|
|
277
|
+
*m = (m_t)&nk_euclideans_packed_f32_powervsx, *c = nk_cap_powervsx_k;
|
|
278
|
+
return;
|
|
279
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
280
|
+
*m = (m_t)&nk_euclideans_symmetric_f32_powervsx, *c = nk_cap_powervsx_k;
|
|
281
|
+
return;
|
|
282
|
+
default: break;
|
|
283
|
+
}
|
|
284
|
+
#endif
|
|
285
|
+
#if NK_TARGET_LOONGSONASX
|
|
286
|
+
if (v & nk_cap_loongsonasx_k) switch (k) {
|
|
287
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f32_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
288
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f32_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
289
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f32_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
290
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f32_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
291
|
+
case nk_kernel_dots_packed_size_k:
|
|
292
|
+
*m = (m_t)&nk_dots_packed_size_f32_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
293
|
+
return;
|
|
294
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f32_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
295
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f32_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
296
|
+
case nk_kernel_dots_symmetric_k:
|
|
297
|
+
*m = (m_t)&nk_dots_symmetric_f32_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
298
|
+
return;
|
|
299
|
+
case nk_kernel_angulars_packed_k:
|
|
300
|
+
*m = (m_t)&nk_angulars_packed_f32_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
301
|
+
return;
|
|
302
|
+
case nk_kernel_angulars_symmetric_k:
|
|
303
|
+
*m = (m_t)&nk_angulars_symmetric_f32_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
304
|
+
return;
|
|
305
|
+
case nk_kernel_euclideans_packed_k:
|
|
306
|
+
*m = (m_t)&nk_euclideans_packed_f32_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
307
|
+
return;
|
|
308
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
309
|
+
*m = (m_t)&nk_euclideans_symmetric_f32_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
310
|
+
return;
|
|
311
|
+
default: break;
|
|
312
|
+
}
|
|
313
|
+
#endif
|
|
258
314
|
#if NK_TARGET_RVV
|
|
259
315
|
if (v & nk_cap_rvv_k) switch (k) {
|
|
260
316
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f32_rvv, *c = nk_cap_rvv_k; return;
|
package/c/dispatch_f64.c
CHANGED
|
@@ -182,6 +182,58 @@ void nk_dispatch_f64_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
182
182
|
default: break;
|
|
183
183
|
}
|
|
184
184
|
#endif
|
|
185
|
+
#if NK_TARGET_POWERVSX
|
|
186
|
+
if (v & nk_cap_powervsx_k) switch (k) {
|
|
187
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
188
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
189
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
190
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
191
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
192
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
193
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
194
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
195
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_f64_powervsx, *c = nk_cap_powervsx_k; return;
|
|
196
|
+
case nk_kernel_angulars_symmetric_k:
|
|
197
|
+
*m = (m_t)&nk_angulars_symmetric_f64_powervsx, *c = nk_cap_powervsx_k;
|
|
198
|
+
return;
|
|
199
|
+
case nk_kernel_euclideans_packed_k:
|
|
200
|
+
*m = (m_t)&nk_euclideans_packed_f64_powervsx, *c = nk_cap_powervsx_k;
|
|
201
|
+
return;
|
|
202
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
203
|
+
*m = (m_t)&nk_euclideans_symmetric_f64_powervsx, *c = nk_cap_powervsx_k;
|
|
204
|
+
return;
|
|
205
|
+
default: break;
|
|
206
|
+
}
|
|
207
|
+
#endif
|
|
208
|
+
#if NK_TARGET_LOONGSONASX
|
|
209
|
+
if (v & nk_cap_loongsonasx_k) switch (k) {
|
|
210
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f64_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
211
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_f64_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
212
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_f64_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
213
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_f64_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
214
|
+
case nk_kernel_dots_packed_size_k:
|
|
215
|
+
*m = (m_t)&nk_dots_packed_size_f64_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
216
|
+
return;
|
|
217
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f64_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
218
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f64_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
219
|
+
case nk_kernel_dots_symmetric_k:
|
|
220
|
+
*m = (m_t)&nk_dots_symmetric_f64_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
221
|
+
return;
|
|
222
|
+
case nk_kernel_angulars_packed_k:
|
|
223
|
+
*m = (m_t)&nk_angulars_packed_f64_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
224
|
+
return;
|
|
225
|
+
case nk_kernel_angulars_symmetric_k:
|
|
226
|
+
*m = (m_t)&nk_angulars_symmetric_f64_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
227
|
+
return;
|
|
228
|
+
case nk_kernel_euclideans_packed_k:
|
|
229
|
+
*m = (m_t)&nk_euclideans_packed_f64_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
230
|
+
return;
|
|
231
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
232
|
+
*m = (m_t)&nk_euclideans_symmetric_f64_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
233
|
+
return;
|
|
234
|
+
default: break;
|
|
235
|
+
}
|
|
236
|
+
#endif
|
|
185
237
|
#if NK_TARGET_RVV
|
|
186
238
|
if (v & nk_cap_rvv_k) switch (k) {
|
|
187
239
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_f64_rvv, *c = nk_cap_rvv_k; return;
|
package/c/dispatch_i4.c
CHANGED
|
@@ -52,6 +52,9 @@ void nk_dispatch_i4_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
52
52
|
case nk_kernel_euclideans_symmetric_k:
|
|
53
53
|
*m = (m_t)&nk_euclideans_symmetric_i4_neonsdot, *c = nk_cap_neonsdot_k;
|
|
54
54
|
return;
|
|
55
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_i4_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
56
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_i4_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
57
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_i4_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
55
58
|
default: break;
|
|
56
59
|
}
|
|
57
60
|
#endif
|