numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
package/c/dispatch_i8.c
CHANGED
|
@@ -34,6 +34,10 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
34
34
|
case nk_kernel_euclideans_symmetric_k:
|
|
35
35
|
*m = (m_t)&nk_euclideans_symmetric_i8_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
36
36
|
return;
|
|
37
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
38
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
39
|
+
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
40
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
37
41
|
default: break;
|
|
38
42
|
}
|
|
39
43
|
#endif
|
|
@@ -50,6 +54,15 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
50
54
|
default: break;
|
|
51
55
|
}
|
|
52
56
|
#endif
|
|
57
|
+
#if NK_TARGET_SVESDOT
|
|
58
|
+
if (v & nk_cap_svesdot_k) switch (k) {
|
|
59
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
60
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_i8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
61
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_i8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
62
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_i8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
63
|
+
default: break;
|
|
64
|
+
}
|
|
65
|
+
#endif
|
|
53
66
|
#if NK_TARGET_NEONSDOT
|
|
54
67
|
if (v & nk_cap_neonsdot_k) switch (k) {
|
|
55
68
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
@@ -74,15 +87,14 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
74
87
|
#endif
|
|
75
88
|
#if NK_TARGET_NEONHALF
|
|
76
89
|
if (v & nk_cap_neonhalf_k) switch (k) {
|
|
77
|
-
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
78
90
|
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
79
91
|
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
80
|
-
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
81
92
|
default: break;
|
|
82
93
|
}
|
|
83
94
|
#endif
|
|
84
95
|
#if NK_TARGET_NEON
|
|
85
96
|
if (v & nk_cap_neon_k) switch (k) {
|
|
97
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_i8_neon, *c = nk_cap_neon_k; return;
|
|
86
98
|
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_i8_neon, *c = nk_cap_neon_k; return;
|
|
87
99
|
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_i8_neon, *c = nk_cap_neon_k; return;
|
|
88
100
|
default: break;
|
|
@@ -113,7 +125,6 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
113
125
|
#endif
|
|
114
126
|
#if NK_TARGET_SAPPHIRE
|
|
115
127
|
if (v & nk_cap_sapphire_k) switch (k) {
|
|
116
|
-
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_i8_sapphire, *c = nk_cap_sapphire_k; return;
|
|
117
128
|
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_i8_sapphire, *c = nk_cap_sapphire_k; return;
|
|
118
129
|
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_i8_sapphire, *c = nk_cap_sapphire_k; return;
|
|
119
130
|
default: break;
|
|
@@ -212,6 +223,54 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
212
223
|
default: break;
|
|
213
224
|
}
|
|
214
225
|
#endif
|
|
226
|
+
#if NK_TARGET_POWERVSX
|
|
227
|
+
if (v & nk_cap_powervsx_k) switch (k) {
|
|
228
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
229
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
230
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
231
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
232
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
233
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
234
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
235
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
236
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
237
|
+
case nk_kernel_angulars_symmetric_k:
|
|
238
|
+
*m = (m_t)&nk_angulars_symmetric_i8_powervsx, *c = nk_cap_powervsx_k;
|
|
239
|
+
return;
|
|
240
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_i8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
241
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
242
|
+
*m = (m_t)&nk_euclideans_symmetric_i8_powervsx, *c = nk_cap_powervsx_k;
|
|
243
|
+
return;
|
|
244
|
+
default: break;
|
|
245
|
+
}
|
|
246
|
+
#endif
|
|
247
|
+
#if NK_TARGET_LOONGSONASX
|
|
248
|
+
if (v & nk_cap_loongsonasx_k) switch (k) {
|
|
249
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
250
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
251
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
252
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
253
|
+
case nk_kernel_dots_packed_size_k:
|
|
254
|
+
*m = (m_t)&nk_dots_packed_size_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
255
|
+
return;
|
|
256
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
257
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
258
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
259
|
+
case nk_kernel_angulars_packed_k:
|
|
260
|
+
*m = (m_t)&nk_angulars_packed_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
261
|
+
return;
|
|
262
|
+
case nk_kernel_angulars_symmetric_k:
|
|
263
|
+
*m = (m_t)&nk_angulars_symmetric_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
264
|
+
return;
|
|
265
|
+
case nk_kernel_euclideans_packed_k:
|
|
266
|
+
*m = (m_t)&nk_euclideans_packed_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
267
|
+
return;
|
|
268
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
269
|
+
*m = (m_t)&nk_euclideans_symmetric_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
270
|
+
return;
|
|
271
|
+
default: break;
|
|
272
|
+
}
|
|
273
|
+
#endif
|
|
215
274
|
#if NK_TARGET_RVV
|
|
216
275
|
if (v & nk_cap_rvv_k) switch (k) {
|
|
217
276
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_rvv, *c = nk_cap_rvv_k; return;
|
package/c/dispatch_other.c
CHANGED
|
@@ -89,6 +89,13 @@ void nk_dispatch_cast_init_(nk_capability_t caps) {
|
|
|
89
89
|
}
|
|
90
90
|
#endif
|
|
91
91
|
|
|
92
|
+
#if NK_TARGET_POWERVSX
|
|
93
|
+
if (caps & nk_cap_powervsx_k) {
|
|
94
|
+
t->f16_to_f32 = &nk_f16_to_f32_powervsx;
|
|
95
|
+
t->f32_to_f16 = &nk_f32_to_f16_powervsx;
|
|
96
|
+
}
|
|
97
|
+
#endif
|
|
98
|
+
|
|
92
99
|
// Scalar conversions: e5m2, e4m3, e3m2, e2m3 (serial only)
|
|
93
100
|
t->e5m2_to_f32 = &nk_e5m2_to_f32_serial;
|
|
94
101
|
t->f32_to_e5m2 = &nk_f32_to_e5m2_serial;
|
|
@@ -144,6 +151,17 @@ void nk_dispatch_math_init_(nk_capability_t caps) {
|
|
|
144
151
|
}
|
|
145
152
|
#endif
|
|
146
153
|
|
|
154
|
+
#if NK_TARGET_POWERVSX
|
|
155
|
+
if (caps & nk_cap_powervsx_k) {
|
|
156
|
+
t->f64_sqrt = &nk_f64_sqrt_powervsx;
|
|
157
|
+
t->f64_rsqrt = &nk_f64_rsqrt_powervsx;
|
|
158
|
+
t->f64_fma = &nk_f64_fma_powervsx;
|
|
159
|
+
t->f32_sqrt = &nk_f32_sqrt_powervsx;
|
|
160
|
+
t->f32_rsqrt = &nk_f32_rsqrt_powervsx;
|
|
161
|
+
t->f32_fma = &nk_f32_fma_powervsx;
|
|
162
|
+
}
|
|
163
|
+
#endif
|
|
164
|
+
|
|
147
165
|
// Scalar math: f16
|
|
148
166
|
t->f16_sqrt = &nk_f16_sqrt_serial;
|
|
149
167
|
t->f16_rsqrt = &nk_f16_rsqrt_serial;
|
package/c/dispatch_u1.c
CHANGED
|
@@ -17,15 +17,15 @@ void nk_dispatch_u1_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
17
17
|
}
|
|
18
18
|
#endif
|
|
19
19
|
#if NK_TARGET_SMEBI32
|
|
20
|
-
if (v &
|
|
21
|
-
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_smebi32, *c =
|
|
22
|
-
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_smebi32, *c =
|
|
23
|
-
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_smebi32, *c =
|
|
24
|
-
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_smebi32, *c =
|
|
25
|
-
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_smebi32, *c =
|
|
26
|
-
case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_smebi32, *c =
|
|
27
|
-
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_smebi32, *c =
|
|
28
|
-
case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_smebi32, *c =
|
|
20
|
+
if (v & nk_cap_smebi32_k) switch (k) {
|
|
21
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
22
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
23
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
24
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
25
|
+
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
26
|
+
case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
27
|
+
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
28
|
+
case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_smebi32, *c = nk_cap_smebi32_k; return;
|
|
29
29
|
default: break;
|
|
30
30
|
}
|
|
31
31
|
#endif
|
|
@@ -91,6 +91,51 @@ void nk_dispatch_u1_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
91
91
|
default: break;
|
|
92
92
|
}
|
|
93
93
|
#endif
|
|
94
|
+
#if NK_TARGET_POWERVSX
|
|
95
|
+
if (v & nk_cap_powervsx_k) switch (k) {
|
|
96
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
97
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
98
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
99
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
100
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
101
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
102
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
103
|
+
case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
104
|
+
case nk_kernel_hammings_symmetric_k:
|
|
105
|
+
*m = (m_t)&nk_hammings_symmetric_u1_powervsx, *c = nk_cap_powervsx_k;
|
|
106
|
+
return;
|
|
107
|
+
case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_powervsx, *c = nk_cap_powervsx_k; return;
|
|
108
|
+
case nk_kernel_jaccards_symmetric_k:
|
|
109
|
+
*m = (m_t)&nk_jaccards_symmetric_u1_powervsx, *c = nk_cap_powervsx_k;
|
|
110
|
+
return;
|
|
111
|
+
default: break;
|
|
112
|
+
}
|
|
113
|
+
#endif
|
|
114
|
+
#if NK_TARGET_LOONGSONASX
|
|
115
|
+
if (v & nk_cap_loongsonasx_k) switch (k) {
|
|
116
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
117
|
+
case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
118
|
+
case nk_kernel_dots_packed_size_k:
|
|
119
|
+
*m = (m_t)&nk_dots_packed_size_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
120
|
+
return;
|
|
121
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
122
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
123
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
124
|
+
case nk_kernel_hammings_packed_k:
|
|
125
|
+
*m = (m_t)&nk_hammings_packed_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
126
|
+
return;
|
|
127
|
+
case nk_kernel_hammings_symmetric_k:
|
|
128
|
+
*m = (m_t)&nk_hammings_symmetric_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
129
|
+
return;
|
|
130
|
+
case nk_kernel_jaccards_packed_k:
|
|
131
|
+
*m = (m_t)&nk_jaccards_packed_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
132
|
+
return;
|
|
133
|
+
case nk_kernel_jaccards_symmetric_k:
|
|
134
|
+
*m = (m_t)&nk_jaccards_symmetric_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
135
|
+
return;
|
|
136
|
+
default: break;
|
|
137
|
+
}
|
|
138
|
+
#endif
|
|
94
139
|
#if NK_TARGET_RVVBB
|
|
95
140
|
if (v & nk_cap_rvvbb_k) switch (k) {
|
|
96
141
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
|
package/c/dispatch_u4.c
CHANGED
|
@@ -52,6 +52,9 @@ void nk_dispatch_u4_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
52
52
|
case nk_kernel_euclideans_symmetric_k:
|
|
53
53
|
*m = (m_t)&nk_euclideans_symmetric_u4_neonsdot, *c = nk_cap_neonsdot_k;
|
|
54
54
|
return;
|
|
55
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_u4_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
56
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u4_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
57
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u4_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
55
58
|
default: break;
|
|
56
59
|
}
|
|
57
60
|
#endif
|
package/c/dispatch_u8.c
CHANGED
|
@@ -35,6 +35,10 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
35
35
|
case nk_kernel_euclideans_symmetric_k:
|
|
36
36
|
*m = (m_t)&nk_euclideans_symmetric_u8_v128relaxed, *c = nk_cap_v128relaxed_k;
|
|
37
37
|
return;
|
|
38
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
39
|
+
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
40
|
+
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
41
|
+
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
|
|
38
42
|
default: break;
|
|
39
43
|
}
|
|
40
44
|
#endif
|
|
@@ -57,6 +61,15 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
57
61
|
default: break;
|
|
58
62
|
}
|
|
59
63
|
#endif
|
|
64
|
+
#if NK_TARGET_SVESDOT
|
|
65
|
+
if (v & nk_cap_svesdot_k) switch (k) {
|
|
66
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
67
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_u8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
68
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
69
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u8_svesdot, *c = nk_cap_svesdot_k; return;
|
|
70
|
+
default: break;
|
|
71
|
+
}
|
|
72
|
+
#endif
|
|
60
73
|
#if NK_TARGET_NEONSDOT
|
|
61
74
|
if (v & nk_cap_neonsdot_k) switch (k) {
|
|
62
75
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_neonsdot, *c = nk_cap_neonsdot_k; return;
|
|
@@ -81,16 +94,15 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
81
94
|
#endif
|
|
82
95
|
#if NK_TARGET_NEONHALF
|
|
83
96
|
if (v & nk_cap_neonhalf_k) switch (k) {
|
|
84
|
-
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
85
97
|
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
86
98
|
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
87
|
-
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
|
|
88
99
|
default: break;
|
|
89
100
|
}
|
|
90
101
|
#endif
|
|
91
102
|
#if NK_TARGET_NEON
|
|
92
103
|
if (v & nk_cap_neon_k) switch (k) {
|
|
93
104
|
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u8_neon, *c = nk_cap_neon_k; return;
|
|
105
|
+
case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u8_neon, *c = nk_cap_neon_k; return;
|
|
94
106
|
case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u8_neon, *c = nk_cap_neon_k; return;
|
|
95
107
|
case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u8_neon, *c = nk_cap_neon_k; return;
|
|
96
108
|
default: break;
|
|
@@ -121,7 +133,6 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
121
133
|
#endif
|
|
122
134
|
#if NK_TARGET_SAPPHIRE
|
|
123
135
|
if (v & nk_cap_sapphire_k) switch (k) {
|
|
124
|
-
case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u8_sapphire, *c = nk_cap_sapphire_k; return;
|
|
125
136
|
case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u8_sapphire, *c = nk_cap_sapphire_k; return;
|
|
126
137
|
case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u8_sapphire, *c = nk_cap_sapphire_k; return;
|
|
127
138
|
default: break;
|
|
@@ -223,6 +234,56 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
|
|
|
223
234
|
default: break;
|
|
224
235
|
}
|
|
225
236
|
#endif
|
|
237
|
+
#if NK_TARGET_POWERVSX
|
|
238
|
+
if (v & nk_cap_powervsx_k) switch (k) {
|
|
239
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
240
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
241
|
+
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
242
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
243
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
244
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
245
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
246
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
247
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
248
|
+
case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
249
|
+
case nk_kernel_angulars_symmetric_k:
|
|
250
|
+
*m = (m_t)&nk_angulars_symmetric_u8_powervsx, *c = nk_cap_powervsx_k;
|
|
251
|
+
return;
|
|
252
|
+
case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_u8_powervsx, *c = nk_cap_powervsx_k; return;
|
|
253
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
254
|
+
*m = (m_t)&nk_euclideans_symmetric_u8_powervsx, *c = nk_cap_powervsx_k;
|
|
255
|
+
return;
|
|
256
|
+
default: break;
|
|
257
|
+
}
|
|
258
|
+
#endif
|
|
259
|
+
#if NK_TARGET_LOONGSONASX
|
|
260
|
+
if (v & nk_cap_loongsonasx_k) switch (k) {
|
|
261
|
+
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
262
|
+
case nk_kernel_angular_k: *m = (m_t)&nk_angular_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
263
|
+
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
264
|
+
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
265
|
+
case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
266
|
+
case nk_kernel_dots_packed_size_k:
|
|
267
|
+
*m = (m_t)&nk_dots_packed_size_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
268
|
+
return;
|
|
269
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
270
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
271
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
|
|
272
|
+
case nk_kernel_angulars_packed_k:
|
|
273
|
+
*m = (m_t)&nk_angulars_packed_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
274
|
+
return;
|
|
275
|
+
case nk_kernel_angulars_symmetric_k:
|
|
276
|
+
*m = (m_t)&nk_angulars_symmetric_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
277
|
+
return;
|
|
278
|
+
case nk_kernel_euclideans_packed_k:
|
|
279
|
+
*m = (m_t)&nk_euclideans_packed_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
280
|
+
return;
|
|
281
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
282
|
+
*m = (m_t)&nk_euclideans_symmetric_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
|
|
283
|
+
return;
|
|
284
|
+
default: break;
|
|
285
|
+
}
|
|
286
|
+
#endif
|
|
226
287
|
#if NK_TARGET_RVV
|
|
227
288
|
if (v & nk_cap_rvv_k) switch (k) {
|
|
228
289
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_rvv, *c = nk_cap_rvv_k; return;
|
package/c/numkong.c
CHANGED
|
@@ -842,6 +842,9 @@ NK_DYNAMIC nk_capability_t nk_capabilities(void) {
|
|
|
842
842
|
return static_capabilities;
|
|
843
843
|
}
|
|
844
844
|
|
|
845
|
+
NK_DYNAMIC nk_capability_t nk_capabilities_available(void) { return nk_capabilities() & nk_capabilities_compiled_(); }
|
|
846
|
+
NK_DYNAMIC nk_capability_t nk_capabilities_compiled(void) { return nk_capabilities_compiled_(); }
|
|
847
|
+
|
|
845
848
|
NK_DYNAMIC void nk_find_kernel_punned( //
|
|
846
849
|
nk_kernel_kind_t kind, //
|
|
847
850
|
nk_dtype_t dtype, //
|
package/include/README.md
CHANGED
|
@@ -25,7 +25,7 @@ int main(void) {
|
|
|
25
25
|
|
|
26
26
|
## Highlights
|
|
27
27
|
|
|
28
|
-
This is the
|
|
28
|
+
This is the primary SDK in the project.
|
|
29
29
|
It is the right layer if you want exact control over dtypes, allocators, packed buffers, dispatch, and host-side partitioning.
|
|
30
30
|
|
|
31
31
|
__Full kernel surface.__
|
|
@@ -167,6 +167,20 @@ For example, `f32_t::dot_result_t` is wider than `f32_t`.
|
|
|
167
167
|
The higher-level templates use `result_type_ = typename in_type_::dot_result_t` and similar defaults.
|
|
168
168
|
The fast typed overloads are constrained so that overriding the result type away from the native policy can disable the specialized path and fall back to the more generic one.
|
|
169
169
|
|
|
170
|
+
When `__cpp_lib_format >= 202110L` for the C++23 `<format>` header support, all NumKong scalar types provide `std::formatter` specializations with similar format specs to the traditional `float`.
|
|
171
|
+
For the BFloat16 type, the output for `nk::f16_t::from_f32(3.14f)` will look like:
|
|
172
|
+
|
|
173
|
+
| Format spec | Output example | Description |
|
|
174
|
+
| ----------- | -------------------- | -------------------------------------- |
|
|
175
|
+
| `{}` | `3.140625` | Clean float value |
|
|
176
|
+
| `{:#}` | `3.140625 [0x4248]` | Annotated with hex bits |
|
|
177
|
+
| `{:.2f}` | `3.14` | Precision forwarded to float formatter |
|
|
178
|
+
| `{:x}` | `4248` | Raw hex bits |
|
|
179
|
+
| `{:#x}` | `0x4248` | Hex with prefix |
|
|
180
|
+
| `{:X}` | `4248` | Uppercase hex |
|
|
181
|
+
| `{:b}` | `0100001001001000` | Binary bits |
|
|
182
|
+
| `{:#b}` | `0b0100001001001000` | Binary with prefix |
|
|
183
|
+
|
|
170
184
|
## Dot Products
|
|
171
185
|
|
|
172
186
|
Dot products are one of the broadest parts of the native SDK.
|
|
@@ -243,7 +257,7 @@ nk_jsd_f32(q, p, 3, &js_reverse);
|
|
|
243
257
|
assert(js_forward == js_reverse && "JSD is symmetric");
|
|
244
258
|
```
|
|
245
259
|
|
|
246
|
-
These paths are
|
|
260
|
+
These paths are useful once you move below `f64`.
|
|
247
261
|
Naive implementations are usually dominated by repeated scalar transcendental calls and weak accumulation policy.
|
|
248
262
|
|
|
249
263
|
## Geospatial Metrics
|
|
@@ -365,9 +379,44 @@ nk::f64_t dot {};
|
|
|
365
379
|
nk::dot(view.row(0), view.row(1), md.extent(1), &dot);
|
|
366
380
|
```
|
|
367
381
|
|
|
382
|
+
## Iterators and Enumeration
|
|
383
|
+
|
|
384
|
+
NumKong containers expose random-access iterators for element and row traversal.
|
|
385
|
+
|
|
386
|
+
- __`dim_iterator`__ — random-access iterator over element values, used by `vector`, `vector_view`, and `vector_span`.
|
|
387
|
+
Supports all standard iterator operations plus `index()` to retrieve the current position.
|
|
388
|
+
- __`axis_iterator`__ — random-access iterator over sub-views (rows), used by `tensor_view` and `tensor_span`.
|
|
389
|
+
Also exposes `index()`.
|
|
390
|
+
- __`enumerate()`__ — free function returning a lightweight view that yields `{index, value}` pairs from any container with `begin()`/`end()`/`size()`.
|
|
391
|
+
|
|
392
|
+
```cpp
|
|
393
|
+
#include <numkong/numkong.hpp>
|
|
394
|
+
|
|
395
|
+
namespace nk = ashvardanian::numkong;
|
|
396
|
+
|
|
397
|
+
nk::vector<nk::f16_t> v(128);
|
|
398
|
+
for (auto [i, val] : nk::enumerate(v))
|
|
399
|
+
std::printf("[%zu] = %f\n", i, val.to_f32());
|
|
400
|
+
|
|
401
|
+
// index() on raw iterators
|
|
402
|
+
for (auto it = v.begin(); it != v.end(); ++it)
|
|
403
|
+
std::printf("[%zu] = %f\n", it.index(), (*it).to_f32());
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
Since `tensor.hpp` includes `vector.hpp`, `enumerate()` works on tensor row views too.
|
|
407
|
+
|
|
408
|
+
Tensors also support range-for over all logical scalar elements, yielding `(position, value)` pairs.
|
|
409
|
+
For sub-byte types each dimension is a logical scalar. Use `.dims()` to iterate values without positions.
|
|
410
|
+
|
|
411
|
+
```cpp
|
|
412
|
+
for (auto [pos, val] : matrix) { /* pos is std::array<size_t, R> */ }
|
|
413
|
+
for (auto [pos, ref] : matrix.span()) { ref = nk::f32_t{1}; }
|
|
414
|
+
for (auto val : matrix.dims()) { /* scalar only, no position */ }
|
|
415
|
+
```
|
|
416
|
+
|
|
368
417
|
## Packed Matrix Kernels for GEMM-Like Workloads
|
|
369
418
|
|
|
370
|
-
This is
|
|
419
|
+
This is a separate native subsystem from the raw vector kernels.
|
|
371
420
|
It is the right tool when the right-hand side is reused many times.
|
|
372
421
|
|
|
373
422
|
```cpp
|
|
@@ -414,7 +463,7 @@ This is SYRK-like in the sense that the output is square and symmetric.
|
|
|
414
463
|
The important difference from packed GEMM-style work is the partitioning model.
|
|
415
464
|
You typically split by output row windows, not by distinct left batches against a shared packed right-hand side.
|
|
416
465
|
|
|
417
|
-
The arithmetic advantage is
|
|
466
|
+
The arithmetic advantage is straightforward.
|
|
418
467
|
The symmetric kernels avoid recomputing both `(i, j)` and `(j, i)` pairs.
|
|
419
468
|
That cuts the pair count almost in half before any micro-kernel details matter.
|
|
420
469
|
|
|
@@ -479,8 +528,8 @@ Its footprint is exposed through `size_bytes()`.
|
|
|
479
528
|
## Runtime Dispatch and Capabilities
|
|
480
529
|
|
|
481
530
|
Dynamic dispatch is the default recommendation for shipping one binary across many CPU generations.
|
|
482
|
-
`nk_configure_thread`
|
|
483
|
-
It must be called once per thread before
|
|
531
|
+
`nk_configure_thread` enables CPU-specific acceleration features such as Intel AMX.
|
|
532
|
+
It must be called once per thread before using AMX operations and returns 1 on success, 0 on failure.
|
|
484
533
|
|
|
485
534
|
```c
|
|
486
535
|
nk_capability_t caps = nk_capabilities();
|
|
@@ -491,7 +540,7 @@ if (caps & nk_cap_sapphireamx_k) { /* AMX available */ }
|
|
|
491
540
|
For exact register-level details, see `capabilities.h`.
|
|
492
541
|
The C++ wrappers can also call directly into named backends if you want to pin a path for testing or benchmarking.
|
|
493
542
|
|
|
494
|
-
## Parallelism and
|
|
543
|
+
## Parallelism and ForkUnion
|
|
495
544
|
|
|
496
545
|
NumKong does not manage its own threads.
|
|
497
546
|
That is deliberate.
|
|
@@ -521,7 +570,7 @@ fork_union.parallel_for(0, worker_count, [&](std::size_t t) {
|
|
|
521
570
|
});
|
|
522
571
|
```
|
|
523
572
|
|
|
524
|
-
We recommend [
|
|
573
|
+
We recommend [ForkUnion](https://github.com/ashvardanian/ForkUnion) for that host-side orchestration.
|
|
525
574
|
OpenMP is still a reasonable fit if the rest of your application already uses it.
|
|
526
575
|
Manual thread pools and task systems also work well because the kernels have explicit row-range interfaces.
|
|
527
576
|
|
|
@@ -570,4 +619,25 @@ cmake -B build -D CMAKE_TOOLCHAIN_FILE=cmake/toolchain-aarch64-gnu.cmake
|
|
|
570
619
|
|
|
571
620
|
NumKong does not use OpenMP and does not create a hidden thread pool.
|
|
572
621
|
Standard pthreads are linked via CMake's `Threads` package.
|
|
573
|
-
Parallelism is host-controlled: partition work across row ranges and dispatch through
|
|
622
|
+
Parallelism is host-controlled: partition work across row ranges and dispatch through ForkUnion, `std::thread`, or any external scheduler.
|
|
623
|
+
|
|
624
|
+
## Addressing External Memory
|
|
625
|
+
|
|
626
|
+
Every kernel takes plain pointers, so any CPU-accessible memory works: mmap, pinned buffers, CUDA unified memory, custom arenas.
|
|
627
|
+
C++ views wrap any pointer without ownership.
|
|
628
|
+
Owning containers accept any C++ Allocator.
|
|
629
|
+
|
|
630
|
+
```cpp
|
|
631
|
+
template <typename T>
|
|
632
|
+
struct cuda_allocator {
|
|
633
|
+
using value_type = T;
|
|
634
|
+
T *allocate(std::size_t n) { T *p;
|
|
635
|
+
cudaMallocManaged(&p, n * sizeof(T), cudaMemAttachGlobal);
|
|
636
|
+
return p; }
|
|
637
|
+
void deallocate(T *p, std::size_t) noexcept { cudaFree(p); }
|
|
638
|
+
};
|
|
639
|
+
|
|
640
|
+
nk_dot_f32(cuda_managed_ptr, cuda_managed_ptr, 1024, &dot); // C ABI, any pointer
|
|
641
|
+
auto view = nk::tensor_view<nk::f32_t>(mmap_ptr, rows, cols); // non-owning view
|
|
642
|
+
auto v = nk::vector<float, cuda_allocator<float>>::try_zeros(1024); // allocator-aware owning
|
|
643
|
+
```
|