numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
package/include/numkong/dots.h
CHANGED
|
@@ -140,18 +140,18 @@
|
|
|
140
140
|
* Low-precision matmul relies on VPMADD* (AVX2), VNNI dot-products, and BF16 dot-products
|
|
141
141
|
* on AVX-512. Zen4 improves throughput by dual-issuing many integer ops on FP ports.
|
|
142
142
|
*
|
|
143
|
-
* Intrinsic
|
|
144
|
-
* _mm256_maddubs_epi16
|
|
145
|
-
* _mm256_madd_epi16
|
|
146
|
-
* _mm256_dpbusd_epi32
|
|
147
|
-
* _mm256_dpwssds_epi32
|
|
148
|
-
* _mm256_dpbf16_ps
|
|
143
|
+
* Intrinsic Instruction Haswell Genoa
|
|
144
|
+
* _mm256_maddubs_epi16 VPMADDUBSW (YMM, YMM, YMM) 5cy @ p0 3cy @ p01
|
|
145
|
+
* _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5cy @ p0 3cy @ p01
|
|
146
|
+
* _mm256_dpbusd_epi32 VPDPBUSD (YMM, K, YMM, YMM) n/a 4cy @ p01
|
|
147
|
+
* _mm256_dpwssds_epi32 VPDPWSSDS (YMM, K, YMM, YMM) n/a 4cy @ p01
|
|
148
|
+
* _mm256_dpbf16_ps VDPBF16PS (YMM, YMM, YMM) n/a 6cy @ p01
|
|
149
149
|
*
|
|
150
150
|
* AMX tile ops (TDPBF16PS/TDPBUSD/TDPBSSD) are not covered by the uops.info 2022 dataset.
|
|
151
151
|
*
|
|
152
152
|
* @section references References
|
|
153
153
|
*
|
|
154
|
-
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
|
|
154
|
+
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
|
155
155
|
* - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
|
|
156
156
|
* - uops.info: https://uops.info/
|
|
157
157
|
* - Matrix Multiplication in 40 lines: https://en.algorithmica.org/hpc/algorithms/matmul/
|
|
@@ -298,64 +298,64 @@ NK_DYNAMIC void nk_dots_packed_u1(nk_u1x8_t const *a, void const *b_packed, nk_u
|
|
|
298
298
|
/**
|
|
299
299
|
* @brief Computes C = A × Aᵀ symmetric Gram matrix.
|
|
300
300
|
* @param[in] vectors Input matrix of row vectors in row-major order.
|
|
301
|
-
* @param[in]
|
|
301
|
+
* @param[in] vectors_count Number of vectors (rows) in the input matrix.
|
|
302
302
|
* @param[in] depth Dimension of each vector (columns).
|
|
303
303
|
* @param[in] stride Row stride in bytes for the input matrix.
|
|
304
|
-
* @param[out] result Output symmetric matrix (
|
|
304
|
+
* @param[out] result Output symmetric matrix (vectors_count × vectors_count).
|
|
305
305
|
* @param[in] result_stride Row stride in bytes for the result matrix.
|
|
306
306
|
* @param[in] row_start Starting row offset of results to compute (needed for parallelism).
|
|
307
307
|
* @param[in] row_count Number of rows of results to compute (needed for parallelism).
|
|
308
308
|
*/
|
|
309
|
-
NK_DYNAMIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t
|
|
310
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
309
|
+
NK_DYNAMIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
310
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
311
311
|
nk_size_t row_count);
|
|
312
312
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
313
|
-
NK_DYNAMIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t
|
|
314
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
313
|
+
NK_DYNAMIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
314
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
315
315
|
nk_size_t row_count);
|
|
316
316
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
317
|
-
NK_DYNAMIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t
|
|
318
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
317
|
+
NK_DYNAMIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
318
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
319
319
|
nk_size_t row_count);
|
|
320
320
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
321
|
-
NK_DYNAMIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t
|
|
322
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
321
|
+
NK_DYNAMIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
322
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
323
323
|
nk_size_t row_count);
|
|
324
324
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
325
|
-
NK_DYNAMIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t
|
|
326
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
325
|
+
NK_DYNAMIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
326
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
327
327
|
nk_size_t row_count);
|
|
328
328
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
329
|
-
NK_DYNAMIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t
|
|
330
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
329
|
+
NK_DYNAMIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
330
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
331
331
|
nk_size_t row_count);
|
|
332
332
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
333
|
-
NK_DYNAMIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t
|
|
334
|
-
nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
333
|
+
NK_DYNAMIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
334
|
+
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
335
335
|
nk_size_t row_count);
|
|
336
336
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
337
|
-
NK_DYNAMIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t
|
|
338
|
-
nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
337
|
+
NK_DYNAMIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
338
|
+
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
339
339
|
nk_size_t row_count);
|
|
340
340
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
341
|
-
NK_DYNAMIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t
|
|
341
|
+
NK_DYNAMIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
|
|
342
342
|
nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
343
343
|
nk_size_t row_count);
|
|
344
344
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
345
|
-
NK_DYNAMIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t
|
|
345
|
+
NK_DYNAMIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
|
|
346
346
|
nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
347
347
|
nk_size_t row_count);
|
|
348
348
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
349
|
-
NK_DYNAMIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t
|
|
350
|
-
nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
349
|
+
NK_DYNAMIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
350
|
+
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
351
351
|
nk_size_t row_count);
|
|
352
352
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
353
|
-
NK_DYNAMIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t
|
|
354
|
-
nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
353
|
+
NK_DYNAMIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
354
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
355
355
|
nk_size_t row_count);
|
|
356
356
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
357
|
-
NK_DYNAMIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t
|
|
358
|
-
nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
357
|
+
NK_DYNAMIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
358
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
359
359
|
nk_size_t row_count);
|
|
360
360
|
|
|
361
361
|
/** @copydoc nk_dots_packed_size_f32 */
|
|
@@ -367,7 +367,7 @@ NK_PUBLIC void nk_dots_pack_f32_serial(nk_f32_t const *b, nk_size_t width, nk_si
|
|
|
367
367
|
NK_PUBLIC void nk_dots_packed_f32_serial(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
368
368
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
369
369
|
/** @copydoc nk_dots_symmetric_f32 */
|
|
370
|
-
NK_PUBLIC void nk_dots_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t
|
|
370
|
+
NK_PUBLIC void nk_dots_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
371
371
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
372
372
|
nk_size_t row_start, nk_size_t row_count);
|
|
373
373
|
|
|
@@ -380,7 +380,7 @@ NK_PUBLIC void nk_dots_pack_f64_serial(nk_f64_t const *b, nk_size_t width, nk_si
|
|
|
380
380
|
NK_PUBLIC void nk_dots_packed_f64_serial(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
381
381
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
382
382
|
/** @copydoc nk_dots_symmetric_f64 */
|
|
383
|
-
NK_PUBLIC void nk_dots_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t
|
|
383
|
+
NK_PUBLIC void nk_dots_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
384
384
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
385
385
|
nk_size_t row_start, nk_size_t row_count);
|
|
386
386
|
|
|
@@ -393,7 +393,7 @@ NK_PUBLIC void nk_dots_pack_f16_serial(nk_f16_t const *b, nk_size_t width, nk_si
|
|
|
393
393
|
NK_PUBLIC void nk_dots_packed_f16_serial(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
394
394
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
395
395
|
/** @copydoc nk_dots_symmetric_f16 */
|
|
396
|
-
NK_PUBLIC void nk_dots_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t
|
|
396
|
+
NK_PUBLIC void nk_dots_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
397
397
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
398
398
|
nk_size_t row_start, nk_size_t row_count);
|
|
399
399
|
|
|
@@ -406,7 +406,7 @@ NK_PUBLIC void nk_dots_pack_bf16_serial(nk_bf16_t const *b, nk_size_t width, nk_
|
|
|
406
406
|
NK_PUBLIC void nk_dots_packed_bf16_serial(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
407
407
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
408
408
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
409
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t
|
|
409
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
410
410
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
411
411
|
nk_size_t row_start, nk_size_t row_count);
|
|
412
412
|
|
|
@@ -419,7 +419,7 @@ NK_PUBLIC void nk_dots_pack_i8_serial(nk_i8_t const *b, nk_size_t width, nk_size
|
|
|
419
419
|
NK_PUBLIC void nk_dots_packed_i8_serial(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
420
420
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
421
421
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
422
|
-
NK_PUBLIC void nk_dots_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t
|
|
422
|
+
NK_PUBLIC void nk_dots_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
423
423
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
424
424
|
nk_size_t row_start, nk_size_t row_count);
|
|
425
425
|
|
|
@@ -432,7 +432,7 @@ NK_PUBLIC void nk_dots_pack_u8_serial(nk_u8_t const *b, nk_size_t width, nk_size
|
|
|
432
432
|
NK_PUBLIC void nk_dots_packed_u8_serial(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
433
433
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
434
434
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
435
|
-
NK_PUBLIC void nk_dots_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t
|
|
435
|
+
NK_PUBLIC void nk_dots_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
436
436
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
437
437
|
nk_size_t row_start, nk_size_t row_count);
|
|
438
438
|
|
|
@@ -445,7 +445,7 @@ NK_PUBLIC void nk_dots_pack_u4_serial(nk_u4x2_t const *b, nk_size_t width, nk_si
|
|
|
445
445
|
NK_PUBLIC void nk_dots_packed_u4_serial(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
446
446
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
447
447
|
/** @copydoc nk_dots_symmetric_u4 */
|
|
448
|
-
NK_PUBLIC void nk_dots_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t
|
|
448
|
+
NK_PUBLIC void nk_dots_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
449
449
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
450
450
|
nk_size_t row_start, nk_size_t row_count);
|
|
451
451
|
|
|
@@ -458,7 +458,7 @@ NK_PUBLIC void nk_dots_pack_u1_serial(nk_u1x8_t const *b, nk_size_t width, nk_si
|
|
|
458
458
|
NK_PUBLIC void nk_dots_packed_u1_serial(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
459
459
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
460
460
|
/** @copydoc nk_dots_symmetric_u1 */
|
|
461
|
-
NK_PUBLIC void nk_dots_symmetric_u1_serial(nk_u1x8_t const *vectors, nk_size_t
|
|
461
|
+
NK_PUBLIC void nk_dots_symmetric_u1_serial(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
462
462
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
463
463
|
nk_size_t row_start, nk_size_t row_count);
|
|
464
464
|
|
|
@@ -471,23 +471,23 @@ NK_PUBLIC void nk_dots_pack_i4_serial(nk_i4x2_t const *b, nk_size_t width, nk_si
|
|
|
471
471
|
NK_PUBLIC void nk_dots_packed_i4_serial(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
472
472
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
473
473
|
/** @copydoc nk_dots_symmetric_i4 */
|
|
474
|
-
NK_PUBLIC void nk_dots_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t
|
|
474
|
+
NK_PUBLIC void nk_dots_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
475
475
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
476
476
|
nk_size_t row_start, nk_size_t row_count);
|
|
477
477
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
478
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t
|
|
478
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
479
479
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
480
480
|
nk_size_t row_start, nk_size_t row_count);
|
|
481
481
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
482
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t
|
|
482
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
483
483
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
484
484
|
nk_size_t row_start, nk_size_t row_count);
|
|
485
485
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
486
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t
|
|
486
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
487
487
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
488
488
|
nk_size_t row_start, nk_size_t row_count);
|
|
489
489
|
/** @copydoc nk_dots_symmetric_e3m2 */
|
|
490
|
-
NK_PUBLIC void nk_dots_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t
|
|
490
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
491
491
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
492
492
|
nk_size_t row_start, nk_size_t row_count);
|
|
493
493
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
@@ -521,7 +521,7 @@ NK_PUBLIC void nk_dots_pack_bf16_genoa(nk_bf16_t const *b, nk_size_t width, nk_s
|
|
|
521
521
|
NK_PUBLIC void nk_dots_packed_bf16_genoa(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
522
522
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
523
523
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
524
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t
|
|
524
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
525
525
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
526
526
|
nk_size_t row_start, nk_size_t row_count);
|
|
527
527
|
|
|
@@ -542,15 +542,42 @@ NK_PUBLIC void nk_dots_pack_e5m2_genoa(nk_e5m2_t const *b, nk_size_t width, nk_s
|
|
|
542
542
|
NK_PUBLIC void nk_dots_packed_e5m2_genoa(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
543
543
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
544
544
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
545
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t
|
|
545
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
546
546
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
547
547
|
nk_size_t row_start, nk_size_t row_count);
|
|
548
548
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
549
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t
|
|
549
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
550
550
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
551
551
|
nk_size_t row_start, nk_size_t row_count);
|
|
552
552
|
#endif // NK_TARGET_GENOA
|
|
553
553
|
|
|
554
|
+
#if NK_TARGET_DIAMOND
|
|
555
|
+
/** @copydoc nk_dots_packed_size_e4m3 */
|
|
556
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_diamond(nk_size_t width, nk_size_t depth);
|
|
557
|
+
/** @copydoc nk_dots_pack_e4m3 */
|
|
558
|
+
NK_PUBLIC void nk_dots_pack_e4m3_diamond(nk_e4m3_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
559
|
+
void *b_packed);
|
|
560
|
+
/** @copydoc nk_dots_packed_e4m3 */
|
|
561
|
+
NK_PUBLIC void nk_dots_packed_e4m3_diamond(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
562
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
563
|
+
/** @copydoc nk_dots_packed_size_e5m2 */
|
|
564
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2_diamond(nk_size_t width, nk_size_t depth);
|
|
565
|
+
/** @copydoc nk_dots_pack_e5m2 */
|
|
566
|
+
NK_PUBLIC void nk_dots_pack_e5m2_diamond(nk_e5m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
567
|
+
void *b_packed);
|
|
568
|
+
/** @copydoc nk_dots_packed_e5m2 */
|
|
569
|
+
NK_PUBLIC void nk_dots_packed_e5m2_diamond(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
570
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
571
|
+
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
572
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_diamond(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
573
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
574
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
575
|
+
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
576
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_diamond(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
577
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
578
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
579
|
+
#endif // NK_TARGET_DIAMOND
|
|
580
|
+
|
|
554
581
|
/* Sapphire Rapids backends using Intel AMX (Advanced Matrix Extensions).
|
|
555
582
|
* AMX provides 8 tile registers (TMM0-TMM7), each holding up to 1KB of data.
|
|
556
583
|
* Tiles are configured as 16 rows × 64 bytes, enabling (16 × 32) BF16 or (16 × 64) INT8 tiles.
|
|
@@ -567,7 +594,7 @@ NK_PUBLIC void nk_dots_packed_bf16_sapphireamx(nk_bf16_t const *a, void const *b
|
|
|
567
594
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
568
595
|
nk_size_t c_stride);
|
|
569
596
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
570
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t
|
|
597
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
571
598
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
572
599
|
nk_size_t row_start, nk_size_t row_count);
|
|
573
600
|
|
|
@@ -580,7 +607,7 @@ NK_PUBLIC void nk_dots_pack_i8_sapphireamx(nk_i8_t const *b, nk_size_t width, nk
|
|
|
580
607
|
NK_PUBLIC void nk_dots_packed_i8_sapphireamx(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
581
608
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
582
609
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
583
|
-
NK_PUBLIC void nk_dots_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t
|
|
610
|
+
NK_PUBLIC void nk_dots_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
584
611
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
585
612
|
nk_size_t row_start, nk_size_t row_count);
|
|
586
613
|
|
|
@@ -595,7 +622,7 @@ NK_PUBLIC void nk_dots_packed_e4m3_sapphireamx(nk_e4m3_t const *a, void const *b
|
|
|
595
622
|
nk_size_t c_stride);
|
|
596
623
|
|
|
597
624
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
598
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t
|
|
625
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
599
626
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
600
627
|
nk_size_t row_start, nk_size_t row_count);
|
|
601
628
|
|
|
@@ -609,7 +636,7 @@ NK_PUBLIC void nk_dots_packed_e5m2_sapphireamx(nk_e5m2_t const *a, void const *b
|
|
|
609
636
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
610
637
|
nk_size_t c_stride);
|
|
611
638
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
612
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t
|
|
639
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
613
640
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
614
641
|
nk_size_t row_start, nk_size_t row_count);
|
|
615
642
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
@@ -622,7 +649,7 @@ NK_PUBLIC void nk_dots_packed_e2m3_sapphireamx(nk_e2m3_t const *a, void const *b
|
|
|
622
649
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
623
650
|
nk_size_t c_stride);
|
|
624
651
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
625
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t
|
|
652
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
626
653
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
627
654
|
nk_size_t row_start, nk_size_t row_count);
|
|
628
655
|
|
|
@@ -636,7 +663,7 @@ NK_PUBLIC void nk_dots_packed_e3m2_sapphireamx(nk_e3m2_t const *a, void const *b
|
|
|
636
663
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
637
664
|
nk_size_t c_stride);
|
|
638
665
|
/** @copydoc nk_dots_symmetric_e3m2 */
|
|
639
|
-
NK_PUBLIC void nk_dots_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t
|
|
666
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
640
667
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
641
668
|
nk_size_t row_start, nk_size_t row_count);
|
|
642
669
|
|
|
@@ -649,7 +676,7 @@ NK_PUBLIC void nk_dots_pack_u8_sapphireamx(nk_u8_t const *b, nk_size_t width, nk
|
|
|
649
676
|
NK_PUBLIC void nk_dots_packed_u8_sapphireamx(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
650
677
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
651
678
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
652
|
-
NK_PUBLIC void nk_dots_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t
|
|
679
|
+
NK_PUBLIC void nk_dots_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
653
680
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
654
681
|
nk_size_t row_start, nk_size_t row_count);
|
|
655
682
|
#endif // NK_TARGET_SAPPHIREAMX
|
|
@@ -668,7 +695,7 @@ NK_PUBLIC void nk_dots_pack_f16_sme(nk_f16_t const *b, nk_size_t width, nk_size_
|
|
|
668
695
|
NK_PUBLIC void nk_dots_packed_f16_sme(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
669
696
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
670
697
|
/** @copydoc nk_dots_symmetric_f16 */
|
|
671
|
-
NK_PUBLIC void nk_dots_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t
|
|
698
|
+
NK_PUBLIC void nk_dots_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
672
699
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
673
700
|
nk_size_t row_start, nk_size_t row_count);
|
|
674
701
|
|
|
@@ -681,7 +708,7 @@ NK_PUBLIC void nk_dots_pack_bf16_sme(nk_bf16_t const *b, nk_size_t width, nk_siz
|
|
|
681
708
|
NK_PUBLIC void nk_dots_packed_bf16_sme(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
682
709
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
683
710
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
684
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t
|
|
711
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
685
712
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
686
713
|
nk_size_t row_start, nk_size_t row_count);
|
|
687
714
|
|
|
@@ -694,9 +721,9 @@ NK_PUBLIC void nk_dots_pack_i8_sme(nk_i8_t const *b, nk_size_t width, nk_size_t
|
|
|
694
721
|
NK_PUBLIC void nk_dots_packed_i8_sme(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
695
722
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
696
723
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
697
|
-
NK_PUBLIC void nk_dots_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t
|
|
698
|
-
nk_i32_t *result, nk_size_t result_stride,
|
|
699
|
-
nk_size_t row_count);
|
|
724
|
+
NK_PUBLIC void nk_dots_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
725
|
+
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
726
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
700
727
|
|
|
701
728
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
702
729
|
NK_PUBLIC nk_size_t nk_dots_packed_size_u8_sme(nk_size_t width, nk_size_t depth);
|
|
@@ -707,9 +734,9 @@ NK_PUBLIC void nk_dots_pack_u8_sme(nk_u8_t const *b, nk_size_t width, nk_size_t
|
|
|
707
734
|
NK_PUBLIC void nk_dots_packed_u8_sme(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
708
735
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
709
736
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
710
|
-
NK_PUBLIC void nk_dots_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t
|
|
711
|
-
nk_u32_t *result, nk_size_t result_stride,
|
|
712
|
-
nk_size_t row_count);
|
|
737
|
+
NK_PUBLIC void nk_dots_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
738
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
739
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
713
740
|
|
|
714
741
|
/** @copydoc nk_dots_packed_size_e4m3 */
|
|
715
742
|
NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_sme(nk_size_t width, nk_size_t depth);
|
|
@@ -720,7 +747,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_sme(nk_e4m3_t const *b, nk_size_t width, nk_siz
|
|
|
720
747
|
NK_PUBLIC void nk_dots_packed_e4m3_sme(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
721
748
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
722
749
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
723
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t
|
|
750
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
724
751
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
725
752
|
nk_size_t row_start, nk_size_t row_count);
|
|
726
753
|
|
|
@@ -733,7 +760,7 @@ NK_PUBLIC void nk_dots_pack_e5m2_sme(nk_e5m2_t const *b, nk_size_t width, nk_siz
|
|
|
733
760
|
NK_PUBLIC void nk_dots_packed_e5m2_sme(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
734
761
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
735
762
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
736
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t
|
|
763
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
737
764
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
738
765
|
nk_size_t row_start, nk_size_t row_count);
|
|
739
766
|
|
|
@@ -746,7 +773,7 @@ NK_PUBLIC void nk_dots_pack_u4_sme(nk_u4x2_t const *b, nk_size_t width, nk_size_
|
|
|
746
773
|
NK_PUBLIC void nk_dots_packed_u4_sme(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
747
774
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
748
775
|
/** @copydoc nk_dots_symmetric_u4 */
|
|
749
|
-
NK_PUBLIC void nk_dots_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t
|
|
776
|
+
NK_PUBLIC void nk_dots_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
750
777
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
751
778
|
nk_size_t row_start, nk_size_t row_count);
|
|
752
779
|
|
|
@@ -759,7 +786,7 @@ NK_PUBLIC void nk_dots_pack_i4_sme(nk_i4x2_t const *b, nk_size_t width, nk_size_
|
|
|
759
786
|
NK_PUBLIC void nk_dots_packed_i4_sme(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
760
787
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
761
788
|
/** @copydoc nk_dots_symmetric_i4 */
|
|
762
|
-
NK_PUBLIC void nk_dots_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t
|
|
789
|
+
NK_PUBLIC void nk_dots_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
763
790
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
764
791
|
nk_size_t row_start, nk_size_t row_count);
|
|
765
792
|
|
|
@@ -772,7 +799,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_sme(nk_e2m3_t const *b, nk_size_t width, nk_siz
|
|
|
772
799
|
NK_PUBLIC void nk_dots_packed_e2m3_sme(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
773
800
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
774
801
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
775
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t
|
|
802
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
776
803
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
777
804
|
nk_size_t row_start, nk_size_t row_count);
|
|
778
805
|
|
|
@@ -785,7 +812,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_sme(nk_e3m2_t const *b, nk_size_t width, nk_siz
|
|
|
785
812
|
NK_PUBLIC void nk_dots_packed_e3m2_sme(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
786
813
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
787
814
|
/** @copydoc nk_dots_symmetric_e3m2 */
|
|
788
|
-
NK_PUBLIC void nk_dots_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t
|
|
815
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
789
816
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
790
817
|
nk_size_t row_start, nk_size_t row_count);
|
|
791
818
|
#endif // NK_TARGET_SME
|
|
@@ -803,7 +830,7 @@ NK_PUBLIC void nk_dots_pack_u1_smebi32(nk_u1x8_t const *b, nk_size_t width, nk_s
|
|
|
803
830
|
NK_PUBLIC void nk_dots_packed_u1_smebi32(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
804
831
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
805
832
|
/** @copydoc nk_dots_symmetric_u1 */
|
|
806
|
-
NK_PUBLIC void nk_dots_symmetric_u1_smebi32(nk_u1x8_t const *vectors, nk_size_t
|
|
833
|
+
NK_PUBLIC void nk_dots_symmetric_u1_smebi32(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
807
834
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
808
835
|
nk_size_t row_start, nk_size_t row_count);
|
|
809
836
|
#endif // NK_TARGET_SMEBI32
|
|
@@ -821,7 +848,7 @@ NK_PUBLIC void nk_dots_pack_f32_smef64(nk_f32_t const *b, nk_size_t width, nk_si
|
|
|
821
848
|
NK_PUBLIC void nk_dots_packed_f32_smef64(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
822
849
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
823
850
|
/** @copydoc nk_dots_symmetric_f32 */
|
|
824
|
-
NK_PUBLIC void nk_dots_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t
|
|
851
|
+
NK_PUBLIC void nk_dots_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
825
852
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
826
853
|
nk_size_t row_start, nk_size_t row_count);
|
|
827
854
|
|
|
@@ -834,7 +861,7 @@ NK_PUBLIC void nk_dots_pack_f64_smef64(nk_f64_t const *b, nk_size_t width, nk_si
|
|
|
834
861
|
NK_PUBLIC void nk_dots_packed_f64_smef64(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
835
862
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
836
863
|
/** @copydoc nk_dots_symmetric_f64 */
|
|
837
|
-
NK_PUBLIC void nk_dots_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t
|
|
864
|
+
NK_PUBLIC void nk_dots_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
838
865
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
839
866
|
nk_size_t row_start, nk_size_t row_count);
|
|
840
867
|
#endif // NK_TARGET_SMEF64
|
|
@@ -852,7 +879,7 @@ NK_PUBLIC void nk_dots_pack_f32_haswell(nk_f32_t const *b, nk_size_t width, nk_s
|
|
|
852
879
|
NK_PUBLIC void nk_dots_packed_f32_haswell(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
853
880
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
854
881
|
/** @copydoc nk_dots_symmetric_f32 */
|
|
855
|
-
NK_PUBLIC void nk_dots_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t
|
|
882
|
+
NK_PUBLIC void nk_dots_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
856
883
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
857
884
|
nk_size_t row_start, nk_size_t row_count);
|
|
858
885
|
/** @copydoc nk_dots_packed_size_f64 */
|
|
@@ -864,7 +891,7 @@ NK_PUBLIC void nk_dots_pack_f64_haswell(nk_f64_t const *b, nk_size_t width, nk_s
|
|
|
864
891
|
NK_PUBLIC void nk_dots_packed_f64_haswell(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
865
892
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
866
893
|
/** @copydoc nk_dots_symmetric_f64 */
|
|
867
|
-
NK_PUBLIC void nk_dots_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t
|
|
894
|
+
NK_PUBLIC void nk_dots_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
868
895
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
869
896
|
nk_size_t row_start, nk_size_t row_count);
|
|
870
897
|
/** @copydoc nk_dots_packed_size_f16 */
|
|
@@ -876,7 +903,7 @@ NK_PUBLIC void nk_dots_pack_f16_haswell(nk_f16_t const *b, nk_size_t width, nk_s
|
|
|
876
903
|
NK_PUBLIC void nk_dots_packed_f16_haswell(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
877
904
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
878
905
|
/** @copydoc nk_dots_symmetric_f16 */
|
|
879
|
-
NK_PUBLIC void nk_dots_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t
|
|
906
|
+
NK_PUBLIC void nk_dots_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
880
907
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
881
908
|
nk_size_t row_start, nk_size_t row_count);
|
|
882
909
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -888,7 +915,7 @@ NK_PUBLIC void nk_dots_pack_bf16_haswell(nk_bf16_t const *b, nk_size_t width, nk
|
|
|
888
915
|
NK_PUBLIC void nk_dots_packed_bf16_haswell(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
889
916
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
890
917
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
891
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t
|
|
918
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
892
919
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
893
920
|
nk_size_t row_start, nk_size_t row_count);
|
|
894
921
|
/** @copydoc nk_dots_packed_size_e4m3 */
|
|
@@ -900,7 +927,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_haswell(nk_e4m3_t const *b, nk_size_t width, nk
|
|
|
900
927
|
NK_PUBLIC void nk_dots_packed_e4m3_haswell(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
901
928
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
902
929
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
903
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t
|
|
930
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
904
931
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
905
932
|
nk_size_t row_start, nk_size_t row_count);
|
|
906
933
|
/** @copydoc nk_dots_packed_size_e5m2 */
|
|
@@ -912,7 +939,7 @@ NK_PUBLIC void nk_dots_pack_e5m2_haswell(nk_e5m2_t const *b, nk_size_t width, nk
|
|
|
912
939
|
NK_PUBLIC void nk_dots_packed_e5m2_haswell(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
913
940
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
914
941
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
915
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t
|
|
942
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
916
943
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
917
944
|
nk_size_t row_start, nk_size_t row_count);
|
|
918
945
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
@@ -924,7 +951,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_haswell(nk_e2m3_t const *b, nk_size_t width, nk
|
|
|
924
951
|
NK_PUBLIC void nk_dots_packed_e2m3_haswell(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
925
952
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
926
953
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
927
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t
|
|
954
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
928
955
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
929
956
|
nk_size_t row_start, nk_size_t row_count);
|
|
930
957
|
/** @copydoc nk_dots_packed_size_e3m2 */
|
|
@@ -936,7 +963,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_haswell(nk_e3m2_t const *b, nk_size_t width, nk
|
|
|
936
963
|
NK_PUBLIC void nk_dots_packed_e3m2_haswell(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
937
964
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
938
965
|
/** @copydoc nk_dots_symmetric_e3m2 */
|
|
939
|
-
NK_PUBLIC void nk_dots_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t
|
|
966
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
940
967
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
941
968
|
nk_size_t row_start, nk_size_t row_count);
|
|
942
969
|
/** @copydoc nk_dots_packed_size_i8 */
|
|
@@ -948,7 +975,7 @@ NK_PUBLIC void nk_dots_pack_i8_haswell(nk_i8_t const *b, nk_size_t width, nk_siz
|
|
|
948
975
|
NK_PUBLIC void nk_dots_packed_i8_haswell(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
949
976
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
950
977
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
951
|
-
NK_PUBLIC void nk_dots_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t
|
|
978
|
+
NK_PUBLIC void nk_dots_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
952
979
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
953
980
|
nk_size_t row_start, nk_size_t row_count);
|
|
954
981
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
@@ -960,7 +987,7 @@ NK_PUBLIC void nk_dots_pack_u8_haswell(nk_u8_t const *b, nk_size_t width, nk_siz
|
|
|
960
987
|
NK_PUBLIC void nk_dots_packed_u8_haswell(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
961
988
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
962
989
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
963
|
-
NK_PUBLIC void nk_dots_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t
|
|
990
|
+
NK_PUBLIC void nk_dots_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
964
991
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
965
992
|
nk_size_t row_start, nk_size_t row_count);
|
|
966
993
|
/** @copydoc nk_dots_packed_size_u1 */
|
|
@@ -972,7 +999,7 @@ NK_PUBLIC void nk_dots_pack_u1_haswell(nk_u1x8_t const *b, nk_size_t width, nk_s
|
|
|
972
999
|
NK_PUBLIC void nk_dots_packed_u1_haswell(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
973
1000
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
974
1001
|
/** @copydoc nk_dots_symmetric_u1 */
|
|
975
|
-
NK_PUBLIC void nk_dots_symmetric_u1_haswell(nk_u1x8_t const *vectors, nk_size_t
|
|
1002
|
+
NK_PUBLIC void nk_dots_symmetric_u1_haswell(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
976
1003
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
977
1004
|
nk_size_t row_start, nk_size_t row_count);
|
|
978
1005
|
/** @copydoc nk_dots_packed_size_i4 */
|
|
@@ -984,7 +1011,7 @@ NK_PUBLIC void nk_dots_pack_i4_haswell(nk_i4x2_t const *b, nk_size_t width, nk_s
|
|
|
984
1011
|
NK_PUBLIC void nk_dots_packed_i4_haswell(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
985
1012
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
986
1013
|
/** @copydoc nk_dots_symmetric_i4 */
|
|
987
|
-
NK_PUBLIC void nk_dots_symmetric_i4_haswell(nk_i4x2_t const *vectors, nk_size_t
|
|
1014
|
+
NK_PUBLIC void nk_dots_symmetric_i4_haswell(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
988
1015
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
989
1016
|
nk_size_t row_start, nk_size_t row_count);
|
|
990
1017
|
/** @copydoc nk_dots_packed_size_u4 */
|
|
@@ -996,7 +1023,7 @@ NK_PUBLIC void nk_dots_pack_u4_haswell(nk_u4x2_t const *b, nk_size_t width, nk_s
|
|
|
996
1023
|
NK_PUBLIC void nk_dots_packed_u4_haswell(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
997
1024
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
998
1025
|
/** @copydoc nk_dots_symmetric_u4 */
|
|
999
|
-
NK_PUBLIC void nk_dots_symmetric_u4_haswell(nk_u4x2_t const *vectors, nk_size_t
|
|
1026
|
+
NK_PUBLIC void nk_dots_symmetric_u4_haswell(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1000
1027
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1001
1028
|
nk_size_t row_start, nk_size_t row_count);
|
|
1002
1029
|
#endif // NK_TARGET_HASWELL
|
|
@@ -1014,7 +1041,7 @@ NK_PUBLIC void nk_dots_pack_f64_skylake(nk_f64_t const *b, nk_size_t width, nk_s
|
|
|
1014
1041
|
NK_PUBLIC void nk_dots_packed_f64_skylake(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1015
1042
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1016
1043
|
/** @copydoc nk_dots_symmetric_f64 */
|
|
1017
|
-
NK_PUBLIC void nk_dots_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t
|
|
1044
|
+
NK_PUBLIC void nk_dots_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1018
1045
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1019
1046
|
nk_size_t row_start, nk_size_t row_count);
|
|
1020
1047
|
/** @copydoc nk_dots_packed_size_f32 */
|
|
@@ -1026,7 +1053,7 @@ NK_PUBLIC void nk_dots_pack_f32_skylake(nk_f32_t const *b, nk_size_t width, nk_s
|
|
|
1026
1053
|
NK_PUBLIC void nk_dots_packed_f32_skylake(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1027
1054
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1028
1055
|
/** @copydoc nk_dots_symmetric_f32 */
|
|
1029
|
-
NK_PUBLIC void nk_dots_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t
|
|
1056
|
+
NK_PUBLIC void nk_dots_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1030
1057
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1031
1058
|
nk_size_t row_start, nk_size_t row_count);
|
|
1032
1059
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1038,7 +1065,7 @@ NK_PUBLIC void nk_dots_pack_bf16_skylake(nk_bf16_t const *b, nk_size_t width, nk
|
|
|
1038
1065
|
NK_PUBLIC void nk_dots_packed_bf16_skylake(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1039
1066
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1040
1067
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1041
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t
|
|
1068
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1042
1069
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1043
1070
|
nk_size_t row_start, nk_size_t row_count);
|
|
1044
1071
|
/** @copydoc nk_dots_packed_size_f16 */
|
|
@@ -1050,7 +1077,7 @@ NK_PUBLIC void nk_dots_pack_f16_skylake(nk_f16_t const *b, nk_size_t width, nk_s
|
|
|
1050
1077
|
NK_PUBLIC void nk_dots_packed_f16_skylake(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1051
1078
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1052
1079
|
/** @copydoc nk_dots_symmetric_f16 */
|
|
1053
|
-
NK_PUBLIC void nk_dots_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t
|
|
1080
|
+
NK_PUBLIC void nk_dots_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1054
1081
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1055
1082
|
nk_size_t row_start, nk_size_t row_count);
|
|
1056
1083
|
/** @copydoc nk_dots_packed_size_e4m3 */
|
|
@@ -1062,7 +1089,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_skylake(nk_e4m3_t const *b, nk_size_t width, nk
|
|
|
1062
1089
|
NK_PUBLIC void nk_dots_packed_e4m3_skylake(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1063
1090
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1064
1091
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
1065
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t
|
|
1092
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1066
1093
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1067
1094
|
nk_size_t row_start, nk_size_t row_count);
|
|
1068
1095
|
/** @copydoc nk_dots_packed_size_e5m2 */
|
|
@@ -1074,7 +1101,7 @@ NK_PUBLIC void nk_dots_pack_e5m2_skylake(nk_e5m2_t const *b, nk_size_t width, nk
|
|
|
1074
1101
|
NK_PUBLIC void nk_dots_packed_e5m2_skylake(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1075
1102
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1076
1103
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
1077
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t
|
|
1104
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1078
1105
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1079
1106
|
nk_size_t row_start, nk_size_t row_count);
|
|
1080
1107
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
@@ -1086,7 +1113,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_skylake(nk_e2m3_t const *b, nk_size_t width, nk
|
|
|
1086
1113
|
NK_PUBLIC void nk_dots_packed_e2m3_skylake(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1087
1114
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1088
1115
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
1089
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t
|
|
1116
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1090
1117
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1091
1118
|
nk_size_t row_start, nk_size_t row_count);
|
|
1092
1119
|
/** @copydoc nk_dots_packed_size_e3m2 */
|
|
@@ -1098,7 +1125,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_skylake(nk_e3m2_t const *b, nk_size_t width, nk
|
|
|
1098
1125
|
NK_PUBLIC void nk_dots_packed_e3m2_skylake(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1099
1126
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1100
1127
|
/** @copydoc nk_dots_symmetric_e3m2 */
|
|
1101
|
-
NK_PUBLIC void nk_dots_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t
|
|
1128
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1102
1129
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1103
1130
|
nk_size_t row_start, nk_size_t row_count);
|
|
1104
1131
|
#endif // NK_TARGET_SKYLAKE
|
|
@@ -1116,7 +1143,7 @@ NK_PUBLIC void nk_dots_pack_i8_icelake(nk_i8_t const *b, nk_size_t width, nk_siz
|
|
|
1116
1143
|
NK_PUBLIC void nk_dots_packed_i8_icelake(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1117
1144
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1118
1145
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
1119
|
-
NK_PUBLIC void nk_dots_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t
|
|
1146
|
+
NK_PUBLIC void nk_dots_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1120
1147
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1121
1148
|
nk_size_t row_start, nk_size_t row_count);
|
|
1122
1149
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
@@ -1128,7 +1155,7 @@ NK_PUBLIC void nk_dots_pack_u8_icelake(nk_u8_t const *b, nk_size_t width, nk_siz
|
|
|
1128
1155
|
NK_PUBLIC void nk_dots_packed_u8_icelake(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1129
1156
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1130
1157
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
1131
|
-
NK_PUBLIC void nk_dots_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t
|
|
1158
|
+
NK_PUBLIC void nk_dots_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1132
1159
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1133
1160
|
nk_size_t row_start, nk_size_t row_count);
|
|
1134
1161
|
/** @copydoc nk_dots_packed_size_i4 */
|
|
@@ -1140,7 +1167,7 @@ NK_PUBLIC void nk_dots_pack_i4_icelake(nk_i4x2_t const *b, nk_size_t width, nk_s
|
|
|
1140
1167
|
NK_PUBLIC void nk_dots_packed_i4_icelake(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1141
1168
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1142
1169
|
/** @copydoc nk_dots_symmetric_i4 */
|
|
1143
|
-
NK_PUBLIC void nk_dots_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t
|
|
1170
|
+
NK_PUBLIC void nk_dots_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1144
1171
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1145
1172
|
nk_size_t row_start, nk_size_t row_count);
|
|
1146
1173
|
/** @copydoc nk_dots_packed_size_u4 */
|
|
@@ -1152,7 +1179,7 @@ NK_PUBLIC void nk_dots_pack_u4_icelake(nk_u4x2_t const *b, nk_size_t width, nk_s
|
|
|
1152
1179
|
NK_PUBLIC void nk_dots_packed_u4_icelake(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1153
1180
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1154
1181
|
/** @copydoc nk_dots_symmetric_u4 */
|
|
1155
|
-
NK_PUBLIC void nk_dots_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t
|
|
1182
|
+
NK_PUBLIC void nk_dots_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1156
1183
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1157
1184
|
nk_size_t row_start, nk_size_t row_count);
|
|
1158
1185
|
/** @copydoc nk_dots_packed_size_u1 */
|
|
@@ -1164,7 +1191,7 @@ NK_PUBLIC void nk_dots_pack_u1_icelake(nk_u1x8_t const *b, nk_size_t width, nk_s
|
|
|
1164
1191
|
NK_PUBLIC void nk_dots_packed_u1_icelake(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1165
1192
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1166
1193
|
/** @copydoc nk_dots_symmetric_u1 */
|
|
1167
|
-
NK_PUBLIC void nk_dots_symmetric_u1_icelake(nk_u1x8_t const *vectors, nk_size_t
|
|
1194
|
+
NK_PUBLIC void nk_dots_symmetric_u1_icelake(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1168
1195
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1169
1196
|
nk_size_t row_start, nk_size_t row_count);
|
|
1170
1197
|
#endif // NK_TARGET_ICELAKE
|
|
@@ -1182,7 +1209,7 @@ NK_PUBLIC void nk_dots_pack_i8_alder(nk_i8_t const *b, nk_size_t width, nk_size_
|
|
|
1182
1209
|
NK_PUBLIC void nk_dots_packed_i8_alder(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1183
1210
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1184
1211
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
1185
|
-
NK_PUBLIC void nk_dots_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t
|
|
1212
|
+
NK_PUBLIC void nk_dots_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1186
1213
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1187
1214
|
nk_size_t row_start, nk_size_t row_count);
|
|
1188
1215
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
@@ -1194,7 +1221,7 @@ NK_PUBLIC void nk_dots_pack_u8_alder(nk_u8_t const *b, nk_size_t width, nk_size_
|
|
|
1194
1221
|
NK_PUBLIC void nk_dots_packed_u8_alder(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1195
1222
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1196
1223
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
1197
|
-
NK_PUBLIC void nk_dots_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t
|
|
1224
|
+
NK_PUBLIC void nk_dots_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1198
1225
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1199
1226
|
nk_size_t row_start, nk_size_t row_count);
|
|
1200
1227
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
@@ -1206,7 +1233,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_alder(nk_e2m3_t const *b, nk_size_t width, nk_s
|
|
|
1206
1233
|
NK_PUBLIC void nk_dots_packed_e2m3_alder(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1207
1234
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1208
1235
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
1209
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t
|
|
1236
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1210
1237
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1211
1238
|
nk_size_t row_start, nk_size_t row_count);
|
|
1212
1239
|
#endif // NK_TARGET_ALDER
|
|
@@ -1224,7 +1251,7 @@ NK_PUBLIC void nk_dots_pack_i8_sierra(nk_i8_t const *b, nk_size_t width, nk_size
|
|
|
1224
1251
|
NK_PUBLIC void nk_dots_packed_i8_sierra(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1225
1252
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1226
1253
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
1227
|
-
NK_PUBLIC void nk_dots_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t
|
|
1254
|
+
NK_PUBLIC void nk_dots_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1228
1255
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1229
1256
|
nk_size_t row_start, nk_size_t row_count);
|
|
1230
1257
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
@@ -1236,7 +1263,7 @@ NK_PUBLIC void nk_dots_pack_u8_sierra(nk_u8_t const *b, nk_size_t width, nk_size
|
|
|
1236
1263
|
NK_PUBLIC void nk_dots_packed_u8_sierra(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1237
1264
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1238
1265
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
1239
|
-
NK_PUBLIC void nk_dots_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t
|
|
1266
|
+
NK_PUBLIC void nk_dots_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1240
1267
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1241
1268
|
nk_size_t row_start, nk_size_t row_count);
|
|
1242
1269
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
@@ -1248,7 +1275,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_sierra(nk_e2m3_t const *b, nk_size_t width, nk_
|
|
|
1248
1275
|
NK_PUBLIC void nk_dots_packed_e2m3_sierra(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1249
1276
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1250
1277
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
1251
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t
|
|
1278
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1252
1279
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1253
1280
|
nk_size_t row_start, nk_size_t row_count);
|
|
1254
1281
|
#endif // NK_TARGET_SIERRA
|
|
@@ -1266,7 +1293,7 @@ NK_PUBLIC void nk_dots_pack_i8_v128relaxed(nk_i8_t const *b, nk_size_t width, nk
|
|
|
1266
1293
|
NK_PUBLIC void nk_dots_packed_i8_v128relaxed(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1267
1294
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1268
1295
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
1269
|
-
NK_PUBLIC void nk_dots_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t
|
|
1296
|
+
NK_PUBLIC void nk_dots_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1270
1297
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1271
1298
|
nk_size_t row_start, nk_size_t row_count);
|
|
1272
1299
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
@@ -1278,7 +1305,7 @@ NK_PUBLIC void nk_dots_pack_u8_v128relaxed(nk_u8_t const *b, nk_size_t width, nk
|
|
|
1278
1305
|
NK_PUBLIC void nk_dots_packed_u8_v128relaxed(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1279
1306
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1280
1307
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
1281
|
-
NK_PUBLIC void nk_dots_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t
|
|
1308
|
+
NK_PUBLIC void nk_dots_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1282
1309
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1283
1310
|
nk_size_t row_start, nk_size_t row_count);
|
|
1284
1311
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
@@ -1291,7 +1318,7 @@ NK_PUBLIC void nk_dots_packed_e2m3_v128relaxed(nk_e2m3_t const *a, void const *b
|
|
|
1291
1318
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
1292
1319
|
nk_size_t c_stride);
|
|
1293
1320
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
1294
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t
|
|
1321
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1295
1322
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1296
1323
|
nk_size_t row_start, nk_size_t row_count);
|
|
1297
1324
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1304,7 +1331,7 @@ NK_PUBLIC void nk_dots_packed_bf16_v128relaxed(nk_bf16_t const *a, void const *b
|
|
|
1304
1331
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
1305
1332
|
nk_size_t c_stride);
|
|
1306
1333
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1307
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t
|
|
1334
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1308
1335
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1309
1336
|
nk_size_t row_start, nk_size_t row_count);
|
|
1310
1337
|
/** @copydoc nk_dots_packed_size_f32 */
|
|
@@ -1316,7 +1343,7 @@ NK_PUBLIC void nk_dots_pack_f32_v128relaxed(nk_f32_t const *b, nk_size_t width,
|
|
|
1316
1343
|
NK_PUBLIC void nk_dots_packed_f32_v128relaxed(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1317
1344
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1318
1345
|
/** @copydoc nk_dots_symmetric_f32 */
|
|
1319
|
-
NK_PUBLIC void nk_dots_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t
|
|
1346
|
+
NK_PUBLIC void nk_dots_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1320
1347
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1321
1348
|
nk_size_t row_start, nk_size_t row_count);
|
|
1322
1349
|
/** @copydoc nk_dots_packed_size_f64 */
|
|
@@ -1328,7 +1355,7 @@ NK_PUBLIC void nk_dots_pack_f64_v128relaxed(nk_f64_t const *b, nk_size_t width,
|
|
|
1328
1355
|
NK_PUBLIC void nk_dots_packed_f64_v128relaxed(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1329
1356
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1330
1357
|
/** @copydoc nk_dots_symmetric_f64 */
|
|
1331
|
-
NK_PUBLIC void nk_dots_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t
|
|
1358
|
+
NK_PUBLIC void nk_dots_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1332
1359
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1333
1360
|
nk_size_t row_start, nk_size_t row_count);
|
|
1334
1361
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1341,7 +1368,7 @@ NK_PUBLIC void nk_dots_packed_e4m3_v128relaxed(nk_e4m3_t const *a, void const *b
|
|
|
1341
1368
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
1342
1369
|
nk_size_t c_stride);
|
|
1343
1370
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1344
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t
|
|
1371
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1345
1372
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1346
1373
|
nk_size_t row_start, nk_size_t row_count);
|
|
1347
1374
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1354,7 +1381,7 @@ NK_PUBLIC void nk_dots_packed_e5m2_v128relaxed(nk_e5m2_t const *a, void const *b
|
|
|
1354
1381
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
1355
1382
|
nk_size_t c_stride);
|
|
1356
1383
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1357
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t
|
|
1384
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1358
1385
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1359
1386
|
nk_size_t row_start, nk_size_t row_count);
|
|
1360
1387
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1366,7 +1393,7 @@ NK_PUBLIC void nk_dots_pack_u4_v128relaxed(nk_u4x2_t const *b, nk_size_t width,
|
|
|
1366
1393
|
NK_PUBLIC void nk_dots_packed_u4_v128relaxed(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1367
1394
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1368
1395
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1369
|
-
NK_PUBLIC void nk_dots_symmetric_u4_v128relaxed(nk_u4x2_t const *vectors, nk_size_t
|
|
1396
|
+
NK_PUBLIC void nk_dots_symmetric_u4_v128relaxed(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1370
1397
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1371
1398
|
nk_size_t row_start, nk_size_t row_count);
|
|
1372
1399
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1378,7 +1405,7 @@ NK_PUBLIC void nk_dots_pack_i4_v128relaxed(nk_i4x2_t const *b, nk_size_t width,
|
|
|
1378
1405
|
NK_PUBLIC void nk_dots_packed_i4_v128relaxed(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1379
1406
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1380
1407
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1381
|
-
NK_PUBLIC void nk_dots_symmetric_i4_v128relaxed(nk_i4x2_t const *vectors, nk_size_t
|
|
1408
|
+
NK_PUBLIC void nk_dots_symmetric_i4_v128relaxed(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1382
1409
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1383
1410
|
nk_size_t row_start, nk_size_t row_count);
|
|
1384
1411
|
/** @copydoc nk_dots_packed_size_u1 */
|
|
@@ -1390,7 +1417,7 @@ NK_PUBLIC void nk_dots_pack_u1_v128relaxed(nk_u1x8_t const *b, nk_size_t width,
|
|
|
1390
1417
|
NK_PUBLIC void nk_dots_packed_u1_v128relaxed(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1391
1418
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1392
1419
|
/** @copydoc nk_dots_symmetric_u1 */
|
|
1393
|
-
NK_PUBLIC void nk_dots_symmetric_u1_v128relaxed(nk_u1x8_t const *vectors, nk_size_t
|
|
1420
|
+
NK_PUBLIC void nk_dots_symmetric_u1_v128relaxed(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1394
1421
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1395
1422
|
nk_size_t row_start, nk_size_t row_count);
|
|
1396
1423
|
#endif // NK_TARGET_V128RELAXED
|
|
@@ -1408,7 +1435,7 @@ NK_PUBLIC void nk_dots_pack_f32_neon(nk_f32_t const *b, nk_size_t width, nk_size
|
|
|
1408
1435
|
NK_PUBLIC void nk_dots_packed_f32_neon(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1409
1436
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1410
1437
|
/** @copydoc nk_dots_symmetric_f32 */
|
|
1411
|
-
NK_PUBLIC void nk_dots_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t
|
|
1438
|
+
NK_PUBLIC void nk_dots_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1412
1439
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1413
1440
|
nk_size_t row_start, nk_size_t row_count);
|
|
1414
1441
|
/** @copydoc nk_dots_packed_size_f64 */
|
|
@@ -1420,7 +1447,7 @@ NK_PUBLIC void nk_dots_pack_f64_neon(nk_f64_t const *b, nk_size_t width, nk_size
|
|
|
1420
1447
|
NK_PUBLIC void nk_dots_packed_f64_neon(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1421
1448
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1422
1449
|
/** @copydoc nk_dots_symmetric_f64 */
|
|
1423
|
-
NK_PUBLIC void nk_dots_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t
|
|
1450
|
+
NK_PUBLIC void nk_dots_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1424
1451
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1425
1452
|
nk_size_t row_start, nk_size_t row_count);
|
|
1426
1453
|
/** @copydoc nk_dots_packed_size_u1 */
|
|
@@ -1432,7 +1459,7 @@ NK_PUBLIC void nk_dots_pack_u1_neon(nk_u1x8_t const *b, nk_size_t width, nk_size
|
|
|
1432
1459
|
NK_PUBLIC void nk_dots_packed_u1_neon(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1433
1460
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1434
1461
|
/** @copydoc nk_dots_symmetric_u1 */
|
|
1435
|
-
NK_PUBLIC void nk_dots_symmetric_u1_neon(nk_u1x8_t const *vectors, nk_size_t
|
|
1462
|
+
NK_PUBLIC void nk_dots_symmetric_u1_neon(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1436
1463
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1437
1464
|
nk_size_t row_start, nk_size_t row_count);
|
|
1438
1465
|
/** @copydoc nk_dots_packed_size_f16 */
|
|
@@ -1444,7 +1471,7 @@ NK_PUBLIC void nk_dots_pack_f16_neon(nk_f16_t const *b, nk_size_t width, nk_size
|
|
|
1444
1471
|
NK_PUBLIC void nk_dots_packed_f16_neon(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1445
1472
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1446
1473
|
/** @copydoc nk_dots_symmetric_f16 */
|
|
1447
|
-
NK_PUBLIC void nk_dots_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t
|
|
1474
|
+
NK_PUBLIC void nk_dots_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1448
1475
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1449
1476
|
nk_size_t row_start, nk_size_t row_count);
|
|
1450
1477
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1456,29 +1483,11 @@ NK_PUBLIC void nk_dots_pack_bf16_neon(nk_bf16_t const *b, nk_size_t width, nk_si
|
|
|
1456
1483
|
NK_PUBLIC void nk_dots_packed_bf16_neon(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1457
1484
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1458
1485
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1459
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t
|
|
1486
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1460
1487
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1461
1488
|
nk_size_t row_start, nk_size_t row_count);
|
|
1462
1489
|
#endif // NK_TARGET_NEON
|
|
1463
1490
|
|
|
1464
|
-
/* ARM NEON with F16 arithmetic (ARMv8.2-A FP16).
|
|
1465
|
-
* Provides native F16 FMLA for half-precision dot products.
|
|
1466
|
-
*/
|
|
1467
|
-
#if NK_TARGET_NEONHALF
|
|
1468
|
-
/** @copydoc nk_dots_packed_size_f16 */
|
|
1469
|
-
NK_PUBLIC nk_size_t nk_dots_packed_size_f16_neonhalf(nk_size_t width, nk_size_t depth);
|
|
1470
|
-
/** @copydoc nk_dots_pack_f16 */
|
|
1471
|
-
NK_PUBLIC void nk_dots_pack_f16_neonhalf(nk_f16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1472
|
-
void *b_packed);
|
|
1473
|
-
/** @copydoc nk_dots_packed_f16 */
|
|
1474
|
-
NK_PUBLIC void nk_dots_packed_f16_neonhalf(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1475
|
-
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1476
|
-
/** @copydoc nk_dots_symmetric_f16 */
|
|
1477
|
-
NK_PUBLIC void nk_dots_symmetric_f16_neonhalf(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
|
|
1478
|
-
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1479
|
-
nk_size_t row_start, nk_size_t row_count);
|
|
1480
|
-
#endif // NK_TARGET_NEONHALF
|
|
1481
|
-
|
|
1482
1491
|
/* ARM NEON with BF16 dot product (ARMv8.6-A BF16).
|
|
1483
1492
|
* Uses BFDOT/BFMMLA for efficient BF16 matrix operations.
|
|
1484
1493
|
*/
|
|
@@ -1492,7 +1501,7 @@ NK_PUBLIC void nk_dots_pack_bf16_neonbfdot(nk_bf16_t const *b, nk_size_t width,
|
|
|
1492
1501
|
NK_PUBLIC void nk_dots_packed_bf16_neonbfdot(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1493
1502
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1494
1503
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1495
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t
|
|
1504
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1496
1505
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1497
1506
|
nk_size_t row_start, nk_size_t row_count);
|
|
1498
1507
|
#endif // NK_TARGET_NEONBFDOT
|
|
@@ -1510,7 +1519,7 @@ NK_PUBLIC void nk_dots_pack_i8_neonsdot(nk_i8_t const *b, nk_size_t width, nk_si
|
|
|
1510
1519
|
NK_PUBLIC void nk_dots_packed_i8_neonsdot(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1511
1520
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1512
1521
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
1513
|
-
NK_PUBLIC void nk_dots_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t
|
|
1522
|
+
NK_PUBLIC void nk_dots_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1514
1523
|
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1515
1524
|
nk_size_t row_start, nk_size_t row_count);
|
|
1516
1525
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
@@ -1522,7 +1531,7 @@ NK_PUBLIC void nk_dots_pack_u8_neonsdot(nk_u8_t const *b, nk_size_t width, nk_si
|
|
|
1522
1531
|
NK_PUBLIC void nk_dots_packed_u8_neonsdot(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1523
1532
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1524
1533
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
1525
|
-
NK_PUBLIC void nk_dots_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t
|
|
1534
|
+
NK_PUBLIC void nk_dots_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1526
1535
|
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1527
1536
|
nk_size_t row_start, nk_size_t row_count);
|
|
1528
1537
|
#endif // NK_TARGET_NEONSDOT
|
|
@@ -1540,7 +1549,7 @@ NK_PUBLIC void nk_dots_pack_f16_neonfhm(nk_f16_t const *b, nk_size_t width, nk_s
|
|
|
1540
1549
|
NK_PUBLIC void nk_dots_packed_f16_neonfhm(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1541
1550
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1542
1551
|
/** @copydoc nk_dots_symmetric_f16 */
|
|
1543
|
-
NK_PUBLIC void nk_dots_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t
|
|
1552
|
+
NK_PUBLIC void nk_dots_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1544
1553
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1545
1554
|
nk_size_t row_start, nk_size_t row_count);
|
|
1546
1555
|
/** @copydoc nk_dots_packed_size_e4m3 */
|
|
@@ -1552,7 +1561,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_neonfhm(nk_e4m3_t const *b, nk_size_t width, nk
|
|
|
1552
1561
|
NK_PUBLIC void nk_dots_packed_e4m3_neonfhm(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1553
1562
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1554
1563
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
1555
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t
|
|
1564
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1556
1565
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1557
1566
|
nk_size_t row_start, nk_size_t row_count);
|
|
1558
1567
|
/** @copydoc nk_dots_packed_size_e5m2 */
|
|
@@ -1564,11 +1573,65 @@ NK_PUBLIC void nk_dots_pack_e5m2_neonfhm(nk_e5m2_t const *b, nk_size_t width, nk
|
|
|
1564
1573
|
NK_PUBLIC void nk_dots_packed_e5m2_neonfhm(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1565
1574
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1566
1575
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
1567
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t
|
|
1576
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1568
1577
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1569
1578
|
nk_size_t row_start, nk_size_t row_count);
|
|
1570
1579
|
#endif // NK_TARGET_NEONFHM
|
|
1571
1580
|
|
|
1581
|
+
/* ARM NEON with FP8 (ARMv9.2-A FP8).
|
|
1582
|
+
* Uses native FP8 dot-product instructions for E4M3/E5M2/E2M3/E3M2 operations.
|
|
1583
|
+
*/
|
|
1584
|
+
#if NK_TARGET_NEONFP8
|
|
1585
|
+
/** @copydoc nk_dots_packed_size_e4m3 */
|
|
1586
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_neonfp8(nk_size_t width, nk_size_t depth);
|
|
1587
|
+
/** @copydoc nk_dots_pack_e4m3 */
|
|
1588
|
+
NK_PUBLIC void nk_dots_pack_e4m3_neonfp8(nk_e4m3_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1589
|
+
void *b_packed);
|
|
1590
|
+
/** @copydoc nk_dots_packed_e4m3 */
|
|
1591
|
+
NK_PUBLIC void nk_dots_packed_e4m3_neonfp8(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1592
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1593
|
+
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
1594
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_neonfp8(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1595
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1596
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1597
|
+
/** @copydoc nk_dots_packed_size_e5m2 */
|
|
1598
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2_neonfp8(nk_size_t width, nk_size_t depth);
|
|
1599
|
+
/** @copydoc nk_dots_pack_e5m2 */
|
|
1600
|
+
NK_PUBLIC void nk_dots_pack_e5m2_neonfp8(nk_e5m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1601
|
+
void *b_packed);
|
|
1602
|
+
/** @copydoc nk_dots_packed_e5m2 */
|
|
1603
|
+
NK_PUBLIC void nk_dots_packed_e5m2_neonfp8(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1604
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1605
|
+
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
1606
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_neonfp8(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1607
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1608
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1609
|
+
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
1610
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_e2m3_neonfp8(nk_size_t width, nk_size_t depth);
|
|
1611
|
+
/** @copydoc nk_dots_pack_e2m3 */
|
|
1612
|
+
NK_PUBLIC void nk_dots_pack_e2m3_neonfp8(nk_e2m3_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1613
|
+
void *b_packed);
|
|
1614
|
+
/** @copydoc nk_dots_packed_e2m3 */
|
|
1615
|
+
NK_PUBLIC void nk_dots_packed_e2m3_neonfp8(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1616
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1617
|
+
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
1618
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_neonfp8(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1619
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1620
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1621
|
+
/** @copydoc nk_dots_packed_size_e3m2 */
|
|
1622
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_e3m2_neonfp8(nk_size_t width, nk_size_t depth);
|
|
1623
|
+
/** @copydoc nk_dots_pack_e3m2 */
|
|
1624
|
+
NK_PUBLIC void nk_dots_pack_e3m2_neonfp8(nk_e3m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1625
|
+
void *b_packed);
|
|
1626
|
+
/** @copydoc nk_dots_packed_e3m2 */
|
|
1627
|
+
NK_PUBLIC void nk_dots_packed_e3m2_neonfp8(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1628
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1629
|
+
/** @copydoc nk_dots_symmetric_e3m2 */
|
|
1630
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2_neonfp8(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1631
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1632
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1633
|
+
#endif // NK_TARGET_NEONFP8
|
|
1634
|
+
|
|
1572
1635
|
#if NK_TARGET_RVV
|
|
1573
1636
|
/** @copydoc nk_dots_packed_size_e2m3 */
|
|
1574
1637
|
NK_PUBLIC nk_size_t nk_dots_packed_size_e2m3_rvv(nk_size_t width, nk_size_t depth);
|
|
@@ -1579,7 +1642,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_rvv(nk_e2m3_t const *b, nk_size_t width, nk_siz
|
|
|
1579
1642
|
NK_PUBLIC void nk_dots_packed_e2m3_rvv(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1580
1643
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1581
1644
|
/** @copydoc nk_dots_symmetric_e2m3 */
|
|
1582
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t
|
|
1645
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1583
1646
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1584
1647
|
nk_size_t row_start, nk_size_t row_count);
|
|
1585
1648
|
/** @copydoc nk_dots_packed_size_e3m2 */
|
|
@@ -1591,7 +1654,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_rvv(nk_e3m2_t const *b, nk_size_t width, nk_siz
|
|
|
1591
1654
|
NK_PUBLIC void nk_dots_packed_e3m2_rvv(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1592
1655
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1593
1656
|
/** @copydoc nk_dots_symmetric_e3m2 */
|
|
1594
|
-
NK_PUBLIC void nk_dots_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t
|
|
1657
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1595
1658
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1596
1659
|
nk_size_t row_start, nk_size_t row_count);
|
|
1597
1660
|
/** @copydoc nk_dots_packed_size_f32 */
|
|
@@ -1603,7 +1666,7 @@ NK_PUBLIC void nk_dots_pack_f32_rvv(nk_f32_t const *b, nk_size_t width, nk_size_
|
|
|
1603
1666
|
NK_PUBLIC void nk_dots_packed_f32_rvv(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1604
1667
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1605
1668
|
/** @copydoc nk_dots_symmetric_f32 */
|
|
1606
|
-
NK_PUBLIC void nk_dots_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t
|
|
1669
|
+
NK_PUBLIC void nk_dots_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1607
1670
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1608
1671
|
nk_size_t row_start, nk_size_t row_count);
|
|
1609
1672
|
/** @copydoc nk_dots_packed_size_f64 */
|
|
@@ -1615,7 +1678,7 @@ NK_PUBLIC void nk_dots_pack_f64_rvv(nk_f64_t const *b, nk_size_t width, nk_size_
|
|
|
1615
1678
|
NK_PUBLIC void nk_dots_packed_f64_rvv(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1616
1679
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1617
1680
|
/** @copydoc nk_dots_symmetric_f64 */
|
|
1618
|
-
NK_PUBLIC void nk_dots_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t
|
|
1681
|
+
NK_PUBLIC void nk_dots_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1619
1682
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1620
1683
|
nk_size_t row_start, nk_size_t row_count);
|
|
1621
1684
|
/** @copydoc nk_dots_packed_size_bf16 */
|
|
@@ -1627,7 +1690,7 @@ NK_PUBLIC void nk_dots_pack_bf16_rvv(nk_bf16_t const *b, nk_size_t width, nk_siz
|
|
|
1627
1690
|
NK_PUBLIC void nk_dots_packed_bf16_rvv(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1628
1691
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1629
1692
|
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1630
|
-
NK_PUBLIC void nk_dots_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t
|
|
1693
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1631
1694
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1632
1695
|
nk_size_t row_start, nk_size_t row_count);
|
|
1633
1696
|
/** @copydoc nk_dots_packed_size_f16 */
|
|
@@ -1639,7 +1702,7 @@ NK_PUBLIC void nk_dots_pack_f16_rvv(nk_f16_t const *b, nk_size_t width, nk_size_
|
|
|
1639
1702
|
NK_PUBLIC void nk_dots_packed_f16_rvv(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1640
1703
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1641
1704
|
/** @copydoc nk_dots_symmetric_f16 */
|
|
1642
|
-
NK_PUBLIC void nk_dots_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t
|
|
1705
|
+
NK_PUBLIC void nk_dots_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1643
1706
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1644
1707
|
nk_size_t row_start, nk_size_t row_count);
|
|
1645
1708
|
/** @copydoc nk_dots_packed_size_i8 */
|
|
@@ -1651,9 +1714,9 @@ NK_PUBLIC void nk_dots_pack_i8_rvv(nk_i8_t const *b, nk_size_t width, nk_size_t
|
|
|
1651
1714
|
NK_PUBLIC void nk_dots_packed_i8_rvv(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1652
1715
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1653
1716
|
/** @copydoc nk_dots_symmetric_i8 */
|
|
1654
|
-
NK_PUBLIC void nk_dots_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t
|
|
1655
|
-
nk_i32_t *result, nk_size_t result_stride,
|
|
1656
|
-
nk_size_t row_count);
|
|
1717
|
+
NK_PUBLIC void nk_dots_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1718
|
+
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1719
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1657
1720
|
/** @copydoc nk_dots_packed_size_u8 */
|
|
1658
1721
|
NK_PUBLIC nk_size_t nk_dots_packed_size_u8_rvv(nk_size_t width, nk_size_t depth);
|
|
1659
1722
|
/** @copydoc nk_dots_pack_u8 */
|
|
@@ -1663,9 +1726,9 @@ NK_PUBLIC void nk_dots_pack_u8_rvv(nk_u8_t const *b, nk_size_t width, nk_size_t
|
|
|
1663
1726
|
NK_PUBLIC void nk_dots_packed_u8_rvv(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1664
1727
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1665
1728
|
/** @copydoc nk_dots_symmetric_u8 */
|
|
1666
|
-
NK_PUBLIC void nk_dots_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
|
|
1667
|
-
nk_u32_t *result, nk_size_t result_stride,
|
|
1668
|
-
nk_size_t row_count);
|
|
1729
|
+
NK_PUBLIC void nk_dots_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1730
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1731
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1669
1732
|
/** @copydoc nk_dots_packed_size_e4m3 */
|
|
1670
1733
|
NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_rvv(nk_size_t width, nk_size_t depth);
|
|
1671
1734
|
/** @copydoc nk_dots_pack_e4m3 */
|
|
@@ -1675,7 +1738,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_rvv(nk_e4m3_t const *b, nk_size_t width, nk_siz
|
|
|
1675
1738
|
NK_PUBLIC void nk_dots_packed_e4m3_rvv(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1676
1739
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1677
1740
|
/** @copydoc nk_dots_symmetric_e4m3 */
|
|
1678
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t
|
|
1741
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1679
1742
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1680
1743
|
nk_size_t row_start, nk_size_t row_count);
|
|
1681
1744
|
/** @copydoc nk_dots_packed_size_e5m2 */
|
|
@@ -1687,11 +1750,101 @@ NK_PUBLIC void nk_dots_pack_e5m2_rvv(nk_e5m2_t const *b, nk_size_t width, nk_siz
|
|
|
1687
1750
|
NK_PUBLIC void nk_dots_packed_e5m2_rvv(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1688
1751
|
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1689
1752
|
/** @copydoc nk_dots_symmetric_e5m2 */
|
|
1690
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t
|
|
1753
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1691
1754
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1692
1755
|
nk_size_t row_start, nk_size_t row_count);
|
|
1693
1756
|
#endif // NK_TARGET_RVV
|
|
1694
1757
|
|
|
1758
|
+
/* Loongson LASX backends using 256-bit SIMD (LoongArch).
|
|
1759
|
+
*/
|
|
1760
|
+
#if NK_TARGET_LOONGSONASX
|
|
1761
|
+
/** @copydoc nk_dots_packed_size_f32 */
|
|
1762
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_f32_loongsonasx(nk_size_t width, nk_size_t depth);
|
|
1763
|
+
/** @copydoc nk_dots_pack_f32 */
|
|
1764
|
+
NK_PUBLIC void nk_dots_pack_f32_loongsonasx(nk_f32_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1765
|
+
void *b_packed);
|
|
1766
|
+
/** @copydoc nk_dots_packed_f32 */
|
|
1767
|
+
NK_PUBLIC void nk_dots_packed_f32_loongsonasx(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1768
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1769
|
+
/** @copydoc nk_dots_symmetric_f32 */
|
|
1770
|
+
NK_PUBLIC void nk_dots_symmetric_f32_loongsonasx(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1771
|
+
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1772
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1773
|
+
/** @copydoc nk_dots_packed_size_f64 */
|
|
1774
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_f64_loongsonasx(nk_size_t width, nk_size_t depth);
|
|
1775
|
+
/** @copydoc nk_dots_pack_f64 */
|
|
1776
|
+
NK_PUBLIC void nk_dots_pack_f64_loongsonasx(nk_f64_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1777
|
+
void *b_packed);
|
|
1778
|
+
/** @copydoc nk_dots_packed_f64 */
|
|
1779
|
+
NK_PUBLIC void nk_dots_packed_f64_loongsonasx(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
|
|
1780
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1781
|
+
/** @copydoc nk_dots_symmetric_f64 */
|
|
1782
|
+
NK_PUBLIC void nk_dots_symmetric_f64_loongsonasx(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1783
|
+
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1784
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1785
|
+
/** @copydoc nk_dots_packed_size_f16 */
|
|
1786
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_f16_loongsonasx(nk_size_t width, nk_size_t depth);
|
|
1787
|
+
/** @copydoc nk_dots_pack_f16 */
|
|
1788
|
+
NK_PUBLIC void nk_dots_pack_f16_loongsonasx(nk_f16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1789
|
+
void *b_packed);
|
|
1790
|
+
/** @copydoc nk_dots_packed_f16 */
|
|
1791
|
+
NK_PUBLIC void nk_dots_packed_f16_loongsonasx(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1792
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1793
|
+
/** @copydoc nk_dots_symmetric_f16 */
|
|
1794
|
+
NK_PUBLIC void nk_dots_symmetric_f16_loongsonasx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1795
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1796
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1797
|
+
/** @copydoc nk_dots_packed_size_bf16 */
|
|
1798
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_bf16_loongsonasx(nk_size_t width, nk_size_t depth);
|
|
1799
|
+
/** @copydoc nk_dots_pack_bf16 */
|
|
1800
|
+
NK_PUBLIC void nk_dots_pack_bf16_loongsonasx(nk_bf16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1801
|
+
void *b_packed);
|
|
1802
|
+
/** @copydoc nk_dots_packed_bf16 */
|
|
1803
|
+
NK_PUBLIC void nk_dots_packed_bf16_loongsonasx(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
1804
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride,
|
|
1805
|
+
nk_size_t c_stride);
|
|
1806
|
+
/** @copydoc nk_dots_symmetric_bf16 */
|
|
1807
|
+
NK_PUBLIC void nk_dots_symmetric_bf16_loongsonasx(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1808
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1809
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1810
|
+
/** @copydoc nk_dots_packed_size_i8 */
|
|
1811
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_i8_loongsonasx(nk_size_t width, nk_size_t depth);
|
|
1812
|
+
/** @copydoc nk_dots_pack_i8 */
|
|
1813
|
+
NK_PUBLIC void nk_dots_pack_i8_loongsonasx(nk_i8_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1814
|
+
void *b_packed);
|
|
1815
|
+
/** @copydoc nk_dots_packed_i8 */
|
|
1816
|
+
NK_PUBLIC void nk_dots_packed_i8_loongsonasx(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
|
|
1817
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1818
|
+
/** @copydoc nk_dots_symmetric_i8 */
|
|
1819
|
+
NK_PUBLIC void nk_dots_symmetric_i8_loongsonasx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1820
|
+
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
|
|
1821
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1822
|
+
/** @copydoc nk_dots_packed_size_u8 */
|
|
1823
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_u8_loongsonasx(nk_size_t width, nk_size_t depth);
|
|
1824
|
+
/** @copydoc nk_dots_pack_u8 */
|
|
1825
|
+
NK_PUBLIC void nk_dots_pack_u8_loongsonasx(nk_u8_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1826
|
+
void *b_packed);
|
|
1827
|
+
/** @copydoc nk_dots_packed_u8 */
|
|
1828
|
+
NK_PUBLIC void nk_dots_packed_u8_loongsonasx(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1829
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1830
|
+
/** @copydoc nk_dots_symmetric_u8 */
|
|
1831
|
+
NK_PUBLIC void nk_dots_symmetric_u8_loongsonasx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1832
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1833
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1834
|
+
/** @copydoc nk_dots_packed_size_u1 */
|
|
1835
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_u1_loongsonasx(nk_size_t width, nk_size_t depth);
|
|
1836
|
+
/** @copydoc nk_dots_pack_u1 */
|
|
1837
|
+
NK_PUBLIC void nk_dots_pack_u1_loongsonasx(nk_u1x8_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
1838
|
+
void *b_packed);
|
|
1839
|
+
/** @copydoc nk_dots_packed_u1 */
|
|
1840
|
+
NK_PUBLIC void nk_dots_packed_u1_loongsonasx(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
|
|
1841
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
1842
|
+
/** @copydoc nk_dots_symmetric_u1 */
|
|
1843
|
+
NK_PUBLIC void nk_dots_symmetric_u1_loongsonasx(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1844
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
|
|
1845
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1846
|
+
#endif // NK_TARGET_LOONGSONASX
|
|
1847
|
+
|
|
1695
1848
|
#if defined(__cplusplus)
|
|
1696
1849
|
} // extern "C"
|
|
1697
1850
|
#endif
|
|
@@ -1703,17 +1856,20 @@ NK_PUBLIC void nk_dots_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t n_
|
|
|
1703
1856
|
#include "numkong/dots/alder.h"
|
|
1704
1857
|
#include "numkong/dots/sierra.h"
|
|
1705
1858
|
#include "numkong/dots/genoa.h"
|
|
1859
|
+
#include "numkong/dots/diamond.h"
|
|
1706
1860
|
#include "numkong/dots/sapphireamx.h"
|
|
1707
1861
|
#include "numkong/dots/neon.h"
|
|
1708
1862
|
#include "numkong/dots/neonsdot.h"
|
|
1709
|
-
#include "numkong/dots/neonhalf.h"
|
|
1710
1863
|
#include "numkong/dots/neonfhm.h"
|
|
1864
|
+
#include "numkong/dots/neonfp8.h"
|
|
1711
1865
|
#include "numkong/dots/neonbfdot.h"
|
|
1712
1866
|
#include "numkong/dots/sme.h"
|
|
1713
1867
|
#include "numkong/dots/smef64.h"
|
|
1714
1868
|
#include "numkong/dots/smebi32.h"
|
|
1715
1869
|
#include "numkong/dots/rvv.h"
|
|
1870
|
+
#include "numkong/dots/powervsx.h"
|
|
1716
1871
|
#include "numkong/dots/v128relaxed.h"
|
|
1872
|
+
#include "numkong/dots/loongsonasx.h"
|
|
1717
1873
|
|
|
1718
1874
|
#if defined(__cplusplus)
|
|
1719
1875
|
extern "C" {
|
|
@@ -1730,6 +1886,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_f32(nk_size_t width, nk_size_t depth) {
|
|
|
1730
1886
|
return nk_dots_packed_size_f32_haswell(width, depth);
|
|
1731
1887
|
#elif NK_TARGET_NEON
|
|
1732
1888
|
return nk_dots_packed_size_f32_neon(width, depth);
|
|
1889
|
+
#elif NK_TARGET_POWERVSX
|
|
1890
|
+
return nk_dots_packed_size_f32_powervsx(width, depth);
|
|
1733
1891
|
#elif NK_TARGET_RVV
|
|
1734
1892
|
return nk_dots_packed_size_f32_rvv(width, depth);
|
|
1735
1893
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1749,6 +1907,8 @@ NK_PUBLIC void nk_dots_pack_f32(nk_f32_t const *b, nk_size_t width, nk_size_t de
|
|
|
1749
1907
|
nk_dots_pack_f32_haswell(b, width, depth, b_stride, b_packed);
|
|
1750
1908
|
#elif NK_TARGET_NEON
|
|
1751
1909
|
nk_dots_pack_f32_neon(b, width, depth, b_stride, b_packed);
|
|
1910
|
+
#elif NK_TARGET_POWERVSX
|
|
1911
|
+
nk_dots_pack_f32_powervsx(b, width, depth, b_stride, b_packed);
|
|
1752
1912
|
#elif NK_TARGET_RVV
|
|
1753
1913
|
nk_dots_pack_f32_rvv(b, width, depth, b_stride, b_packed);
|
|
1754
1914
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1768,6 +1928,8 @@ NK_PUBLIC void nk_dots_packed_f32(nk_f32_t const *a, void const *b_packed, nk_f6
|
|
|
1768
1928
|
nk_dots_packed_f32_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1769
1929
|
#elif NK_TARGET_NEON
|
|
1770
1930
|
nk_dots_packed_f32_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1931
|
+
#elif NK_TARGET_POWERVSX
|
|
1932
|
+
nk_dots_packed_f32_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1771
1933
|
#elif NK_TARGET_RVV
|
|
1772
1934
|
nk_dots_packed_f32_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1773
1935
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1786,6 +1948,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_f64(nk_size_t width, nk_size_t depth) {
|
|
|
1786
1948
|
return nk_dots_packed_size_f64_haswell(width, depth);
|
|
1787
1949
|
#elif NK_TARGET_NEON
|
|
1788
1950
|
return nk_dots_packed_size_f64_neon(width, depth);
|
|
1951
|
+
#elif NK_TARGET_POWERVSX
|
|
1952
|
+
return nk_dots_packed_size_f64_powervsx(width, depth);
|
|
1789
1953
|
#elif NK_TARGET_RVV
|
|
1790
1954
|
return nk_dots_packed_size_f64_rvv(width, depth);
|
|
1791
1955
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1805,6 +1969,8 @@ NK_PUBLIC void nk_dots_pack_f64(nk_f64_t const *b, nk_size_t width, nk_size_t de
|
|
|
1805
1969
|
nk_dots_pack_f64_haswell(b, width, depth, b_stride, b_packed);
|
|
1806
1970
|
#elif NK_TARGET_NEON
|
|
1807
1971
|
nk_dots_pack_f64_neon(b, width, depth, b_stride, b_packed);
|
|
1972
|
+
#elif NK_TARGET_POWERVSX
|
|
1973
|
+
nk_dots_pack_f64_powervsx(b, width, depth, b_stride, b_packed);
|
|
1808
1974
|
#elif NK_TARGET_RVV
|
|
1809
1975
|
nk_dots_pack_f64_rvv(b, width, depth, b_stride, b_packed);
|
|
1810
1976
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1824,6 +1990,8 @@ NK_PUBLIC void nk_dots_packed_f64(nk_f64_t const *a, void const *b_packed, nk_f6
|
|
|
1824
1990
|
nk_dots_packed_f64_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1825
1991
|
#elif NK_TARGET_NEON
|
|
1826
1992
|
nk_dots_packed_f64_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1993
|
+
#elif NK_TARGET_POWERVSX
|
|
1994
|
+
nk_dots_packed_f64_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1827
1995
|
#elif NK_TARGET_RVV
|
|
1828
1996
|
nk_dots_packed_f64_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1829
1997
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1838,14 +2006,14 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_f16(nk_size_t width, nk_size_t depth) {
|
|
|
1838
2006
|
return nk_dots_packed_size_f16_sme(width, depth);
|
|
1839
2007
|
#elif NK_TARGET_NEONFHM
|
|
1840
2008
|
return nk_dots_packed_size_f16_neonfhm(width, depth);
|
|
1841
|
-
#elif NK_TARGET_NEONHALF
|
|
1842
|
-
return nk_dots_packed_size_f16_neonhalf(width, depth);
|
|
1843
2009
|
#elif NK_TARGET_NEON
|
|
1844
2010
|
return nk_dots_packed_size_f16_neon(width, depth);
|
|
1845
2011
|
#elif NK_TARGET_SKYLAKE
|
|
1846
2012
|
return nk_dots_packed_size_f16_skylake(width, depth);
|
|
1847
2013
|
#elif NK_TARGET_HASWELL
|
|
1848
2014
|
return nk_dots_packed_size_f16_haswell(width, depth);
|
|
2015
|
+
#elif NK_TARGET_POWERVSX
|
|
2016
|
+
return nk_dots_packed_size_f16_powervsx(width, depth);
|
|
1849
2017
|
#elif NK_TARGET_RVV
|
|
1850
2018
|
return nk_dots_packed_size_f16_rvv(width, depth);
|
|
1851
2019
|
#else
|
|
@@ -1859,14 +2027,14 @@ NK_PUBLIC void nk_dots_pack_f16(nk_f16_t const *b, nk_size_t width, nk_size_t de
|
|
|
1859
2027
|
nk_dots_pack_f16_sme(b, width, depth, b_stride, b_packed);
|
|
1860
2028
|
#elif NK_TARGET_NEONFHM
|
|
1861
2029
|
nk_dots_pack_f16_neonfhm(b, width, depth, b_stride, b_packed);
|
|
1862
|
-
#elif NK_TARGET_NEONHALF
|
|
1863
|
-
nk_dots_pack_f16_neonhalf(b, width, depth, b_stride, b_packed);
|
|
1864
2030
|
#elif NK_TARGET_NEON
|
|
1865
2031
|
nk_dots_pack_f16_neon(b, width, depth, b_stride, b_packed);
|
|
1866
2032
|
#elif NK_TARGET_SKYLAKE
|
|
1867
2033
|
nk_dots_pack_f16_skylake(b, width, depth, b_stride, b_packed);
|
|
1868
2034
|
#elif NK_TARGET_HASWELL
|
|
1869
2035
|
nk_dots_pack_f16_haswell(b, width, depth, b_stride, b_packed);
|
|
2036
|
+
#elif NK_TARGET_POWERVSX
|
|
2037
|
+
nk_dots_pack_f16_powervsx(b, width, depth, b_stride, b_packed);
|
|
1870
2038
|
#elif NK_TARGET_RVV
|
|
1871
2039
|
nk_dots_pack_f16_rvv(b, width, depth, b_stride, b_packed);
|
|
1872
2040
|
#else
|
|
@@ -1880,14 +2048,14 @@ NK_PUBLIC void nk_dots_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f3
|
|
|
1880
2048
|
nk_dots_packed_f16_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1881
2049
|
#elif NK_TARGET_NEONFHM
|
|
1882
2050
|
nk_dots_packed_f16_neonfhm(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1883
|
-
#elif NK_TARGET_NEONHALF
|
|
1884
|
-
nk_dots_packed_f16_neonhalf(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1885
2051
|
#elif NK_TARGET_NEON
|
|
1886
2052
|
nk_dots_packed_f16_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1887
2053
|
#elif NK_TARGET_SKYLAKE
|
|
1888
2054
|
nk_dots_packed_f16_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1889
2055
|
#elif NK_TARGET_HASWELL
|
|
1890
2056
|
nk_dots_packed_f16_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2057
|
+
#elif NK_TARGET_POWERVSX
|
|
2058
|
+
nk_dots_packed_f16_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1891
2059
|
#elif NK_TARGET_RVV
|
|
1892
2060
|
nk_dots_packed_f16_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1893
2061
|
#else
|
|
@@ -1908,6 +2076,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_bf16(nk_size_t width, nk_size_t depth) {
|
|
|
1908
2076
|
return nk_dots_packed_size_bf16_skylake(width, depth);
|
|
1909
2077
|
#elif NK_TARGET_HASWELL
|
|
1910
2078
|
return nk_dots_packed_size_bf16_haswell(width, depth);
|
|
2079
|
+
#elif NK_TARGET_POWERVSX
|
|
2080
|
+
return nk_dots_packed_size_bf16_powervsx(width, depth);
|
|
1911
2081
|
#elif NK_TARGET_RVV
|
|
1912
2082
|
return nk_dots_packed_size_bf16_rvv(width, depth);
|
|
1913
2083
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1931,6 +2101,8 @@ NK_PUBLIC void nk_dots_pack_bf16(nk_bf16_t const *b, nk_size_t width, nk_size_t
|
|
|
1931
2101
|
nk_dots_pack_bf16_skylake(b, width, depth, b_stride, b_packed);
|
|
1932
2102
|
#elif NK_TARGET_HASWELL
|
|
1933
2103
|
nk_dots_pack_bf16_haswell(b, width, depth, b_stride, b_packed);
|
|
2104
|
+
#elif NK_TARGET_POWERVSX
|
|
2105
|
+
nk_dots_pack_bf16_powervsx(b, width, depth, b_stride, b_packed);
|
|
1934
2106
|
#elif NK_TARGET_RVV
|
|
1935
2107
|
nk_dots_pack_bf16_rvv(b, width, depth, b_stride, b_packed);
|
|
1936
2108
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1954,6 +2126,8 @@ NK_PUBLIC void nk_dots_packed_bf16(nk_bf16_t const *a, void const *b_packed, nk_
|
|
|
1954
2126
|
nk_dots_packed_bf16_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1955
2127
|
#elif NK_TARGET_HASWELL
|
|
1956
2128
|
nk_dots_packed_bf16_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2129
|
+
#elif NK_TARGET_POWERVSX
|
|
2130
|
+
nk_dots_packed_bf16_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1957
2131
|
#elif NK_TARGET_RVV
|
|
1958
2132
|
nk_dots_packed_bf16_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
1959
2133
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1978,6 +2152,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_i8(nk_size_t width, nk_size_t depth) {
|
|
|
1978
2152
|
return nk_dots_packed_size_i8_alder(width, depth);
|
|
1979
2153
|
#elif NK_TARGET_HASWELL
|
|
1980
2154
|
return nk_dots_packed_size_i8_haswell(width, depth);
|
|
2155
|
+
#elif NK_TARGET_POWERVSX
|
|
2156
|
+
return nk_dots_packed_size_i8_powervsx(width, depth);
|
|
1981
2157
|
#elif NK_TARGET_RVV
|
|
1982
2158
|
return nk_dots_packed_size_i8_rvv(width, depth);
|
|
1983
2159
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2002,6 +2178,8 @@ NK_PUBLIC void nk_dots_pack_i8(nk_i8_t const *b, nk_size_t width, nk_size_t dept
|
|
|
2002
2178
|
nk_dots_pack_i8_alder(b, width, depth, b_stride, b_packed);
|
|
2003
2179
|
#elif NK_TARGET_HASWELL
|
|
2004
2180
|
nk_dots_pack_i8_haswell(b, width, depth, b_stride, b_packed);
|
|
2181
|
+
#elif NK_TARGET_POWERVSX
|
|
2182
|
+
nk_dots_pack_i8_powervsx(b, width, depth, b_stride, b_packed);
|
|
2005
2183
|
#elif NK_TARGET_RVV
|
|
2006
2184
|
nk_dots_pack_i8_rvv(b, width, depth, b_stride, b_packed);
|
|
2007
2185
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2027,6 +2205,8 @@ NK_PUBLIC void nk_dots_packed_i8(nk_i8_t const *a, void const *b_packed, nk_i32_
|
|
|
2027
2205
|
nk_dots_packed_i8_alder(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2028
2206
|
#elif NK_TARGET_HASWELL
|
|
2029
2207
|
nk_dots_packed_i8_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2208
|
+
#elif NK_TARGET_POWERVSX
|
|
2209
|
+
nk_dots_packed_i8_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2030
2210
|
#elif NK_TARGET_RVV
|
|
2031
2211
|
nk_dots_packed_i8_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2032
2212
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2051,6 +2231,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_u8(nk_size_t width, nk_size_t depth) {
|
|
|
2051
2231
|
return nk_dots_packed_size_u8_alder(width, depth);
|
|
2052
2232
|
#elif NK_TARGET_HASWELL
|
|
2053
2233
|
return nk_dots_packed_size_u8_haswell(width, depth);
|
|
2234
|
+
#elif NK_TARGET_POWERVSX
|
|
2235
|
+
return nk_dots_packed_size_u8_powervsx(width, depth);
|
|
2054
2236
|
#elif NK_TARGET_RVV
|
|
2055
2237
|
return nk_dots_packed_size_u8_rvv(width, depth);
|
|
2056
2238
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2075,6 +2257,8 @@ NK_PUBLIC void nk_dots_pack_u8(nk_u8_t const *b, nk_size_t width, nk_size_t dept
|
|
|
2075
2257
|
nk_dots_pack_u8_alder(b, width, depth, b_stride, b_packed);
|
|
2076
2258
|
#elif NK_TARGET_HASWELL
|
|
2077
2259
|
nk_dots_pack_u8_haswell(b, width, depth, b_stride, b_packed);
|
|
2260
|
+
#elif NK_TARGET_POWERVSX
|
|
2261
|
+
nk_dots_pack_u8_powervsx(b, width, depth, b_stride, b_packed);
|
|
2078
2262
|
#elif NK_TARGET_RVV
|
|
2079
2263
|
nk_dots_pack_u8_rvv(b, width, depth, b_stride, b_packed);
|
|
2080
2264
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2100,6 +2284,8 @@ NK_PUBLIC void nk_dots_packed_u8(nk_u8_t const *a, void const *b_packed, nk_u32_
|
|
|
2100
2284
|
nk_dots_packed_u8_alder(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2101
2285
|
#elif NK_TARGET_HASWELL
|
|
2102
2286
|
nk_dots_packed_u8_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2287
|
+
#elif NK_TARGET_POWERVSX
|
|
2288
|
+
nk_dots_packed_u8_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2103
2289
|
#elif NK_TARGET_RVV
|
|
2104
2290
|
nk_dots_packed_u8_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2105
2291
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2114,8 +2300,12 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3(nk_size_t width, nk_size_t depth) {
|
|
|
2114
2300
|
return nk_dots_packed_size_e4m3_sme(width, depth);
|
|
2115
2301
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2116
2302
|
return nk_dots_packed_size_e4m3_sapphireamx(width, depth);
|
|
2303
|
+
#elif NK_TARGET_NEONFP8
|
|
2304
|
+
return nk_dots_packed_size_e4m3_neonfp8(width, depth);
|
|
2117
2305
|
#elif NK_TARGET_NEONFHM
|
|
2118
2306
|
return nk_dots_packed_size_e4m3_neonfhm(width, depth);
|
|
2307
|
+
#elif NK_TARGET_DIAMOND
|
|
2308
|
+
return nk_dots_packed_size_e4m3_diamond(width, depth);
|
|
2119
2309
|
#elif NK_TARGET_GENOA
|
|
2120
2310
|
return nk_dots_packed_size_e4m3_genoa(width, depth);
|
|
2121
2311
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2137,8 +2327,12 @@ NK_PUBLIC void nk_dots_pack_e4m3(nk_e4m3_t const *b, nk_size_t width, nk_size_t
|
|
|
2137
2327
|
nk_dots_pack_e4m3_sme(b, width, depth, b_stride, b_packed);
|
|
2138
2328
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2139
2329
|
nk_dots_pack_e4m3_sapphireamx(b, width, depth, b_stride, b_packed);
|
|
2330
|
+
#elif NK_TARGET_NEONFP8
|
|
2331
|
+
nk_dots_pack_e4m3_neonfp8(b, width, depth, b_stride, b_packed);
|
|
2140
2332
|
#elif NK_TARGET_NEONFHM
|
|
2141
2333
|
nk_dots_pack_e4m3_neonfhm(b, width, depth, b_stride, b_packed);
|
|
2334
|
+
#elif NK_TARGET_DIAMOND
|
|
2335
|
+
nk_dots_pack_e4m3_diamond(b, width, depth, b_stride, b_packed);
|
|
2142
2336
|
#elif NK_TARGET_GENOA
|
|
2143
2337
|
nk_dots_pack_e4m3_genoa(b, width, depth, b_stride, b_packed);
|
|
2144
2338
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2160,8 +2354,12 @@ NK_PUBLIC void nk_dots_packed_e4m3(nk_e4m3_t const *a, void const *b_packed, nk_
|
|
|
2160
2354
|
nk_dots_packed_e4m3_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2161
2355
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2162
2356
|
nk_dots_packed_e4m3_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2357
|
+
#elif NK_TARGET_NEONFP8
|
|
2358
|
+
nk_dots_packed_e4m3_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2163
2359
|
#elif NK_TARGET_NEONFHM
|
|
2164
2360
|
nk_dots_packed_e4m3_neonfhm(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2361
|
+
#elif NK_TARGET_DIAMOND
|
|
2362
|
+
nk_dots_packed_e4m3_diamond(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2165
2363
|
#elif NK_TARGET_GENOA
|
|
2166
2364
|
nk_dots_packed_e4m3_genoa(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2167
2365
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2182,8 +2380,12 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2(nk_size_t width, nk_size_t depth) {
|
|
|
2182
2380
|
return nk_dots_packed_size_e5m2_sme(width, depth);
|
|
2183
2381
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2184
2382
|
return nk_dots_packed_size_e5m2_sapphireamx(width, depth);
|
|
2383
|
+
#elif NK_TARGET_NEONFP8
|
|
2384
|
+
return nk_dots_packed_size_e5m2_neonfp8(width, depth);
|
|
2185
2385
|
#elif NK_TARGET_NEONFHM
|
|
2186
2386
|
return nk_dots_packed_size_e5m2_neonfhm(width, depth);
|
|
2387
|
+
#elif NK_TARGET_DIAMOND
|
|
2388
|
+
return nk_dots_packed_size_e5m2_diamond(width, depth);
|
|
2187
2389
|
#elif NK_TARGET_GENOA
|
|
2188
2390
|
return nk_dots_packed_size_e5m2_genoa(width, depth);
|
|
2189
2391
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2205,8 +2407,12 @@ NK_PUBLIC void nk_dots_pack_e5m2(nk_e5m2_t const *b, nk_size_t width, nk_size_t
|
|
|
2205
2407
|
nk_dots_pack_e5m2_sme(b, width, depth, b_stride, b_packed);
|
|
2206
2408
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2207
2409
|
nk_dots_pack_e5m2_sapphireamx(b, width, depth, b_stride, b_packed);
|
|
2410
|
+
#elif NK_TARGET_NEONFP8
|
|
2411
|
+
nk_dots_pack_e5m2_neonfp8(b, width, depth, b_stride, b_packed);
|
|
2208
2412
|
#elif NK_TARGET_NEONFHM
|
|
2209
2413
|
nk_dots_pack_e5m2_neonfhm(b, width, depth, b_stride, b_packed);
|
|
2414
|
+
#elif NK_TARGET_DIAMOND
|
|
2415
|
+
nk_dots_pack_e5m2_diamond(b, width, depth, b_stride, b_packed);
|
|
2210
2416
|
#elif NK_TARGET_GENOA
|
|
2211
2417
|
nk_dots_pack_e5m2_genoa(b, width, depth, b_stride, b_packed);
|
|
2212
2418
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2228,8 +2434,12 @@ NK_PUBLIC void nk_dots_packed_e5m2(nk_e5m2_t const *a, void const *b_packed, nk_
|
|
|
2228
2434
|
nk_dots_packed_e5m2_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2229
2435
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2230
2436
|
nk_dots_packed_e5m2_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2437
|
+
#elif NK_TARGET_NEONFP8
|
|
2438
|
+
nk_dots_packed_e5m2_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2231
2439
|
#elif NK_TARGET_NEONFHM
|
|
2232
2440
|
nk_dots_packed_e5m2_neonfhm(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2441
|
+
#elif NK_TARGET_DIAMOND
|
|
2442
|
+
nk_dots_packed_e5m2_diamond(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2233
2443
|
#elif NK_TARGET_GENOA
|
|
2234
2444
|
nk_dots_packed_e5m2_genoa(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2235
2445
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2250,6 +2460,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e2m3(nk_size_t width, nk_size_t depth) {
|
|
|
2250
2460
|
return nk_dots_packed_size_e2m3_sme(width, depth);
|
|
2251
2461
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2252
2462
|
return nk_dots_packed_size_e2m3_sapphireamx(width, depth);
|
|
2463
|
+
#elif NK_TARGET_NEONFP8
|
|
2464
|
+
return nk_dots_packed_size_e2m3_neonfp8(width, depth);
|
|
2253
2465
|
#elif NK_TARGET_SKYLAKE
|
|
2254
2466
|
return nk_dots_packed_size_e2m3_skylake(width, depth);
|
|
2255
2467
|
#elif NK_TARGET_SIERRA
|
|
@@ -2273,6 +2485,8 @@ NK_PUBLIC void nk_dots_pack_e2m3(nk_e2m3_t const *b, nk_size_t width, nk_size_t
|
|
|
2273
2485
|
nk_dots_pack_e2m3_sme(b, width, depth, b_stride, b_packed);
|
|
2274
2486
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2275
2487
|
nk_dots_pack_e2m3_sapphireamx(b, width, depth, b_stride, b_packed);
|
|
2488
|
+
#elif NK_TARGET_NEONFP8
|
|
2489
|
+
nk_dots_pack_e2m3_neonfp8(b, width, depth, b_stride, b_packed);
|
|
2276
2490
|
#elif NK_TARGET_SKYLAKE
|
|
2277
2491
|
nk_dots_pack_e2m3_skylake(b, width, depth, b_stride, b_packed);
|
|
2278
2492
|
#elif NK_TARGET_SIERRA
|
|
@@ -2296,6 +2510,8 @@ NK_PUBLIC void nk_dots_packed_e2m3(nk_e2m3_t const *a, void const *b_packed, nk_
|
|
|
2296
2510
|
nk_dots_packed_e2m3_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2297
2511
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2298
2512
|
nk_dots_packed_e2m3_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2513
|
+
#elif NK_TARGET_NEONFP8
|
|
2514
|
+
nk_dots_packed_e2m3_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2299
2515
|
#elif NK_TARGET_SKYLAKE
|
|
2300
2516
|
nk_dots_packed_e2m3_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2301
2517
|
#elif NK_TARGET_SIERRA
|
|
@@ -2318,6 +2534,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e3m2(nk_size_t width, nk_size_t depth) {
|
|
|
2318
2534
|
return nk_dots_packed_size_e3m2_sme(width, depth);
|
|
2319
2535
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2320
2536
|
return nk_dots_packed_size_e3m2_sapphireamx(width, depth);
|
|
2537
|
+
#elif NK_TARGET_NEONFP8
|
|
2538
|
+
return nk_dots_packed_size_e3m2_neonfp8(width, depth);
|
|
2321
2539
|
#elif NK_TARGET_SKYLAKE
|
|
2322
2540
|
return nk_dots_packed_size_e3m2_skylake(width, depth);
|
|
2323
2541
|
#elif NK_TARGET_HASWELL
|
|
@@ -2335,6 +2553,8 @@ NK_PUBLIC void nk_dots_pack_e3m2(nk_e3m2_t const *b, nk_size_t width, nk_size_t
|
|
|
2335
2553
|
nk_dots_pack_e3m2_sme(b, width, depth, b_stride, b_packed);
|
|
2336
2554
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2337
2555
|
nk_dots_pack_e3m2_sapphireamx(b, width, depth, b_stride, b_packed);
|
|
2556
|
+
#elif NK_TARGET_NEONFP8
|
|
2557
|
+
nk_dots_pack_e3m2_neonfp8(b, width, depth, b_stride, b_packed);
|
|
2338
2558
|
#elif NK_TARGET_SKYLAKE
|
|
2339
2559
|
nk_dots_pack_e3m2_skylake(b, width, depth, b_stride, b_packed);
|
|
2340
2560
|
#elif NK_TARGET_HASWELL
|
|
@@ -2352,6 +2572,8 @@ NK_PUBLIC void nk_dots_packed_e3m2(nk_e3m2_t const *a, void const *b_packed, nk_
|
|
|
2352
2572
|
nk_dots_packed_e3m2_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2353
2573
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2354
2574
|
nk_dots_packed_e3m2_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2575
|
+
#elif NK_TARGET_NEONFP8
|
|
2576
|
+
nk_dots_packed_e3m2_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2355
2577
|
#elif NK_TARGET_SKYLAKE
|
|
2356
2578
|
nk_dots_packed_e3m2_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2357
2579
|
#elif NK_TARGET_HASWELL
|
|
@@ -2422,6 +2644,10 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_u1(nk_size_t width, nk_size_t depth) {
|
|
|
2422
2644
|
return nk_dots_packed_size_u1_haswell(width, depth);
|
|
2423
2645
|
#elif NK_TARGET_NEON
|
|
2424
2646
|
return nk_dots_packed_size_u1_neon(width, depth);
|
|
2647
|
+
#elif NK_TARGET_POWERVSX
|
|
2648
|
+
return nk_dots_packed_size_u1_powervsx(width, depth);
|
|
2649
|
+
#elif NK_TARGET_LOONGSONASX
|
|
2650
|
+
return nk_dots_packed_size_u1_loongsonasx(width, depth);
|
|
2425
2651
|
#elif NK_TARGET_V128RELAXED
|
|
2426
2652
|
return nk_dots_packed_size_u1_v128relaxed(width, depth);
|
|
2427
2653
|
#else
|
|
@@ -2439,6 +2665,10 @@ NK_PUBLIC void nk_dots_pack_u1(nk_u1x8_t const *b, nk_size_t width, nk_size_t de
|
|
|
2439
2665
|
nk_dots_pack_u1_haswell(b, width, depth, b_stride, b_packed);
|
|
2440
2666
|
#elif NK_TARGET_NEON
|
|
2441
2667
|
nk_dots_pack_u1_neon(b, width, depth, b_stride, b_packed);
|
|
2668
|
+
#elif NK_TARGET_POWERVSX
|
|
2669
|
+
nk_dots_pack_u1_powervsx(b, width, depth, b_stride, b_packed);
|
|
2670
|
+
#elif NK_TARGET_LOONGSONASX
|
|
2671
|
+
nk_dots_pack_u1_loongsonasx(b, width, depth, b_stride, b_packed);
|
|
2442
2672
|
#elif NK_TARGET_V128RELAXED
|
|
2443
2673
|
nk_dots_pack_u1_v128relaxed(b, width, depth, b_stride, b_packed);
|
|
2444
2674
|
#else
|
|
@@ -2456,6 +2686,10 @@ NK_PUBLIC void nk_dots_packed_u1(nk_u1x8_t const *a, void const *b_packed, nk_u3
|
|
|
2456
2686
|
nk_dots_packed_u1_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2457
2687
|
#elif NK_TARGET_NEON
|
|
2458
2688
|
nk_dots_packed_u1_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2689
|
+
#elif NK_TARGET_POWERVSX
|
|
2690
|
+
nk_dots_packed_u1_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2691
|
+
#elif NK_TARGET_LOONGSONASX
|
|
2692
|
+
nk_dots_packed_u1_loongsonasx(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2459
2693
|
#elif NK_TARGET_V128RELAXED
|
|
2460
2694
|
nk_dots_packed_u1_v128relaxed(a, b_packed, c, height, width, depth, a_stride, c_stride);
|
|
2461
2695
|
#else
|
|
@@ -2513,285 +2747,331 @@ NK_PUBLIC void nk_dots_packed_i4(nk_i4x2_t const *a, void const *b_packed, nk_i3
|
|
|
2513
2747
|
#endif
|
|
2514
2748
|
}
|
|
2515
2749
|
|
|
2516
|
-
NK_PUBLIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t
|
|
2517
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2750
|
+
NK_PUBLIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2751
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2518
2752
|
nk_size_t row_count) {
|
|
2519
2753
|
#if NK_TARGET_SME
|
|
2520
|
-
nk_dots_symmetric_f16_sme(vectors,
|
|
2521
|
-
#elif NK_TARGET_NEONHALF
|
|
2522
|
-
nk_dots_symmetric_f16_neonhalf(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
|
|
2754
|
+
nk_dots_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2523
2755
|
#elif NK_TARGET_NEON
|
|
2524
|
-
nk_dots_symmetric_f16_neon(vectors,
|
|
2756
|
+
nk_dots_symmetric_f16_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2525
2757
|
#elif NK_TARGET_NEONFHM
|
|
2526
|
-
nk_dots_symmetric_f16_neonfhm(vectors,
|
|
2758
|
+
nk_dots_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2527
2759
|
#elif NK_TARGET_SKYLAKE
|
|
2528
|
-
nk_dots_symmetric_f16_skylake(vectors,
|
|
2760
|
+
nk_dots_symmetric_f16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2529
2761
|
#elif NK_TARGET_HASWELL
|
|
2530
|
-
nk_dots_symmetric_f16_haswell(vectors,
|
|
2762
|
+
nk_dots_symmetric_f16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2763
|
+
#elif NK_TARGET_POWERVSX
|
|
2764
|
+
nk_dots_symmetric_f16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2531
2765
|
#elif NK_TARGET_RVV
|
|
2532
|
-
nk_dots_symmetric_f16_rvv(vectors,
|
|
2766
|
+
nk_dots_symmetric_f16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2533
2767
|
#else
|
|
2534
|
-
nk_dots_symmetric_f16_serial(vectors,
|
|
2768
|
+
nk_dots_symmetric_f16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2535
2769
|
#endif
|
|
2536
2770
|
}
|
|
2537
2771
|
|
|
2538
|
-
NK_PUBLIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t
|
|
2539
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2772
|
+
NK_PUBLIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2773
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2540
2774
|
nk_size_t row_count) {
|
|
2541
2775
|
#if NK_TARGET_SME
|
|
2542
|
-
nk_dots_symmetric_bf16_sme(vectors,
|
|
2776
|
+
nk_dots_symmetric_bf16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2543
2777
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2544
|
-
nk_dots_symmetric_bf16_sapphireamx(vectors,
|
|
2778
|
+
nk_dots_symmetric_bf16_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2779
|
+
row_count);
|
|
2545
2780
|
#elif NK_TARGET_NEONBFDOT
|
|
2546
|
-
nk_dots_symmetric_bf16_neonbfdot(vectors,
|
|
2781
|
+
nk_dots_symmetric_bf16_neonbfdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2782
|
+
row_count);
|
|
2547
2783
|
#elif NK_TARGET_GENOA
|
|
2548
|
-
nk_dots_symmetric_bf16_genoa(vectors,
|
|
2784
|
+
nk_dots_symmetric_bf16_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2549
2785
|
#elif NK_TARGET_SKYLAKE
|
|
2550
|
-
nk_dots_symmetric_bf16_skylake(vectors,
|
|
2786
|
+
nk_dots_symmetric_bf16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2551
2787
|
#elif NK_TARGET_HASWELL
|
|
2552
|
-
nk_dots_symmetric_bf16_haswell(vectors,
|
|
2788
|
+
nk_dots_symmetric_bf16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2789
|
+
#elif NK_TARGET_POWERVSX
|
|
2790
|
+
nk_dots_symmetric_bf16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2553
2791
|
#elif NK_TARGET_RVV
|
|
2554
|
-
nk_dots_symmetric_bf16_rvv(vectors,
|
|
2792
|
+
nk_dots_symmetric_bf16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2555
2793
|
#elif NK_TARGET_V128RELAXED
|
|
2556
|
-
nk_dots_symmetric_bf16_v128relaxed(vectors,
|
|
2794
|
+
nk_dots_symmetric_bf16_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2795
|
+
row_count);
|
|
2557
2796
|
#else
|
|
2558
|
-
nk_dots_symmetric_bf16_serial(vectors,
|
|
2797
|
+
nk_dots_symmetric_bf16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2559
2798
|
#endif
|
|
2560
2799
|
}
|
|
2561
2800
|
|
|
2562
|
-
NK_PUBLIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t
|
|
2801
|
+
NK_PUBLIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
|
|
2563
2802
|
nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2564
2803
|
nk_size_t row_count) {
|
|
2565
2804
|
#if NK_TARGET_SME
|
|
2566
|
-
nk_dots_symmetric_i8_sme(vectors,
|
|
2805
|
+
nk_dots_symmetric_i8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2567
2806
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2568
|
-
nk_dots_symmetric_i8_sapphireamx(vectors,
|
|
2807
|
+
nk_dots_symmetric_i8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2808
|
+
row_count);
|
|
2569
2809
|
#elif NK_TARGET_NEONSDOT
|
|
2570
|
-
nk_dots_symmetric_i8_neonsdot(vectors,
|
|
2810
|
+
nk_dots_symmetric_i8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2571
2811
|
#elif NK_TARGET_ICELAKE
|
|
2572
|
-
nk_dots_symmetric_i8_icelake(vectors,
|
|
2812
|
+
nk_dots_symmetric_i8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2573
2813
|
#elif NK_TARGET_SIERRA
|
|
2574
|
-
nk_dots_symmetric_i8_sierra(vectors,
|
|
2814
|
+
nk_dots_symmetric_i8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2575
2815
|
#elif NK_TARGET_ALDER
|
|
2576
|
-
nk_dots_symmetric_i8_alder(vectors,
|
|
2816
|
+
nk_dots_symmetric_i8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2577
2817
|
#elif NK_TARGET_HASWELL
|
|
2578
|
-
nk_dots_symmetric_i8_haswell(vectors,
|
|
2818
|
+
nk_dots_symmetric_i8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2819
|
+
#elif NK_TARGET_POWERVSX
|
|
2820
|
+
nk_dots_symmetric_i8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2579
2821
|
#elif NK_TARGET_RVV
|
|
2580
|
-
nk_dots_symmetric_i8_rvv(vectors,
|
|
2822
|
+
nk_dots_symmetric_i8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2581
2823
|
#elif NK_TARGET_V128RELAXED
|
|
2582
|
-
nk_dots_symmetric_i8_v128relaxed(vectors,
|
|
2824
|
+
nk_dots_symmetric_i8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2825
|
+
row_count);
|
|
2583
2826
|
#else
|
|
2584
|
-
nk_dots_symmetric_i8_serial(vectors,
|
|
2827
|
+
nk_dots_symmetric_i8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2585
2828
|
#endif
|
|
2586
2829
|
}
|
|
2587
2830
|
|
|
2588
|
-
NK_PUBLIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t
|
|
2831
|
+
NK_PUBLIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
|
|
2589
2832
|
nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2590
2833
|
nk_size_t row_count) {
|
|
2591
2834
|
#if NK_TARGET_SME
|
|
2592
|
-
nk_dots_symmetric_u8_sme(vectors,
|
|
2835
|
+
nk_dots_symmetric_u8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2593
2836
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2594
|
-
nk_dots_symmetric_u8_sapphireamx(vectors,
|
|
2837
|
+
nk_dots_symmetric_u8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2838
|
+
row_count);
|
|
2595
2839
|
#elif NK_TARGET_ICELAKE
|
|
2596
|
-
nk_dots_symmetric_u8_icelake(vectors,
|
|
2840
|
+
nk_dots_symmetric_u8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2597
2841
|
#elif NK_TARGET_SIERRA
|
|
2598
|
-
nk_dots_symmetric_u8_sierra(vectors,
|
|
2842
|
+
nk_dots_symmetric_u8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2599
2843
|
#elif NK_TARGET_ALDER
|
|
2600
|
-
nk_dots_symmetric_u8_alder(vectors,
|
|
2844
|
+
nk_dots_symmetric_u8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2601
2845
|
#elif NK_TARGET_NEONSDOT
|
|
2602
|
-
nk_dots_symmetric_u8_neonsdot(vectors,
|
|
2846
|
+
nk_dots_symmetric_u8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2603
2847
|
#elif NK_TARGET_HASWELL
|
|
2604
|
-
nk_dots_symmetric_u8_haswell(vectors,
|
|
2848
|
+
nk_dots_symmetric_u8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2849
|
+
#elif NK_TARGET_POWERVSX
|
|
2850
|
+
nk_dots_symmetric_u8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2605
2851
|
#elif NK_TARGET_RVV
|
|
2606
|
-
nk_dots_symmetric_u8_rvv(vectors,
|
|
2852
|
+
nk_dots_symmetric_u8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2607
2853
|
#elif NK_TARGET_V128RELAXED
|
|
2608
|
-
nk_dots_symmetric_u8_v128relaxed(vectors,
|
|
2854
|
+
nk_dots_symmetric_u8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2855
|
+
row_count);
|
|
2609
2856
|
#else
|
|
2610
|
-
nk_dots_symmetric_u8_serial(vectors,
|
|
2857
|
+
nk_dots_symmetric_u8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2611
2858
|
#endif
|
|
2612
2859
|
}
|
|
2613
2860
|
|
|
2614
|
-
NK_PUBLIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t
|
|
2615
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2861
|
+
NK_PUBLIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2862
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2616
2863
|
nk_size_t row_count) {
|
|
2617
2864
|
#if NK_TARGET_SME
|
|
2618
|
-
nk_dots_symmetric_e4m3_sme(vectors,
|
|
2865
|
+
nk_dots_symmetric_e4m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2866
|
+
#elif NK_TARGET_NEONFP8
|
|
2867
|
+
nk_dots_symmetric_e4m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2619
2868
|
#elif NK_TARGET_NEONFHM
|
|
2620
|
-
nk_dots_symmetric_e4m3_neonfhm(vectors,
|
|
2869
|
+
nk_dots_symmetric_e4m3_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2621
2870
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2622
|
-
nk_dots_symmetric_e4m3_sapphireamx(vectors,
|
|
2871
|
+
nk_dots_symmetric_e4m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2872
|
+
row_count);
|
|
2873
|
+
#elif NK_TARGET_DIAMOND
|
|
2874
|
+
nk_dots_symmetric_e4m3_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2623
2875
|
#elif NK_TARGET_GENOA
|
|
2624
|
-
nk_dots_symmetric_e4m3_genoa(vectors,
|
|
2876
|
+
nk_dots_symmetric_e4m3_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2625
2877
|
#elif NK_TARGET_SKYLAKE
|
|
2626
|
-
nk_dots_symmetric_e4m3_skylake(vectors,
|
|
2878
|
+
nk_dots_symmetric_e4m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2627
2879
|
#elif NK_TARGET_HASWELL
|
|
2628
|
-
nk_dots_symmetric_e4m3_haswell(vectors,
|
|
2880
|
+
nk_dots_symmetric_e4m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2629
2881
|
#elif NK_TARGET_RVV
|
|
2630
|
-
nk_dots_symmetric_e4m3_rvv(vectors,
|
|
2882
|
+
nk_dots_symmetric_e4m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2631
2883
|
#elif NK_TARGET_V128RELAXED
|
|
2632
|
-
nk_dots_symmetric_e4m3_v128relaxed(vectors,
|
|
2884
|
+
nk_dots_symmetric_e4m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2885
|
+
row_count);
|
|
2633
2886
|
#else
|
|
2634
|
-
nk_dots_symmetric_e4m3_serial(vectors,
|
|
2887
|
+
nk_dots_symmetric_e4m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2635
2888
|
#endif
|
|
2636
2889
|
}
|
|
2637
2890
|
|
|
2638
|
-
NK_PUBLIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t
|
|
2639
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2891
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2892
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2640
2893
|
nk_size_t row_count) {
|
|
2641
2894
|
#if NK_TARGET_SME
|
|
2642
|
-
nk_dots_symmetric_e5m2_sme(vectors,
|
|
2895
|
+
nk_dots_symmetric_e5m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2896
|
+
#elif NK_TARGET_NEONFP8
|
|
2897
|
+
nk_dots_symmetric_e5m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2643
2898
|
#elif NK_TARGET_NEONFHM
|
|
2644
|
-
nk_dots_symmetric_e5m2_neonfhm(vectors,
|
|
2899
|
+
nk_dots_symmetric_e5m2_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2645
2900
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2646
|
-
nk_dots_symmetric_e5m2_sapphireamx(vectors,
|
|
2901
|
+
nk_dots_symmetric_e5m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2902
|
+
row_count);
|
|
2903
|
+
#elif NK_TARGET_DIAMOND
|
|
2904
|
+
nk_dots_symmetric_e5m2_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2647
2905
|
#elif NK_TARGET_GENOA
|
|
2648
|
-
nk_dots_symmetric_e5m2_genoa(vectors,
|
|
2906
|
+
nk_dots_symmetric_e5m2_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2649
2907
|
#elif NK_TARGET_SKYLAKE
|
|
2650
|
-
nk_dots_symmetric_e5m2_skylake(vectors,
|
|
2908
|
+
nk_dots_symmetric_e5m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2651
2909
|
#elif NK_TARGET_HASWELL
|
|
2652
|
-
nk_dots_symmetric_e5m2_haswell(vectors,
|
|
2910
|
+
nk_dots_symmetric_e5m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2653
2911
|
#elif NK_TARGET_RVV
|
|
2654
|
-
nk_dots_symmetric_e5m2_rvv(vectors,
|
|
2912
|
+
nk_dots_symmetric_e5m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2655
2913
|
#elif NK_TARGET_V128RELAXED
|
|
2656
|
-
nk_dots_symmetric_e5m2_v128relaxed(vectors,
|
|
2914
|
+
nk_dots_symmetric_e5m2_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2915
|
+
row_count);
|
|
2657
2916
|
#else
|
|
2658
|
-
nk_dots_symmetric_e5m2_serial(vectors,
|
|
2917
|
+
nk_dots_symmetric_e5m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2659
2918
|
#endif
|
|
2660
2919
|
}
|
|
2661
2920
|
|
|
2662
|
-
NK_PUBLIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t
|
|
2663
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2921
|
+
NK_PUBLIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2922
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2664
2923
|
nk_size_t row_count) {
|
|
2665
2924
|
#if NK_TARGET_SME
|
|
2666
|
-
nk_dots_symmetric_e2m3_sme(vectors,
|
|
2925
|
+
nk_dots_symmetric_e2m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2667
2926
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2668
|
-
nk_dots_symmetric_e2m3_sapphireamx(vectors,
|
|
2927
|
+
nk_dots_symmetric_e2m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2928
|
+
row_count);
|
|
2929
|
+
#elif NK_TARGET_NEONFP8
|
|
2930
|
+
nk_dots_symmetric_e2m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2669
2931
|
#elif NK_TARGET_SKYLAKE
|
|
2670
|
-
nk_dots_symmetric_e2m3_skylake(vectors,
|
|
2932
|
+
nk_dots_symmetric_e2m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2671
2933
|
#elif NK_TARGET_SIERRA
|
|
2672
|
-
nk_dots_symmetric_e2m3_sierra(vectors,
|
|
2934
|
+
nk_dots_symmetric_e2m3_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2673
2935
|
#elif NK_TARGET_ALDER
|
|
2674
|
-
nk_dots_symmetric_e2m3_alder(vectors,
|
|
2936
|
+
nk_dots_symmetric_e2m3_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2675
2937
|
#elif NK_TARGET_HASWELL
|
|
2676
|
-
nk_dots_symmetric_e2m3_haswell(vectors,
|
|
2938
|
+
nk_dots_symmetric_e2m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2677
2939
|
#elif NK_TARGET_RVV
|
|
2678
|
-
nk_dots_symmetric_e2m3_rvv(vectors,
|
|
2940
|
+
nk_dots_symmetric_e2m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2679
2941
|
#elif NK_TARGET_V128RELAXED
|
|
2680
|
-
nk_dots_symmetric_e2m3_v128relaxed(vectors,
|
|
2942
|
+
nk_dots_symmetric_e2m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2943
|
+
row_count);
|
|
2681
2944
|
#else
|
|
2682
|
-
nk_dots_symmetric_e2m3_serial(vectors,
|
|
2945
|
+
nk_dots_symmetric_e2m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2683
2946
|
#endif
|
|
2684
2947
|
}
|
|
2685
2948
|
|
|
2686
|
-
NK_PUBLIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t
|
|
2687
|
-
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2949
|
+
NK_PUBLIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2950
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2688
2951
|
nk_size_t row_count) {
|
|
2689
2952
|
#if NK_TARGET_SME
|
|
2690
|
-
nk_dots_symmetric_e3m2_sme(vectors,
|
|
2953
|
+
nk_dots_symmetric_e3m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2691
2954
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2692
|
-
nk_dots_symmetric_e3m2_sapphireamx(vectors,
|
|
2955
|
+
nk_dots_symmetric_e3m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2956
|
+
row_count);
|
|
2957
|
+
#elif NK_TARGET_NEONFP8
|
|
2958
|
+
nk_dots_symmetric_e3m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2693
2959
|
#elif NK_TARGET_SKYLAKE
|
|
2694
|
-
nk_dots_symmetric_e3m2_skylake(vectors,
|
|
2960
|
+
nk_dots_symmetric_e3m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2695
2961
|
#elif NK_TARGET_HASWELL
|
|
2696
|
-
nk_dots_symmetric_e3m2_haswell(vectors,
|
|
2962
|
+
nk_dots_symmetric_e3m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2697
2963
|
#elif NK_TARGET_RVV
|
|
2698
|
-
nk_dots_symmetric_e3m2_rvv(vectors,
|
|
2964
|
+
nk_dots_symmetric_e3m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2699
2965
|
#else
|
|
2700
|
-
nk_dots_symmetric_e3m2_serial(vectors,
|
|
2966
|
+
nk_dots_symmetric_e3m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2701
2967
|
#endif
|
|
2702
2968
|
}
|
|
2703
2969
|
|
|
2704
|
-
NK_PUBLIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t
|
|
2705
|
-
nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2970
|
+
NK_PUBLIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2971
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2706
2972
|
nk_size_t row_count) {
|
|
2707
2973
|
#if NK_TARGET_SME
|
|
2708
|
-
nk_dots_symmetric_u4_sme(vectors,
|
|
2974
|
+
nk_dots_symmetric_u4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2709
2975
|
#elif NK_TARGET_ICELAKE
|
|
2710
|
-
nk_dots_symmetric_u4_icelake(vectors,
|
|
2976
|
+
nk_dots_symmetric_u4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2711
2977
|
#elif NK_TARGET_NEONSDOT
|
|
2712
|
-
nk_dots_symmetric_u4_neonsdot(vectors,
|
|
2978
|
+
nk_dots_symmetric_u4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2713
2979
|
#elif NK_TARGET_HASWELL
|
|
2714
|
-
nk_dots_symmetric_u4_haswell(vectors,
|
|
2980
|
+
nk_dots_symmetric_u4_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2715
2981
|
#elif NK_TARGET_V128RELAXED
|
|
2716
|
-
nk_dots_symmetric_u4_v128relaxed(vectors,
|
|
2982
|
+
nk_dots_symmetric_u4_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2983
|
+
row_count);
|
|
2717
2984
|
#else
|
|
2718
|
-
nk_dots_symmetric_u4_serial(vectors,
|
|
2985
|
+
nk_dots_symmetric_u4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2719
2986
|
#endif
|
|
2720
2987
|
}
|
|
2721
2988
|
|
|
2722
|
-
NK_PUBLIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t
|
|
2723
|
-
nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2989
|
+
NK_PUBLIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2990
|
+
nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2724
2991
|
nk_size_t row_count) {
|
|
2725
2992
|
#if NK_TARGET_SMEBI32
|
|
2726
|
-
nk_dots_symmetric_u1_smebi32(vectors,
|
|
2993
|
+
nk_dots_symmetric_u1_smebi32(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2727
2994
|
#elif NK_TARGET_ICELAKE
|
|
2728
|
-
nk_dots_symmetric_u1_icelake(vectors,
|
|
2995
|
+
nk_dots_symmetric_u1_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2729
2996
|
#elif NK_TARGET_HASWELL
|
|
2730
|
-
nk_dots_symmetric_u1_haswell(vectors,
|
|
2997
|
+
nk_dots_symmetric_u1_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2731
2998
|
#elif NK_TARGET_NEON
|
|
2732
|
-
nk_dots_symmetric_u1_neon(vectors,
|
|
2999
|
+
nk_dots_symmetric_u1_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
3000
|
+
#elif NK_TARGET_POWERVSX
|
|
3001
|
+
nk_dots_symmetric_u1_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
3002
|
+
#elif NK_TARGET_LOONGSONASX
|
|
3003
|
+
nk_dots_symmetric_u1_loongsonasx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3004
|
+
row_count);
|
|
2733
3005
|
#elif NK_TARGET_V128RELAXED
|
|
2734
|
-
nk_dots_symmetric_u1_v128relaxed(vectors,
|
|
3006
|
+
nk_dots_symmetric_u1_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3007
|
+
row_count);
|
|
2735
3008
|
#else
|
|
2736
|
-
nk_dots_symmetric_u1_serial(vectors,
|
|
3009
|
+
nk_dots_symmetric_u1_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2737
3010
|
#endif
|
|
2738
3011
|
}
|
|
2739
3012
|
|
|
2740
|
-
NK_PUBLIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t
|
|
2741
|
-
nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
3013
|
+
NK_PUBLIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
3014
|
+
nk_size_t stride, nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2742
3015
|
nk_size_t row_count) {
|
|
2743
3016
|
#if NK_TARGET_SME
|
|
2744
|
-
nk_dots_symmetric_i4_sme(vectors,
|
|
3017
|
+
nk_dots_symmetric_i4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2745
3018
|
#elif NK_TARGET_ICELAKE
|
|
2746
|
-
nk_dots_symmetric_i4_icelake(vectors,
|
|
3019
|
+
nk_dots_symmetric_i4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2747
3020
|
#elif NK_TARGET_NEONSDOT
|
|
2748
|
-
nk_dots_symmetric_i4_neonsdot(vectors,
|
|
3021
|
+
nk_dots_symmetric_i4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2749
3022
|
#elif NK_TARGET_HASWELL
|
|
2750
|
-
nk_dots_symmetric_i4_haswell(vectors,
|
|
3023
|
+
nk_dots_symmetric_i4_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2751
3024
|
#elif NK_TARGET_V128RELAXED
|
|
2752
|
-
nk_dots_symmetric_i4_v128relaxed(vectors,
|
|
3025
|
+
nk_dots_symmetric_i4_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3026
|
+
row_count);
|
|
2753
3027
|
#else
|
|
2754
|
-
nk_dots_symmetric_i4_serial(vectors,
|
|
3028
|
+
nk_dots_symmetric_i4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2755
3029
|
#endif
|
|
2756
3030
|
}
|
|
2757
3031
|
|
|
2758
|
-
NK_PUBLIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t
|
|
2759
|
-
nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
3032
|
+
NK_PUBLIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
3033
|
+
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2760
3034
|
nk_size_t row_count) {
|
|
2761
3035
|
#if NK_TARGET_SMEF64
|
|
2762
|
-
nk_dots_symmetric_f32_smef64(vectors,
|
|
3036
|
+
nk_dots_symmetric_f32_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2763
3037
|
#elif NK_TARGET_SKYLAKE
|
|
2764
|
-
nk_dots_symmetric_f32_skylake(vectors,
|
|
3038
|
+
nk_dots_symmetric_f32_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2765
3039
|
#elif NK_TARGET_HASWELL
|
|
2766
|
-
nk_dots_symmetric_f32_haswell(vectors,
|
|
3040
|
+
nk_dots_symmetric_f32_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2767
3041
|
#elif NK_TARGET_NEON
|
|
2768
|
-
nk_dots_symmetric_f32_neon(vectors,
|
|
3042
|
+
nk_dots_symmetric_f32_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
3043
|
+
#elif NK_TARGET_POWERVSX
|
|
3044
|
+
nk_dots_symmetric_f32_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2769
3045
|
#elif NK_TARGET_RVV
|
|
2770
|
-
nk_dots_symmetric_f32_rvv(vectors,
|
|
3046
|
+
nk_dots_symmetric_f32_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2771
3047
|
#elif NK_TARGET_V128RELAXED
|
|
2772
|
-
nk_dots_symmetric_f32_v128relaxed(vectors,
|
|
3048
|
+
nk_dots_symmetric_f32_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3049
|
+
row_count);
|
|
2773
3050
|
#else
|
|
2774
|
-
nk_dots_symmetric_f32_serial(vectors,
|
|
3051
|
+
nk_dots_symmetric_f32_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2775
3052
|
#endif
|
|
2776
3053
|
}
|
|
2777
3054
|
|
|
2778
|
-
NK_PUBLIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t
|
|
2779
|
-
nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
3055
|
+
NK_PUBLIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
3056
|
+
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
|
|
2780
3057
|
nk_size_t row_count) {
|
|
2781
3058
|
#if NK_TARGET_SMEF64
|
|
2782
|
-
nk_dots_symmetric_f64_smef64(vectors,
|
|
3059
|
+
nk_dots_symmetric_f64_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2783
3060
|
#elif NK_TARGET_SKYLAKE
|
|
2784
|
-
nk_dots_symmetric_f64_skylake(vectors,
|
|
3061
|
+
nk_dots_symmetric_f64_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2785
3062
|
#elif NK_TARGET_HASWELL
|
|
2786
|
-
nk_dots_symmetric_f64_haswell(vectors,
|
|
3063
|
+
nk_dots_symmetric_f64_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2787
3064
|
#elif NK_TARGET_NEON
|
|
2788
|
-
nk_dots_symmetric_f64_neon(vectors,
|
|
3065
|
+
nk_dots_symmetric_f64_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
3066
|
+
#elif NK_TARGET_POWERVSX
|
|
3067
|
+
nk_dots_symmetric_f64_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2789
3068
|
#elif NK_TARGET_RVV
|
|
2790
|
-
nk_dots_symmetric_f64_rvv(vectors,
|
|
3069
|
+
nk_dots_symmetric_f64_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2791
3070
|
#elif NK_TARGET_V128RELAXED
|
|
2792
|
-
nk_dots_symmetric_f64_v128relaxed(vectors,
|
|
3071
|
+
nk_dots_symmetric_f64_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3072
|
+
row_count);
|
|
2793
3073
|
#else
|
|
2794
|
-
nk_dots_symmetric_f64_serial(vectors,
|
|
3074
|
+
nk_dots_symmetric_f64_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2795
3075
|
#endif
|
|
2796
3076
|
}
|
|
2797
3077
|
|