numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -76,15 +76,15 @@ NK_DYNAMIC void nk_angulars_packed_f32(nk_f32_t const *a, void const *b_packed,
|
|
|
76
76
|
/**
|
|
77
77
|
* @brief Computes symmetric angular distance matrix (Gram-style) for a set of vectors.
|
|
78
78
|
* @param[in] vectors Input matrix of row vectors in row-major order.
|
|
79
|
-
* @param[in]
|
|
79
|
+
* @param[in] vectors_count Number of vectors (rows) in the input matrix.
|
|
80
80
|
* @param[in] depth Dimension of each vector (columns).
|
|
81
81
|
* @param[in] stride Row stride in bytes for the input matrix.
|
|
82
|
-
* @param[out] result Output symmetric matrix (
|
|
82
|
+
* @param[out] result Output symmetric matrix (vectors_count x vectors_count).
|
|
83
83
|
* @param[in] result_stride Row stride in bytes for the result matrix.
|
|
84
84
|
* @param[in] row_start Starting row offset of results to compute (for parallelism).
|
|
85
85
|
* @param[in] row_count Number of rows of results to compute (for parallelism).
|
|
86
86
|
*/
|
|
87
|
-
NK_DYNAMIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t
|
|
87
|
+
NK_DYNAMIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
88
88
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
89
89
|
nk_size_t row_start, nk_size_t row_count);
|
|
90
90
|
|
|
@@ -106,15 +106,15 @@ NK_DYNAMIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed
|
|
|
106
106
|
/**
|
|
107
107
|
* @brief Computes symmetric euclidean distance matrix (Gram-style) for a set of vectors.
|
|
108
108
|
* @param[in] vectors Input matrix of row vectors in row-major order.
|
|
109
|
-
* @param[in]
|
|
109
|
+
* @param[in] vectors_count Number of vectors (rows) in the input matrix.
|
|
110
110
|
* @param[in] depth Dimension of each vector (columns).
|
|
111
111
|
* @param[in] stride Row stride in bytes for the input matrix.
|
|
112
|
-
* @param[out] result Output symmetric matrix (
|
|
112
|
+
* @param[out] result Output symmetric matrix (vectors_count x vectors_count).
|
|
113
113
|
* @param[in] result_stride Row stride in bytes for the result matrix.
|
|
114
114
|
* @param[in] row_start Starting row offset of results to compute (for parallelism).
|
|
115
115
|
* @param[in] row_count Number of rows of results to compute (for parallelism).
|
|
116
116
|
*/
|
|
117
|
-
NK_DYNAMIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t
|
|
117
|
+
NK_DYNAMIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
118
118
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
119
119
|
nk_size_t row_start, nk_size_t row_count);
|
|
120
120
|
|
|
@@ -123,7 +123,7 @@ NK_DYNAMIC void nk_angulars_packed_f64(nk_f64_t const *a, void const *b_packed,
|
|
|
123
123
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
124
124
|
nk_size_t r_stride_in_bytes);
|
|
125
125
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
126
|
-
NK_DYNAMIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t
|
|
126
|
+
NK_DYNAMIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
127
127
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
128
128
|
nk_size_t row_start, nk_size_t row_count);
|
|
129
129
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -131,7 +131,7 @@ NK_DYNAMIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed
|
|
|
131
131
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
132
132
|
nk_size_t r_stride_in_bytes);
|
|
133
133
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
134
|
-
NK_DYNAMIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t
|
|
134
|
+
NK_DYNAMIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
135
135
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
136
136
|
nk_size_t row_start, nk_size_t row_count);
|
|
137
137
|
|
|
@@ -140,7 +140,7 @@ NK_DYNAMIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed,
|
|
|
140
140
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
141
141
|
nk_size_t r_stride_in_bytes);
|
|
142
142
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
143
|
-
NK_DYNAMIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t
|
|
143
|
+
NK_DYNAMIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
144
144
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
145
145
|
nk_size_t row_start, nk_size_t row_count);
|
|
146
146
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -148,7 +148,7 @@ NK_DYNAMIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed
|
|
|
148
148
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
149
149
|
nk_size_t r_stride_in_bytes);
|
|
150
150
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
151
|
-
NK_DYNAMIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t
|
|
151
|
+
NK_DYNAMIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
152
152
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
153
153
|
nk_size_t row_start, nk_size_t row_count);
|
|
154
154
|
|
|
@@ -157,7 +157,7 @@ NK_DYNAMIC void nk_angulars_packed_bf16(nk_bf16_t const *a, void const *b_packed
|
|
|
157
157
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
158
158
|
nk_size_t r_stride_in_bytes);
|
|
159
159
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
160
|
-
NK_DYNAMIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t
|
|
160
|
+
NK_DYNAMIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
161
161
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
162
162
|
nk_size_t row_start, nk_size_t row_count);
|
|
163
163
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -165,7 +165,7 @@ NK_DYNAMIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_pack
|
|
|
165
165
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
166
166
|
nk_size_t r_stride_in_bytes);
|
|
167
167
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
168
|
-
NK_DYNAMIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t
|
|
168
|
+
NK_DYNAMIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
169
169
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
170
170
|
nk_size_t row_start, nk_size_t row_count);
|
|
171
171
|
|
|
@@ -174,7 +174,7 @@ NK_DYNAMIC void nk_angulars_packed_e4m3(nk_e4m3_t const *a, void const *b_packed
|
|
|
174
174
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
175
175
|
nk_size_t r_stride_in_bytes);
|
|
176
176
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
177
|
-
NK_DYNAMIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t
|
|
177
|
+
NK_DYNAMIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
178
178
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
179
179
|
nk_size_t row_start, nk_size_t row_count);
|
|
180
180
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -182,7 +182,7 @@ NK_DYNAMIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_pack
|
|
|
182
182
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
183
183
|
nk_size_t r_stride_in_bytes);
|
|
184
184
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
185
|
-
NK_DYNAMIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t
|
|
185
|
+
NK_DYNAMIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
186
186
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
187
187
|
nk_size_t row_start, nk_size_t row_count);
|
|
188
188
|
|
|
@@ -191,7 +191,7 @@ NK_DYNAMIC void nk_angulars_packed_e5m2(nk_e5m2_t const *a, void const *b_packed
|
|
|
191
191
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
192
192
|
nk_size_t r_stride_in_bytes);
|
|
193
193
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
194
|
-
NK_DYNAMIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t
|
|
194
|
+
NK_DYNAMIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
195
195
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
196
196
|
nk_size_t row_start, nk_size_t row_count);
|
|
197
197
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -199,7 +199,7 @@ NK_DYNAMIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_pack
|
|
|
199
199
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
200
200
|
nk_size_t r_stride_in_bytes);
|
|
201
201
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
202
|
-
NK_DYNAMIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t
|
|
202
|
+
NK_DYNAMIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
203
203
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
204
204
|
nk_size_t row_start, nk_size_t row_count);
|
|
205
205
|
|
|
@@ -208,7 +208,7 @@ NK_DYNAMIC void nk_angulars_packed_e2m3(nk_e2m3_t const *a, void const *b_packed
|
|
|
208
208
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
209
209
|
nk_size_t r_stride_in_bytes);
|
|
210
210
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
211
|
-
NK_DYNAMIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t
|
|
211
|
+
NK_DYNAMIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
212
212
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
213
213
|
nk_size_t row_start, nk_size_t row_count);
|
|
214
214
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -216,7 +216,7 @@ NK_DYNAMIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_pack
|
|
|
216
216
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
217
217
|
nk_size_t r_stride_in_bytes);
|
|
218
218
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
219
|
-
NK_DYNAMIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t
|
|
219
|
+
NK_DYNAMIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
220
220
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
221
221
|
nk_size_t row_start, nk_size_t row_count);
|
|
222
222
|
|
|
@@ -225,7 +225,7 @@ NK_DYNAMIC void nk_angulars_packed_e3m2(nk_e3m2_t const *a, void const *b_packed
|
|
|
225
225
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
226
226
|
nk_size_t r_stride_in_bytes);
|
|
227
227
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
228
|
-
NK_DYNAMIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t
|
|
228
|
+
NK_DYNAMIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
229
229
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
230
230
|
nk_size_t row_start, nk_size_t row_count);
|
|
231
231
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -233,7 +233,7 @@ NK_DYNAMIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_pack
|
|
|
233
233
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
234
234
|
nk_size_t r_stride_in_bytes);
|
|
235
235
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
236
|
-
NK_DYNAMIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t
|
|
236
|
+
NK_DYNAMIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
237
237
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
238
238
|
nk_size_t row_start, nk_size_t row_count);
|
|
239
239
|
|
|
@@ -242,15 +242,15 @@ NK_DYNAMIC void nk_angulars_packed_i8(nk_i8_t const *a, void const *b_packed, nk
|
|
|
242
242
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
243
243
|
nk_size_t r_stride_in_bytes);
|
|
244
244
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
245
|
-
NK_DYNAMIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t
|
|
246
|
-
nk_f32_t *result, nk_size_t result_stride,
|
|
247
|
-
nk_size_t row_count);
|
|
245
|
+
NK_DYNAMIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
246
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
247
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
248
248
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
249
249
|
NK_DYNAMIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
250
250
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
251
251
|
nk_size_t r_stride_in_bytes);
|
|
252
252
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
253
|
-
NK_DYNAMIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t
|
|
253
|
+
NK_DYNAMIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
254
254
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
255
255
|
nk_size_t row_start, nk_size_t row_count);
|
|
256
256
|
|
|
@@ -259,15 +259,15 @@ NK_DYNAMIC void nk_angulars_packed_u8(nk_u8_t const *a, void const *b_packed, nk
|
|
|
259
259
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
260
260
|
nk_size_t r_stride_in_bytes);
|
|
261
261
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
262
|
-
NK_DYNAMIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t
|
|
263
|
-
nk_f32_t *result, nk_size_t result_stride,
|
|
264
|
-
nk_size_t row_count);
|
|
262
|
+
NK_DYNAMIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
263
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
264
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
265
265
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
266
266
|
NK_DYNAMIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
267
267
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
268
268
|
nk_size_t r_stride_in_bytes);
|
|
269
269
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
270
|
-
NK_DYNAMIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t
|
|
270
|
+
NK_DYNAMIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
271
271
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
272
272
|
nk_size_t row_start, nk_size_t row_count);
|
|
273
273
|
|
|
@@ -276,7 +276,7 @@ NK_DYNAMIC void nk_angulars_packed_i4(nk_i4x2_t const *a, void const *b_packed,
|
|
|
276
276
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
277
277
|
nk_size_t r_stride_in_bytes);
|
|
278
278
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
279
|
-
NK_DYNAMIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t
|
|
279
|
+
NK_DYNAMIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
280
280
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
281
281
|
nk_size_t row_start, nk_size_t row_count);
|
|
282
282
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -284,7 +284,7 @@ NK_DYNAMIC void nk_euclideans_packed_i4(nk_i4x2_t const *a, void const *b_packed
|
|
|
284
284
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
285
285
|
nk_size_t r_stride_in_bytes);
|
|
286
286
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
287
|
-
NK_DYNAMIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t
|
|
287
|
+
NK_DYNAMIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
288
288
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
289
289
|
nk_size_t row_start, nk_size_t row_count);
|
|
290
290
|
|
|
@@ -293,7 +293,7 @@ NK_DYNAMIC void nk_angulars_packed_u4(nk_u4x2_t const *a, void const *b_packed,
|
|
|
293
293
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
294
294
|
nk_size_t r_stride_in_bytes);
|
|
295
295
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
296
|
-
NK_DYNAMIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t
|
|
296
|
+
NK_DYNAMIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
297
297
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
298
298
|
nk_size_t row_start, nk_size_t row_count);
|
|
299
299
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -301,7 +301,7 @@ NK_DYNAMIC void nk_euclideans_packed_u4(nk_u4x2_t const *a, void const *b_packed
|
|
|
301
301
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
302
302
|
nk_size_t r_stride_in_bytes);
|
|
303
303
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
304
|
-
NK_DYNAMIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t
|
|
304
|
+
NK_DYNAMIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
305
305
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
306
306
|
nk_size_t row_start, nk_size_t row_count);
|
|
307
307
|
|
|
@@ -310,7 +310,7 @@ NK_PUBLIC void nk_angulars_packed_f32_serial(nk_f32_t const *a, void const *b_pa
|
|
|
310
310
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
311
311
|
nk_size_t r_stride_in_bytes);
|
|
312
312
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
313
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t
|
|
313
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
314
314
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
315
315
|
nk_size_t row_start, nk_size_t row_count);
|
|
316
316
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -318,7 +318,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_serial(nk_f32_t const *a, void const *b_
|
|
|
318
318
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
319
319
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
320
320
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
321
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t
|
|
321
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
322
322
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
323
323
|
nk_size_t row_start, nk_size_t row_count);
|
|
324
324
|
|
|
@@ -327,7 +327,7 @@ NK_PUBLIC void nk_angulars_packed_f64_serial(nk_f64_t const *a, void const *b_pa
|
|
|
327
327
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
328
328
|
nk_size_t r_stride_in_bytes);
|
|
329
329
|
/** @copydoc nk_angulars_symmetric_f64 */
|
|
330
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t
|
|
330
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
331
331
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
332
332
|
nk_size_t row_start, nk_size_t row_count);
|
|
333
333
|
/** @copydoc nk_euclideans_packed_f64 */
|
|
@@ -335,7 +335,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_serial(nk_f64_t const *a, void const *b_
|
|
|
335
335
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
336
336
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
337
337
|
/** @copydoc nk_euclideans_symmetric_f64 */
|
|
338
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t
|
|
338
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
339
339
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
340
340
|
nk_size_t row_start, nk_size_t row_count);
|
|
341
341
|
|
|
@@ -344,7 +344,7 @@ NK_PUBLIC void nk_angulars_packed_f16_serial(nk_f16_t const *a, void const *b_pa
|
|
|
344
344
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
345
345
|
nk_size_t r_stride_in_bytes);
|
|
346
346
|
/** @copydoc nk_angulars_symmetric_f16 */
|
|
347
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t
|
|
347
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
348
348
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
349
349
|
nk_size_t row_start, nk_size_t row_count);
|
|
350
350
|
/** @copydoc nk_euclideans_packed_f16 */
|
|
@@ -352,7 +352,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_serial(nk_f16_t const *a, void const *b_
|
|
|
352
352
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
353
353
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
354
354
|
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
355
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t
|
|
355
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
356
356
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
357
357
|
nk_size_t row_start, nk_size_t row_count);
|
|
358
358
|
|
|
@@ -361,7 +361,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_serial(nk_bf16_t const *a, void const *b_
|
|
|
361
361
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
362
362
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
363
363
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
364
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t
|
|
364
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
365
365
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
366
366
|
nk_size_t row_start, nk_size_t row_count);
|
|
367
367
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -369,7 +369,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_serial(nk_bf16_t const *a, void const *
|
|
|
369
369
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
370
370
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
371
371
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
372
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t
|
|
372
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
373
373
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
374
374
|
nk_size_t row_start, nk_size_t row_count);
|
|
375
375
|
|
|
@@ -378,7 +378,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_serial(nk_e4m3_t const *a, void const *b_
|
|
|
378
378
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
379
379
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
380
380
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
381
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t
|
|
381
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
382
382
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
383
383
|
nk_size_t row_start, nk_size_t row_count);
|
|
384
384
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
@@ -386,7 +386,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_serial(nk_e4m3_t const *a, void const *
|
|
|
386
386
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
387
387
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
388
388
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
389
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t
|
|
389
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
390
390
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
391
391
|
nk_size_t row_start, nk_size_t row_count);
|
|
392
392
|
|
|
@@ -395,7 +395,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_serial(nk_e5m2_t const *a, void const *b_
|
|
|
395
395
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
396
396
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
397
397
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
398
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t
|
|
398
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
399
399
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
400
400
|
nk_size_t row_start, nk_size_t row_count);
|
|
401
401
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
@@ -403,7 +403,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_serial(nk_e5m2_t const *a, void const *
|
|
|
403
403
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
404
404
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
405
405
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
406
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t
|
|
406
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
407
407
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
408
408
|
nk_size_t row_start, nk_size_t row_count);
|
|
409
409
|
|
|
@@ -412,7 +412,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_serial(nk_e2m3_t const *a, void const *b_
|
|
|
412
412
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
413
413
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
414
414
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
415
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t
|
|
415
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
416
416
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
417
417
|
nk_size_t row_start, nk_size_t row_count);
|
|
418
418
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
@@ -420,7 +420,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_serial(nk_e2m3_t const *a, void const *
|
|
|
420
420
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
421
421
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
422
422
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
423
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t
|
|
423
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
424
424
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
425
425
|
nk_size_t row_start, nk_size_t row_count);
|
|
426
426
|
|
|
@@ -429,7 +429,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_serial(nk_e3m2_t const *a, void const *b_
|
|
|
429
429
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
430
430
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
431
431
|
/** @copydoc nk_angulars_symmetric_e3m2 */
|
|
432
|
-
NK_PUBLIC void nk_angulars_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t
|
|
432
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
433
433
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
434
434
|
nk_size_t row_start, nk_size_t row_count);
|
|
435
435
|
/** @copydoc nk_euclideans_packed_e3m2 */
|
|
@@ -437,7 +437,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_serial(nk_e3m2_t const *a, void const *
|
|
|
437
437
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
438
438
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
439
439
|
/** @copydoc nk_euclideans_symmetric_e3m2 */
|
|
440
|
-
NK_PUBLIC void nk_euclideans_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t
|
|
440
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
441
441
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
442
442
|
nk_size_t row_start, nk_size_t row_count);
|
|
443
443
|
|
|
@@ -446,7 +446,7 @@ NK_PUBLIC void nk_angulars_packed_i8_serial(nk_i8_t const *a, void const *b_pack
|
|
|
446
446
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
447
447
|
nk_size_t r_stride_in_bytes);
|
|
448
448
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
449
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t
|
|
449
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
450
450
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
451
451
|
nk_size_t row_start, nk_size_t row_count);
|
|
452
452
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -454,7 +454,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_serial(nk_i8_t const *a, void const *b_pa
|
|
|
454
454
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
455
455
|
nk_size_t r_stride_in_bytes);
|
|
456
456
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
457
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t
|
|
457
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
458
458
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
459
459
|
nk_size_t row_start, nk_size_t row_count);
|
|
460
460
|
|
|
@@ -463,7 +463,7 @@ NK_PUBLIC void nk_angulars_packed_u8_serial(nk_u8_t const *a, void const *b_pack
|
|
|
463
463
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
464
464
|
nk_size_t r_stride_in_bytes);
|
|
465
465
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
466
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t
|
|
466
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
467
467
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
468
468
|
nk_size_t row_start, nk_size_t row_count);
|
|
469
469
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -471,7 +471,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_serial(nk_u8_t const *a, void const *b_pa
|
|
|
471
471
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
472
472
|
nk_size_t r_stride_in_bytes);
|
|
473
473
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
474
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t
|
|
474
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
475
475
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
476
476
|
nk_size_t row_start, nk_size_t row_count);
|
|
477
477
|
|
|
@@ -480,7 +480,7 @@ NK_PUBLIC void nk_angulars_packed_i4_serial(nk_i4x2_t const *a, void const *b_pa
|
|
|
480
480
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
481
481
|
nk_size_t r_stride_in_bytes);
|
|
482
482
|
/** @copydoc nk_angulars_symmetric_i4 */
|
|
483
|
-
NK_PUBLIC void nk_angulars_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t
|
|
483
|
+
NK_PUBLIC void nk_angulars_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
484
484
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
485
485
|
nk_size_t row_start, nk_size_t row_count);
|
|
486
486
|
/** @copydoc nk_euclideans_packed_i4 */
|
|
@@ -488,7 +488,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_serial(nk_i4x2_t const *a, void const *b_
|
|
|
488
488
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
489
489
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
490
490
|
/** @copydoc nk_euclideans_symmetric_i4 */
|
|
491
|
-
NK_PUBLIC void nk_euclideans_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t
|
|
491
|
+
NK_PUBLIC void nk_euclideans_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
492
492
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
493
493
|
nk_size_t row_start, nk_size_t row_count);
|
|
494
494
|
|
|
@@ -497,7 +497,7 @@ NK_PUBLIC void nk_angulars_packed_u4_serial(nk_u4x2_t const *a, void const *b_pa
|
|
|
497
497
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
498
498
|
nk_size_t r_stride_in_bytes);
|
|
499
499
|
/** @copydoc nk_angulars_symmetric_u4 */
|
|
500
|
-
NK_PUBLIC void nk_angulars_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t
|
|
500
|
+
NK_PUBLIC void nk_angulars_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
501
501
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
502
502
|
nk_size_t row_start, nk_size_t row_count);
|
|
503
503
|
/** @copydoc nk_euclideans_packed_u4 */
|
|
@@ -505,7 +505,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_serial(nk_u4x2_t const *a, void const *b_
|
|
|
505
505
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
506
506
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
507
507
|
/** @copydoc nk_euclideans_symmetric_u4 */
|
|
508
|
-
NK_PUBLIC void nk_euclideans_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t
|
|
508
|
+
NK_PUBLIC void nk_euclideans_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
509
509
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
510
510
|
nk_size_t row_start, nk_size_t row_count);
|
|
511
511
|
|
|
@@ -519,7 +519,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_genoa(nk_bf16_t const *a, void const *b_p
|
|
|
519
519
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
520
520
|
nk_size_t r_stride_in_bytes);
|
|
521
521
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
522
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t
|
|
522
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
523
523
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
524
524
|
nk_size_t row_start, nk_size_t row_count);
|
|
525
525
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -527,7 +527,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_genoa(nk_bf16_t const *a, void const *b
|
|
|
527
527
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
528
528
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
529
529
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
530
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t
|
|
530
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
531
531
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
532
532
|
nk_size_t row_start, nk_size_t row_count);
|
|
533
533
|
|
|
@@ -536,7 +536,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_genoa(nk_e4m3_t const *a, void const *b_p
|
|
|
536
536
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
537
537
|
nk_size_t r_stride_in_bytes);
|
|
538
538
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
539
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t
|
|
539
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
540
540
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
541
541
|
nk_size_t row_start, nk_size_t row_count);
|
|
542
542
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
@@ -544,7 +544,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_genoa(nk_e4m3_t const *a, void const *b
|
|
|
544
544
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
545
545
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
546
546
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
547
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t
|
|
547
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
548
548
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
549
549
|
nk_size_t row_start, nk_size_t row_count);
|
|
550
550
|
|
|
@@ -553,7 +553,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_genoa(nk_e5m2_t const *a, void const *b_p
|
|
|
553
553
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
554
554
|
nk_size_t r_stride_in_bytes);
|
|
555
555
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
556
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t
|
|
556
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
557
557
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
558
558
|
nk_size_t row_start, nk_size_t row_count);
|
|
559
559
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
@@ -561,12 +561,48 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_genoa(nk_e5m2_t const *a, void const *b
|
|
|
561
561
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
562
562
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
563
563
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
564
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t
|
|
564
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
565
565
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
566
566
|
nk_size_t row_start, nk_size_t row_count);
|
|
567
567
|
|
|
568
568
|
#endif // NK_TARGET_GENOA
|
|
569
569
|
|
|
570
|
+
#if NK_TARGET_DIAMOND
|
|
571
|
+
/** @copydoc nk_angulars_packed_e4m3 */
|
|
572
|
+
NK_PUBLIC void nk_angulars_packed_e4m3_diamond(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
573
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
574
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
575
|
+
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
576
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_diamond(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
577
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
578
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
579
|
+
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
580
|
+
NK_PUBLIC void nk_euclideans_packed_e4m3_diamond(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
581
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
582
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
583
|
+
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
584
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_diamond(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
585
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
586
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
587
|
+
|
|
588
|
+
/** @copydoc nk_angulars_packed_e5m2 */
|
|
589
|
+
NK_PUBLIC void nk_angulars_packed_e5m2_diamond(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
590
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
591
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
592
|
+
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
593
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_diamond(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
594
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
595
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
596
|
+
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
597
|
+
NK_PUBLIC void nk_euclideans_packed_e5m2_diamond(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
598
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
599
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
600
|
+
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
601
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_diamond(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
602
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
603
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
604
|
+
#endif // NK_TARGET_DIAMOND
|
|
605
|
+
|
|
570
606
|
/* Sapphire Rapids backends using Intel AMX (Advanced Matrix Extensions).
|
|
571
607
|
* AMX provides 8 tile registers (TMM0-TMM7), each holding up to 1KB of data.
|
|
572
608
|
* Tiles are configured as 16 rows x 64 bytes, enabling (16 x 32) BF16 or (16 x 64) INT8 tiles.
|
|
@@ -578,92 +614,102 @@ NK_PUBLIC void nk_angulars_packed_bf16_sapphireamx(nk_bf16_t const *a, void cons
|
|
|
578
614
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
579
615
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
580
616
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
581
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t
|
|
582
|
-
nk_size_t stride, nk_f32_t *result,
|
|
583
|
-
nk_size_t
|
|
617
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vectors_count,
|
|
618
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
619
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
620
|
+
nk_size_t row_count);
|
|
584
621
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
585
622
|
NK_PUBLIC void nk_euclideans_packed_bf16_sapphireamx(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result,
|
|
586
623
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
587
624
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
588
625
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
589
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t
|
|
590
|
-
nk_size_t stride, nk_f32_t *result,
|
|
591
|
-
nk_size_t
|
|
626
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vectors_count,
|
|
627
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
628
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
629
|
+
nk_size_t row_count);
|
|
592
630
|
|
|
593
631
|
/** @copydoc nk_angulars_packed_e4m3 */
|
|
594
632
|
NK_PUBLIC void nk_angulars_packed_e4m3_sapphireamx(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
595
633
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
596
634
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
597
635
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
598
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t
|
|
599
|
-
nk_size_t stride, nk_f32_t *result,
|
|
600
|
-
nk_size_t
|
|
636
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t vectors_count,
|
|
637
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
638
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
639
|
+
nk_size_t row_count);
|
|
601
640
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
602
641
|
NK_PUBLIC void nk_euclideans_packed_e4m3_sapphireamx(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
603
642
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
604
643
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
605
644
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
606
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t
|
|
607
|
-
nk_size_t stride, nk_f32_t *result,
|
|
608
|
-
nk_size_t
|
|
645
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t vectors_count,
|
|
646
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
647
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
648
|
+
nk_size_t row_count);
|
|
609
649
|
|
|
610
650
|
/** @copydoc nk_angulars_packed_e5m2 */
|
|
611
651
|
NK_PUBLIC void nk_angulars_packed_e5m2_sapphireamx(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
612
652
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
613
653
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
614
654
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
615
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t
|
|
616
|
-
nk_size_t stride, nk_f32_t *result,
|
|
617
|
-
nk_size_t
|
|
655
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t vectors_count,
|
|
656
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
657
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
658
|
+
nk_size_t row_count);
|
|
618
659
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
619
660
|
NK_PUBLIC void nk_euclideans_packed_e5m2_sapphireamx(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
620
661
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
621
662
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
622
663
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
623
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t
|
|
624
|
-
nk_size_t stride, nk_f32_t *result,
|
|
625
|
-
nk_size_t
|
|
664
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t vectors_count,
|
|
665
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
666
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
667
|
+
nk_size_t row_count);
|
|
626
668
|
|
|
627
669
|
/** @copydoc nk_angulars_packed_e2m3 */
|
|
628
670
|
NK_PUBLIC void nk_angulars_packed_e2m3_sapphireamx(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
629
671
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
630
672
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
631
673
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
632
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t
|
|
633
|
-
nk_size_t stride, nk_f32_t *result,
|
|
634
|
-
nk_size_t
|
|
674
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t vectors_count,
|
|
675
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
676
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
677
|
+
nk_size_t row_count);
|
|
635
678
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
636
679
|
NK_PUBLIC void nk_euclideans_packed_e2m3_sapphireamx(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
637
680
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
638
681
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
639
682
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
640
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t
|
|
641
|
-
nk_size_t stride, nk_f32_t *result,
|
|
642
|
-
nk_size_t
|
|
683
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t vectors_count,
|
|
684
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
685
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
686
|
+
nk_size_t row_count);
|
|
643
687
|
|
|
644
688
|
/** @copydoc nk_angulars_packed_e3m2 */
|
|
645
689
|
NK_PUBLIC void nk_angulars_packed_e3m2_sapphireamx(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
646
690
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
647
691
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
648
692
|
/** @copydoc nk_angulars_symmetric_e3m2 */
|
|
649
|
-
NK_PUBLIC void nk_angulars_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t
|
|
650
|
-
nk_size_t stride, nk_f32_t *result,
|
|
651
|
-
nk_size_t
|
|
693
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t vectors_count,
|
|
694
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
695
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
696
|
+
nk_size_t row_count);
|
|
652
697
|
/** @copydoc nk_euclideans_packed_e3m2 */
|
|
653
698
|
NK_PUBLIC void nk_euclideans_packed_e3m2_sapphireamx(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
654
699
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
655
700
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
656
701
|
/** @copydoc nk_euclideans_symmetric_e3m2 */
|
|
657
|
-
NK_PUBLIC void nk_euclideans_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t
|
|
658
|
-
nk_size_t stride, nk_f32_t *result,
|
|
659
|
-
nk_size_t
|
|
702
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t vectors_count,
|
|
703
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
704
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
705
|
+
nk_size_t row_count);
|
|
660
706
|
|
|
661
707
|
/** @copydoc nk_angulars_packed_i8 */
|
|
662
708
|
NK_PUBLIC void nk_angulars_packed_i8_sapphireamx(nk_i8_t const *a, void const *b_packed, nk_f32_t *result,
|
|
663
709
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
664
710
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
665
711
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
666
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t
|
|
712
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
667
713
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
668
714
|
nk_size_t row_start, nk_size_t row_count);
|
|
669
715
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -671,7 +717,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_sapphireamx(nk_i8_t const *a, void const
|
|
|
671
717
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
672
718
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
673
719
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
674
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t
|
|
720
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
675
721
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
676
722
|
nk_size_t row_start, nk_size_t row_count);
|
|
677
723
|
|
|
@@ -680,7 +726,7 @@ NK_PUBLIC void nk_angulars_packed_u8_sapphireamx(nk_u8_t const *a, void const *b
|
|
|
680
726
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
681
727
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
682
728
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
683
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t
|
|
729
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
684
730
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
685
731
|
nk_size_t row_start, nk_size_t row_count);
|
|
686
732
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -688,7 +734,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_sapphireamx(nk_u8_t const *a, void const
|
|
|
688
734
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
689
735
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
690
736
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
691
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t
|
|
737
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
692
738
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
693
739
|
nk_size_t row_start, nk_size_t row_count);
|
|
694
740
|
#endif // NK_TARGET_SAPPHIREAMX
|
|
@@ -703,7 +749,7 @@ NK_PUBLIC void nk_angulars_packed_f16_sme(nk_f16_t const *a, void const *b_packe
|
|
|
703
749
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
704
750
|
nk_size_t r_stride_in_bytes);
|
|
705
751
|
/** @copydoc nk_angulars_symmetric_f16 */
|
|
706
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t
|
|
752
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
707
753
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
708
754
|
nk_size_t row_start, nk_size_t row_count);
|
|
709
755
|
/** @copydoc nk_euclideans_packed_f16 */
|
|
@@ -711,7 +757,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_sme(nk_f16_t const *a, void const *b_pac
|
|
|
711
757
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
712
758
|
nk_size_t r_stride_in_bytes);
|
|
713
759
|
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
714
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t
|
|
760
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
715
761
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
716
762
|
nk_size_t row_start, nk_size_t row_count);
|
|
717
763
|
|
|
@@ -720,7 +766,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_sme(nk_bf16_t const *a, void const *b_pac
|
|
|
720
766
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
721
767
|
nk_size_t r_stride_in_bytes);
|
|
722
768
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
723
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t
|
|
769
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
724
770
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
725
771
|
nk_size_t row_start, nk_size_t row_count);
|
|
726
772
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -728,7 +774,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_sme(nk_bf16_t const *a, void const *b_p
|
|
|
728
774
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
729
775
|
nk_size_t r_stride_in_bytes);
|
|
730
776
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
731
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t
|
|
777
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
732
778
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
733
779
|
nk_size_t row_start, nk_size_t row_count);
|
|
734
780
|
|
|
@@ -737,7 +783,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_sme(nk_e4m3_t const *a, void const *b_pac
|
|
|
737
783
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
738
784
|
nk_size_t r_stride_in_bytes);
|
|
739
785
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
740
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t
|
|
786
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
741
787
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
742
788
|
nk_size_t row_start, nk_size_t row_count);
|
|
743
789
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
@@ -745,7 +791,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_sme(nk_e4m3_t const *a, void const *b_p
|
|
|
745
791
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
746
792
|
nk_size_t r_stride_in_bytes);
|
|
747
793
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
748
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t
|
|
794
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
749
795
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
750
796
|
nk_size_t row_start, nk_size_t row_count);
|
|
751
797
|
|
|
@@ -754,7 +800,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_sme(nk_e5m2_t const *a, void const *b_pac
|
|
|
754
800
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
755
801
|
nk_size_t r_stride_in_bytes);
|
|
756
802
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
757
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t
|
|
803
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
758
804
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
759
805
|
nk_size_t row_start, nk_size_t row_count);
|
|
760
806
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
@@ -762,7 +808,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_sme(nk_e5m2_t const *a, void const *b_p
|
|
|
762
808
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
763
809
|
nk_size_t r_stride_in_bytes);
|
|
764
810
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
765
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t
|
|
811
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
766
812
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
767
813
|
nk_size_t row_start, nk_size_t row_count);
|
|
768
814
|
|
|
@@ -771,7 +817,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_sme(nk_e2m3_t const *a, void const *b_pac
|
|
|
771
817
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
772
818
|
nk_size_t r_stride_in_bytes);
|
|
773
819
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
774
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t
|
|
820
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
775
821
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
776
822
|
nk_size_t row_start, nk_size_t row_count);
|
|
777
823
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
@@ -779,7 +825,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_sme(nk_e2m3_t const *a, void const *b_p
|
|
|
779
825
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
780
826
|
nk_size_t r_stride_in_bytes);
|
|
781
827
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
782
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t
|
|
828
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
783
829
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
784
830
|
nk_size_t row_start, nk_size_t row_count);
|
|
785
831
|
|
|
@@ -788,7 +834,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_sme(nk_e3m2_t const *a, void const *b_pac
|
|
|
788
834
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
789
835
|
nk_size_t r_stride_in_bytes);
|
|
790
836
|
/** @copydoc nk_angulars_symmetric_e3m2 */
|
|
791
|
-
NK_PUBLIC void nk_angulars_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t
|
|
837
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
792
838
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
793
839
|
nk_size_t row_start, nk_size_t row_count);
|
|
794
840
|
/** @copydoc nk_euclideans_packed_e3m2 */
|
|
@@ -796,7 +842,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_sme(nk_e3m2_t const *a, void const *b_p
|
|
|
796
842
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
797
843
|
nk_size_t r_stride_in_bytes);
|
|
798
844
|
/** @copydoc nk_euclideans_symmetric_e3m2 */
|
|
799
|
-
NK_PUBLIC void nk_euclideans_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t
|
|
845
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
800
846
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
801
847
|
nk_size_t row_start, nk_size_t row_count);
|
|
802
848
|
|
|
@@ -805,7 +851,7 @@ NK_PUBLIC void nk_angulars_packed_i8_sme(nk_i8_t const *a, void const *b_packed,
|
|
|
805
851
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
806
852
|
nk_size_t r_stride_in_bytes);
|
|
807
853
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
808
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t
|
|
854
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
809
855
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
810
856
|
nk_size_t row_start, nk_size_t row_count);
|
|
811
857
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -813,7 +859,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_sme(nk_i8_t const *a, void const *b_packe
|
|
|
813
859
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
814
860
|
nk_size_t r_stride_in_bytes);
|
|
815
861
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
816
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t
|
|
862
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
817
863
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
818
864
|
nk_size_t row_start, nk_size_t row_count);
|
|
819
865
|
|
|
@@ -822,7 +868,7 @@ NK_PUBLIC void nk_angulars_packed_u8_sme(nk_u8_t const *a, void const *b_packed,
|
|
|
822
868
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
823
869
|
nk_size_t r_stride_in_bytes);
|
|
824
870
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
825
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t
|
|
871
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
826
872
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
827
873
|
nk_size_t row_start, nk_size_t row_count);
|
|
828
874
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -830,7 +876,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_sme(nk_u8_t const *a, void const *b_packe
|
|
|
830
876
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
831
877
|
nk_size_t r_stride_in_bytes);
|
|
832
878
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
833
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t
|
|
879
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
834
880
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
835
881
|
nk_size_t row_start, nk_size_t row_count);
|
|
836
882
|
|
|
@@ -839,7 +885,7 @@ NK_PUBLIC void nk_angulars_packed_i4_sme(nk_i4x2_t const *a, void const *b_packe
|
|
|
839
885
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
840
886
|
nk_size_t r_stride_in_bytes);
|
|
841
887
|
/** @copydoc nk_angulars_symmetric_i4 */
|
|
842
|
-
NK_PUBLIC void nk_angulars_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t
|
|
888
|
+
NK_PUBLIC void nk_angulars_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
843
889
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
844
890
|
nk_size_t row_start, nk_size_t row_count);
|
|
845
891
|
/** @copydoc nk_euclideans_packed_i4 */
|
|
@@ -847,7 +893,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_sme(nk_i4x2_t const *a, void const *b_pac
|
|
|
847
893
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
848
894
|
nk_size_t r_stride_in_bytes);
|
|
849
895
|
/** @copydoc nk_euclideans_symmetric_i4 */
|
|
850
|
-
NK_PUBLIC void nk_euclideans_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t
|
|
896
|
+
NK_PUBLIC void nk_euclideans_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
851
897
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
852
898
|
nk_size_t row_start, nk_size_t row_count);
|
|
853
899
|
|
|
@@ -856,7 +902,7 @@ NK_PUBLIC void nk_angulars_packed_u4_sme(nk_u4x2_t const *a, void const *b_packe
|
|
|
856
902
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
857
903
|
nk_size_t r_stride_in_bytes);
|
|
858
904
|
/** @copydoc nk_angulars_symmetric_u4 */
|
|
859
|
-
NK_PUBLIC void nk_angulars_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t
|
|
905
|
+
NK_PUBLIC void nk_angulars_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
860
906
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
861
907
|
nk_size_t row_start, nk_size_t row_count);
|
|
862
908
|
/** @copydoc nk_euclideans_packed_u4 */
|
|
@@ -864,7 +910,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_sme(nk_u4x2_t const *a, void const *b_pac
|
|
|
864
910
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
865
911
|
nk_size_t r_stride_in_bytes);
|
|
866
912
|
/** @copydoc nk_euclideans_symmetric_u4 */
|
|
867
|
-
NK_PUBLIC void nk_euclideans_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t
|
|
913
|
+
NK_PUBLIC void nk_euclideans_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
868
914
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
869
915
|
nk_size_t row_start, nk_size_t row_count);
|
|
870
916
|
#endif // NK_TARGET_SME
|
|
@@ -878,7 +924,7 @@ NK_PUBLIC void nk_angulars_packed_f32_smef64(nk_f32_t const *a, void const *b_pa
|
|
|
878
924
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
879
925
|
nk_size_t r_stride_in_bytes);
|
|
880
926
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
881
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t
|
|
927
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
882
928
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
883
929
|
nk_size_t row_start, nk_size_t row_count);
|
|
884
930
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -886,7 +932,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_smef64(nk_f32_t const *a, void const *b_
|
|
|
886
932
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
887
933
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
888
934
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
889
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t
|
|
935
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
890
936
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
891
937
|
nk_size_t row_start, nk_size_t row_count);
|
|
892
938
|
|
|
@@ -895,7 +941,7 @@ NK_PUBLIC void nk_angulars_packed_f64_smef64(nk_f64_t const *a, void const *b_pa
|
|
|
895
941
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
896
942
|
nk_size_t r_stride_in_bytes);
|
|
897
943
|
/** @copydoc nk_angulars_symmetric_f64 */
|
|
898
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t
|
|
944
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
899
945
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
900
946
|
nk_size_t row_start, nk_size_t row_count);
|
|
901
947
|
/** @copydoc nk_euclideans_packed_f64 */
|
|
@@ -903,7 +949,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_smef64(nk_f64_t const *a, void const *b_
|
|
|
903
949
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
904
950
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
905
951
|
/** @copydoc nk_euclideans_symmetric_f64 */
|
|
906
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t
|
|
952
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
907
953
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
908
954
|
nk_size_t row_start, nk_size_t row_count);
|
|
909
955
|
#endif // NK_TARGET_SMEF64
|
|
@@ -917,7 +963,7 @@ NK_PUBLIC void nk_angulars_packed_f32_haswell(nk_f32_t const *a, void const *b_p
|
|
|
917
963
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
918
964
|
nk_size_t r_stride_in_bytes);
|
|
919
965
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
920
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t
|
|
966
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
921
967
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
922
968
|
nk_size_t row_start, nk_size_t row_count);
|
|
923
969
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -925,7 +971,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_haswell(nk_f32_t const *a, void const *b
|
|
|
925
971
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
926
972
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
927
973
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
928
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t
|
|
974
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
929
975
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
930
976
|
nk_size_t row_start, nk_size_t row_count);
|
|
931
977
|
|
|
@@ -934,7 +980,7 @@ NK_PUBLIC void nk_angulars_packed_f64_haswell(nk_f64_t const *a, void const *b_p
|
|
|
934
980
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
935
981
|
nk_size_t r_stride_in_bytes);
|
|
936
982
|
/** @copydoc nk_angulars_symmetric_f64 */
|
|
937
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t
|
|
983
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
938
984
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
939
985
|
nk_size_t row_start, nk_size_t row_count);
|
|
940
986
|
/** @copydoc nk_euclideans_packed_f64 */
|
|
@@ -942,7 +988,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_haswell(nk_f64_t const *a, void const *b
|
|
|
942
988
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
943
989
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
944
990
|
/** @copydoc nk_euclideans_symmetric_f64 */
|
|
945
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t
|
|
991
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
946
992
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
947
993
|
nk_size_t row_start, nk_size_t row_count);
|
|
948
994
|
|
|
@@ -951,7 +997,7 @@ NK_PUBLIC void nk_angulars_packed_f16_haswell(nk_f16_t const *a, void const *b_p
|
|
|
951
997
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
952
998
|
nk_size_t r_stride_in_bytes);
|
|
953
999
|
/** @copydoc nk_angulars_symmetric_f16 */
|
|
954
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t
|
|
1000
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
955
1001
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
956
1002
|
nk_size_t row_start, nk_size_t row_count);
|
|
957
1003
|
/** @copydoc nk_euclideans_packed_f16 */
|
|
@@ -959,7 +1005,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_haswell(nk_f16_t const *a, void const *b
|
|
|
959
1005
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
960
1006
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
961
1007
|
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
962
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t
|
|
1008
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
963
1009
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
964
1010
|
nk_size_t row_start, nk_size_t row_count);
|
|
965
1011
|
|
|
@@ -968,7 +1014,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_haswell(nk_bf16_t const *a, void const *b
|
|
|
968
1014
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
969
1015
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
970
1016
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
971
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t
|
|
1017
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
972
1018
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
973
1019
|
nk_size_t row_start, nk_size_t row_count);
|
|
974
1020
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -976,7 +1022,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_haswell(nk_bf16_t const *a, void const
|
|
|
976
1022
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
977
1023
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
978
1024
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
979
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t
|
|
1025
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
980
1026
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
981
1027
|
nk_size_t row_start, nk_size_t row_count);
|
|
982
1028
|
|
|
@@ -985,7 +1031,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_haswell(nk_e4m3_t const *a, void const *b
|
|
|
985
1031
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
986
1032
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
987
1033
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
988
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t
|
|
1034
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
989
1035
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
990
1036
|
nk_size_t row_start, nk_size_t row_count);
|
|
991
1037
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
@@ -993,7 +1039,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_haswell(nk_e4m3_t const *a, void const
|
|
|
993
1039
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
994
1040
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
995
1041
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
996
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t
|
|
1042
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
997
1043
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
998
1044
|
nk_size_t row_start, nk_size_t row_count);
|
|
999
1045
|
|
|
@@ -1002,7 +1048,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_haswell(nk_e5m2_t const *a, void const *b
|
|
|
1002
1048
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1003
1049
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1004
1050
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
1005
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t
|
|
1051
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1006
1052
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1007
1053
|
nk_size_t row_start, nk_size_t row_count);
|
|
1008
1054
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
@@ -1010,7 +1056,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_haswell(nk_e5m2_t const *a, void const
|
|
|
1010
1056
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1011
1057
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1012
1058
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
1013
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t
|
|
1059
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1014
1060
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1015
1061
|
nk_size_t row_start, nk_size_t row_count);
|
|
1016
1062
|
|
|
@@ -1019,7 +1065,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_haswell(nk_e2m3_t const *a, void const *b
|
|
|
1019
1065
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1020
1066
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1021
1067
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
1022
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t
|
|
1068
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1023
1069
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1024
1070
|
nk_size_t row_start, nk_size_t row_count);
|
|
1025
1071
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
@@ -1027,7 +1073,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_haswell(nk_e2m3_t const *a, void const
|
|
|
1027
1073
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1028
1074
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1029
1075
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
1030
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t
|
|
1076
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1031
1077
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1032
1078
|
nk_size_t row_start, nk_size_t row_count);
|
|
1033
1079
|
|
|
@@ -1036,7 +1082,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_haswell(nk_e3m2_t const *a, void const *b
|
|
|
1036
1082
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1037
1083
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1038
1084
|
/** @copydoc nk_angulars_symmetric_e3m2 */
|
|
1039
|
-
NK_PUBLIC void nk_angulars_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t
|
|
1085
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1040
1086
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1041
1087
|
nk_size_t row_start, nk_size_t row_count);
|
|
1042
1088
|
/** @copydoc nk_euclideans_packed_e3m2 */
|
|
@@ -1044,7 +1090,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_haswell(nk_e3m2_t const *a, void const
|
|
|
1044
1090
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1045
1091
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1046
1092
|
/** @copydoc nk_euclideans_symmetric_e3m2 */
|
|
1047
|
-
NK_PUBLIC void nk_euclideans_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t
|
|
1093
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1048
1094
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1049
1095
|
nk_size_t row_start, nk_size_t row_count);
|
|
1050
1096
|
/** @copydoc nk_angulars_packed_i8 */
|
|
@@ -1052,7 +1098,7 @@ NK_PUBLIC void nk_angulars_packed_i8_haswell(nk_i8_t const *a, void const *b_pac
|
|
|
1052
1098
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1053
1099
|
nk_size_t r_stride_in_bytes);
|
|
1054
1100
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
1055
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t
|
|
1101
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1056
1102
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1057
1103
|
nk_size_t row_start, nk_size_t row_count);
|
|
1058
1104
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -1060,7 +1106,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_haswell(nk_i8_t const *a, void const *b_p
|
|
|
1060
1106
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1061
1107
|
nk_size_t r_stride_in_bytes);
|
|
1062
1108
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
1063
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t
|
|
1109
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1064
1110
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1065
1111
|
nk_size_t row_start, nk_size_t row_count);
|
|
1066
1112
|
/** @copydoc nk_angulars_packed_u8 */
|
|
@@ -1068,7 +1114,7 @@ NK_PUBLIC void nk_angulars_packed_u8_haswell(nk_u8_t const *a, void const *b_pac
|
|
|
1068
1114
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1069
1115
|
nk_size_t r_stride_in_bytes);
|
|
1070
1116
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
1071
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t
|
|
1117
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1072
1118
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1073
1119
|
nk_size_t row_start, nk_size_t row_count);
|
|
1074
1120
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -1076,7 +1122,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_haswell(nk_u8_t const *a, void const *b_p
|
|
|
1076
1122
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1077
1123
|
nk_size_t r_stride_in_bytes);
|
|
1078
1124
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
1079
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t
|
|
1125
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1080
1126
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1081
1127
|
nk_size_t row_start, nk_size_t row_count);
|
|
1082
1128
|
#endif // NK_TARGET_HASWELL
|
|
@@ -1090,7 +1136,7 @@ NK_PUBLIC void nk_angulars_packed_f32_skylake(nk_f32_t const *a, void const *b_p
|
|
|
1090
1136
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1091
1137
|
nk_size_t r_stride_in_bytes);
|
|
1092
1138
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
1093
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t
|
|
1139
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1094
1140
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1095
1141
|
nk_size_t row_start, nk_size_t row_count);
|
|
1096
1142
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -1098,7 +1144,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_skylake(nk_f32_t const *a, void const *b
|
|
|
1098
1144
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1099
1145
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1100
1146
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
1101
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t
|
|
1147
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1102
1148
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1103
1149
|
nk_size_t row_start, nk_size_t row_count);
|
|
1104
1150
|
|
|
@@ -1107,7 +1153,7 @@ NK_PUBLIC void nk_angulars_packed_f64_skylake(nk_f64_t const *a, void const *b_p
|
|
|
1107
1153
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1108
1154
|
nk_size_t r_stride_in_bytes);
|
|
1109
1155
|
/** @copydoc nk_angulars_symmetric_f64 */
|
|
1110
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t
|
|
1156
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1111
1157
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1112
1158
|
nk_size_t row_start, nk_size_t row_count);
|
|
1113
1159
|
/** @copydoc nk_euclideans_packed_f64 */
|
|
@@ -1115,7 +1161,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_skylake(nk_f64_t const *a, void const *b
|
|
|
1115
1161
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1116
1162
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1117
1163
|
/** @copydoc nk_euclideans_symmetric_f64 */
|
|
1118
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t
|
|
1164
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1119
1165
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1120
1166
|
nk_size_t row_start, nk_size_t row_count);
|
|
1121
1167
|
|
|
@@ -1124,7 +1170,7 @@ NK_PUBLIC void nk_angulars_packed_f16_skylake(nk_f16_t const *a, void const *b_p
|
|
|
1124
1170
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1125
1171
|
nk_size_t r_stride_in_bytes);
|
|
1126
1172
|
/** @copydoc nk_angulars_symmetric_f16 */
|
|
1127
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t
|
|
1173
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1128
1174
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1129
1175
|
nk_size_t row_start, nk_size_t row_count);
|
|
1130
1176
|
/** @copydoc nk_euclideans_packed_f16 */
|
|
@@ -1132,7 +1178,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_skylake(nk_f16_t const *a, void const *b
|
|
|
1132
1178
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1133
1179
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1134
1180
|
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
1135
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t
|
|
1181
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1136
1182
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1137
1183
|
nk_size_t row_start, nk_size_t row_count);
|
|
1138
1184
|
|
|
@@ -1141,7 +1187,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_skylake(nk_bf16_t const *a, void const *b
|
|
|
1141
1187
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1142
1188
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1143
1189
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
1144
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t
|
|
1190
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1145
1191
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1146
1192
|
nk_size_t row_start, nk_size_t row_count);
|
|
1147
1193
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -1149,7 +1195,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_skylake(nk_bf16_t const *a, void const
|
|
|
1149
1195
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1150
1196
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1151
1197
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
1152
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t
|
|
1198
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1153
1199
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1154
1200
|
nk_size_t row_start, nk_size_t row_count);
|
|
1155
1201
|
|
|
@@ -1158,7 +1204,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_skylake(nk_e4m3_t const *a, void const *b
|
|
|
1158
1204
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1159
1205
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1160
1206
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
1161
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t
|
|
1207
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1162
1208
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1163
1209
|
nk_size_t row_start, nk_size_t row_count);
|
|
1164
1210
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
@@ -1166,7 +1212,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_skylake(nk_e4m3_t const *a, void const
|
|
|
1166
1212
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1167
1213
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1168
1214
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
1169
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t
|
|
1215
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1170
1216
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1171
1217
|
nk_size_t row_start, nk_size_t row_count);
|
|
1172
1218
|
|
|
@@ -1175,7 +1221,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_skylake(nk_e5m2_t const *a, void const *b
|
|
|
1175
1221
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1176
1222
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1177
1223
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
1178
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t
|
|
1224
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1179
1225
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1180
1226
|
nk_size_t row_start, nk_size_t row_count);
|
|
1181
1227
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
@@ -1183,7 +1229,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_skylake(nk_e5m2_t const *a, void const
|
|
|
1183
1229
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1184
1230
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1185
1231
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
1186
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t
|
|
1232
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1187
1233
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1188
1234
|
nk_size_t row_start, nk_size_t row_count);
|
|
1189
1235
|
|
|
@@ -1192,7 +1238,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_skylake(nk_e2m3_t const *a, void const *b
|
|
|
1192
1238
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1193
1239
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1194
1240
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
1195
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t
|
|
1241
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1196
1242
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1197
1243
|
nk_size_t row_start, nk_size_t row_count);
|
|
1198
1244
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
@@ -1200,7 +1246,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_skylake(nk_e2m3_t const *a, void const
|
|
|
1200
1246
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1201
1247
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1202
1248
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
1203
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t
|
|
1249
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1204
1250
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1205
1251
|
nk_size_t row_start, nk_size_t row_count);
|
|
1206
1252
|
|
|
@@ -1209,7 +1255,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_skylake(nk_e3m2_t const *a, void const *b
|
|
|
1209
1255
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1210
1256
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1211
1257
|
/** @copydoc nk_angulars_symmetric_e3m2 */
|
|
1212
|
-
NK_PUBLIC void nk_angulars_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t
|
|
1258
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1213
1259
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1214
1260
|
nk_size_t row_start, nk_size_t row_count);
|
|
1215
1261
|
/** @copydoc nk_euclideans_packed_e3m2 */
|
|
@@ -1217,7 +1263,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_skylake(nk_e3m2_t const *a, void const
|
|
|
1217
1263
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1218
1264
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1219
1265
|
/** @copydoc nk_euclideans_symmetric_e3m2 */
|
|
1220
|
-
NK_PUBLIC void nk_euclideans_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t
|
|
1266
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1221
1267
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1222
1268
|
nk_size_t row_start, nk_size_t row_count);
|
|
1223
1269
|
#endif // NK_TARGET_SKYLAKE
|
|
@@ -1231,7 +1277,7 @@ NK_PUBLIC void nk_angulars_packed_i8_icelake(nk_i8_t const *a, void const *b_pac
|
|
|
1231
1277
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1232
1278
|
nk_size_t r_stride_in_bytes);
|
|
1233
1279
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
1234
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t
|
|
1280
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1235
1281
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1236
1282
|
nk_size_t row_start, nk_size_t row_count);
|
|
1237
1283
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -1239,7 +1285,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_icelake(nk_i8_t const *a, void const *b_p
|
|
|
1239
1285
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1240
1286
|
nk_size_t r_stride_in_bytes);
|
|
1241
1287
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
1242
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t
|
|
1288
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1243
1289
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1244
1290
|
nk_size_t row_start, nk_size_t row_count);
|
|
1245
1291
|
|
|
@@ -1248,7 +1294,7 @@ NK_PUBLIC void nk_angulars_packed_u8_icelake(nk_u8_t const *a, void const *b_pac
|
|
|
1248
1294
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1249
1295
|
nk_size_t r_stride_in_bytes);
|
|
1250
1296
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
1251
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t
|
|
1297
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1252
1298
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1253
1299
|
nk_size_t row_start, nk_size_t row_count);
|
|
1254
1300
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -1256,7 +1302,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_icelake(nk_u8_t const *a, void const *b_p
|
|
|
1256
1302
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1257
1303
|
nk_size_t r_stride_in_bytes);
|
|
1258
1304
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
1259
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t
|
|
1305
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1260
1306
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1261
1307
|
nk_size_t row_start, nk_size_t row_count);
|
|
1262
1308
|
|
|
@@ -1265,7 +1311,7 @@ NK_PUBLIC void nk_angulars_packed_i4_icelake(nk_i4x2_t const *a, void const *b_p
|
|
|
1265
1311
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1266
1312
|
nk_size_t r_stride_in_bytes);
|
|
1267
1313
|
/** @copydoc nk_angulars_symmetric_i4 */
|
|
1268
|
-
NK_PUBLIC void nk_angulars_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t
|
|
1314
|
+
NK_PUBLIC void nk_angulars_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1269
1315
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1270
1316
|
nk_size_t row_start, nk_size_t row_count);
|
|
1271
1317
|
/** @copydoc nk_euclideans_packed_i4 */
|
|
@@ -1273,7 +1319,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_icelake(nk_i4x2_t const *a, void const *b
|
|
|
1273
1319
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1274
1320
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1275
1321
|
/** @copydoc nk_euclideans_symmetric_i4 */
|
|
1276
|
-
NK_PUBLIC void nk_euclideans_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t
|
|
1322
|
+
NK_PUBLIC void nk_euclideans_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1277
1323
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1278
1324
|
nk_size_t row_start, nk_size_t row_count);
|
|
1279
1325
|
|
|
@@ -1282,7 +1328,7 @@ NK_PUBLIC void nk_angulars_packed_u4_icelake(nk_u4x2_t const *a, void const *b_p
|
|
|
1282
1328
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1283
1329
|
nk_size_t r_stride_in_bytes);
|
|
1284
1330
|
/** @copydoc nk_angulars_symmetric_u4 */
|
|
1285
|
-
NK_PUBLIC void nk_angulars_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t
|
|
1331
|
+
NK_PUBLIC void nk_angulars_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1286
1332
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1287
1333
|
nk_size_t row_start, nk_size_t row_count);
|
|
1288
1334
|
/** @copydoc nk_euclideans_packed_u4 */
|
|
@@ -1290,7 +1336,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_icelake(nk_u4x2_t const *a, void const *b
|
|
|
1290
1336
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1291
1337
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1292
1338
|
/** @copydoc nk_euclideans_symmetric_u4 */
|
|
1293
|
-
NK_PUBLIC void nk_euclideans_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t
|
|
1339
|
+
NK_PUBLIC void nk_euclideans_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1294
1340
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1295
1341
|
nk_size_t row_start, nk_size_t row_count);
|
|
1296
1342
|
#endif // NK_TARGET_ICELAKE
|
|
@@ -1301,7 +1347,7 @@ NK_PUBLIC void nk_angulars_packed_i8_alder(nk_i8_t const *a, void const *b_packe
|
|
|
1301
1347
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1302
1348
|
nk_size_t r_stride_in_bytes);
|
|
1303
1349
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
1304
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t
|
|
1350
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1305
1351
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1306
1352
|
nk_size_t row_start, nk_size_t row_count);
|
|
1307
1353
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -1309,7 +1355,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_alder(nk_i8_t const *a, void const *b_pac
|
|
|
1309
1355
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1310
1356
|
nk_size_t r_stride_in_bytes);
|
|
1311
1357
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
1312
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t
|
|
1358
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1313
1359
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1314
1360
|
nk_size_t row_start, nk_size_t row_count);
|
|
1315
1361
|
/** @copydoc nk_angulars_packed_u8 */
|
|
@@ -1317,7 +1363,7 @@ NK_PUBLIC void nk_angulars_packed_u8_alder(nk_u8_t const *a, void const *b_packe
|
|
|
1317
1363
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1318
1364
|
nk_size_t r_stride_in_bytes);
|
|
1319
1365
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
1320
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t
|
|
1366
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1321
1367
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1322
1368
|
nk_size_t row_start, nk_size_t row_count);
|
|
1323
1369
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -1325,7 +1371,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_alder(nk_u8_t const *a, void const *b_pac
|
|
|
1325
1371
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1326
1372
|
nk_size_t r_stride_in_bytes);
|
|
1327
1373
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
1328
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t
|
|
1374
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1329
1375
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1330
1376
|
nk_size_t row_start, nk_size_t row_count);
|
|
1331
1377
|
/** @copydoc nk_angulars_packed_e2m3 */
|
|
@@ -1333,7 +1379,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_alder(nk_e2m3_t const *a, void const *b_p
|
|
|
1333
1379
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1334
1380
|
nk_size_t r_stride_in_bytes);
|
|
1335
1381
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
1336
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t
|
|
1382
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1337
1383
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1338
1384
|
nk_size_t row_start, nk_size_t row_count);
|
|
1339
1385
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
@@ -1341,7 +1387,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_alder(nk_e2m3_t const *a, void const *b
|
|
|
1341
1387
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1342
1388
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1343
1389
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
1344
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t
|
|
1390
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1345
1391
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1346
1392
|
nk_size_t row_start, nk_size_t row_count);
|
|
1347
1393
|
#endif // NK_TARGET_ALDER
|
|
@@ -1355,7 +1401,7 @@ NK_PUBLIC void nk_angulars_packed_i8_sierra(nk_i8_t const *a, void const *b_pack
|
|
|
1355
1401
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1356
1402
|
nk_size_t r_stride_in_bytes);
|
|
1357
1403
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
1358
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t
|
|
1404
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1359
1405
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1360
1406
|
nk_size_t row_start, nk_size_t row_count);
|
|
1361
1407
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -1363,7 +1409,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_sierra(nk_i8_t const *a, void const *b_pa
|
|
|
1363
1409
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1364
1410
|
nk_size_t r_stride_in_bytes);
|
|
1365
1411
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
1366
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t
|
|
1412
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1367
1413
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1368
1414
|
nk_size_t row_start, nk_size_t row_count);
|
|
1369
1415
|
/** @copydoc nk_angulars_packed_u8 */
|
|
@@ -1371,7 +1417,7 @@ NK_PUBLIC void nk_angulars_packed_u8_sierra(nk_u8_t const *a, void const *b_pack
|
|
|
1371
1417
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1372
1418
|
nk_size_t r_stride_in_bytes);
|
|
1373
1419
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
1374
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t
|
|
1420
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1375
1421
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1376
1422
|
nk_size_t row_start, nk_size_t row_count);
|
|
1377
1423
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -1379,7 +1425,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_sierra(nk_u8_t const *a, void const *b_pa
|
|
|
1379
1425
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1380
1426
|
nk_size_t r_stride_in_bytes);
|
|
1381
1427
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
1382
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t
|
|
1428
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1383
1429
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1384
1430
|
nk_size_t row_start, nk_size_t row_count);
|
|
1385
1431
|
/** @copydoc nk_angulars_packed_e2m3 */
|
|
@@ -1387,7 +1433,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_sierra(nk_e2m3_t const *a, void const *b_
|
|
|
1387
1433
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1388
1434
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1389
1435
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
1390
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t
|
|
1436
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1391
1437
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1392
1438
|
nk_size_t row_start, nk_size_t row_count);
|
|
1393
1439
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
@@ -1395,7 +1441,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_sierra(nk_e2m3_t const *a, void const *
|
|
|
1395
1441
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1396
1442
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1397
1443
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
1398
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t
|
|
1444
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1399
1445
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1400
1446
|
nk_size_t row_start, nk_size_t row_count);
|
|
1401
1447
|
#endif // NK_TARGET_SIERRA
|
|
@@ -1409,7 +1455,7 @@ NK_PUBLIC void nk_angulars_packed_i8_v128relaxed(nk_i8_t const *a, void const *b
|
|
|
1409
1455
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1410
1456
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1411
1457
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
1412
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t
|
|
1458
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1413
1459
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1414
1460
|
nk_size_t row_start, nk_size_t row_count);
|
|
1415
1461
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -1417,7 +1463,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_v128relaxed(nk_i8_t const *a, void const
|
|
|
1417
1463
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1418
1464
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1419
1465
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
1420
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t
|
|
1466
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1421
1467
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1422
1468
|
nk_size_t row_start, nk_size_t row_count);
|
|
1423
1469
|
/** @copydoc nk_angulars_packed_u8 */
|
|
@@ -1425,7 +1471,7 @@ NK_PUBLIC void nk_angulars_packed_u8_v128relaxed(nk_u8_t const *a, void const *b
|
|
|
1425
1471
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1426
1472
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1427
1473
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
1428
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t
|
|
1474
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1429
1475
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1430
1476
|
nk_size_t row_start, nk_size_t row_count);
|
|
1431
1477
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -1433,7 +1479,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_v128relaxed(nk_u8_t const *a, void const
|
|
|
1433
1479
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1434
1480
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1435
1481
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
1436
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t
|
|
1482
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1437
1483
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1438
1484
|
nk_size_t row_start, nk_size_t row_count);
|
|
1439
1485
|
/** @copydoc nk_angulars_packed_e2m3 */
|
|
@@ -1441,71 +1487,79 @@ NK_PUBLIC void nk_angulars_packed_e2m3_v128relaxed(nk_e2m3_t const *a, void cons
|
|
|
1441
1487
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1442
1488
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1443
1489
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
1444
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t
|
|
1445
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1446
|
-
nk_size_t
|
|
1490
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t vectors_count,
|
|
1491
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1492
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1493
|
+
nk_size_t row_count);
|
|
1447
1494
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
1448
1495
|
NK_PUBLIC void nk_euclideans_packed_e2m3_v128relaxed(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1449
1496
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1450
1497
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1451
1498
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
1452
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t
|
|
1453
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1454
|
-
nk_size_t
|
|
1499
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t vectors_count,
|
|
1500
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1501
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1502
|
+
nk_size_t row_count);
|
|
1455
1503
|
/** @copydoc nk_angulars_packed_e4m3 */
|
|
1456
1504
|
NK_PUBLIC void nk_angulars_packed_e4m3_v128relaxed(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1457
1505
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1458
1506
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1459
1507
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
1460
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t
|
|
1461
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1462
|
-
nk_size_t
|
|
1508
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t vectors_count,
|
|
1509
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1510
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1511
|
+
nk_size_t row_count);
|
|
1463
1512
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
1464
1513
|
NK_PUBLIC void nk_euclideans_packed_e4m3_v128relaxed(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1465
1514
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1466
1515
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1467
1516
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
1468
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t
|
|
1469
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1470
|
-
nk_size_t
|
|
1517
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t vectors_count,
|
|
1518
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1519
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1520
|
+
nk_size_t row_count);
|
|
1471
1521
|
/** @copydoc nk_angulars_packed_e5m2 */
|
|
1472
1522
|
NK_PUBLIC void nk_angulars_packed_e5m2_v128relaxed(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1473
1523
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1474
1524
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1475
1525
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
1476
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t
|
|
1477
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1478
|
-
nk_size_t
|
|
1526
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t vectors_count,
|
|
1527
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1528
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1529
|
+
nk_size_t row_count);
|
|
1479
1530
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
1480
1531
|
NK_PUBLIC void nk_euclideans_packed_e5m2_v128relaxed(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1481
1532
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1482
1533
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1483
1534
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
1484
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t
|
|
1485
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1486
|
-
nk_size_t
|
|
1535
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t vectors_count,
|
|
1536
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1537
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1538
|
+
nk_size_t row_count);
|
|
1487
1539
|
/** @copydoc nk_angulars_packed_bf16 */
|
|
1488
1540
|
NK_PUBLIC void nk_angulars_packed_bf16_v128relaxed(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1489
1541
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1490
1542
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1491
1543
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
1492
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t
|
|
1493
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1494
|
-
nk_size_t
|
|
1544
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vectors_count,
|
|
1545
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1546
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1547
|
+
nk_size_t row_count);
|
|
1495
1548
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
1496
1549
|
NK_PUBLIC void nk_euclideans_packed_bf16_v128relaxed(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1497
1550
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1498
1551
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1499
1552
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
1500
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t
|
|
1501
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1502
|
-
nk_size_t
|
|
1553
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vectors_count,
|
|
1554
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1555
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1556
|
+
nk_size_t row_count);
|
|
1503
1557
|
/** @copydoc nk_angulars_packed_f32 */
|
|
1504
1558
|
NK_PUBLIC void nk_angulars_packed_f32_v128relaxed(nk_f32_t const *a, void const *b_packed, nk_f64_t *result,
|
|
1505
1559
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1506
1560
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1507
1561
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
1508
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t
|
|
1562
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1509
1563
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1510
1564
|
nk_size_t row_start, nk_size_t row_count);
|
|
1511
1565
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -1513,15 +1567,16 @@ NK_PUBLIC void nk_euclideans_packed_f32_v128relaxed(nk_f32_t const *a, void cons
|
|
|
1513
1567
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1514
1568
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1515
1569
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
1516
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t
|
|
1517
|
-
nk_size_t stride, nk_f64_t *result,
|
|
1518
|
-
nk_size_t
|
|
1570
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vectors_count,
|
|
1571
|
+
nk_size_t depth, nk_size_t stride, nk_f64_t *result,
|
|
1572
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1573
|
+
nk_size_t row_count);
|
|
1519
1574
|
/** @copydoc nk_angulars_packed_f64 */
|
|
1520
1575
|
NK_PUBLIC void nk_angulars_packed_f64_v128relaxed(nk_f64_t const *a, void const *b_packed, nk_f64_t *result,
|
|
1521
1576
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1522
1577
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1523
1578
|
/** @copydoc nk_angulars_symmetric_f64 */
|
|
1524
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t
|
|
1579
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1525
1580
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1526
1581
|
nk_size_t row_start, nk_size_t row_count);
|
|
1527
1582
|
/** @copydoc nk_euclideans_packed_f64 */
|
|
@@ -1529,9 +1584,10 @@ NK_PUBLIC void nk_euclideans_packed_f64_v128relaxed(nk_f64_t const *a, void cons
|
|
|
1529
1584
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1530
1585
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1531
1586
|
/** @copydoc nk_euclideans_symmetric_f64 */
|
|
1532
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t
|
|
1533
|
-
nk_size_t stride, nk_f64_t *result,
|
|
1534
|
-
nk_size_t
|
|
1587
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t vectors_count,
|
|
1588
|
+
nk_size_t depth, nk_size_t stride, nk_f64_t *result,
|
|
1589
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1590
|
+
nk_size_t row_count);
|
|
1535
1591
|
#endif // NK_TARGET_V128RELAXED
|
|
1536
1592
|
|
|
1537
1593
|
/* ARM NEON backends (base NEON with F32/F64 support).
|
|
@@ -1543,7 +1599,7 @@ NK_PUBLIC void nk_angulars_packed_f32_neon(nk_f32_t const *a, void const *b_pack
|
|
|
1543
1599
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1544
1600
|
nk_size_t r_stride_in_bytes);
|
|
1545
1601
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
1546
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t
|
|
1602
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1547
1603
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1548
1604
|
nk_size_t row_start, nk_size_t row_count);
|
|
1549
1605
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -1551,7 +1607,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_neon(nk_f32_t const *a, void const *b_pa
|
|
|
1551
1607
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1552
1608
|
nk_size_t r_stride_in_bytes);
|
|
1553
1609
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
1554
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t
|
|
1610
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1555
1611
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1556
1612
|
nk_size_t row_start, nk_size_t row_count);
|
|
1557
1613
|
|
|
@@ -1560,7 +1616,7 @@ NK_PUBLIC void nk_angulars_packed_f64_neon(nk_f64_t const *a, void const *b_pack
|
|
|
1560
1616
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1561
1617
|
nk_size_t r_stride_in_bytes);
|
|
1562
1618
|
/** @copydoc nk_angulars_symmetric_f64 */
|
|
1563
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t
|
|
1619
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1564
1620
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1565
1621
|
nk_size_t row_start, nk_size_t row_count);
|
|
1566
1622
|
/** @copydoc nk_euclideans_packed_f64 */
|
|
@@ -1568,7 +1624,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_neon(nk_f64_t const *a, void const *b_pa
|
|
|
1568
1624
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1569
1625
|
nk_size_t r_stride_in_bytes);
|
|
1570
1626
|
/** @copydoc nk_euclideans_symmetric_f64 */
|
|
1571
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t
|
|
1627
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1572
1628
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1573
1629
|
nk_size_t row_start, nk_size_t row_count);
|
|
1574
1630
|
/** @copydoc nk_angulars_packed_bf16 */
|
|
@@ -1576,7 +1632,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_neon(nk_bf16_t const *a, void const *b_pa
|
|
|
1576
1632
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1577
1633
|
nk_size_t r_stride_in_bytes);
|
|
1578
1634
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
1579
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t
|
|
1635
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1580
1636
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1581
1637
|
nk_size_t row_start, nk_size_t row_count);
|
|
1582
1638
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -1584,7 +1640,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_neon(nk_bf16_t const *a, void const *b_
|
|
|
1584
1640
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1585
1641
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1586
1642
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
1587
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t
|
|
1643
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1588
1644
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1589
1645
|
nk_size_t row_start, nk_size_t row_count);
|
|
1590
1646
|
/** @copydoc nk_angulars_packed_f16 */
|
|
@@ -1592,7 +1648,7 @@ NK_PUBLIC void nk_angulars_packed_f16_neon(nk_f16_t const *a, void const *b_pack
|
|
|
1592
1648
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1593
1649
|
nk_size_t r_stride_in_bytes);
|
|
1594
1650
|
/** @copydoc nk_angulars_symmetric_f16 */
|
|
1595
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t
|
|
1651
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1596
1652
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1597
1653
|
nk_size_t row_start, nk_size_t row_count);
|
|
1598
1654
|
/** @copydoc nk_euclideans_packed_f16 */
|
|
@@ -1600,33 +1656,11 @@ NK_PUBLIC void nk_euclideans_packed_f16_neon(nk_f16_t const *a, void const *b_pa
|
|
|
1600
1656
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1601
1657
|
nk_size_t r_stride_in_bytes);
|
|
1602
1658
|
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
1603
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t
|
|
1659
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1604
1660
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1605
1661
|
nk_size_t row_start, nk_size_t row_count);
|
|
1606
1662
|
#endif // NK_TARGET_NEON
|
|
1607
1663
|
|
|
1608
|
-
/* ARM NEON with F16 arithmetic (ARMv8.2-A FP16).
|
|
1609
|
-
* Provides native F16 FMLA for half-precision dot products.
|
|
1610
|
-
*/
|
|
1611
|
-
#if NK_TARGET_NEONHALF
|
|
1612
|
-
/** @copydoc nk_angulars_packed_f16 */
|
|
1613
|
-
NK_PUBLIC void nk_angulars_packed_f16_neonhalf(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1614
|
-
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1615
|
-
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1616
|
-
/** @copydoc nk_angulars_symmetric_f16 */
|
|
1617
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_neonhalf(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
|
|
1618
|
-
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1619
|
-
nk_size_t row_start, nk_size_t row_count);
|
|
1620
|
-
/** @copydoc nk_euclideans_packed_f16 */
|
|
1621
|
-
NK_PUBLIC void nk_euclideans_packed_f16_neonhalf(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1622
|
-
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1623
|
-
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1624
|
-
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
1625
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_neonhalf(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
|
|
1626
|
-
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1627
|
-
nk_size_t row_start, nk_size_t row_count);
|
|
1628
|
-
#endif // NK_TARGET_NEONHALF
|
|
1629
|
-
|
|
1630
1664
|
/* ARM NEON with BF16 dot product (ARMv8.6-A BF16).
|
|
1631
1665
|
* Uses BFDOT/BFMMLA for efficient BF16 matrix operations.
|
|
1632
1666
|
*/
|
|
@@ -1636,7 +1670,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_neonbfdot(nk_bf16_t const *a, void const
|
|
|
1636
1670
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1637
1671
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1638
1672
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
1639
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t
|
|
1673
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1640
1674
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1641
1675
|
nk_size_t row_start, nk_size_t row_count);
|
|
1642
1676
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -1644,9 +1678,10 @@ NK_PUBLIC void nk_euclideans_packed_bf16_neonbfdot(nk_bf16_t const *a, void cons
|
|
|
1644
1678
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1645
1679
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1646
1680
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
1647
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t
|
|
1648
|
-
nk_size_t stride, nk_f32_t *result,
|
|
1649
|
-
nk_size_t
|
|
1681
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t vectors_count,
|
|
1682
|
+
nk_size_t depth, nk_size_t stride, nk_f32_t *result,
|
|
1683
|
+
nk_size_t result_stride, nk_size_t row_start,
|
|
1684
|
+
nk_size_t row_count);
|
|
1650
1685
|
#endif // NK_TARGET_NEONBFDOT
|
|
1651
1686
|
|
|
1652
1687
|
/* ARM NEON with signed/unsigned dot product (ARMv8.2-A DotProd).
|
|
@@ -1658,7 +1693,7 @@ NK_PUBLIC void nk_angulars_packed_i8_neonsdot(nk_i8_t const *a, void const *b_pa
|
|
|
1658
1693
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1659
1694
|
nk_size_t r_stride_in_bytes);
|
|
1660
1695
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
1661
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t
|
|
1696
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1662
1697
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1663
1698
|
nk_size_t row_start, nk_size_t row_count);
|
|
1664
1699
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -1666,7 +1701,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_neonsdot(nk_i8_t const *a, void const *b_
|
|
|
1666
1701
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1667
1702
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1668
1703
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
1669
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t
|
|
1704
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1670
1705
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1671
1706
|
nk_size_t row_start, nk_size_t row_count);
|
|
1672
1707
|
|
|
@@ -1675,7 +1710,7 @@ NK_PUBLIC void nk_angulars_packed_u8_neonsdot(nk_u8_t const *a, void const *b_pa
|
|
|
1675
1710
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1676
1711
|
nk_size_t r_stride_in_bytes);
|
|
1677
1712
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
1678
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t
|
|
1713
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1679
1714
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1680
1715
|
nk_size_t row_start, nk_size_t row_count);
|
|
1681
1716
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -1683,7 +1718,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_neonsdot(nk_u8_t const *a, void const *b_
|
|
|
1683
1718
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1684
1719
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1685
1720
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
1686
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t
|
|
1721
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1687
1722
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1688
1723
|
nk_size_t row_start, nk_size_t row_count);
|
|
1689
1724
|
|
|
@@ -1692,7 +1727,7 @@ NK_PUBLIC void nk_angulars_packed_i4_neonsdot(nk_i4x2_t const *a, void const *b_
|
|
|
1692
1727
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1693
1728
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1694
1729
|
/** @copydoc nk_angulars_symmetric_i4 */
|
|
1695
|
-
NK_PUBLIC void nk_angulars_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t
|
|
1730
|
+
NK_PUBLIC void nk_angulars_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1696
1731
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1697
1732
|
nk_size_t row_start, nk_size_t row_count);
|
|
1698
1733
|
/** @copydoc nk_euclideans_packed_i4 */
|
|
@@ -1700,7 +1735,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_neonsdot(nk_i4x2_t const *a, void const *
|
|
|
1700
1735
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1701
1736
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1702
1737
|
/** @copydoc nk_euclideans_symmetric_i4 */
|
|
1703
|
-
NK_PUBLIC void nk_euclideans_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t
|
|
1738
|
+
NK_PUBLIC void nk_euclideans_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1704
1739
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1705
1740
|
nk_size_t row_start, nk_size_t row_count);
|
|
1706
1741
|
|
|
@@ -1709,7 +1744,7 @@ NK_PUBLIC void nk_angulars_packed_u4_neonsdot(nk_u4x2_t const *a, void const *b_
|
|
|
1709
1744
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1710
1745
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1711
1746
|
/** @copydoc nk_angulars_symmetric_u4 */
|
|
1712
|
-
NK_PUBLIC void nk_angulars_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t
|
|
1747
|
+
NK_PUBLIC void nk_angulars_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1713
1748
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1714
1749
|
nk_size_t row_start, nk_size_t row_count);
|
|
1715
1750
|
/** @copydoc nk_euclideans_packed_u4 */
|
|
@@ -1717,7 +1752,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_neonsdot(nk_u4x2_t const *a, void const *
|
|
|
1717
1752
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1718
1753
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1719
1754
|
/** @copydoc nk_euclideans_symmetric_u4 */
|
|
1720
|
-
NK_PUBLIC void nk_euclideans_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t
|
|
1755
|
+
NK_PUBLIC void nk_euclideans_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1721
1756
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1722
1757
|
nk_size_t row_start, nk_size_t row_count);
|
|
1723
1758
|
#endif // NK_TARGET_NEONSDOT
|
|
@@ -1731,7 +1766,7 @@ NK_PUBLIC void nk_angulars_packed_f16_neonfhm(nk_f16_t const *a, void const *b_p
|
|
|
1731
1766
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1732
1767
|
nk_size_t r_stride_in_bytes);
|
|
1733
1768
|
/** @copydoc nk_angulars_symmetric_f16 */
|
|
1734
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t
|
|
1769
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1735
1770
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1736
1771
|
nk_size_t row_start, nk_size_t row_count);
|
|
1737
1772
|
/** @copydoc nk_euclideans_packed_f16 */
|
|
@@ -1739,7 +1774,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_neonfhm(nk_f16_t const *a, void const *b
|
|
|
1739
1774
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1740
1775
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1741
1776
|
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
1742
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t
|
|
1777
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1743
1778
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1744
1779
|
nk_size_t row_start, nk_size_t row_count);
|
|
1745
1780
|
|
|
@@ -1748,7 +1783,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_neonfhm(nk_e4m3_t const *a, void const *b
|
|
|
1748
1783
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1749
1784
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1750
1785
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
1751
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t
|
|
1786
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1752
1787
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1753
1788
|
nk_size_t row_start, nk_size_t row_count);
|
|
1754
1789
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
@@ -1756,7 +1791,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_neonfhm(nk_e4m3_t const *a, void const
|
|
|
1756
1791
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1757
1792
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1758
1793
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
1759
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t
|
|
1794
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1760
1795
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1761
1796
|
nk_size_t row_start, nk_size_t row_count);
|
|
1762
1797
|
|
|
@@ -1765,7 +1800,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_neonfhm(nk_e5m2_t const *a, void const *b
|
|
|
1765
1800
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1766
1801
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1767
1802
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
1768
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t
|
|
1803
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1769
1804
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1770
1805
|
nk_size_t row_start, nk_size_t row_count);
|
|
1771
1806
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
@@ -1773,19 +1808,93 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_neonfhm(nk_e5m2_t const *a, void const
|
|
|
1773
1808
|
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1774
1809
|
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1775
1810
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
1776
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t
|
|
1811
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1777
1812
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1778
1813
|
nk_size_t row_start, nk_size_t row_count);
|
|
1779
1814
|
|
|
1780
1815
|
#endif // NK_TARGET_NEONFHM
|
|
1781
1816
|
|
|
1817
|
+
/* ARM NEON with FP8 (ARMv9.2-A FP8).
|
|
1818
|
+
* Uses native FP8 dot-product instructions for E4M3/E5M2/E2M3/E3M2 operations.
|
|
1819
|
+
*/
|
|
1820
|
+
#if NK_TARGET_NEONFP8
|
|
1821
|
+
/** @copydoc nk_angulars_packed_e4m3 */
|
|
1822
|
+
NK_PUBLIC void nk_angulars_packed_e4m3_neonfp8(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1823
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1824
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1825
|
+
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
1826
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_neonfp8(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1827
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1828
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1829
|
+
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
1830
|
+
NK_PUBLIC void nk_euclideans_packed_e4m3_neonfp8(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1831
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1832
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1833
|
+
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
1834
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_neonfp8(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1835
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1836
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1837
|
+
|
|
1838
|
+
/** @copydoc nk_angulars_packed_e5m2 */
|
|
1839
|
+
NK_PUBLIC void nk_angulars_packed_e5m2_neonfp8(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1840
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1841
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1842
|
+
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
1843
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_neonfp8(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1844
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1845
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1846
|
+
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
1847
|
+
NK_PUBLIC void nk_euclideans_packed_e5m2_neonfp8(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1848
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1849
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1850
|
+
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
1851
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_neonfp8(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1852
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1853
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1854
|
+
|
|
1855
|
+
/** @copydoc nk_angulars_packed_e2m3 */
|
|
1856
|
+
NK_PUBLIC void nk_angulars_packed_e2m3_neonfp8(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1857
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1858
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1859
|
+
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
1860
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_neonfp8(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1861
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1862
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1863
|
+
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
1864
|
+
NK_PUBLIC void nk_euclideans_packed_e2m3_neonfp8(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1865
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1866
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1867
|
+
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
1868
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_neonfp8(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1869
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1870
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1871
|
+
|
|
1872
|
+
/** @copydoc nk_angulars_packed_e3m2 */
|
|
1873
|
+
NK_PUBLIC void nk_angulars_packed_e3m2_neonfp8(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1874
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1875
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1876
|
+
/** @copydoc nk_angulars_symmetric_e3m2 */
|
|
1877
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2_neonfp8(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1878
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1879
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1880
|
+
/** @copydoc nk_euclideans_packed_e3m2 */
|
|
1881
|
+
NK_PUBLIC void nk_euclideans_packed_e3m2_neonfp8(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
|
|
1882
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
1883
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
1884
|
+
/** @copydoc nk_euclideans_symmetric_e3m2 */
|
|
1885
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2_neonfp8(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1886
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1887
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
1888
|
+
|
|
1889
|
+
#endif // NK_TARGET_NEONFP8
|
|
1890
|
+
|
|
1782
1891
|
#if NK_TARGET_RVV
|
|
1783
1892
|
/** @copydoc nk_angulars_packed_f32 */
|
|
1784
1893
|
NK_PUBLIC void nk_angulars_packed_f32_rvv(nk_f32_t const *a, void const *b_packed, nk_f64_t *result, nk_size_t rows,
|
|
1785
1894
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1786
1895
|
nk_size_t r_stride_in_bytes);
|
|
1787
1896
|
/** @copydoc nk_angulars_symmetric_f32 */
|
|
1788
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t
|
|
1897
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1789
1898
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1790
1899
|
nk_size_t row_start, nk_size_t row_count);
|
|
1791
1900
|
/** @copydoc nk_euclideans_packed_f32 */
|
|
@@ -1793,7 +1902,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_rvv(nk_f32_t const *a, void const *b_pac
|
|
|
1793
1902
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1794
1903
|
nk_size_t r_stride_in_bytes);
|
|
1795
1904
|
/** @copydoc nk_euclideans_symmetric_f32 */
|
|
1796
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t
|
|
1905
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1797
1906
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1798
1907
|
nk_size_t row_start, nk_size_t row_count);
|
|
1799
1908
|
|
|
@@ -1802,7 +1911,7 @@ NK_PUBLIC void nk_angulars_packed_f64_rvv(nk_f64_t const *a, void const *b_packe
|
|
|
1802
1911
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1803
1912
|
nk_size_t r_stride_in_bytes);
|
|
1804
1913
|
/** @copydoc nk_angulars_symmetric_f64 */
|
|
1805
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t
|
|
1914
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1806
1915
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1807
1916
|
nk_size_t row_start, nk_size_t row_count);
|
|
1808
1917
|
/** @copydoc nk_euclideans_packed_f64 */
|
|
@@ -1810,7 +1919,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_rvv(nk_f64_t const *a, void const *b_pac
|
|
|
1810
1919
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1811
1920
|
nk_size_t r_stride_in_bytes);
|
|
1812
1921
|
/** @copydoc nk_euclideans_symmetric_f64 */
|
|
1813
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t
|
|
1922
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1814
1923
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
1815
1924
|
nk_size_t row_start, nk_size_t row_count);
|
|
1816
1925
|
|
|
@@ -1819,7 +1928,7 @@ NK_PUBLIC void nk_angulars_packed_f16_rvv(nk_f16_t const *a, void const *b_packe
|
|
|
1819
1928
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1820
1929
|
nk_size_t r_stride_in_bytes);
|
|
1821
1930
|
/** @copydoc nk_angulars_symmetric_f16 */
|
|
1822
|
-
NK_PUBLIC void nk_angulars_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t
|
|
1931
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1823
1932
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1824
1933
|
nk_size_t row_start, nk_size_t row_count);
|
|
1825
1934
|
/** @copydoc nk_euclideans_packed_f16 */
|
|
@@ -1827,7 +1936,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_rvv(nk_f16_t const *a, void const *b_pac
|
|
|
1827
1936
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1828
1937
|
nk_size_t r_stride_in_bytes);
|
|
1829
1938
|
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
1830
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t
|
|
1939
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1831
1940
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1832
1941
|
nk_size_t row_start, nk_size_t row_count);
|
|
1833
1942
|
|
|
@@ -1836,7 +1945,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_rvv(nk_bf16_t const *a, void const *b_pac
|
|
|
1836
1945
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1837
1946
|
nk_size_t r_stride_in_bytes);
|
|
1838
1947
|
/** @copydoc nk_angulars_symmetric_bf16 */
|
|
1839
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t
|
|
1948
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1840
1949
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1841
1950
|
nk_size_t row_start, nk_size_t row_count);
|
|
1842
1951
|
/** @copydoc nk_euclideans_packed_bf16 */
|
|
@@ -1844,7 +1953,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_rvv(nk_bf16_t const *a, void const *b_p
|
|
|
1844
1953
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1845
1954
|
nk_size_t r_stride_in_bytes);
|
|
1846
1955
|
/** @copydoc nk_euclideans_symmetric_bf16 */
|
|
1847
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t
|
|
1956
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1848
1957
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1849
1958
|
nk_size_t row_start, nk_size_t row_count);
|
|
1850
1959
|
|
|
@@ -1853,7 +1962,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_rvv(nk_e4m3_t const *a, void const *b_pac
|
|
|
1853
1962
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1854
1963
|
nk_size_t r_stride_in_bytes);
|
|
1855
1964
|
/** @copydoc nk_angulars_symmetric_e4m3 */
|
|
1856
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t
|
|
1965
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1857
1966
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1858
1967
|
nk_size_t row_start, nk_size_t row_count);
|
|
1859
1968
|
/** @copydoc nk_euclideans_packed_e4m3 */
|
|
@@ -1861,7 +1970,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_rvv(nk_e4m3_t const *a, void const *b_p
|
|
|
1861
1970
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1862
1971
|
nk_size_t r_stride_in_bytes);
|
|
1863
1972
|
/** @copydoc nk_euclideans_symmetric_e4m3 */
|
|
1864
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t
|
|
1973
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1865
1974
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1866
1975
|
nk_size_t row_start, nk_size_t row_count);
|
|
1867
1976
|
|
|
@@ -1870,7 +1979,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_rvv(nk_e5m2_t const *a, void const *b_pac
|
|
|
1870
1979
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1871
1980
|
nk_size_t r_stride_in_bytes);
|
|
1872
1981
|
/** @copydoc nk_angulars_symmetric_e5m2 */
|
|
1873
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t
|
|
1982
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1874
1983
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1875
1984
|
nk_size_t row_start, nk_size_t row_count);
|
|
1876
1985
|
/** @copydoc nk_euclideans_packed_e5m2 */
|
|
@@ -1878,7 +1987,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_rvv(nk_e5m2_t const *a, void const *b_p
|
|
|
1878
1987
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1879
1988
|
nk_size_t r_stride_in_bytes);
|
|
1880
1989
|
/** @copydoc nk_euclideans_symmetric_e5m2 */
|
|
1881
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t
|
|
1990
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1882
1991
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1883
1992
|
nk_size_t row_start, nk_size_t row_count);
|
|
1884
1993
|
|
|
@@ -1887,7 +1996,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_rvv(nk_e2m3_t const *a, void const *b_pac
|
|
|
1887
1996
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1888
1997
|
nk_size_t r_stride_in_bytes);
|
|
1889
1998
|
/** @copydoc nk_angulars_symmetric_e2m3 */
|
|
1890
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t
|
|
1999
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1891
2000
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1892
2001
|
nk_size_t row_start, nk_size_t row_count);
|
|
1893
2002
|
/** @copydoc nk_euclideans_packed_e2m3 */
|
|
@@ -1895,7 +2004,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_rvv(nk_e2m3_t const *a, void const *b_p
|
|
|
1895
2004
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1896
2005
|
nk_size_t r_stride_in_bytes);
|
|
1897
2006
|
/** @copydoc nk_euclideans_symmetric_e2m3 */
|
|
1898
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t
|
|
2007
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1899
2008
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1900
2009
|
nk_size_t row_start, nk_size_t row_count);
|
|
1901
2010
|
|
|
@@ -1904,7 +2013,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_rvv(nk_e3m2_t const *a, void const *b_pac
|
|
|
1904
2013
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1905
2014
|
nk_size_t r_stride_in_bytes);
|
|
1906
2015
|
/** @copydoc nk_angulars_symmetric_e3m2 */
|
|
1907
|
-
NK_PUBLIC void nk_angulars_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t
|
|
2016
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1908
2017
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1909
2018
|
nk_size_t row_start, nk_size_t row_count);
|
|
1910
2019
|
/** @copydoc nk_euclideans_packed_e3m2 */
|
|
@@ -1912,7 +2021,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_rvv(nk_e3m2_t const *a, void const *b_p
|
|
|
1912
2021
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1913
2022
|
nk_size_t r_stride_in_bytes);
|
|
1914
2023
|
/** @copydoc nk_euclideans_symmetric_e3m2 */
|
|
1915
|
-
NK_PUBLIC void nk_euclideans_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t
|
|
2024
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1916
2025
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1917
2026
|
nk_size_t row_start, nk_size_t row_count);
|
|
1918
2027
|
|
|
@@ -1921,7 +2030,7 @@ NK_PUBLIC void nk_angulars_packed_i8_rvv(nk_i8_t const *a, void const *b_packed,
|
|
|
1921
2030
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1922
2031
|
nk_size_t r_stride_in_bytes);
|
|
1923
2032
|
/** @copydoc nk_angulars_symmetric_i8 */
|
|
1924
|
-
NK_PUBLIC void nk_angulars_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t
|
|
2033
|
+
NK_PUBLIC void nk_angulars_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1925
2034
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1926
2035
|
nk_size_t row_start, nk_size_t row_count);
|
|
1927
2036
|
/** @copydoc nk_euclideans_packed_i8 */
|
|
@@ -1929,7 +2038,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_rvv(nk_i8_t const *a, void const *b_packe
|
|
|
1929
2038
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1930
2039
|
nk_size_t r_stride_in_bytes);
|
|
1931
2040
|
/** @copydoc nk_euclideans_symmetric_i8 */
|
|
1932
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t
|
|
2041
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1933
2042
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1934
2043
|
nk_size_t row_start, nk_size_t row_count);
|
|
1935
2044
|
|
|
@@ -1938,7 +2047,7 @@ NK_PUBLIC void nk_angulars_packed_u8_rvv(nk_u8_t const *a, void const *b_packed,
|
|
|
1938
2047
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1939
2048
|
nk_size_t r_stride_in_bytes);
|
|
1940
2049
|
/** @copydoc nk_angulars_symmetric_u8 */
|
|
1941
|
-
NK_PUBLIC void nk_angulars_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
|
|
2050
|
+
NK_PUBLIC void nk_angulars_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1942
2051
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1943
2052
|
nk_size_t row_start, nk_size_t row_count);
|
|
1944
2053
|
/** @copydoc nk_euclideans_packed_u8 */
|
|
@@ -1946,7 +2055,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_rvv(nk_u8_t const *a, void const *b_packe
|
|
|
1946
2055
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
1947
2056
|
nk_size_t r_stride_in_bytes);
|
|
1948
2057
|
/** @copydoc nk_euclideans_symmetric_u8 */
|
|
1949
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
|
|
2058
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
1950
2059
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
1951
2060
|
nk_size_t row_start, nk_size_t row_count);
|
|
1952
2061
|
#endif // NK_TARGET_RVV
|
|
@@ -1957,13 +2066,14 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
|
|
|
1957
2066
|
|
|
1958
2067
|
#include "numkong/spatials/serial.h"
|
|
1959
2068
|
#include "numkong/spatials/neon.h"
|
|
1960
|
-
#include "numkong/spatials/neonhalf.h"
|
|
1961
2069
|
#include "numkong/spatials/neonfhm.h"
|
|
2070
|
+
#include "numkong/spatials/neonfp8.h"
|
|
1962
2071
|
#include "numkong/spatials/neonbfdot.h"
|
|
1963
2072
|
#include "numkong/spatials/neonsdot.h"
|
|
1964
2073
|
#include "numkong/spatials/haswell.h"
|
|
1965
2074
|
#include "numkong/spatials/skylake.h"
|
|
1966
2075
|
#include "numkong/spatials/genoa.h"
|
|
2076
|
+
#include "numkong/spatials/diamond.h"
|
|
1967
2077
|
#include "numkong/spatials/icelake.h"
|
|
1968
2078
|
#include "numkong/spatials/alder.h"
|
|
1969
2079
|
#include "numkong/spatials/sierra.h"
|
|
@@ -1972,6 +2082,8 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
|
|
|
1972
2082
|
#include "numkong/spatials/v128relaxed.h"
|
|
1973
2083
|
#include "numkong/spatials/sme.h"
|
|
1974
2084
|
#include "numkong/spatials/smef64.h"
|
|
2085
|
+
#include "numkong/spatials/powervsx.h"
|
|
2086
|
+
#include "numkong/spatials/loongsonasx.h"
|
|
1975
2087
|
|
|
1976
2088
|
#if defined(__cplusplus)
|
|
1977
2089
|
extern "C" {
|
|
@@ -1990,6 +2102,8 @@ NK_PUBLIC void nk_angulars_packed_f64(nk_f64_t const *a, void const *b_packed, n
|
|
|
1990
2102
|
nk_angulars_packed_f64_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
1991
2103
|
#elif NK_TARGET_HASWELL
|
|
1992
2104
|
nk_angulars_packed_f64_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2105
|
+
#elif NK_TARGET_POWERVSX
|
|
2106
|
+
nk_angulars_packed_f64_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
1993
2107
|
#elif NK_TARGET_RVV
|
|
1994
2108
|
nk_angulars_packed_f64_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
1995
2109
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -1998,24 +2112,31 @@ NK_PUBLIC void nk_angulars_packed_f64(nk_f64_t const *a, void const *b_packed, n
|
|
|
1998
2112
|
nk_angulars_packed_f64_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
1999
2113
|
#endif
|
|
2000
2114
|
}
|
|
2001
|
-
NK_PUBLIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t
|
|
2115
|
+
NK_PUBLIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2002
2116
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
2003
2117
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2004
2118
|
#if NK_TARGET_SMEF64
|
|
2005
|
-
nk_angulars_symmetric_f64_smef64(vectors,
|
|
2119
|
+
nk_angulars_symmetric_f64_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2120
|
+
row_count);
|
|
2006
2121
|
#elif NK_TARGET_NEON
|
|
2007
|
-
nk_angulars_symmetric_f64_neon(vectors,
|
|
2122
|
+
nk_angulars_symmetric_f64_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2008
2123
|
#elif NK_TARGET_SKYLAKE
|
|
2009
|
-
nk_angulars_symmetric_f64_skylake(vectors,
|
|
2124
|
+
nk_angulars_symmetric_f64_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2125
|
+
row_count);
|
|
2010
2126
|
#elif NK_TARGET_HASWELL
|
|
2011
|
-
nk_angulars_symmetric_f64_haswell(vectors,
|
|
2127
|
+
nk_angulars_symmetric_f64_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2128
|
+
row_count);
|
|
2129
|
+
#elif NK_TARGET_POWERVSX
|
|
2130
|
+
nk_angulars_symmetric_f64_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2131
|
+
row_count);
|
|
2012
2132
|
#elif NK_TARGET_RVV
|
|
2013
|
-
nk_angulars_symmetric_f64_rvv(vectors,
|
|
2133
|
+
nk_angulars_symmetric_f64_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2014
2134
|
#elif NK_TARGET_V128RELAXED
|
|
2015
|
-
nk_angulars_symmetric_f64_v128relaxed(vectors,
|
|
2135
|
+
nk_angulars_symmetric_f64_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2016
2136
|
row_count);
|
|
2017
2137
|
#else
|
|
2018
|
-
nk_angulars_symmetric_f64_serial(vectors,
|
|
2138
|
+
nk_angulars_symmetric_f64_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2139
|
+
row_count);
|
|
2019
2140
|
#endif
|
|
2020
2141
|
}
|
|
2021
2142
|
NK_PUBLIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed, nk_f64_t *result, nk_size_t rows,
|
|
@@ -2029,6 +2150,8 @@ NK_PUBLIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed,
|
|
|
2029
2150
|
nk_euclideans_packed_f64_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2030
2151
|
#elif NK_TARGET_HASWELL
|
|
2031
2152
|
nk_euclideans_packed_f64_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2153
|
+
#elif NK_TARGET_POWERVSX
|
|
2154
|
+
nk_euclideans_packed_f64_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2032
2155
|
#elif NK_TARGET_RVV
|
|
2033
2156
|
nk_euclideans_packed_f64_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2034
2157
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2037,24 +2160,32 @@ NK_PUBLIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed,
|
|
|
2037
2160
|
nk_euclideans_packed_f64_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2038
2161
|
#endif
|
|
2039
2162
|
}
|
|
2040
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t
|
|
2163
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2041
2164
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
2042
2165
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2043
2166
|
#if NK_TARGET_SMEF64
|
|
2044
|
-
nk_euclideans_symmetric_f64_smef64(vectors,
|
|
2167
|
+
nk_euclideans_symmetric_f64_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2168
|
+
row_count);
|
|
2045
2169
|
#elif NK_TARGET_NEON
|
|
2046
|
-
nk_euclideans_symmetric_f64_neon(vectors,
|
|
2170
|
+
nk_euclideans_symmetric_f64_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2171
|
+
row_count);
|
|
2047
2172
|
#elif NK_TARGET_SKYLAKE
|
|
2048
|
-
nk_euclideans_symmetric_f64_skylake(vectors,
|
|
2173
|
+
nk_euclideans_symmetric_f64_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2174
|
+
row_count);
|
|
2049
2175
|
#elif NK_TARGET_HASWELL
|
|
2050
|
-
nk_euclideans_symmetric_f64_haswell(vectors,
|
|
2176
|
+
nk_euclideans_symmetric_f64_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2177
|
+
row_count);
|
|
2178
|
+
#elif NK_TARGET_POWERVSX
|
|
2179
|
+
nk_euclideans_symmetric_f64_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2180
|
+
row_count);
|
|
2051
2181
|
#elif NK_TARGET_RVV
|
|
2052
|
-
nk_euclideans_symmetric_f64_rvv(vectors,
|
|
2182
|
+
nk_euclideans_symmetric_f64_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2053
2183
|
#elif NK_TARGET_V128RELAXED
|
|
2054
|
-
nk_euclideans_symmetric_f64_v128relaxed(vectors,
|
|
2184
|
+
nk_euclideans_symmetric_f64_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2055
2185
|
row_count);
|
|
2056
2186
|
#else
|
|
2057
|
-
nk_euclideans_symmetric_f64_serial(vectors,
|
|
2187
|
+
nk_euclideans_symmetric_f64_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2188
|
+
row_count);
|
|
2058
2189
|
#endif
|
|
2059
2190
|
}
|
|
2060
2191
|
|
|
@@ -2069,6 +2200,8 @@ NK_PUBLIC void nk_angulars_packed_f32(nk_f32_t const *a, void const *b_packed, n
|
|
|
2069
2200
|
nk_angulars_packed_f32_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2070
2201
|
#elif NK_TARGET_HASWELL
|
|
2071
2202
|
nk_angulars_packed_f32_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2203
|
+
#elif NK_TARGET_POWERVSX
|
|
2204
|
+
nk_angulars_packed_f32_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2072
2205
|
#elif NK_TARGET_RVV
|
|
2073
2206
|
nk_angulars_packed_f32_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2074
2207
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2077,24 +2210,31 @@ NK_PUBLIC void nk_angulars_packed_f32(nk_f32_t const *a, void const *b_packed, n
|
|
|
2077
2210
|
nk_angulars_packed_f32_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2078
2211
|
#endif
|
|
2079
2212
|
}
|
|
2080
|
-
NK_PUBLIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t
|
|
2213
|
+
NK_PUBLIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2081
2214
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
2082
2215
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2083
2216
|
#if NK_TARGET_SMEF64
|
|
2084
|
-
nk_angulars_symmetric_f32_smef64(vectors,
|
|
2217
|
+
nk_angulars_symmetric_f32_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2218
|
+
row_count);
|
|
2085
2219
|
#elif NK_TARGET_NEON
|
|
2086
|
-
nk_angulars_symmetric_f32_neon(vectors,
|
|
2220
|
+
nk_angulars_symmetric_f32_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2087
2221
|
#elif NK_TARGET_SKYLAKE
|
|
2088
|
-
nk_angulars_symmetric_f32_skylake(vectors,
|
|
2222
|
+
nk_angulars_symmetric_f32_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2223
|
+
row_count);
|
|
2089
2224
|
#elif NK_TARGET_HASWELL
|
|
2090
|
-
nk_angulars_symmetric_f32_haswell(vectors,
|
|
2225
|
+
nk_angulars_symmetric_f32_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2226
|
+
row_count);
|
|
2227
|
+
#elif NK_TARGET_POWERVSX
|
|
2228
|
+
nk_angulars_symmetric_f32_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2229
|
+
row_count);
|
|
2091
2230
|
#elif NK_TARGET_RVV
|
|
2092
|
-
nk_angulars_symmetric_f32_rvv(vectors,
|
|
2231
|
+
nk_angulars_symmetric_f32_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2093
2232
|
#elif NK_TARGET_V128RELAXED
|
|
2094
|
-
nk_angulars_symmetric_f32_v128relaxed(vectors,
|
|
2233
|
+
nk_angulars_symmetric_f32_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2095
2234
|
row_count);
|
|
2096
2235
|
#else
|
|
2097
|
-
nk_angulars_symmetric_f32_serial(vectors,
|
|
2236
|
+
nk_angulars_symmetric_f32_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2237
|
+
row_count);
|
|
2098
2238
|
#endif
|
|
2099
2239
|
}
|
|
2100
2240
|
NK_PUBLIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed, nk_f64_t *result, nk_size_t rows,
|
|
@@ -2108,6 +2248,8 @@ NK_PUBLIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed,
|
|
|
2108
2248
|
nk_euclideans_packed_f32_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2109
2249
|
#elif NK_TARGET_HASWELL
|
|
2110
2250
|
nk_euclideans_packed_f32_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2251
|
+
#elif NK_TARGET_POWERVSX
|
|
2252
|
+
nk_euclideans_packed_f32_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2111
2253
|
#elif NK_TARGET_RVV
|
|
2112
2254
|
nk_euclideans_packed_f32_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2113
2255
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2116,24 +2258,32 @@ NK_PUBLIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed,
|
|
|
2116
2258
|
nk_euclideans_packed_f32_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2117
2259
|
#endif
|
|
2118
2260
|
}
|
|
2119
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t
|
|
2261
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2120
2262
|
nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
|
|
2121
2263
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2122
2264
|
#if NK_TARGET_SMEF64
|
|
2123
|
-
nk_euclideans_symmetric_f32_smef64(vectors,
|
|
2265
|
+
nk_euclideans_symmetric_f32_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2266
|
+
row_count);
|
|
2124
2267
|
#elif NK_TARGET_NEON
|
|
2125
|
-
nk_euclideans_symmetric_f32_neon(vectors,
|
|
2268
|
+
nk_euclideans_symmetric_f32_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2269
|
+
row_count);
|
|
2126
2270
|
#elif NK_TARGET_SKYLAKE
|
|
2127
|
-
nk_euclideans_symmetric_f32_skylake(vectors,
|
|
2271
|
+
nk_euclideans_symmetric_f32_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2272
|
+
row_count);
|
|
2128
2273
|
#elif NK_TARGET_HASWELL
|
|
2129
|
-
nk_euclideans_symmetric_f32_haswell(vectors,
|
|
2274
|
+
nk_euclideans_symmetric_f32_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2275
|
+
row_count);
|
|
2276
|
+
#elif NK_TARGET_POWERVSX
|
|
2277
|
+
nk_euclideans_symmetric_f32_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2278
|
+
row_count);
|
|
2130
2279
|
#elif NK_TARGET_RVV
|
|
2131
|
-
nk_euclideans_symmetric_f32_rvv(vectors,
|
|
2280
|
+
nk_euclideans_symmetric_f32_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2132
2281
|
#elif NK_TARGET_V128RELAXED
|
|
2133
|
-
nk_euclideans_symmetric_f32_v128relaxed(vectors,
|
|
2282
|
+
nk_euclideans_symmetric_f32_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2134
2283
|
row_count);
|
|
2135
2284
|
#else
|
|
2136
|
-
nk_euclideans_symmetric_f32_serial(vectors,
|
|
2285
|
+
nk_euclideans_symmetric_f32_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2286
|
+
row_count);
|
|
2137
2287
|
#endif
|
|
2138
2288
|
}
|
|
2139
2289
|
|
|
@@ -2144,39 +2294,44 @@ NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, n
|
|
|
2144
2294
|
nk_angulars_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2145
2295
|
#elif NK_TARGET_NEONFHM
|
|
2146
2296
|
nk_angulars_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2147
|
-
#elif NK_TARGET_NEONHALF
|
|
2148
|
-
nk_angulars_packed_f16_neonhalf(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2149
2297
|
#elif NK_TARGET_NEON
|
|
2150
2298
|
nk_angulars_packed_f16_neon(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2151
2299
|
#elif NK_TARGET_SKYLAKE
|
|
2152
2300
|
nk_angulars_packed_f16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2153
2301
|
#elif NK_TARGET_HASWELL
|
|
2154
2302
|
nk_angulars_packed_f16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2303
|
+
#elif NK_TARGET_POWERVSX
|
|
2304
|
+
nk_angulars_packed_f16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2155
2305
|
#elif NK_TARGET_RVV
|
|
2156
2306
|
nk_angulars_packed_f16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2157
2307
|
#else
|
|
2158
2308
|
nk_angulars_packed_f16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2159
2309
|
#endif
|
|
2160
2310
|
}
|
|
2161
|
-
NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t
|
|
2311
|
+
NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2162
2312
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2163
2313
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2164
2314
|
#if NK_TARGET_SME
|
|
2165
|
-
nk_angulars_symmetric_f16_sme(vectors,
|
|
2315
|
+
nk_angulars_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2166
2316
|
#elif NK_TARGET_NEONFHM
|
|
2167
|
-
nk_angulars_symmetric_f16_neonfhm(vectors,
|
|
2168
|
-
|
|
2169
|
-
nk_angulars_symmetric_f16_neonhalf(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
|
|
2317
|
+
nk_angulars_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2318
|
+
row_count);
|
|
2170
2319
|
#elif NK_TARGET_NEON
|
|
2171
|
-
nk_angulars_symmetric_f16_neon(vectors,
|
|
2320
|
+
nk_angulars_symmetric_f16_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2172
2321
|
#elif NK_TARGET_SKYLAKE
|
|
2173
|
-
nk_angulars_symmetric_f16_skylake(vectors,
|
|
2322
|
+
nk_angulars_symmetric_f16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2323
|
+
row_count);
|
|
2174
2324
|
#elif NK_TARGET_HASWELL
|
|
2175
|
-
nk_angulars_symmetric_f16_haswell(vectors,
|
|
2325
|
+
nk_angulars_symmetric_f16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2326
|
+
row_count);
|
|
2327
|
+
#elif NK_TARGET_POWERVSX
|
|
2328
|
+
nk_angulars_symmetric_f16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2329
|
+
row_count);
|
|
2176
2330
|
#elif NK_TARGET_RVV
|
|
2177
|
-
nk_angulars_symmetric_f16_rvv(vectors,
|
|
2331
|
+
nk_angulars_symmetric_f16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2178
2332
|
#else
|
|
2179
|
-
nk_angulars_symmetric_f16_serial(vectors,
|
|
2333
|
+
nk_angulars_symmetric_f16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2334
|
+
row_count);
|
|
2180
2335
|
#endif
|
|
2181
2336
|
}
|
|
2182
2337
|
NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2186,40 +2341,45 @@ NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed,
|
|
|
2186
2341
|
nk_euclideans_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2187
2342
|
#elif NK_TARGET_NEONFHM
|
|
2188
2343
|
nk_euclideans_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2189
|
-
#elif NK_TARGET_NEONHALF
|
|
2190
|
-
nk_euclideans_packed_f16_neonhalf(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2191
2344
|
#elif NK_TARGET_NEON
|
|
2192
2345
|
nk_euclideans_packed_f16_neon(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2193
2346
|
#elif NK_TARGET_SKYLAKE
|
|
2194
2347
|
nk_euclideans_packed_f16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2195
2348
|
#elif NK_TARGET_HASWELL
|
|
2196
2349
|
nk_euclideans_packed_f16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2350
|
+
#elif NK_TARGET_POWERVSX
|
|
2351
|
+
nk_euclideans_packed_f16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2197
2352
|
#elif NK_TARGET_RVV
|
|
2198
2353
|
nk_euclideans_packed_f16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2199
2354
|
#else
|
|
2200
2355
|
nk_euclideans_packed_f16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2201
2356
|
#endif
|
|
2202
2357
|
}
|
|
2203
|
-
NK_PUBLIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t
|
|
2358
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2204
2359
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2205
2360
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2206
2361
|
#if NK_TARGET_SME
|
|
2207
|
-
nk_euclideans_symmetric_f16_sme(vectors,
|
|
2362
|
+
nk_euclideans_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2208
2363
|
#elif NK_TARGET_NEONFHM
|
|
2209
|
-
nk_euclideans_symmetric_f16_neonfhm(vectors,
|
|
2210
|
-
|
|
2211
|
-
nk_euclideans_symmetric_f16_neonhalf(vectors, n_vectors, depth, stride, result, result_stride, row_start,
|
|
2212
|
-
row_count);
|
|
2364
|
+
nk_euclideans_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2365
|
+
row_count);
|
|
2213
2366
|
#elif NK_TARGET_NEON
|
|
2214
|
-
nk_euclideans_symmetric_f16_neon(vectors,
|
|
2367
|
+
nk_euclideans_symmetric_f16_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2368
|
+
row_count);
|
|
2215
2369
|
#elif NK_TARGET_SKYLAKE
|
|
2216
|
-
nk_euclideans_symmetric_f16_skylake(vectors,
|
|
2370
|
+
nk_euclideans_symmetric_f16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2371
|
+
row_count);
|
|
2217
2372
|
#elif NK_TARGET_HASWELL
|
|
2218
|
-
nk_euclideans_symmetric_f16_haswell(vectors,
|
|
2373
|
+
nk_euclideans_symmetric_f16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2374
|
+
row_count);
|
|
2375
|
+
#elif NK_TARGET_POWERVSX
|
|
2376
|
+
nk_euclideans_symmetric_f16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2377
|
+
row_count);
|
|
2219
2378
|
#elif NK_TARGET_RVV
|
|
2220
|
-
nk_euclideans_symmetric_f16_rvv(vectors,
|
|
2379
|
+
nk_euclideans_symmetric_f16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2221
2380
|
#else
|
|
2222
|
-
nk_euclideans_symmetric_f16_serial(vectors,
|
|
2381
|
+
nk_euclideans_symmetric_f16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2382
|
+
row_count);
|
|
2223
2383
|
#endif
|
|
2224
2384
|
}
|
|
2225
2385
|
|
|
@@ -2238,6 +2398,8 @@ NK_PUBLIC void nk_angulars_packed_bf16(nk_bf16_t const *a, void const *b_packed,
|
|
|
2238
2398
|
nk_angulars_packed_bf16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2239
2399
|
#elif NK_TARGET_HASWELL
|
|
2240
2400
|
nk_angulars_packed_bf16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2401
|
+
#elif NK_TARGET_POWERVSX
|
|
2402
|
+
nk_angulars_packed_bf16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2241
2403
|
#elif NK_TARGET_RVV
|
|
2242
2404
|
nk_angulars_packed_bf16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2243
2405
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2246,30 +2408,37 @@ NK_PUBLIC void nk_angulars_packed_bf16(nk_bf16_t const *a, void const *b_packed,
|
|
|
2246
2408
|
nk_angulars_packed_bf16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2247
2409
|
#endif
|
|
2248
2410
|
}
|
|
2249
|
-
NK_PUBLIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t
|
|
2411
|
+
NK_PUBLIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2250
2412
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2251
2413
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2252
2414
|
#if NK_TARGET_SME
|
|
2253
|
-
nk_angulars_symmetric_bf16_sme(vectors,
|
|
2415
|
+
nk_angulars_symmetric_bf16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2254
2416
|
#elif NK_TARGET_NEONBFDOT
|
|
2255
|
-
nk_angulars_symmetric_bf16_neonbfdot(vectors,
|
|
2417
|
+
nk_angulars_symmetric_bf16_neonbfdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2256
2418
|
row_count);
|
|
2257
2419
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2258
|
-
nk_angulars_symmetric_bf16_sapphireamx(vectors,
|
|
2420
|
+
nk_angulars_symmetric_bf16_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2259
2421
|
row_count);
|
|
2260
2422
|
#elif NK_TARGET_GENOA
|
|
2261
|
-
nk_angulars_symmetric_bf16_genoa(vectors,
|
|
2423
|
+
nk_angulars_symmetric_bf16_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2424
|
+
row_count);
|
|
2262
2425
|
#elif NK_TARGET_SKYLAKE
|
|
2263
|
-
nk_angulars_symmetric_bf16_skylake(vectors,
|
|
2426
|
+
nk_angulars_symmetric_bf16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2427
|
+
row_count);
|
|
2264
2428
|
#elif NK_TARGET_HASWELL
|
|
2265
|
-
nk_angulars_symmetric_bf16_haswell(vectors,
|
|
2429
|
+
nk_angulars_symmetric_bf16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2430
|
+
row_count);
|
|
2431
|
+
#elif NK_TARGET_POWERVSX
|
|
2432
|
+
nk_angulars_symmetric_bf16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2433
|
+
row_count);
|
|
2266
2434
|
#elif NK_TARGET_RVV
|
|
2267
|
-
nk_angulars_symmetric_bf16_rvv(vectors,
|
|
2435
|
+
nk_angulars_symmetric_bf16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2268
2436
|
#elif NK_TARGET_V128RELAXED
|
|
2269
|
-
nk_angulars_symmetric_bf16_v128relaxed(vectors,
|
|
2437
|
+
nk_angulars_symmetric_bf16_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2270
2438
|
row_count);
|
|
2271
2439
|
#else
|
|
2272
|
-
nk_angulars_symmetric_bf16_serial(vectors,
|
|
2440
|
+
nk_angulars_symmetric_bf16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2441
|
+
row_count);
|
|
2273
2442
|
#endif
|
|
2274
2443
|
}
|
|
2275
2444
|
NK_PUBLIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2287,6 +2456,8 @@ NK_PUBLIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_packe
|
|
|
2287
2456
|
nk_euclideans_packed_bf16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2288
2457
|
#elif NK_TARGET_HASWELL
|
|
2289
2458
|
nk_euclideans_packed_bf16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2459
|
+
#elif NK_TARGET_POWERVSX
|
|
2460
|
+
nk_euclideans_packed_bf16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2290
2461
|
#elif NK_TARGET_RVV
|
|
2291
2462
|
nk_euclideans_packed_bf16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2292
2463
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2295,32 +2466,39 @@ NK_PUBLIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_packe
|
|
|
2295
2466
|
nk_euclideans_packed_bf16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2296
2467
|
#endif
|
|
2297
2468
|
}
|
|
2298
|
-
NK_PUBLIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t
|
|
2469
|
+
NK_PUBLIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2299
2470
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2300
2471
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2301
2472
|
#if NK_TARGET_SME
|
|
2302
|
-
nk_euclideans_symmetric_bf16_sme(vectors,
|
|
2473
|
+
nk_euclideans_symmetric_bf16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2474
|
+
row_count);
|
|
2303
2475
|
#elif NK_TARGET_NEONBFDOT
|
|
2304
|
-
nk_euclideans_symmetric_bf16_neonbfdot(vectors,
|
|
2476
|
+
nk_euclideans_symmetric_bf16_neonbfdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2305
2477
|
row_count);
|
|
2306
2478
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2307
|
-
nk_euclideans_symmetric_bf16_sapphireamx(vectors,
|
|
2479
|
+
nk_euclideans_symmetric_bf16_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2308
2480
|
row_count);
|
|
2309
2481
|
#elif NK_TARGET_GENOA
|
|
2310
|
-
nk_euclideans_symmetric_bf16_genoa(vectors,
|
|
2482
|
+
nk_euclideans_symmetric_bf16_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2483
|
+
row_count);
|
|
2311
2484
|
#elif NK_TARGET_SKYLAKE
|
|
2312
|
-
nk_euclideans_symmetric_bf16_skylake(vectors,
|
|
2485
|
+
nk_euclideans_symmetric_bf16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2313
2486
|
row_count);
|
|
2314
2487
|
#elif NK_TARGET_HASWELL
|
|
2315
|
-
nk_euclideans_symmetric_bf16_haswell(vectors,
|
|
2488
|
+
nk_euclideans_symmetric_bf16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2316
2489
|
row_count);
|
|
2490
|
+
#elif NK_TARGET_POWERVSX
|
|
2491
|
+
nk_euclideans_symmetric_bf16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2492
|
+
row_count);
|
|
2317
2493
|
#elif NK_TARGET_RVV
|
|
2318
|
-
nk_euclideans_symmetric_bf16_rvv(vectors,
|
|
2494
|
+
nk_euclideans_symmetric_bf16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2495
|
+
row_count);
|
|
2319
2496
|
#elif NK_TARGET_V128RELAXED
|
|
2320
|
-
nk_euclideans_symmetric_bf16_v128relaxed(vectors,
|
|
2497
|
+
nk_euclideans_symmetric_bf16_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2321
2498
|
row_count);
|
|
2322
2499
|
#else
|
|
2323
|
-
nk_euclideans_symmetric_bf16_serial(vectors,
|
|
2500
|
+
nk_euclideans_symmetric_bf16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2501
|
+
row_count);
|
|
2324
2502
|
#endif
|
|
2325
2503
|
}
|
|
2326
2504
|
|
|
@@ -2329,10 +2507,14 @@ NK_PUBLIC void nk_angulars_packed_e4m3(nk_e4m3_t const *a, void const *b_packed,
|
|
|
2329
2507
|
nk_size_t r_stride_in_bytes) {
|
|
2330
2508
|
#if NK_TARGET_SME
|
|
2331
2509
|
nk_angulars_packed_e4m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2510
|
+
#elif NK_TARGET_NEONFP8
|
|
2511
|
+
nk_angulars_packed_e4m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2332
2512
|
#elif NK_TARGET_NEONFHM
|
|
2333
2513
|
nk_angulars_packed_e4m3_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2334
2514
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2335
2515
|
nk_angulars_packed_e4m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2516
|
+
#elif NK_TARGET_DIAMOND
|
|
2517
|
+
nk_angulars_packed_e4m3_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2336
2518
|
#elif NK_TARGET_GENOA
|
|
2337
2519
|
nk_angulars_packed_e4m3_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2338
2520
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2347,29 +2529,40 @@ NK_PUBLIC void nk_angulars_packed_e4m3(nk_e4m3_t const *a, void const *b_packed,
|
|
|
2347
2529
|
nk_angulars_packed_e4m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2348
2530
|
#endif
|
|
2349
2531
|
}
|
|
2350
|
-
NK_PUBLIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t
|
|
2532
|
+
NK_PUBLIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2351
2533
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2352
2534
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2353
2535
|
#if NK_TARGET_SME
|
|
2354
|
-
nk_angulars_symmetric_e4m3_sme(vectors,
|
|
2536
|
+
nk_angulars_symmetric_e4m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2537
|
+
#elif NK_TARGET_NEONFP8
|
|
2538
|
+
nk_angulars_symmetric_e4m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2539
|
+
row_count);
|
|
2355
2540
|
#elif NK_TARGET_NEONFHM
|
|
2356
|
-
nk_angulars_symmetric_e4m3_neonfhm(vectors,
|
|
2541
|
+
nk_angulars_symmetric_e4m3_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2542
|
+
row_count);
|
|
2357
2543
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2358
|
-
nk_angulars_symmetric_e4m3_sapphireamx(vectors,
|
|
2544
|
+
nk_angulars_symmetric_e4m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2359
2545
|
row_count);
|
|
2546
|
+
#elif NK_TARGET_DIAMOND
|
|
2547
|
+
nk_angulars_symmetric_e4m3_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2548
|
+
row_count);
|
|
2360
2549
|
#elif NK_TARGET_GENOA
|
|
2361
|
-
nk_angulars_symmetric_e4m3_genoa(vectors,
|
|
2550
|
+
nk_angulars_symmetric_e4m3_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2551
|
+
row_count);
|
|
2362
2552
|
#elif NK_TARGET_SKYLAKE
|
|
2363
|
-
nk_angulars_symmetric_e4m3_skylake(vectors,
|
|
2553
|
+
nk_angulars_symmetric_e4m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2554
|
+
row_count);
|
|
2364
2555
|
#elif NK_TARGET_HASWELL
|
|
2365
|
-
nk_angulars_symmetric_e4m3_haswell(vectors,
|
|
2556
|
+
nk_angulars_symmetric_e4m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2557
|
+
row_count);
|
|
2366
2558
|
#elif NK_TARGET_RVV
|
|
2367
|
-
nk_angulars_symmetric_e4m3_rvv(vectors,
|
|
2559
|
+
nk_angulars_symmetric_e4m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2368
2560
|
#elif NK_TARGET_V128RELAXED
|
|
2369
|
-
nk_angulars_symmetric_e4m3_v128relaxed(vectors,
|
|
2561
|
+
nk_angulars_symmetric_e4m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2370
2562
|
row_count);
|
|
2371
2563
|
#else
|
|
2372
|
-
nk_angulars_symmetric_e4m3_serial(vectors,
|
|
2564
|
+
nk_angulars_symmetric_e4m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2565
|
+
row_count);
|
|
2373
2566
|
#endif
|
|
2374
2567
|
}
|
|
2375
2568
|
NK_PUBLIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2377,10 +2570,14 @@ NK_PUBLIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_packe
|
|
|
2377
2570
|
nk_size_t r_stride_in_bytes) {
|
|
2378
2571
|
#if NK_TARGET_SME
|
|
2379
2572
|
nk_euclideans_packed_e4m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2573
|
+
#elif NK_TARGET_NEONFP8
|
|
2574
|
+
nk_euclideans_packed_e4m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2380
2575
|
#elif NK_TARGET_NEONFHM
|
|
2381
2576
|
nk_euclideans_packed_e4m3_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2382
2577
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2383
2578
|
nk_euclideans_packed_e4m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2579
|
+
#elif NK_TARGET_DIAMOND
|
|
2580
|
+
nk_euclideans_packed_e4m3_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2384
2581
|
#elif NK_TARGET_GENOA
|
|
2385
2582
|
nk_euclideans_packed_e4m3_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2386
2583
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2395,32 +2592,42 @@ NK_PUBLIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_packe
|
|
|
2395
2592
|
nk_euclideans_packed_e4m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2396
2593
|
#endif
|
|
2397
2594
|
}
|
|
2398
|
-
NK_PUBLIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t
|
|
2595
|
+
NK_PUBLIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2399
2596
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2400
2597
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2401
2598
|
#if NK_TARGET_SME
|
|
2402
|
-
nk_euclideans_symmetric_e4m3_sme(vectors,
|
|
2599
|
+
nk_euclideans_symmetric_e4m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2600
|
+
row_count);
|
|
2601
|
+
#elif NK_TARGET_NEONFP8
|
|
2602
|
+
nk_euclideans_symmetric_e4m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2603
|
+
row_count);
|
|
2403
2604
|
#elif NK_TARGET_NEONFHM
|
|
2404
|
-
nk_euclideans_symmetric_e4m3_neonfhm(vectors,
|
|
2605
|
+
nk_euclideans_symmetric_e4m3_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2405
2606
|
row_count);
|
|
2406
2607
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2407
|
-
nk_euclideans_symmetric_e4m3_sapphireamx(vectors,
|
|
2608
|
+
nk_euclideans_symmetric_e4m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2408
2609
|
row_count);
|
|
2610
|
+
#elif NK_TARGET_DIAMOND
|
|
2611
|
+
nk_euclideans_symmetric_e4m3_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2612
|
+
row_count);
|
|
2409
2613
|
#elif NK_TARGET_GENOA
|
|
2410
|
-
nk_euclideans_symmetric_e4m3_genoa(vectors,
|
|
2614
|
+
nk_euclideans_symmetric_e4m3_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2615
|
+
row_count);
|
|
2411
2616
|
#elif NK_TARGET_SKYLAKE
|
|
2412
|
-
nk_euclideans_symmetric_e4m3_skylake(vectors,
|
|
2617
|
+
nk_euclideans_symmetric_e4m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2413
2618
|
row_count);
|
|
2414
2619
|
#elif NK_TARGET_HASWELL
|
|
2415
|
-
nk_euclideans_symmetric_e4m3_haswell(vectors,
|
|
2620
|
+
nk_euclideans_symmetric_e4m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2416
2621
|
row_count);
|
|
2417
2622
|
#elif NK_TARGET_RVV
|
|
2418
|
-
nk_euclideans_symmetric_e4m3_rvv(vectors,
|
|
2623
|
+
nk_euclideans_symmetric_e4m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2624
|
+
row_count);
|
|
2419
2625
|
#elif NK_TARGET_V128RELAXED
|
|
2420
|
-
nk_euclideans_symmetric_e4m3_v128relaxed(vectors,
|
|
2626
|
+
nk_euclideans_symmetric_e4m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2421
2627
|
row_count);
|
|
2422
2628
|
#else
|
|
2423
|
-
nk_euclideans_symmetric_e4m3_serial(vectors,
|
|
2629
|
+
nk_euclideans_symmetric_e4m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2630
|
+
row_count);
|
|
2424
2631
|
#endif
|
|
2425
2632
|
}
|
|
2426
2633
|
|
|
@@ -2429,10 +2636,14 @@ NK_PUBLIC void nk_angulars_packed_e5m2(nk_e5m2_t const *a, void const *b_packed,
|
|
|
2429
2636
|
nk_size_t r_stride_in_bytes) {
|
|
2430
2637
|
#if NK_TARGET_SME
|
|
2431
2638
|
nk_angulars_packed_e5m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2639
|
+
#elif NK_TARGET_NEONFP8
|
|
2640
|
+
nk_angulars_packed_e5m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2432
2641
|
#elif NK_TARGET_NEONFHM
|
|
2433
2642
|
nk_angulars_packed_e5m2_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2434
2643
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2435
2644
|
nk_angulars_packed_e5m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2645
|
+
#elif NK_TARGET_DIAMOND
|
|
2646
|
+
nk_angulars_packed_e5m2_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2436
2647
|
#elif NK_TARGET_GENOA
|
|
2437
2648
|
nk_angulars_packed_e5m2_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2438
2649
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2447,29 +2658,40 @@ NK_PUBLIC void nk_angulars_packed_e5m2(nk_e5m2_t const *a, void const *b_packed,
|
|
|
2447
2658
|
nk_angulars_packed_e5m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2448
2659
|
#endif
|
|
2449
2660
|
}
|
|
2450
|
-
NK_PUBLIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t
|
|
2661
|
+
NK_PUBLIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2451
2662
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2452
2663
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2453
2664
|
#if NK_TARGET_SME
|
|
2454
|
-
nk_angulars_symmetric_e5m2_sme(vectors,
|
|
2665
|
+
nk_angulars_symmetric_e5m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2666
|
+
#elif NK_TARGET_NEONFP8
|
|
2667
|
+
nk_angulars_symmetric_e5m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2668
|
+
row_count);
|
|
2455
2669
|
#elif NK_TARGET_NEONFHM
|
|
2456
|
-
nk_angulars_symmetric_e5m2_neonfhm(vectors,
|
|
2670
|
+
nk_angulars_symmetric_e5m2_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2671
|
+
row_count);
|
|
2457
2672
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2458
|
-
nk_angulars_symmetric_e5m2_sapphireamx(vectors,
|
|
2673
|
+
nk_angulars_symmetric_e5m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2459
2674
|
row_count);
|
|
2675
|
+
#elif NK_TARGET_DIAMOND
|
|
2676
|
+
nk_angulars_symmetric_e5m2_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2677
|
+
row_count);
|
|
2460
2678
|
#elif NK_TARGET_GENOA
|
|
2461
|
-
nk_angulars_symmetric_e5m2_genoa(vectors,
|
|
2679
|
+
nk_angulars_symmetric_e5m2_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2680
|
+
row_count);
|
|
2462
2681
|
#elif NK_TARGET_SKYLAKE
|
|
2463
|
-
nk_angulars_symmetric_e5m2_skylake(vectors,
|
|
2682
|
+
nk_angulars_symmetric_e5m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2683
|
+
row_count);
|
|
2464
2684
|
#elif NK_TARGET_HASWELL
|
|
2465
|
-
nk_angulars_symmetric_e5m2_haswell(vectors,
|
|
2685
|
+
nk_angulars_symmetric_e5m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2686
|
+
row_count);
|
|
2466
2687
|
#elif NK_TARGET_RVV
|
|
2467
|
-
nk_angulars_symmetric_e5m2_rvv(vectors,
|
|
2688
|
+
nk_angulars_symmetric_e5m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2468
2689
|
#elif NK_TARGET_V128RELAXED
|
|
2469
|
-
nk_angulars_symmetric_e5m2_v128relaxed(vectors,
|
|
2690
|
+
nk_angulars_symmetric_e5m2_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2470
2691
|
row_count);
|
|
2471
2692
|
#else
|
|
2472
|
-
nk_angulars_symmetric_e5m2_serial(vectors,
|
|
2693
|
+
nk_angulars_symmetric_e5m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2694
|
+
row_count);
|
|
2473
2695
|
#endif
|
|
2474
2696
|
}
|
|
2475
2697
|
NK_PUBLIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2477,10 +2699,14 @@ NK_PUBLIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_packe
|
|
|
2477
2699
|
nk_size_t r_stride_in_bytes) {
|
|
2478
2700
|
#if NK_TARGET_SME
|
|
2479
2701
|
nk_euclideans_packed_e5m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2702
|
+
#elif NK_TARGET_NEONFP8
|
|
2703
|
+
nk_euclideans_packed_e5m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2480
2704
|
#elif NK_TARGET_NEONFHM
|
|
2481
2705
|
nk_euclideans_packed_e5m2_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2482
2706
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2483
2707
|
nk_euclideans_packed_e5m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2708
|
+
#elif NK_TARGET_DIAMOND
|
|
2709
|
+
nk_euclideans_packed_e5m2_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2484
2710
|
#elif NK_TARGET_GENOA
|
|
2485
2711
|
nk_euclideans_packed_e5m2_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2486
2712
|
#elif NK_TARGET_SKYLAKE
|
|
@@ -2495,32 +2721,42 @@ NK_PUBLIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_packe
|
|
|
2495
2721
|
nk_euclideans_packed_e5m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2496
2722
|
#endif
|
|
2497
2723
|
}
|
|
2498
|
-
NK_PUBLIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t
|
|
2724
|
+
NK_PUBLIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2499
2725
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2500
2726
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2501
2727
|
#if NK_TARGET_SME
|
|
2502
|
-
nk_euclideans_symmetric_e5m2_sme(vectors,
|
|
2728
|
+
nk_euclideans_symmetric_e5m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2729
|
+
row_count);
|
|
2730
|
+
#elif NK_TARGET_NEONFP8
|
|
2731
|
+
nk_euclideans_symmetric_e5m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2732
|
+
row_count);
|
|
2503
2733
|
#elif NK_TARGET_NEONFHM
|
|
2504
|
-
nk_euclideans_symmetric_e5m2_neonfhm(vectors,
|
|
2734
|
+
nk_euclideans_symmetric_e5m2_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2505
2735
|
row_count);
|
|
2506
2736
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2507
|
-
nk_euclideans_symmetric_e5m2_sapphireamx(vectors,
|
|
2737
|
+
nk_euclideans_symmetric_e5m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2508
2738
|
row_count);
|
|
2739
|
+
#elif NK_TARGET_DIAMOND
|
|
2740
|
+
nk_euclideans_symmetric_e5m2_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2741
|
+
row_count);
|
|
2509
2742
|
#elif NK_TARGET_GENOA
|
|
2510
|
-
nk_euclideans_symmetric_e5m2_genoa(vectors,
|
|
2743
|
+
nk_euclideans_symmetric_e5m2_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2744
|
+
row_count);
|
|
2511
2745
|
#elif NK_TARGET_SKYLAKE
|
|
2512
|
-
nk_euclideans_symmetric_e5m2_skylake(vectors,
|
|
2746
|
+
nk_euclideans_symmetric_e5m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2513
2747
|
row_count);
|
|
2514
2748
|
#elif NK_TARGET_HASWELL
|
|
2515
|
-
nk_euclideans_symmetric_e5m2_haswell(vectors,
|
|
2749
|
+
nk_euclideans_symmetric_e5m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2516
2750
|
row_count);
|
|
2517
2751
|
#elif NK_TARGET_RVV
|
|
2518
|
-
nk_euclideans_symmetric_e5m2_rvv(vectors,
|
|
2752
|
+
nk_euclideans_symmetric_e5m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2753
|
+
row_count);
|
|
2519
2754
|
#elif NK_TARGET_V128RELAXED
|
|
2520
|
-
nk_euclideans_symmetric_e5m2_v128relaxed(vectors,
|
|
2755
|
+
nk_euclideans_symmetric_e5m2_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2521
2756
|
row_count);
|
|
2522
2757
|
#else
|
|
2523
|
-
nk_euclideans_symmetric_e5m2_serial(vectors,
|
|
2758
|
+
nk_euclideans_symmetric_e5m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2759
|
+
row_count);
|
|
2524
2760
|
#endif
|
|
2525
2761
|
}
|
|
2526
2762
|
|
|
@@ -2531,6 +2767,8 @@ NK_PUBLIC void nk_angulars_packed_e2m3(nk_e2m3_t const *a, void const *b_packed,
|
|
|
2531
2767
|
nk_angulars_packed_e2m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2532
2768
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2533
2769
|
nk_angulars_packed_e2m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2770
|
+
#elif NK_TARGET_NEONFP8
|
|
2771
|
+
nk_angulars_packed_e2m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2534
2772
|
#elif NK_TARGET_SKYLAKE
|
|
2535
2773
|
nk_angulars_packed_e2m3_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2536
2774
|
#elif NK_TARGET_SIERRA
|
|
@@ -2547,29 +2785,37 @@ NK_PUBLIC void nk_angulars_packed_e2m3(nk_e2m3_t const *a, void const *b_packed,
|
|
|
2547
2785
|
nk_angulars_packed_e2m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2548
2786
|
#endif
|
|
2549
2787
|
}
|
|
2550
|
-
NK_PUBLIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t
|
|
2788
|
+
NK_PUBLIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2551
2789
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2552
2790
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2553
2791
|
#if NK_TARGET_SME
|
|
2554
|
-
nk_angulars_symmetric_e2m3_sme(vectors,
|
|
2792
|
+
nk_angulars_symmetric_e2m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2555
2793
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2556
|
-
nk_angulars_symmetric_e2m3_sapphireamx(vectors,
|
|
2794
|
+
nk_angulars_symmetric_e2m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2557
2795
|
row_count);
|
|
2796
|
+
#elif NK_TARGET_NEONFP8
|
|
2797
|
+
nk_angulars_symmetric_e2m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2798
|
+
row_count);
|
|
2558
2799
|
#elif NK_TARGET_SKYLAKE
|
|
2559
|
-
nk_angulars_symmetric_e2m3_skylake(vectors,
|
|
2800
|
+
nk_angulars_symmetric_e2m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2801
|
+
row_count);
|
|
2560
2802
|
#elif NK_TARGET_SIERRA
|
|
2561
|
-
nk_angulars_symmetric_e2m3_sierra(vectors,
|
|
2803
|
+
nk_angulars_symmetric_e2m3_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2804
|
+
row_count);
|
|
2562
2805
|
#elif NK_TARGET_ALDER
|
|
2563
|
-
nk_angulars_symmetric_e2m3_alder(vectors,
|
|
2806
|
+
nk_angulars_symmetric_e2m3_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2807
|
+
row_count);
|
|
2564
2808
|
#elif NK_TARGET_HASWELL
|
|
2565
|
-
nk_angulars_symmetric_e2m3_haswell(vectors,
|
|
2809
|
+
nk_angulars_symmetric_e2m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2810
|
+
row_count);
|
|
2566
2811
|
#elif NK_TARGET_RVV
|
|
2567
|
-
nk_angulars_symmetric_e2m3_rvv(vectors,
|
|
2812
|
+
nk_angulars_symmetric_e2m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2568
2813
|
#elif NK_TARGET_V128RELAXED
|
|
2569
|
-
nk_angulars_symmetric_e2m3_v128relaxed(vectors,
|
|
2814
|
+
nk_angulars_symmetric_e2m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2570
2815
|
row_count);
|
|
2571
2816
|
#else
|
|
2572
|
-
nk_angulars_symmetric_e2m3_serial(vectors,
|
|
2817
|
+
nk_angulars_symmetric_e2m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2818
|
+
row_count);
|
|
2573
2819
|
#endif
|
|
2574
2820
|
}
|
|
2575
2821
|
NK_PUBLIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2579,6 +2825,8 @@ NK_PUBLIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_packe
|
|
|
2579
2825
|
nk_euclideans_packed_e2m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2580
2826
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2581
2827
|
nk_euclideans_packed_e2m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2828
|
+
#elif NK_TARGET_NEONFP8
|
|
2829
|
+
nk_euclideans_packed_e2m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2582
2830
|
#elif NK_TARGET_SKYLAKE
|
|
2583
2831
|
nk_euclideans_packed_e2m3_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2584
2832
|
#elif NK_TARGET_SIERRA
|
|
@@ -2595,31 +2843,39 @@ NK_PUBLIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_packe
|
|
|
2595
2843
|
nk_euclideans_packed_e2m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2596
2844
|
#endif
|
|
2597
2845
|
}
|
|
2598
|
-
NK_PUBLIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t
|
|
2846
|
+
NK_PUBLIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2599
2847
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2600
2848
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2601
2849
|
#if NK_TARGET_SME
|
|
2602
|
-
nk_euclideans_symmetric_e2m3_sme(vectors,
|
|
2850
|
+
nk_euclideans_symmetric_e2m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2851
|
+
row_count);
|
|
2603
2852
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2604
|
-
nk_euclideans_symmetric_e2m3_sapphireamx(vectors,
|
|
2853
|
+
nk_euclideans_symmetric_e2m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2605
2854
|
row_count);
|
|
2855
|
+
#elif NK_TARGET_NEONFP8
|
|
2856
|
+
nk_euclideans_symmetric_e2m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2857
|
+
row_count);
|
|
2606
2858
|
#elif NK_TARGET_SKYLAKE
|
|
2607
|
-
nk_euclideans_symmetric_e2m3_skylake(vectors,
|
|
2859
|
+
nk_euclideans_symmetric_e2m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2608
2860
|
row_count);
|
|
2609
2861
|
#elif NK_TARGET_SIERRA
|
|
2610
|
-
nk_euclideans_symmetric_e2m3_sierra(vectors,
|
|
2862
|
+
nk_euclideans_symmetric_e2m3_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2863
|
+
row_count);
|
|
2611
2864
|
#elif NK_TARGET_ALDER
|
|
2612
|
-
nk_euclideans_symmetric_e2m3_alder(vectors,
|
|
2865
|
+
nk_euclideans_symmetric_e2m3_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2866
|
+
row_count);
|
|
2613
2867
|
#elif NK_TARGET_HASWELL
|
|
2614
|
-
nk_euclideans_symmetric_e2m3_haswell(vectors,
|
|
2868
|
+
nk_euclideans_symmetric_e2m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2615
2869
|
row_count);
|
|
2616
2870
|
#elif NK_TARGET_RVV
|
|
2617
|
-
nk_euclideans_symmetric_e2m3_rvv(vectors,
|
|
2871
|
+
nk_euclideans_symmetric_e2m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2872
|
+
row_count);
|
|
2618
2873
|
#elif NK_TARGET_V128RELAXED
|
|
2619
|
-
nk_euclideans_symmetric_e2m3_v128relaxed(vectors,
|
|
2874
|
+
nk_euclideans_symmetric_e2m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2620
2875
|
row_count);
|
|
2621
2876
|
#else
|
|
2622
|
-
nk_euclideans_symmetric_e2m3_serial(vectors,
|
|
2877
|
+
nk_euclideans_symmetric_e2m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2878
|
+
row_count);
|
|
2623
2879
|
#endif
|
|
2624
2880
|
}
|
|
2625
2881
|
|
|
@@ -2630,6 +2886,8 @@ NK_PUBLIC void nk_angulars_packed_e3m2(nk_e3m2_t const *a, void const *b_packed,
|
|
|
2630
2886
|
nk_angulars_packed_e3m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2631
2887
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2632
2888
|
nk_angulars_packed_e3m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2889
|
+
#elif NK_TARGET_NEONFP8
|
|
2890
|
+
nk_angulars_packed_e3m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2633
2891
|
#elif NK_TARGET_SKYLAKE
|
|
2634
2892
|
nk_angulars_packed_e3m2_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2635
2893
|
#elif NK_TARGET_HASWELL
|
|
@@ -2640,22 +2898,28 @@ NK_PUBLIC void nk_angulars_packed_e3m2(nk_e3m2_t const *a, void const *b_packed,
|
|
|
2640
2898
|
nk_angulars_packed_e3m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2641
2899
|
#endif
|
|
2642
2900
|
}
|
|
2643
|
-
NK_PUBLIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t
|
|
2901
|
+
NK_PUBLIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2644
2902
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2645
2903
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2646
2904
|
#if NK_TARGET_SME
|
|
2647
|
-
nk_angulars_symmetric_e3m2_sme(vectors,
|
|
2905
|
+
nk_angulars_symmetric_e3m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2648
2906
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2649
|
-
nk_angulars_symmetric_e3m2_sapphireamx(vectors,
|
|
2907
|
+
nk_angulars_symmetric_e3m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2650
2908
|
row_count);
|
|
2909
|
+
#elif NK_TARGET_NEONFP8
|
|
2910
|
+
nk_angulars_symmetric_e3m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2911
|
+
row_count);
|
|
2651
2912
|
#elif NK_TARGET_SKYLAKE
|
|
2652
|
-
nk_angulars_symmetric_e3m2_skylake(vectors,
|
|
2913
|
+
nk_angulars_symmetric_e3m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2914
|
+
row_count);
|
|
2653
2915
|
#elif NK_TARGET_HASWELL
|
|
2654
|
-
nk_angulars_symmetric_e3m2_haswell(vectors,
|
|
2916
|
+
nk_angulars_symmetric_e3m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2917
|
+
row_count);
|
|
2655
2918
|
#elif NK_TARGET_RVV
|
|
2656
|
-
nk_angulars_symmetric_e3m2_rvv(vectors,
|
|
2919
|
+
nk_angulars_symmetric_e3m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2657
2920
|
#else
|
|
2658
|
-
nk_angulars_symmetric_e3m2_serial(vectors,
|
|
2921
|
+
nk_angulars_symmetric_e3m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2922
|
+
row_count);
|
|
2659
2923
|
#endif
|
|
2660
2924
|
}
|
|
2661
2925
|
NK_PUBLIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2665,6 +2929,8 @@ NK_PUBLIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_packe
|
|
|
2665
2929
|
nk_euclideans_packed_e3m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2666
2930
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2667
2931
|
nk_euclideans_packed_e3m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2932
|
+
#elif NK_TARGET_NEONFP8
|
|
2933
|
+
nk_euclideans_packed_e3m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2668
2934
|
#elif NK_TARGET_SKYLAKE
|
|
2669
2935
|
nk_euclideans_packed_e3m2_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2670
2936
|
#elif NK_TARGET_HASWELL
|
|
@@ -2675,24 +2941,30 @@ NK_PUBLIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_packe
|
|
|
2675
2941
|
nk_euclideans_packed_e3m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2676
2942
|
#endif
|
|
2677
2943
|
}
|
|
2678
|
-
NK_PUBLIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t
|
|
2944
|
+
NK_PUBLIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2679
2945
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2680
2946
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2681
2947
|
#if NK_TARGET_SME
|
|
2682
|
-
nk_euclideans_symmetric_e3m2_sme(vectors,
|
|
2948
|
+
nk_euclideans_symmetric_e3m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2949
|
+
row_count);
|
|
2683
2950
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2684
|
-
nk_euclideans_symmetric_e3m2_sapphireamx(vectors,
|
|
2951
|
+
nk_euclideans_symmetric_e3m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2685
2952
|
row_count);
|
|
2953
|
+
#elif NK_TARGET_NEONFP8
|
|
2954
|
+
nk_euclideans_symmetric_e3m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2955
|
+
row_count);
|
|
2686
2956
|
#elif NK_TARGET_SKYLAKE
|
|
2687
|
-
nk_euclideans_symmetric_e3m2_skylake(vectors,
|
|
2957
|
+
nk_euclideans_symmetric_e3m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2688
2958
|
row_count);
|
|
2689
2959
|
#elif NK_TARGET_HASWELL
|
|
2690
|
-
nk_euclideans_symmetric_e3m2_haswell(vectors,
|
|
2960
|
+
nk_euclideans_symmetric_e3m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2691
2961
|
row_count);
|
|
2692
2962
|
#elif NK_TARGET_RVV
|
|
2693
|
-
nk_euclideans_symmetric_e3m2_rvv(vectors,
|
|
2963
|
+
nk_euclideans_symmetric_e3m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2964
|
+
row_count);
|
|
2694
2965
|
#else
|
|
2695
|
-
nk_euclideans_symmetric_e3m2_serial(vectors,
|
|
2966
|
+
nk_euclideans_symmetric_e3m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2967
|
+
row_count);
|
|
2696
2968
|
#endif
|
|
2697
2969
|
}
|
|
2698
2970
|
|
|
@@ -2713,6 +2985,8 @@ NK_PUBLIC void nk_angulars_packed_i8(nk_i8_t const *a, void const *b_packed, nk_
|
|
|
2713
2985
|
nk_angulars_packed_i8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2714
2986
|
#elif NK_TARGET_HASWELL
|
|
2715
2987
|
nk_angulars_packed_i8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2988
|
+
#elif NK_TARGET_POWERVSX
|
|
2989
|
+
nk_angulars_packed_i8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2716
2990
|
#elif NK_TARGET_RVV
|
|
2717
2991
|
nk_angulars_packed_i8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2718
2992
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2721,31 +2995,37 @@ NK_PUBLIC void nk_angulars_packed_i8(nk_i8_t const *a, void const *b_packed, nk_
|
|
|
2721
2995
|
nk_angulars_packed_i8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2722
2996
|
#endif
|
|
2723
2997
|
}
|
|
2724
|
-
NK_PUBLIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t
|
|
2725
|
-
nk_f32_t *result, nk_size_t result_stride,
|
|
2726
|
-
nk_size_t row_count) {
|
|
2998
|
+
NK_PUBLIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2999
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
3000
|
+
nk_size_t row_start, nk_size_t row_count) {
|
|
2727
3001
|
#if NK_TARGET_SME
|
|
2728
|
-
nk_angulars_symmetric_i8_sme(vectors,
|
|
3002
|
+
nk_angulars_symmetric_i8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2729
3003
|
#elif NK_TARGET_NEONSDOT
|
|
2730
|
-
nk_angulars_symmetric_i8_neonsdot(vectors,
|
|
3004
|
+
nk_angulars_symmetric_i8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3005
|
+
row_count);
|
|
2731
3006
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2732
|
-
nk_angulars_symmetric_i8_sapphireamx(vectors,
|
|
3007
|
+
nk_angulars_symmetric_i8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2733
3008
|
row_count);
|
|
2734
3009
|
#elif NK_TARGET_ICELAKE
|
|
2735
|
-
nk_angulars_symmetric_i8_icelake(vectors,
|
|
3010
|
+
nk_angulars_symmetric_i8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3011
|
+
row_count);
|
|
2736
3012
|
#elif NK_TARGET_SIERRA
|
|
2737
|
-
nk_angulars_symmetric_i8_sierra(vectors,
|
|
3013
|
+
nk_angulars_symmetric_i8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2738
3014
|
#elif NK_TARGET_ALDER
|
|
2739
|
-
nk_angulars_symmetric_i8_alder(vectors,
|
|
3015
|
+
nk_angulars_symmetric_i8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2740
3016
|
#elif NK_TARGET_HASWELL
|
|
2741
|
-
nk_angulars_symmetric_i8_haswell(vectors,
|
|
3017
|
+
nk_angulars_symmetric_i8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3018
|
+
row_count);
|
|
3019
|
+
#elif NK_TARGET_POWERVSX
|
|
3020
|
+
nk_angulars_symmetric_i8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3021
|
+
row_count);
|
|
2742
3022
|
#elif NK_TARGET_RVV
|
|
2743
|
-
nk_angulars_symmetric_i8_rvv(vectors,
|
|
3023
|
+
nk_angulars_symmetric_i8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2744
3024
|
#elif NK_TARGET_V128RELAXED
|
|
2745
|
-
nk_angulars_symmetric_i8_v128relaxed(vectors,
|
|
3025
|
+
nk_angulars_symmetric_i8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2746
3026
|
row_count);
|
|
2747
3027
|
#else
|
|
2748
|
-
nk_angulars_symmetric_i8_serial(vectors,
|
|
3028
|
+
nk_angulars_symmetric_i8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2749
3029
|
#endif
|
|
2750
3030
|
}
|
|
2751
3031
|
NK_PUBLIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2765,6 +3045,8 @@ NK_PUBLIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, n
|
|
|
2765
3045
|
nk_euclideans_packed_i8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2766
3046
|
#elif NK_TARGET_HASWELL
|
|
2767
3047
|
nk_euclideans_packed_i8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
3048
|
+
#elif NK_TARGET_POWERVSX
|
|
3049
|
+
nk_euclideans_packed_i8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2768
3050
|
#elif NK_TARGET_RVV
|
|
2769
3051
|
nk_euclideans_packed_i8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2770
3052
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2773,31 +3055,40 @@ NK_PUBLIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, n
|
|
|
2773
3055
|
nk_euclideans_packed_i8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2774
3056
|
#endif
|
|
2775
3057
|
}
|
|
2776
|
-
NK_PUBLIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t
|
|
3058
|
+
NK_PUBLIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2777
3059
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2778
3060
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2779
3061
|
#if NK_TARGET_SME
|
|
2780
|
-
nk_euclideans_symmetric_i8_sme(vectors,
|
|
3062
|
+
nk_euclideans_symmetric_i8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2781
3063
|
#elif NK_TARGET_NEONSDOT
|
|
2782
|
-
nk_euclideans_symmetric_i8_neonsdot(vectors,
|
|
3064
|
+
nk_euclideans_symmetric_i8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3065
|
+
row_count);
|
|
2783
3066
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2784
|
-
nk_euclideans_symmetric_i8_sapphireamx(vectors,
|
|
3067
|
+
nk_euclideans_symmetric_i8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2785
3068
|
row_count);
|
|
2786
3069
|
#elif NK_TARGET_ICELAKE
|
|
2787
|
-
nk_euclideans_symmetric_i8_icelake(vectors,
|
|
3070
|
+
nk_euclideans_symmetric_i8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3071
|
+
row_count);
|
|
2788
3072
|
#elif NK_TARGET_SIERRA
|
|
2789
|
-
nk_euclideans_symmetric_i8_sierra(vectors,
|
|
3073
|
+
nk_euclideans_symmetric_i8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3074
|
+
row_count);
|
|
2790
3075
|
#elif NK_TARGET_ALDER
|
|
2791
|
-
nk_euclideans_symmetric_i8_alder(vectors,
|
|
3076
|
+
nk_euclideans_symmetric_i8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3077
|
+
row_count);
|
|
2792
3078
|
#elif NK_TARGET_HASWELL
|
|
2793
|
-
nk_euclideans_symmetric_i8_haswell(vectors,
|
|
3079
|
+
nk_euclideans_symmetric_i8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3080
|
+
row_count);
|
|
3081
|
+
#elif NK_TARGET_POWERVSX
|
|
3082
|
+
nk_euclideans_symmetric_i8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3083
|
+
row_count);
|
|
2794
3084
|
#elif NK_TARGET_RVV
|
|
2795
|
-
nk_euclideans_symmetric_i8_rvv(vectors,
|
|
3085
|
+
nk_euclideans_symmetric_i8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2796
3086
|
#elif NK_TARGET_V128RELAXED
|
|
2797
|
-
nk_euclideans_symmetric_i8_v128relaxed(vectors,
|
|
3087
|
+
nk_euclideans_symmetric_i8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2798
3088
|
row_count);
|
|
2799
3089
|
#else
|
|
2800
|
-
nk_euclideans_symmetric_i8_serial(vectors,
|
|
3090
|
+
nk_euclideans_symmetric_i8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3091
|
+
row_count);
|
|
2801
3092
|
#endif
|
|
2802
3093
|
}
|
|
2803
3094
|
|
|
@@ -2818,6 +3109,8 @@ NK_PUBLIC void nk_angulars_packed_u8(nk_u8_t const *a, void const *b_packed, nk_
|
|
|
2818
3109
|
nk_angulars_packed_u8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2819
3110
|
#elif NK_TARGET_HASWELL
|
|
2820
3111
|
nk_angulars_packed_u8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
3112
|
+
#elif NK_TARGET_POWERVSX
|
|
3113
|
+
nk_angulars_packed_u8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2821
3114
|
#elif NK_TARGET_RVV
|
|
2822
3115
|
nk_angulars_packed_u8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2823
3116
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2826,31 +3119,37 @@ NK_PUBLIC void nk_angulars_packed_u8(nk_u8_t const *a, void const *b_packed, nk_
|
|
|
2826
3119
|
nk_angulars_packed_u8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2827
3120
|
#endif
|
|
2828
3121
|
}
|
|
2829
|
-
NK_PUBLIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t
|
|
2830
|
-
nk_f32_t *result, nk_size_t result_stride,
|
|
2831
|
-
nk_size_t row_count) {
|
|
3122
|
+
NK_PUBLIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
3123
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
3124
|
+
nk_size_t row_start, nk_size_t row_count) {
|
|
2832
3125
|
#if NK_TARGET_SME
|
|
2833
|
-
nk_angulars_symmetric_u8_sme(vectors,
|
|
3126
|
+
nk_angulars_symmetric_u8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2834
3127
|
#elif NK_TARGET_NEONSDOT
|
|
2835
|
-
nk_angulars_symmetric_u8_neonsdot(vectors,
|
|
3128
|
+
nk_angulars_symmetric_u8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3129
|
+
row_count);
|
|
2836
3130
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2837
|
-
nk_angulars_symmetric_u8_sapphireamx(vectors,
|
|
3131
|
+
nk_angulars_symmetric_u8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2838
3132
|
row_count);
|
|
2839
3133
|
#elif NK_TARGET_ICELAKE
|
|
2840
|
-
nk_angulars_symmetric_u8_icelake(vectors,
|
|
3134
|
+
nk_angulars_symmetric_u8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3135
|
+
row_count);
|
|
2841
3136
|
#elif NK_TARGET_SIERRA
|
|
2842
|
-
nk_angulars_symmetric_u8_sierra(vectors,
|
|
3137
|
+
nk_angulars_symmetric_u8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2843
3138
|
#elif NK_TARGET_ALDER
|
|
2844
|
-
nk_angulars_symmetric_u8_alder(vectors,
|
|
3139
|
+
nk_angulars_symmetric_u8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2845
3140
|
#elif NK_TARGET_HASWELL
|
|
2846
|
-
nk_angulars_symmetric_u8_haswell(vectors,
|
|
3141
|
+
nk_angulars_symmetric_u8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3142
|
+
row_count);
|
|
3143
|
+
#elif NK_TARGET_POWERVSX
|
|
3144
|
+
nk_angulars_symmetric_u8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3145
|
+
row_count);
|
|
2847
3146
|
#elif NK_TARGET_RVV
|
|
2848
|
-
nk_angulars_symmetric_u8_rvv(vectors,
|
|
3147
|
+
nk_angulars_symmetric_u8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2849
3148
|
#elif NK_TARGET_V128RELAXED
|
|
2850
|
-
nk_angulars_symmetric_u8_v128relaxed(vectors,
|
|
3149
|
+
nk_angulars_symmetric_u8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2851
3150
|
row_count);
|
|
2852
3151
|
#else
|
|
2853
|
-
nk_angulars_symmetric_u8_serial(vectors,
|
|
3152
|
+
nk_angulars_symmetric_u8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2854
3153
|
#endif
|
|
2855
3154
|
}
|
|
2856
3155
|
NK_PUBLIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2870,6 +3169,8 @@ NK_PUBLIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, n
|
|
|
2870
3169
|
nk_euclideans_packed_u8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2871
3170
|
#elif NK_TARGET_HASWELL
|
|
2872
3171
|
nk_euclideans_packed_u8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
3172
|
+
#elif NK_TARGET_POWERVSX
|
|
3173
|
+
nk_euclideans_packed_u8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2873
3174
|
#elif NK_TARGET_RVV
|
|
2874
3175
|
nk_euclideans_packed_u8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2875
3176
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -2878,31 +3179,40 @@ NK_PUBLIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, n
|
|
|
2878
3179
|
nk_euclideans_packed_u8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2879
3180
|
#endif
|
|
2880
3181
|
}
|
|
2881
|
-
NK_PUBLIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t
|
|
3182
|
+
NK_PUBLIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2882
3183
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2883
3184
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2884
3185
|
#if NK_TARGET_SME
|
|
2885
|
-
nk_euclideans_symmetric_u8_sme(vectors,
|
|
3186
|
+
nk_euclideans_symmetric_u8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2886
3187
|
#elif NK_TARGET_NEONSDOT
|
|
2887
|
-
nk_euclideans_symmetric_u8_neonsdot(vectors,
|
|
3188
|
+
nk_euclideans_symmetric_u8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3189
|
+
row_count);
|
|
2888
3190
|
#elif NK_TARGET_SAPPHIREAMX
|
|
2889
|
-
nk_euclideans_symmetric_u8_sapphireamx(vectors,
|
|
3191
|
+
nk_euclideans_symmetric_u8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2890
3192
|
row_count);
|
|
2891
3193
|
#elif NK_TARGET_ICELAKE
|
|
2892
|
-
nk_euclideans_symmetric_u8_icelake(vectors,
|
|
3194
|
+
nk_euclideans_symmetric_u8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3195
|
+
row_count);
|
|
2893
3196
|
#elif NK_TARGET_SIERRA
|
|
2894
|
-
nk_euclideans_symmetric_u8_sierra(vectors,
|
|
3197
|
+
nk_euclideans_symmetric_u8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3198
|
+
row_count);
|
|
2895
3199
|
#elif NK_TARGET_ALDER
|
|
2896
|
-
nk_euclideans_symmetric_u8_alder(vectors,
|
|
3200
|
+
nk_euclideans_symmetric_u8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3201
|
+
row_count);
|
|
2897
3202
|
#elif NK_TARGET_HASWELL
|
|
2898
|
-
nk_euclideans_symmetric_u8_haswell(vectors,
|
|
3203
|
+
nk_euclideans_symmetric_u8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3204
|
+
row_count);
|
|
3205
|
+
#elif NK_TARGET_POWERVSX
|
|
3206
|
+
nk_euclideans_symmetric_u8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3207
|
+
row_count);
|
|
2899
3208
|
#elif NK_TARGET_RVV
|
|
2900
|
-
nk_euclideans_symmetric_u8_rvv(vectors,
|
|
3209
|
+
nk_euclideans_symmetric_u8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2901
3210
|
#elif NK_TARGET_V128RELAXED
|
|
2902
|
-
nk_euclideans_symmetric_u8_v128relaxed(vectors,
|
|
3211
|
+
nk_euclideans_symmetric_u8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2903
3212
|
row_count);
|
|
2904
3213
|
#else
|
|
2905
|
-
nk_euclideans_symmetric_u8_serial(vectors,
|
|
3214
|
+
nk_euclideans_symmetric_u8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3215
|
+
row_count);
|
|
2906
3216
|
#endif
|
|
2907
3217
|
}
|
|
2908
3218
|
|
|
@@ -2919,17 +3229,19 @@ NK_PUBLIC void nk_angulars_packed_i4(nk_i4x2_t const *a, void const *b_packed, n
|
|
|
2919
3229
|
nk_angulars_packed_i4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2920
3230
|
#endif
|
|
2921
3231
|
}
|
|
2922
|
-
NK_PUBLIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t
|
|
3232
|
+
NK_PUBLIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2923
3233
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2924
3234
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2925
3235
|
#if NK_TARGET_SME
|
|
2926
|
-
nk_angulars_symmetric_i4_sme(vectors,
|
|
3236
|
+
nk_angulars_symmetric_i4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2927
3237
|
#elif NK_TARGET_NEONSDOT
|
|
2928
|
-
nk_angulars_symmetric_i4_neonsdot(vectors,
|
|
3238
|
+
nk_angulars_symmetric_i4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3239
|
+
row_count);
|
|
2929
3240
|
#elif NK_TARGET_ICELAKE
|
|
2930
|
-
nk_angulars_symmetric_i4_icelake(vectors,
|
|
3241
|
+
nk_angulars_symmetric_i4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3242
|
+
row_count);
|
|
2931
3243
|
#else
|
|
2932
|
-
nk_angulars_symmetric_i4_serial(vectors,
|
|
3244
|
+
nk_angulars_symmetric_i4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2933
3245
|
#endif
|
|
2934
3246
|
}
|
|
2935
3247
|
NK_PUBLIC void nk_euclideans_packed_i4(nk_i4x2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2945,17 +3257,20 @@ NK_PUBLIC void nk_euclideans_packed_i4(nk_i4x2_t const *a, void const *b_packed,
|
|
|
2945
3257
|
nk_euclideans_packed_i4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2946
3258
|
#endif
|
|
2947
3259
|
}
|
|
2948
|
-
NK_PUBLIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t
|
|
3260
|
+
NK_PUBLIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2949
3261
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2950
3262
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2951
3263
|
#if NK_TARGET_SME
|
|
2952
|
-
nk_euclideans_symmetric_i4_sme(vectors,
|
|
3264
|
+
nk_euclideans_symmetric_i4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2953
3265
|
#elif NK_TARGET_NEONSDOT
|
|
2954
|
-
nk_euclideans_symmetric_i4_neonsdot(vectors,
|
|
3266
|
+
nk_euclideans_symmetric_i4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3267
|
+
row_count);
|
|
2955
3268
|
#elif NK_TARGET_ICELAKE
|
|
2956
|
-
nk_euclideans_symmetric_i4_icelake(vectors,
|
|
3269
|
+
nk_euclideans_symmetric_i4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3270
|
+
row_count);
|
|
2957
3271
|
#else
|
|
2958
|
-
nk_euclideans_symmetric_i4_serial(vectors,
|
|
3272
|
+
nk_euclideans_symmetric_i4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3273
|
+
row_count);
|
|
2959
3274
|
#endif
|
|
2960
3275
|
}
|
|
2961
3276
|
|
|
@@ -2972,17 +3287,19 @@ NK_PUBLIC void nk_angulars_packed_u4(nk_u4x2_t const *a, void const *b_packed, n
|
|
|
2972
3287
|
nk_angulars_packed_u4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2973
3288
|
#endif
|
|
2974
3289
|
}
|
|
2975
|
-
NK_PUBLIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t
|
|
3290
|
+
NK_PUBLIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2976
3291
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2977
3292
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2978
3293
|
#if NK_TARGET_SME
|
|
2979
|
-
nk_angulars_symmetric_u4_sme(vectors,
|
|
3294
|
+
nk_angulars_symmetric_u4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2980
3295
|
#elif NK_TARGET_NEONSDOT
|
|
2981
|
-
nk_angulars_symmetric_u4_neonsdot(vectors,
|
|
3296
|
+
nk_angulars_symmetric_u4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3297
|
+
row_count);
|
|
2982
3298
|
#elif NK_TARGET_ICELAKE
|
|
2983
|
-
nk_angulars_symmetric_u4_icelake(vectors,
|
|
3299
|
+
nk_angulars_symmetric_u4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3300
|
+
row_count);
|
|
2984
3301
|
#else
|
|
2985
|
-
nk_angulars_symmetric_u4_serial(vectors,
|
|
3302
|
+
nk_angulars_symmetric_u4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2986
3303
|
#endif
|
|
2987
3304
|
}
|
|
2988
3305
|
NK_PUBLIC void nk_euclideans_packed_u4(nk_u4x2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
@@ -2998,17 +3315,20 @@ NK_PUBLIC void nk_euclideans_packed_u4(nk_u4x2_t const *a, void const *b_packed,
|
|
|
2998
3315
|
nk_euclideans_packed_u4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2999
3316
|
#endif
|
|
3000
3317
|
}
|
|
3001
|
-
NK_PUBLIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t
|
|
3318
|
+
NK_PUBLIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
3002
3319
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
3003
3320
|
nk_size_t row_start, nk_size_t row_count) {
|
|
3004
3321
|
#if NK_TARGET_SME
|
|
3005
|
-
nk_euclideans_symmetric_u4_sme(vectors,
|
|
3322
|
+
nk_euclideans_symmetric_u4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
3006
3323
|
#elif NK_TARGET_NEONSDOT
|
|
3007
|
-
nk_euclideans_symmetric_u4_neonsdot(vectors,
|
|
3324
|
+
nk_euclideans_symmetric_u4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3325
|
+
row_count);
|
|
3008
3326
|
#elif NK_TARGET_ICELAKE
|
|
3009
|
-
nk_euclideans_symmetric_u4_icelake(vectors,
|
|
3327
|
+
nk_euclideans_symmetric_u4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3328
|
+
row_count);
|
|
3010
3329
|
#else
|
|
3011
|
-
nk_euclideans_symmetric_u4_serial(vectors,
|
|
3330
|
+
nk_euclideans_symmetric_u4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
3331
|
+
row_count);
|
|
3012
3332
|
#endif
|
|
3013
3333
|
}
|
|
3014
3334
|
|