numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -77,10 +77,10 @@
|
|
|
77
77
|
*
|
|
78
78
|
* Relevant instructions and caveats:
|
|
79
79
|
*
|
|
80
|
-
* Intrinsic
|
|
81
|
-
* _mm_rsqrt_ps
|
|
82
|
-
* _mm_maskz_rsqrt14_pd
|
|
83
|
-
* _mm_sqrt_ps/_mm_sqrt_pd
|
|
80
|
+
* Intrinsic Instruction Notes
|
|
81
|
+
* _mm_rsqrt_ps VRSQRTPS fast approx; refine with NR
|
|
82
|
+
* _mm_maskz_rsqrt14_pd VRSQRT14PD higher-precision approx; MSVC masked-only
|
|
83
|
+
* _mm_sqrt_ps/_mm_sqrt_pd VSQRTPS/VSQRTPD higher latency, sqrt/div unit
|
|
84
84
|
*
|
|
85
85
|
* Latency/port notes (rule of thumb):
|
|
86
86
|
* - On Intel client cores, sqrt/rsqrt execute on the divide/sqrt unit (often
|
|
@@ -96,15 +96,15 @@
|
|
|
96
96
|
* AVX-512 VNNI replaces that with VPDPWSSD. BF16 uses VDPBF16PS where available to avoid
|
|
97
97
|
* convert+FMA sequences; if the ISA lacks it, we fall back to f32 FMA in the AVX2/serial:
|
|
98
98
|
*
|
|
99
|
-
* Intrinsic
|
|
100
|
-
* _mm256_fmadd_ps
|
|
101
|
-
* _mm256_fmadd_pd
|
|
102
|
-
* _mm256_madd_epi16
|
|
103
|
-
* _mm512_dpwssd_epi32
|
|
104
|
-
* _mm512_dpbf16_ps
|
|
105
|
-
* _mm_rsqrt_ps
|
|
106
|
-
* _mm_maskz_rsqrt14_pd
|
|
107
|
-
* _mm_sqrt_ps
|
|
99
|
+
* Intrinsic Instruction Icelake Genoa
|
|
100
|
+
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
|
|
101
|
+
* _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
|
|
102
|
+
* _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5cy @ p01 3cy @ p01
|
|
103
|
+
* _mm512_dpwssd_epi32 VPDPWSSD (ZMM, K, ZMM, ZMM) 5cy @ p05 4cy @ p01
|
|
104
|
+
* _mm512_dpbf16_ps VDPBF16PS (ZMM, K, ZMM, ZMM) n/a 6cy @ p01
|
|
105
|
+
* _mm_rsqrt_ps VRSQRTPS (XMM, XMM) 5cy @ p0 4cy @ p01
|
|
106
|
+
* _mm_maskz_rsqrt14_pd VRSQRT14PD (XMM, K, XMM) 4cy @ p0 5cy @ p01
|
|
107
|
+
* _mm_sqrt_ps VSQRTPS (XMM, XMM) 12cy @ p0 15cy @ p01
|
|
108
108
|
*
|
|
109
109
|
* @section arm_instructions Relevant Arm Instructions
|
|
110
110
|
*
|
|
@@ -115,18 +115,18 @@
|
|
|
115
115
|
* instructions skipping `vbfmlal` and `vbfmlalt` alternatives to limit shuffle overhead
|
|
116
116
|
* and code complexity.
|
|
117
117
|
*
|
|
118
|
-
* Intrinsic
|
|
119
|
-
* vfmaq_f32
|
|
120
|
-
* vfmaq_f64
|
|
121
|
-
* vdotq_s32
|
|
122
|
-
* vbfdotq_f32
|
|
123
|
-
* vrsqrteq_f32
|
|
124
|
-
* vrsqrtsq_f32
|
|
125
|
-
* vsqrtq_f32
|
|
118
|
+
* Intrinsic Instruction M1 Firestorm
|
|
119
|
+
* vfmaq_f32 FMLA.S (vec) 4c / 4c
|
|
120
|
+
* vfmaq_f64 FMLA.D (vec) 4c / 4c
|
|
121
|
+
* vdotq_s32 SDOT.B (vec) 3c / 4c
|
|
122
|
+
* vbfdotq_f32 BFDOT (vec) n/a
|
|
123
|
+
* vrsqrteq_f32 FRSQRTE.S (vec) 3c / 1c
|
|
124
|
+
* vrsqrtsq_f32 FRSQRTS.S (vec) 4c / 4c
|
|
125
|
+
* vsqrtq_f32 FSQRT.S (vec) 10c / 0.5c
|
|
126
126
|
*
|
|
127
127
|
* @section references References
|
|
128
128
|
*
|
|
129
|
-
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
|
|
129
|
+
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
|
130
130
|
* - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
|
|
131
131
|
*
|
|
132
132
|
*/
|
|
@@ -332,16 +332,13 @@ NK_PUBLIC void nk_euclidean_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk
|
|
|
332
332
|
NK_PUBLIC void nk_sqeuclidean_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
333
333
|
/** @copydoc nk_angular_f64 */
|
|
334
334
|
NK_PUBLIC void nk_angular_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
335
|
-
#endif // NK_TARGET_NEON
|
|
336
|
-
|
|
337
|
-
#if NK_TARGET_NEONHALF
|
|
338
335
|
/** @copydoc nk_euclidean_f64 */
|
|
339
|
-
NK_PUBLIC void
|
|
336
|
+
NK_PUBLIC void nk_euclidean_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
340
337
|
/** @copydoc nk_sqeuclidean_f64 */
|
|
341
|
-
NK_PUBLIC void
|
|
338
|
+
NK_PUBLIC void nk_sqeuclidean_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
342
339
|
/** @copydoc nk_angular_f64 */
|
|
343
|
-
NK_PUBLIC void
|
|
344
|
-
#endif //
|
|
340
|
+
NK_PUBLIC void nk_angular_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
341
|
+
#endif // NK_TARGET_NEON
|
|
345
342
|
|
|
346
343
|
#if NK_TARGET_NEONBFDOT
|
|
347
344
|
/** @copydoc nk_euclidean_f64 */
|
|
@@ -365,8 +362,62 @@ NK_PUBLIC void nk_euclidean_u8_neonsdot(nk_u8_t const *a, nk_u8_t const *b, nk_s
|
|
|
365
362
|
NK_PUBLIC void nk_sqeuclidean_u8_neonsdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
|
|
366
363
|
/** @copydoc nk_angular_f64 */
|
|
367
364
|
NK_PUBLIC void nk_angular_u8_neonsdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
365
|
+
/** @copydoc nk_euclidean_f64 */
|
|
366
|
+
NK_PUBLIC void nk_euclidean_i4_neonsdot(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
367
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
368
|
+
NK_PUBLIC void nk_sqeuclidean_i4_neonsdot(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_u32_t *result);
|
|
369
|
+
/** @copydoc nk_angular_f64 */
|
|
370
|
+
NK_PUBLIC void nk_angular_i4_neonsdot(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
371
|
+
/** @copydoc nk_euclidean_f64 */
|
|
372
|
+
NK_PUBLIC void nk_euclidean_u4_neonsdot(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
373
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
374
|
+
NK_PUBLIC void nk_sqeuclidean_u4_neonsdot(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_u32_t *result);
|
|
375
|
+
/** @copydoc nk_angular_f64 */
|
|
376
|
+
NK_PUBLIC void nk_angular_u4_neonsdot(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
368
377
|
#endif // NK_TARGET_NEONSDOT
|
|
369
378
|
|
|
379
|
+
#if NK_TARGET_SVESDOT
|
|
380
|
+
/** @copydoc nk_euclidean_f64 */
|
|
381
|
+
NK_PUBLIC void nk_euclidean_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
382
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
383
|
+
NK_PUBLIC void nk_sqeuclidean_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_u32_t *result);
|
|
384
|
+
/** @copydoc nk_angular_f64 */
|
|
385
|
+
NK_PUBLIC void nk_angular_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
386
|
+
/** @copydoc nk_euclidean_f64 */
|
|
387
|
+
NK_PUBLIC void nk_euclidean_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
388
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
389
|
+
NK_PUBLIC void nk_sqeuclidean_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
|
|
390
|
+
/** @copydoc nk_angular_f64 */
|
|
391
|
+
NK_PUBLIC void nk_angular_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
392
|
+
#endif // NK_TARGET_SVESDOT
|
|
393
|
+
|
|
394
|
+
#if NK_TARGET_NEONFP8
|
|
395
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
396
|
+
NK_PUBLIC void nk_sqeuclidean_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
397
|
+
/** @copydoc nk_euclidean_f64 */
|
|
398
|
+
NK_PUBLIC void nk_euclidean_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
399
|
+
/** @copydoc nk_angular_f64 */
|
|
400
|
+
NK_PUBLIC void nk_angular_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
401
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
402
|
+
NK_PUBLIC void nk_sqeuclidean_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
403
|
+
/** @copydoc nk_euclidean_f64 */
|
|
404
|
+
NK_PUBLIC void nk_euclidean_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
405
|
+
/** @copydoc nk_angular_f64 */
|
|
406
|
+
NK_PUBLIC void nk_angular_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
407
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
408
|
+
NK_PUBLIC void nk_sqeuclidean_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
409
|
+
/** @copydoc nk_euclidean_f64 */
|
|
410
|
+
NK_PUBLIC void nk_euclidean_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
411
|
+
/** @copydoc nk_angular_f64 */
|
|
412
|
+
NK_PUBLIC void nk_angular_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
413
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
414
|
+
NK_PUBLIC void nk_sqeuclidean_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
415
|
+
/** @copydoc nk_euclidean_f64 */
|
|
416
|
+
NK_PUBLIC void nk_euclidean_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
417
|
+
/** @copydoc nk_angular_f64 */
|
|
418
|
+
NK_PUBLIC void nk_angular_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
419
|
+
#endif // NK_TARGET_NEONFP8
|
|
420
|
+
|
|
370
421
|
/* SIMD-powered backends for Arm SVE, mostly using 32-bit arithmetic over variable-length platform-defined word sizes.
|
|
371
422
|
* Designed for Arm Graviton 3, Microsoft Cobalt, as well as Nvidia Grace and newer Ampere Altra CPUs.
|
|
372
423
|
*/
|
|
@@ -526,6 +577,24 @@ NK_PUBLIC void nk_euclidean_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_si
|
|
|
526
577
|
NK_PUBLIC void nk_sqeuclidean_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
|
|
527
578
|
/** @copydoc nk_angular_f64 */
|
|
528
579
|
NK_PUBLIC void nk_angular_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
580
|
+
/** @copydoc nk_euclidean_f64 */
|
|
581
|
+
NK_PUBLIC void nk_euclidean_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
582
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
583
|
+
NK_PUBLIC void nk_sqeuclidean_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
584
|
+
/** @copydoc nk_angular_f64 */
|
|
585
|
+
NK_PUBLIC void nk_angular_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
586
|
+
/** @copydoc nk_euclidean_f64 */
|
|
587
|
+
NK_PUBLIC void nk_euclidean_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
588
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
589
|
+
NK_PUBLIC void nk_sqeuclidean_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
590
|
+
/** @copydoc nk_angular_f64 */
|
|
591
|
+
NK_PUBLIC void nk_angular_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
592
|
+
/** @copydoc nk_euclidean_f64 */
|
|
593
|
+
NK_PUBLIC void nk_euclidean_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
594
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
595
|
+
NK_PUBLIC void nk_sqeuclidean_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
596
|
+
/** @copydoc nk_angular_f64 */
|
|
597
|
+
NK_PUBLIC void nk_angular_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
529
598
|
#endif // NK_TARGET_ICELAKE
|
|
530
599
|
|
|
531
600
|
#if NK_TARGET_GENOA
|
|
@@ -536,12 +605,6 @@ NK_PUBLIC void nk_sqeuclidean_bf16_genoa(nk_bf16_t const *a, nk_bf16_t const *b,
|
|
|
536
605
|
/** @copydoc nk_angular_f64 */
|
|
537
606
|
NK_PUBLIC void nk_angular_bf16_genoa(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
538
607
|
/** @copydoc nk_euclidean_f64 */
|
|
539
|
-
NK_PUBLIC void nk_euclidean_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
540
|
-
/** @copydoc nk_sqeuclidean_f64 */
|
|
541
|
-
NK_PUBLIC void nk_sqeuclidean_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
542
|
-
/** @copydoc nk_angular_f64 */
|
|
543
|
-
NK_PUBLIC void nk_angular_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
544
|
-
/** @copydoc nk_euclidean_f64 */
|
|
545
608
|
NK_PUBLIC void nk_euclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
546
609
|
/** @copydoc nk_sqeuclidean_f64 */
|
|
547
610
|
NK_PUBLIC void nk_sqeuclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
@@ -549,24 +612,26 @@ NK_PUBLIC void nk_sqeuclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b,
|
|
|
549
612
|
NK_PUBLIC void nk_angular_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
550
613
|
#endif // NK_TARGET_GENOA
|
|
551
614
|
|
|
552
|
-
#if
|
|
615
|
+
#if NK_TARGET_DIAMOND
|
|
553
616
|
/** @copydoc nk_euclidean_f64 */
|
|
554
|
-
NK_PUBLIC void
|
|
555
|
-
/** @copydoc
|
|
556
|
-
NK_PUBLIC void
|
|
557
|
-
/** @copydoc
|
|
558
|
-
NK_PUBLIC void
|
|
559
|
-
/** @copydoc nk_euclidean_f64 */
|
|
560
|
-
NK_PUBLIC void nk_sqeuclidean_e3m2_sapphire(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
561
|
-
/** @copydoc nk_euclidean_f64 */
|
|
562
|
-
NK_PUBLIC void nk_euclidean_e2m3_sapphire(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
617
|
+
NK_PUBLIC void nk_euclidean_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
618
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
619
|
+
NK_PUBLIC void nk_sqeuclidean_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
620
|
+
/** @copydoc nk_angular_f64 */
|
|
621
|
+
NK_PUBLIC void nk_angular_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
563
622
|
/** @copydoc nk_euclidean_f64 */
|
|
564
|
-
NK_PUBLIC void
|
|
623
|
+
NK_PUBLIC void nk_euclidean_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
624
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
625
|
+
NK_PUBLIC void nk_sqeuclidean_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
565
626
|
/** @copydoc nk_angular_f64 */
|
|
566
|
-
NK_PUBLIC void
|
|
627
|
+
NK_PUBLIC void nk_angular_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
628
|
+
/** @copydoc nk_euclidean_f64 */
|
|
629
|
+
NK_PUBLIC void nk_euclidean_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
630
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
631
|
+
NK_PUBLIC void nk_sqeuclidean_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
567
632
|
/** @copydoc nk_angular_f64 */
|
|
568
|
-
NK_PUBLIC void
|
|
569
|
-
#endif //
|
|
633
|
+
NK_PUBLIC void nk_angular_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
634
|
+
#endif // NK_TARGET_DIAMOND
|
|
570
635
|
|
|
571
636
|
/* SIMD-powered backends for AVX-INT8-VNNI extensions on Xeon 6 CPUs, including Sierra Forest and Granite Rapids.
|
|
572
637
|
* The packs many "efficiency" cores into a single socket, avoiding heavy 512-bit operations, and focusing on
|
|
@@ -591,6 +656,12 @@ NK_PUBLIC void nk_angular_e2m3_sierra(nk_e2m3_t const *a, nk_e2m3_t const *b, nk
|
|
|
591
656
|
NK_PUBLIC void nk_euclidean_e2m3_sierra(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
592
657
|
/** @copydoc nk_sqeuclidean_f64 */
|
|
593
658
|
NK_PUBLIC void nk_sqeuclidean_e2m3_sierra(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
659
|
+
/** @copydoc nk_euclidean_f64 */
|
|
660
|
+
NK_PUBLIC void nk_euclidean_e3m2_sierra(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
661
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
662
|
+
NK_PUBLIC void nk_sqeuclidean_e3m2_sierra(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
663
|
+
/** @copydoc nk_angular_f64 */
|
|
664
|
+
NK_PUBLIC void nk_angular_e3m2_sierra(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
594
665
|
#endif // NK_TARGET_SIERRA
|
|
595
666
|
|
|
596
667
|
#if NK_TARGET_ALDER
|
|
@@ -657,6 +728,30 @@ NK_PUBLIC void nk_sqeuclidean_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b,
|
|
|
657
728
|
NK_PUBLIC void nk_euclidean_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
658
729
|
/** @copydoc nk_angular_f64 */
|
|
659
730
|
NK_PUBLIC void nk_angular_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
|
|
731
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
732
|
+
NK_PUBLIC void nk_sqeuclidean_e4m3_v128relaxed(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
733
|
+
/** @copydoc nk_euclidean_f64 */
|
|
734
|
+
NK_PUBLIC void nk_euclidean_e4m3_v128relaxed(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
735
|
+
/** @copydoc nk_angular_f64 */
|
|
736
|
+
NK_PUBLIC void nk_angular_e4m3_v128relaxed(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
737
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
738
|
+
NK_PUBLIC void nk_sqeuclidean_e5m2_v128relaxed(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
739
|
+
/** @copydoc nk_euclidean_f64 */
|
|
740
|
+
NK_PUBLIC void nk_euclidean_e5m2_v128relaxed(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
741
|
+
/** @copydoc nk_angular_f64 */
|
|
742
|
+
NK_PUBLIC void nk_angular_e5m2_v128relaxed(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
743
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
744
|
+
NK_PUBLIC void nk_sqeuclidean_e2m3_v128relaxed(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
745
|
+
/** @copydoc nk_euclidean_f64 */
|
|
746
|
+
NK_PUBLIC void nk_euclidean_e2m3_v128relaxed(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
747
|
+
/** @copydoc nk_angular_f64 */
|
|
748
|
+
NK_PUBLIC void nk_angular_e2m3_v128relaxed(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
749
|
+
/** @copydoc nk_sqeuclidean_f64 */
|
|
750
|
+
NK_PUBLIC void nk_sqeuclidean_e3m2_v128relaxed(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
751
|
+
/** @copydoc nk_euclidean_f64 */
|
|
752
|
+
NK_PUBLIC void nk_euclidean_e3m2_v128relaxed(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
753
|
+
/** @copydoc nk_angular_f64 */
|
|
754
|
+
NK_PUBLIC void nk_angular_e3m2_v128relaxed(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
660
755
|
#endif // NK_TARGET_V128RELAXED
|
|
661
756
|
|
|
662
757
|
/* SIMD-powered backends for RISC-V Vector extension, using scalable vector arithmetic.
|
|
@@ -806,16 +901,17 @@ NK_INTERNAL nk_dtype_t nk_angular_output_dtype(nk_dtype_t dtype) {
|
|
|
806
901
|
|
|
807
902
|
#include "numkong/spatial/serial.h"
|
|
808
903
|
#include "numkong/spatial/neon.h"
|
|
809
|
-
#include "numkong/spatial/neonhalf.h"
|
|
810
904
|
#include "numkong/spatial/neonbfdot.h"
|
|
811
905
|
#include "numkong/spatial/neonsdot.h"
|
|
812
906
|
#include "numkong/spatial/sve.h"
|
|
813
907
|
#include "numkong/spatial/svehalf.h"
|
|
814
908
|
#include "numkong/spatial/svebfdot.h"
|
|
909
|
+
#include "numkong/spatial/svesdot.h"
|
|
910
|
+
#include "numkong/spatial/neonfp8.h"
|
|
815
911
|
#include "numkong/spatial/haswell.h"
|
|
816
912
|
#include "numkong/spatial/skylake.h"
|
|
817
913
|
#include "numkong/spatial/genoa.h"
|
|
818
|
-
#include "numkong/spatial/
|
|
914
|
+
#include "numkong/spatial/diamond.h"
|
|
819
915
|
#include "numkong/spatial/icelake.h"
|
|
820
916
|
#include "numkong/spatial/alder.h"
|
|
821
917
|
#include "numkong/spatial/sierra.h"
|
|
@@ -823,6 +919,8 @@ NK_INTERNAL nk_dtype_t nk_angular_output_dtype(nk_dtype_t dtype) {
|
|
|
823
919
|
#include "numkong/spatial/rvvhalf.h"
|
|
824
920
|
#include "numkong/spatial/rvvbf16.h"
|
|
825
921
|
#include "numkong/spatial/v128relaxed.h"
|
|
922
|
+
#include "numkong/spatial/powervsx.h"
|
|
923
|
+
#include "numkong/spatial/loongsonasx.h"
|
|
826
924
|
|
|
827
925
|
#if defined(__cplusplus)
|
|
828
926
|
extern "C" {
|
|
@@ -833,6 +931,10 @@ extern "C" {
|
|
|
833
931
|
NK_PUBLIC void nk_euclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
834
932
|
#if NK_TARGET_V128RELAXED
|
|
835
933
|
nk_euclidean_f64_v128relaxed(a, b, n, result);
|
|
934
|
+
#elif NK_TARGET_POWERVSX
|
|
935
|
+
nk_euclidean_f64_powervsx(a, b, n, result);
|
|
936
|
+
#elif NK_TARGET_LOONGSONASX
|
|
937
|
+
nk_euclidean_f64_loongsonasx(a, b, n, result);
|
|
836
938
|
#elif NK_TARGET_RVV
|
|
837
939
|
nk_euclidean_f64_rvv(a, b, n, result);
|
|
838
940
|
#elif NK_TARGET_SVE
|
|
@@ -851,6 +953,10 @@ NK_PUBLIC void nk_euclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
|
851
953
|
NK_PUBLIC void nk_sqeuclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
852
954
|
#if NK_TARGET_V128RELAXED
|
|
853
955
|
nk_sqeuclidean_f64_v128relaxed(a, b, n, result);
|
|
956
|
+
#elif NK_TARGET_POWERVSX
|
|
957
|
+
nk_sqeuclidean_f64_powervsx(a, b, n, result);
|
|
958
|
+
#elif NK_TARGET_LOONGSONASX
|
|
959
|
+
nk_sqeuclidean_f64_loongsonasx(a, b, n, result);
|
|
854
960
|
#elif NK_TARGET_RVV
|
|
855
961
|
nk_sqeuclidean_f64_rvv(a, b, n, result);
|
|
856
962
|
#elif NK_TARGET_SVE
|
|
@@ -869,6 +975,10 @@ NK_PUBLIC void nk_sqeuclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_
|
|
|
869
975
|
NK_PUBLIC void nk_angular_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
870
976
|
#if NK_TARGET_V128RELAXED
|
|
871
977
|
nk_angular_f64_v128relaxed(a, b, n, result);
|
|
978
|
+
#elif NK_TARGET_POWERVSX
|
|
979
|
+
nk_angular_f64_powervsx(a, b, n, result);
|
|
980
|
+
#elif NK_TARGET_LOONGSONASX
|
|
981
|
+
nk_angular_f64_loongsonasx(a, b, n, result);
|
|
872
982
|
#elif NK_TARGET_RVV
|
|
873
983
|
nk_angular_f64_rvv(a, b, n, result);
|
|
874
984
|
#elif NK_TARGET_SVE
|
|
@@ -887,6 +997,10 @@ NK_PUBLIC void nk_angular_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n,
|
|
|
887
997
|
NK_PUBLIC void nk_euclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
888
998
|
#if NK_TARGET_V128RELAXED
|
|
889
999
|
nk_euclidean_f32_v128relaxed(a, b, n, result);
|
|
1000
|
+
#elif NK_TARGET_POWERVSX
|
|
1001
|
+
nk_euclidean_f32_powervsx(a, b, n, result);
|
|
1002
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1003
|
+
nk_euclidean_f32_loongsonasx(a, b, n, result);
|
|
890
1004
|
#elif NK_TARGET_RVV
|
|
891
1005
|
nk_euclidean_f32_rvv(a, b, n, result);
|
|
892
1006
|
#elif NK_TARGET_SVE
|
|
@@ -905,6 +1019,10 @@ NK_PUBLIC void nk_euclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
|
905
1019
|
NK_PUBLIC void nk_sqeuclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
906
1020
|
#if NK_TARGET_V128RELAXED
|
|
907
1021
|
nk_sqeuclidean_f32_v128relaxed(a, b, n, result);
|
|
1022
|
+
#elif NK_TARGET_POWERVSX
|
|
1023
|
+
nk_sqeuclidean_f32_powervsx(a, b, n, result);
|
|
1024
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1025
|
+
nk_sqeuclidean_f32_loongsonasx(a, b, n, result);
|
|
908
1026
|
#elif NK_TARGET_RVV
|
|
909
1027
|
nk_sqeuclidean_f32_rvv(a, b, n, result);
|
|
910
1028
|
#elif NK_TARGET_SVE
|
|
@@ -923,6 +1041,10 @@ NK_PUBLIC void nk_sqeuclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_
|
|
|
923
1041
|
NK_PUBLIC void nk_angular_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
924
1042
|
#if NK_TARGET_V128RELAXED
|
|
925
1043
|
nk_angular_f32_v128relaxed(a, b, n, result);
|
|
1044
|
+
#elif NK_TARGET_POWERVSX
|
|
1045
|
+
nk_angular_f32_powervsx(a, b, n, result);
|
|
1046
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1047
|
+
nk_angular_f32_loongsonasx(a, b, n, result);
|
|
926
1048
|
#elif NK_TARGET_RVV
|
|
927
1049
|
nk_angular_f32_rvv(a, b, n, result);
|
|
928
1050
|
#elif NK_TARGET_SVE
|
|
@@ -941,14 +1063,18 @@ NK_PUBLIC void nk_angular_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n,
|
|
|
941
1063
|
NK_PUBLIC void nk_euclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
942
1064
|
#if NK_TARGET_V128RELAXED
|
|
943
1065
|
nk_euclidean_f16_v128relaxed(a, b, n, result);
|
|
1066
|
+
#elif NK_TARGET_POWERVSX
|
|
1067
|
+
nk_euclidean_f16_powervsx(a, b, n, result);
|
|
944
1068
|
#elif NK_TARGET_RVVHALF
|
|
945
1069
|
nk_euclidean_f16_rvvhalf(a, b, n, result);
|
|
946
1070
|
#elif NK_TARGET_RVV
|
|
947
1071
|
nk_euclidean_f16_rvv(a, b, n, result);
|
|
948
1072
|
#elif NK_TARGET_SVEHALF
|
|
949
1073
|
nk_euclidean_f16_svehalf(a, b, n, result);
|
|
950
|
-
#elif
|
|
951
|
-
|
|
1074
|
+
#elif NK_TARGET_NEON
|
|
1075
|
+
nk_euclidean_f16_neon(a, b, n, result);
|
|
1076
|
+
#elif NK_TARGET_DIAMOND
|
|
1077
|
+
nk_euclidean_f16_diamond(a, b, n, result);
|
|
952
1078
|
#elif NK_TARGET_SKYLAKE
|
|
953
1079
|
nk_euclidean_f16_skylake(a, b, n, result);
|
|
954
1080
|
#elif NK_TARGET_HASWELL
|
|
@@ -961,14 +1087,18 @@ NK_PUBLIC void nk_euclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t
|
|
|
961
1087
|
NK_PUBLIC void nk_sqeuclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
962
1088
|
#if NK_TARGET_V128RELAXED
|
|
963
1089
|
nk_sqeuclidean_f16_v128relaxed(a, b, n, result);
|
|
1090
|
+
#elif NK_TARGET_POWERVSX
|
|
1091
|
+
nk_sqeuclidean_f16_powervsx(a, b, n, result);
|
|
964
1092
|
#elif NK_TARGET_RVVHALF
|
|
965
1093
|
nk_sqeuclidean_f16_rvvhalf(a, b, n, result);
|
|
966
1094
|
#elif NK_TARGET_RVV
|
|
967
1095
|
nk_sqeuclidean_f16_rvv(a, b, n, result);
|
|
968
1096
|
#elif NK_TARGET_SVEHALF
|
|
969
1097
|
nk_sqeuclidean_f16_svehalf(a, b, n, result);
|
|
970
|
-
#elif
|
|
971
|
-
|
|
1098
|
+
#elif NK_TARGET_NEON
|
|
1099
|
+
nk_sqeuclidean_f16_neon(a, b, n, result);
|
|
1100
|
+
#elif NK_TARGET_DIAMOND
|
|
1101
|
+
nk_sqeuclidean_f16_diamond(a, b, n, result);
|
|
972
1102
|
#elif NK_TARGET_SKYLAKE
|
|
973
1103
|
nk_sqeuclidean_f16_skylake(a, b, n, result);
|
|
974
1104
|
#elif NK_TARGET_HASWELL
|
|
@@ -981,14 +1111,18 @@ NK_PUBLIC void nk_sqeuclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_
|
|
|
981
1111
|
NK_PUBLIC void nk_angular_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
982
1112
|
#if NK_TARGET_V128RELAXED
|
|
983
1113
|
nk_angular_f16_v128relaxed(a, b, n, result);
|
|
1114
|
+
#elif NK_TARGET_POWERVSX
|
|
1115
|
+
nk_angular_f16_powervsx(a, b, n, result);
|
|
984
1116
|
#elif NK_TARGET_RVVHALF
|
|
985
1117
|
nk_angular_f16_rvvhalf(a, b, n, result);
|
|
986
1118
|
#elif NK_TARGET_RVV
|
|
987
1119
|
nk_angular_f16_rvv(a, b, n, result);
|
|
988
1120
|
#elif NK_TARGET_SVEHALF
|
|
989
1121
|
nk_angular_f16_svehalf(a, b, n, result);
|
|
990
|
-
#elif
|
|
991
|
-
|
|
1122
|
+
#elif NK_TARGET_NEON
|
|
1123
|
+
nk_angular_f16_neon(a, b, n, result);
|
|
1124
|
+
#elif NK_TARGET_DIAMOND
|
|
1125
|
+
nk_angular_f16_diamond(a, b, n, result);
|
|
992
1126
|
#elif NK_TARGET_SKYLAKE
|
|
993
1127
|
nk_angular_f16_skylake(a, b, n, result);
|
|
994
1128
|
#elif NK_TARGET_HASWELL
|
|
@@ -1001,6 +1135,10 @@ NK_PUBLIC void nk_angular_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n,
|
|
|
1001
1135
|
NK_PUBLIC void nk_euclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1002
1136
|
#if NK_TARGET_V128RELAXED
|
|
1003
1137
|
nk_euclidean_bf16_v128relaxed(a, b, n, result);
|
|
1138
|
+
#elif NK_TARGET_POWERVSX
|
|
1139
|
+
nk_euclidean_bf16_powervsx(a, b, n, result);
|
|
1140
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1141
|
+
nk_euclidean_bf16_loongsonasx(a, b, n, result);
|
|
1004
1142
|
#elif NK_TARGET_RVVBF16
|
|
1005
1143
|
nk_euclidean_bf16_rvvbf16(a, b, n, result);
|
|
1006
1144
|
#elif NK_TARGET_RVV
|
|
@@ -1021,6 +1159,10 @@ NK_PUBLIC void nk_euclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size
|
|
|
1021
1159
|
NK_PUBLIC void nk_sqeuclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1022
1160
|
#if NK_TARGET_V128RELAXED
|
|
1023
1161
|
nk_sqeuclidean_bf16_v128relaxed(a, b, n, result);
|
|
1162
|
+
#elif NK_TARGET_POWERVSX
|
|
1163
|
+
nk_sqeuclidean_bf16_powervsx(a, b, n, result);
|
|
1164
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1165
|
+
nk_sqeuclidean_bf16_loongsonasx(a, b, n, result);
|
|
1024
1166
|
#elif NK_TARGET_RVVBF16
|
|
1025
1167
|
nk_sqeuclidean_bf16_rvvbf16(a, b, n, result);
|
|
1026
1168
|
#elif NK_TARGET_RVV
|
|
@@ -1041,6 +1183,10 @@ NK_PUBLIC void nk_sqeuclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_si
|
|
|
1041
1183
|
NK_PUBLIC void nk_angular_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1042
1184
|
#if NK_TARGET_V128RELAXED
|
|
1043
1185
|
nk_angular_bf16_v128relaxed(a, b, n, result);
|
|
1186
|
+
#elif NK_TARGET_POWERVSX
|
|
1187
|
+
nk_angular_bf16_powervsx(a, b, n, result);
|
|
1188
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1189
|
+
nk_angular_bf16_loongsonasx(a, b, n, result);
|
|
1044
1190
|
#elif NK_TARGET_RVVBF16
|
|
1045
1191
|
nk_angular_bf16_rvvbf16(a, b, n, result);
|
|
1046
1192
|
#elif NK_TARGET_RVV
|
|
@@ -1059,84 +1205,118 @@ NK_PUBLIC void nk_angular_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t
|
|
|
1059
1205
|
}
|
|
1060
1206
|
|
|
1061
1207
|
NK_PUBLIC void nk_euclidean_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1062
|
-
#if
|
|
1063
|
-
|
|
1064
|
-
#elif
|
|
1065
|
-
|
|
1208
|
+
#if NK_TARGET_NEONFP8
|
|
1209
|
+
nk_euclidean_e4m3_neonfp8(a, b, n, result);
|
|
1210
|
+
#elif NK_TARGET_DIAMOND
|
|
1211
|
+
nk_euclidean_e4m3_diamond(a, b, n, result);
|
|
1212
|
+
#elif NK_TARGET_ICELAKE
|
|
1213
|
+
nk_euclidean_e4m3_icelake(a, b, n, result);
|
|
1066
1214
|
#elif NK_TARGET_SKYLAKE
|
|
1067
1215
|
nk_euclidean_e4m3_skylake(a, b, n, result);
|
|
1068
1216
|
#elif NK_TARGET_RVV
|
|
1069
1217
|
nk_euclidean_e4m3_rvv(a, b, n, result);
|
|
1218
|
+
#elif NK_TARGET_V128RELAXED
|
|
1219
|
+
nk_euclidean_e4m3_v128relaxed(a, b, n, result);
|
|
1070
1220
|
#else
|
|
1071
1221
|
nk_euclidean_e4m3_serial(a, b, n, result);
|
|
1072
1222
|
#endif
|
|
1073
1223
|
}
|
|
1074
1224
|
|
|
1075
1225
|
NK_PUBLIC void nk_sqeuclidean_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1076
|
-
#if
|
|
1077
|
-
|
|
1078
|
-
#elif
|
|
1079
|
-
|
|
1226
|
+
#if NK_TARGET_NEONFP8
|
|
1227
|
+
nk_sqeuclidean_e4m3_neonfp8(a, b, n, result);
|
|
1228
|
+
#elif NK_TARGET_DIAMOND
|
|
1229
|
+
nk_sqeuclidean_e4m3_diamond(a, b, n, result);
|
|
1230
|
+
#elif NK_TARGET_ICELAKE
|
|
1231
|
+
nk_sqeuclidean_e4m3_icelake(a, b, n, result);
|
|
1080
1232
|
#elif NK_TARGET_SKYLAKE
|
|
1081
1233
|
nk_sqeuclidean_e4m3_skylake(a, b, n, result);
|
|
1082
1234
|
#elif NK_TARGET_RVV
|
|
1083
1235
|
nk_sqeuclidean_e4m3_rvv(a, b, n, result);
|
|
1236
|
+
#elif NK_TARGET_V128RELAXED
|
|
1237
|
+
nk_sqeuclidean_e4m3_v128relaxed(a, b, n, result);
|
|
1084
1238
|
#else
|
|
1085
1239
|
nk_sqeuclidean_e4m3_serial(a, b, n, result);
|
|
1086
1240
|
#endif
|
|
1087
1241
|
}
|
|
1088
1242
|
|
|
1089
1243
|
NK_PUBLIC void nk_angular_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1090
|
-
#if
|
|
1091
|
-
|
|
1244
|
+
#if NK_TARGET_NEONFP8
|
|
1245
|
+
nk_angular_e4m3_neonfp8(a, b, n, result);
|
|
1246
|
+
#elif NK_TARGET_DIAMOND
|
|
1247
|
+
nk_angular_e4m3_diamond(a, b, n, result);
|
|
1248
|
+
#elif NK_TARGET_ICELAKE
|
|
1249
|
+
nk_angular_e4m3_icelake(a, b, n, result);
|
|
1092
1250
|
#elif NK_TARGET_SKYLAKE
|
|
1093
1251
|
nk_angular_e4m3_skylake(a, b, n, result);
|
|
1094
1252
|
#elif NK_TARGET_RVV
|
|
1095
1253
|
nk_angular_e4m3_rvv(a, b, n, result);
|
|
1254
|
+
#elif NK_TARGET_V128RELAXED
|
|
1255
|
+
nk_angular_e4m3_v128relaxed(a, b, n, result);
|
|
1096
1256
|
#else
|
|
1097
1257
|
nk_angular_e4m3_serial(a, b, n, result);
|
|
1098
1258
|
#endif
|
|
1099
1259
|
}
|
|
1100
1260
|
|
|
1101
1261
|
NK_PUBLIC void nk_euclidean_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1102
|
-
#if
|
|
1262
|
+
#if NK_TARGET_NEONFP8
|
|
1263
|
+
nk_euclidean_e5m2_neonfp8(a, b, n, result);
|
|
1264
|
+
#elif NK_TARGET_DIAMOND
|
|
1265
|
+
nk_euclidean_e5m2_diamond(a, b, n, result);
|
|
1266
|
+
#elif NK_TARGET_GENOA
|
|
1103
1267
|
nk_euclidean_e5m2_genoa(a, b, n, result);
|
|
1104
1268
|
#elif NK_TARGET_SKYLAKE
|
|
1105
1269
|
nk_euclidean_e5m2_skylake(a, b, n, result);
|
|
1106
1270
|
#elif NK_TARGET_RVV
|
|
1107
1271
|
nk_euclidean_e5m2_rvv(a, b, n, result);
|
|
1272
|
+
#elif NK_TARGET_V128RELAXED
|
|
1273
|
+
nk_euclidean_e5m2_v128relaxed(a, b, n, result);
|
|
1108
1274
|
#else
|
|
1109
1275
|
nk_euclidean_e5m2_serial(a, b, n, result);
|
|
1110
1276
|
#endif
|
|
1111
1277
|
}
|
|
1112
1278
|
|
|
1113
1279
|
NK_PUBLIC void nk_sqeuclidean_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1114
|
-
#if
|
|
1280
|
+
#if NK_TARGET_NEONFP8
|
|
1281
|
+
nk_sqeuclidean_e5m2_neonfp8(a, b, n, result);
|
|
1282
|
+
#elif NK_TARGET_DIAMOND
|
|
1283
|
+
nk_sqeuclidean_e5m2_diamond(a, b, n, result);
|
|
1284
|
+
#elif NK_TARGET_GENOA
|
|
1115
1285
|
nk_sqeuclidean_e5m2_genoa(a, b, n, result);
|
|
1116
1286
|
#elif NK_TARGET_SKYLAKE
|
|
1117
1287
|
nk_sqeuclidean_e5m2_skylake(a, b, n, result);
|
|
1118
1288
|
#elif NK_TARGET_RVV
|
|
1119
1289
|
nk_sqeuclidean_e5m2_rvv(a, b, n, result);
|
|
1290
|
+
#elif NK_TARGET_V128RELAXED
|
|
1291
|
+
nk_sqeuclidean_e5m2_v128relaxed(a, b, n, result);
|
|
1120
1292
|
#else
|
|
1121
1293
|
nk_sqeuclidean_e5m2_serial(a, b, n, result);
|
|
1122
1294
|
#endif
|
|
1123
1295
|
}
|
|
1124
1296
|
|
|
1125
1297
|
NK_PUBLIC void nk_angular_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1126
|
-
#if
|
|
1298
|
+
#if NK_TARGET_NEONFP8
|
|
1299
|
+
nk_angular_e5m2_neonfp8(a, b, n, result);
|
|
1300
|
+
#elif NK_TARGET_DIAMOND
|
|
1301
|
+
nk_angular_e5m2_diamond(a, b, n, result);
|
|
1302
|
+
#elif NK_TARGET_GENOA
|
|
1127
1303
|
nk_angular_e5m2_genoa(a, b, n, result);
|
|
1128
1304
|
#elif NK_TARGET_SKYLAKE
|
|
1129
1305
|
nk_angular_e5m2_skylake(a, b, n, result);
|
|
1130
1306
|
#elif NK_TARGET_RVV
|
|
1131
1307
|
nk_angular_e5m2_rvv(a, b, n, result);
|
|
1308
|
+
#elif NK_TARGET_V128RELAXED
|
|
1309
|
+
nk_angular_e5m2_v128relaxed(a, b, n, result);
|
|
1132
1310
|
#else
|
|
1133
1311
|
nk_angular_e5m2_serial(a, b, n, result);
|
|
1134
1312
|
#endif
|
|
1135
1313
|
}
|
|
1136
1314
|
|
|
1137
1315
|
NK_PUBLIC void nk_euclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1138
|
-
#if
|
|
1139
|
-
|
|
1316
|
+
#if NK_TARGET_NEONFP8
|
|
1317
|
+
nk_euclidean_e2m3_neonfp8(a, b, n, result);
|
|
1318
|
+
#elif NK_TARGET_ICELAKE
|
|
1319
|
+
nk_euclidean_e2m3_icelake(a, b, n, result);
|
|
1140
1320
|
#elif NK_TARGET_SKYLAKE
|
|
1141
1321
|
nk_euclidean_e2m3_skylake(a, b, n, result);
|
|
1142
1322
|
#elif NK_TARGET_SIERRA
|
|
@@ -1147,14 +1327,18 @@ NK_PUBLIC void nk_euclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size
|
|
|
1147
1327
|
nk_euclidean_e2m3_haswell(a, b, n, result);
|
|
1148
1328
|
#elif NK_TARGET_NEON
|
|
1149
1329
|
nk_euclidean_e2m3_neon(a, b, n, result);
|
|
1330
|
+
#elif NK_TARGET_V128RELAXED
|
|
1331
|
+
nk_euclidean_e2m3_v128relaxed(a, b, n, result);
|
|
1150
1332
|
#else
|
|
1151
1333
|
nk_euclidean_e2m3_serial(a, b, n, result);
|
|
1152
1334
|
#endif
|
|
1153
1335
|
}
|
|
1154
1336
|
|
|
1155
1337
|
NK_PUBLIC void nk_sqeuclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1156
|
-
#if
|
|
1157
|
-
|
|
1338
|
+
#if NK_TARGET_NEONFP8
|
|
1339
|
+
nk_sqeuclidean_e2m3_neonfp8(a, b, n, result);
|
|
1340
|
+
#elif NK_TARGET_ICELAKE
|
|
1341
|
+
nk_sqeuclidean_e2m3_icelake(a, b, n, result);
|
|
1158
1342
|
#elif NK_TARGET_SKYLAKE
|
|
1159
1343
|
nk_sqeuclidean_e2m3_skylake(a, b, n, result);
|
|
1160
1344
|
#elif NK_TARGET_SIERRA
|
|
@@ -1165,14 +1349,18 @@ NK_PUBLIC void nk_sqeuclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_si
|
|
|
1165
1349
|
nk_sqeuclidean_e2m3_haswell(a, b, n, result);
|
|
1166
1350
|
#elif NK_TARGET_NEON
|
|
1167
1351
|
nk_sqeuclidean_e2m3_neon(a, b, n, result);
|
|
1352
|
+
#elif NK_TARGET_V128RELAXED
|
|
1353
|
+
nk_sqeuclidean_e2m3_v128relaxed(a, b, n, result);
|
|
1168
1354
|
#else
|
|
1169
1355
|
nk_sqeuclidean_e2m3_serial(a, b, n, result);
|
|
1170
1356
|
#endif
|
|
1171
1357
|
}
|
|
1172
1358
|
|
|
1173
1359
|
NK_PUBLIC void nk_angular_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1174
|
-
#if
|
|
1175
|
-
|
|
1360
|
+
#if NK_TARGET_NEONFP8
|
|
1361
|
+
nk_angular_e2m3_neonfp8(a, b, n, result);
|
|
1362
|
+
#elif NK_TARGET_ICELAKE
|
|
1363
|
+
nk_angular_e2m3_icelake(a, b, n, result);
|
|
1176
1364
|
#elif NK_TARGET_SKYLAKE
|
|
1177
1365
|
nk_angular_e2m3_skylake(a, b, n, result);
|
|
1178
1366
|
#elif NK_TARGET_SIERRA
|
|
@@ -1183,54 +1371,74 @@ NK_PUBLIC void nk_angular_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t
|
|
|
1183
1371
|
nk_angular_e2m3_haswell(a, b, n, result);
|
|
1184
1372
|
#elif NK_TARGET_NEON
|
|
1185
1373
|
nk_angular_e2m3_neon(a, b, n, result);
|
|
1374
|
+
#elif NK_TARGET_V128RELAXED
|
|
1375
|
+
nk_angular_e2m3_v128relaxed(a, b, n, result);
|
|
1186
1376
|
#else
|
|
1187
1377
|
nk_angular_e2m3_serial(a, b, n, result);
|
|
1188
1378
|
#endif
|
|
1189
1379
|
}
|
|
1190
1380
|
|
|
1191
1381
|
NK_PUBLIC void nk_euclidean_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1192
|
-
#if
|
|
1193
|
-
|
|
1382
|
+
#if NK_TARGET_NEONFP8
|
|
1383
|
+
nk_euclidean_e3m2_neonfp8(a, b, n, result);
|
|
1384
|
+
#elif NK_TARGET_ICELAKE
|
|
1385
|
+
nk_euclidean_e3m2_icelake(a, b, n, result);
|
|
1194
1386
|
#elif NK_TARGET_SKYLAKE
|
|
1195
1387
|
nk_euclidean_e3m2_skylake(a, b, n, result);
|
|
1388
|
+
#elif NK_TARGET_SIERRA
|
|
1389
|
+
nk_euclidean_e3m2_sierra(a, b, n, result);
|
|
1196
1390
|
#elif NK_TARGET_ALDER
|
|
1197
1391
|
nk_euclidean_e3m2_alder(a, b, n, result);
|
|
1198
1392
|
#elif NK_TARGET_HASWELL
|
|
1199
1393
|
nk_euclidean_e3m2_haswell(a, b, n, result);
|
|
1200
1394
|
#elif NK_TARGET_NEON
|
|
1201
1395
|
nk_euclidean_e3m2_neon(a, b, n, result);
|
|
1396
|
+
#elif NK_TARGET_V128RELAXED
|
|
1397
|
+
nk_euclidean_e3m2_v128relaxed(a, b, n, result);
|
|
1202
1398
|
#else
|
|
1203
1399
|
nk_euclidean_e3m2_serial(a, b, n, result);
|
|
1204
1400
|
#endif
|
|
1205
1401
|
}
|
|
1206
1402
|
|
|
1207
1403
|
NK_PUBLIC void nk_sqeuclidean_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1208
|
-
#if
|
|
1209
|
-
|
|
1404
|
+
#if NK_TARGET_NEONFP8
|
|
1405
|
+
nk_sqeuclidean_e3m2_neonfp8(a, b, n, result);
|
|
1406
|
+
#elif NK_TARGET_ICELAKE
|
|
1407
|
+
nk_sqeuclidean_e3m2_icelake(a, b, n, result);
|
|
1210
1408
|
#elif NK_TARGET_SKYLAKE
|
|
1211
1409
|
nk_sqeuclidean_e3m2_skylake(a, b, n, result);
|
|
1410
|
+
#elif NK_TARGET_SIERRA
|
|
1411
|
+
nk_sqeuclidean_e3m2_sierra(a, b, n, result);
|
|
1212
1412
|
#elif NK_TARGET_ALDER
|
|
1213
1413
|
nk_sqeuclidean_e3m2_alder(a, b, n, result);
|
|
1214
1414
|
#elif NK_TARGET_HASWELL
|
|
1215
1415
|
nk_sqeuclidean_e3m2_haswell(a, b, n, result);
|
|
1216
1416
|
#elif NK_TARGET_NEON
|
|
1217
1417
|
nk_sqeuclidean_e3m2_neon(a, b, n, result);
|
|
1418
|
+
#elif NK_TARGET_V128RELAXED
|
|
1419
|
+
nk_sqeuclidean_e3m2_v128relaxed(a, b, n, result);
|
|
1218
1420
|
#else
|
|
1219
1421
|
nk_sqeuclidean_e3m2_serial(a, b, n, result);
|
|
1220
1422
|
#endif
|
|
1221
1423
|
}
|
|
1222
1424
|
|
|
1223
1425
|
NK_PUBLIC void nk_angular_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1224
|
-
#if
|
|
1225
|
-
|
|
1426
|
+
#if NK_TARGET_NEONFP8
|
|
1427
|
+
nk_angular_e3m2_neonfp8(a, b, n, result);
|
|
1428
|
+
#elif NK_TARGET_ICELAKE
|
|
1429
|
+
nk_angular_e3m2_icelake(a, b, n, result);
|
|
1226
1430
|
#elif NK_TARGET_SKYLAKE
|
|
1227
1431
|
nk_angular_e3m2_skylake(a, b, n, result);
|
|
1432
|
+
#elif NK_TARGET_SIERRA
|
|
1433
|
+
nk_angular_e3m2_sierra(a, b, n, result);
|
|
1228
1434
|
#elif NK_TARGET_ALDER
|
|
1229
1435
|
nk_angular_e3m2_alder(a, b, n, result);
|
|
1230
1436
|
#elif NK_TARGET_HASWELL
|
|
1231
1437
|
nk_angular_e3m2_haswell(a, b, n, result);
|
|
1232
1438
|
#elif NK_TARGET_NEON
|
|
1233
1439
|
nk_angular_e3m2_neon(a, b, n, result);
|
|
1440
|
+
#elif NK_TARGET_V128RELAXED
|
|
1441
|
+
nk_angular_e3m2_v128relaxed(a, b, n, result);
|
|
1234
1442
|
#else
|
|
1235
1443
|
nk_angular_e3m2_serial(a, b, n, result);
|
|
1236
1444
|
#endif
|
|
@@ -1239,6 +1447,12 @@ NK_PUBLIC void nk_angular_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t
|
|
|
1239
1447
|
NK_PUBLIC void nk_euclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1240
1448
|
#if NK_TARGET_RVV
|
|
1241
1449
|
nk_euclidean_i8_rvv(a, b, n, result);
|
|
1450
|
+
#elif NK_TARGET_POWERVSX
|
|
1451
|
+
nk_euclidean_i8_powervsx(a, b, n, result);
|
|
1452
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1453
|
+
nk_euclidean_i8_loongsonasx(a, b, n, result);
|
|
1454
|
+
#elif NK_TARGET_SVESDOT
|
|
1455
|
+
nk_euclidean_i8_svesdot(a, b, n, result);
|
|
1242
1456
|
#elif NK_TARGET_NEONSDOT
|
|
1243
1457
|
nk_euclidean_i8_neonsdot(a, b, n, result);
|
|
1244
1458
|
#elif NK_TARGET_ICELAKE
|
|
@@ -1259,6 +1473,12 @@ NK_PUBLIC void nk_euclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n,
|
|
|
1259
1473
|
NK_PUBLIC void nk_sqeuclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
1260
1474
|
#if NK_TARGET_RVV
|
|
1261
1475
|
nk_sqeuclidean_i8_rvv(a, b, n, result);
|
|
1476
|
+
#elif NK_TARGET_POWERVSX
|
|
1477
|
+
nk_sqeuclidean_i8_powervsx(a, b, n, result);
|
|
1478
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1479
|
+
nk_sqeuclidean_i8_loongsonasx(a, b, n, result);
|
|
1480
|
+
#elif NK_TARGET_SVESDOT
|
|
1481
|
+
nk_sqeuclidean_i8_svesdot(a, b, n, result);
|
|
1262
1482
|
#elif NK_TARGET_NEONSDOT
|
|
1263
1483
|
nk_sqeuclidean_i8_neonsdot(a, b, n, result);
|
|
1264
1484
|
#elif NK_TARGET_ICELAKE
|
|
@@ -1279,6 +1499,12 @@ NK_PUBLIC void nk_sqeuclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n
|
|
|
1279
1499
|
NK_PUBLIC void nk_angular_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1280
1500
|
#if NK_TARGET_RVV
|
|
1281
1501
|
nk_angular_i8_rvv(a, b, n, result);
|
|
1502
|
+
#elif NK_TARGET_POWERVSX
|
|
1503
|
+
nk_angular_i8_powervsx(a, b, n, result);
|
|
1504
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1505
|
+
nk_angular_i8_loongsonasx(a, b, n, result);
|
|
1506
|
+
#elif NK_TARGET_SVESDOT
|
|
1507
|
+
nk_angular_i8_svesdot(a, b, n, result);
|
|
1282
1508
|
#elif NK_TARGET_NEONSDOT
|
|
1283
1509
|
nk_angular_i8_neonsdot(a, b, n, result);
|
|
1284
1510
|
#elif NK_TARGET_ICELAKE
|
|
@@ -1299,6 +1525,12 @@ NK_PUBLIC void nk_angular_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk
|
|
|
1299
1525
|
NK_PUBLIC void nk_euclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1300
1526
|
#if NK_TARGET_RVV
|
|
1301
1527
|
nk_euclidean_u8_rvv(a, b, n, result);
|
|
1528
|
+
#elif NK_TARGET_POWERVSX
|
|
1529
|
+
nk_euclidean_u8_powervsx(a, b, n, result);
|
|
1530
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1531
|
+
nk_euclidean_u8_loongsonasx(a, b, n, result);
|
|
1532
|
+
#elif NK_TARGET_SVESDOT
|
|
1533
|
+
nk_euclidean_u8_svesdot(a, b, n, result);
|
|
1302
1534
|
#elif NK_TARGET_NEONSDOT
|
|
1303
1535
|
nk_euclidean_u8_neonsdot(a, b, n, result);
|
|
1304
1536
|
#elif NK_TARGET_ICELAKE
|
|
@@ -1319,6 +1551,12 @@ NK_PUBLIC void nk_euclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n,
|
|
|
1319
1551
|
NK_PUBLIC void nk_sqeuclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
1320
1552
|
#if NK_TARGET_RVV
|
|
1321
1553
|
nk_sqeuclidean_u8_rvv(a, b, n, result);
|
|
1554
|
+
#elif NK_TARGET_POWERVSX
|
|
1555
|
+
nk_sqeuclidean_u8_powervsx(a, b, n, result);
|
|
1556
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1557
|
+
nk_sqeuclidean_u8_loongsonasx(a, b, n, result);
|
|
1558
|
+
#elif NK_TARGET_SVESDOT
|
|
1559
|
+
nk_sqeuclidean_u8_svesdot(a, b, n, result);
|
|
1322
1560
|
#elif NK_TARGET_NEONSDOT
|
|
1323
1561
|
nk_sqeuclidean_u8_neonsdot(a, b, n, result);
|
|
1324
1562
|
#elif NK_TARGET_ICELAKE
|
|
@@ -1339,6 +1577,12 @@ NK_PUBLIC void nk_sqeuclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n
|
|
|
1339
1577
|
NK_PUBLIC void nk_angular_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1340
1578
|
#if NK_TARGET_RVV
|
|
1341
1579
|
nk_angular_u8_rvv(a, b, n, result);
|
|
1580
|
+
#elif NK_TARGET_POWERVSX
|
|
1581
|
+
nk_angular_u8_powervsx(a, b, n, result);
|
|
1582
|
+
#elif NK_TARGET_LOONGSONASX
|
|
1583
|
+
nk_angular_u8_loongsonasx(a, b, n, result);
|
|
1584
|
+
#elif NK_TARGET_SVESDOT
|
|
1585
|
+
nk_angular_u8_svesdot(a, b, n, result);
|
|
1342
1586
|
#elif NK_TARGET_NEONSDOT
|
|
1343
1587
|
nk_angular_u8_neonsdot(a, b, n, result);
|
|
1344
1588
|
#elif NK_TARGET_ICELAKE
|
|
@@ -1359,6 +1603,8 @@ NK_PUBLIC void nk_angular_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk
|
|
|
1359
1603
|
NK_PUBLIC void nk_euclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1360
1604
|
#if NK_TARGET_ICELAKE
|
|
1361
1605
|
nk_euclidean_i4_icelake(a, b, n, result);
|
|
1606
|
+
#elif NK_TARGET_NEONSDOT
|
|
1607
|
+
nk_euclidean_i4_neonsdot(a, b, n, result);
|
|
1362
1608
|
#elif NK_TARGET_RVV
|
|
1363
1609
|
nk_euclidean_i4_rvv(a, b, n, result);
|
|
1364
1610
|
#else
|
|
@@ -1369,6 +1615,8 @@ NK_PUBLIC void nk_euclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t
|
|
|
1369
1615
|
NK_PUBLIC void nk_sqeuclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
1370
1616
|
#if NK_TARGET_ICELAKE
|
|
1371
1617
|
nk_sqeuclidean_i4_icelake(a, b, n, result);
|
|
1618
|
+
#elif NK_TARGET_NEONSDOT
|
|
1619
|
+
nk_sqeuclidean_i4_neonsdot(a, b, n, result);
|
|
1372
1620
|
#elif NK_TARGET_RVV
|
|
1373
1621
|
nk_sqeuclidean_i4_rvv(a, b, n, result);
|
|
1374
1622
|
#else
|
|
@@ -1379,6 +1627,8 @@ NK_PUBLIC void nk_sqeuclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size
|
|
|
1379
1627
|
NK_PUBLIC void nk_angular_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1380
1628
|
#if NK_TARGET_ICELAKE
|
|
1381
1629
|
nk_angular_i4_icelake(a, b, n, result);
|
|
1630
|
+
#elif NK_TARGET_NEONSDOT
|
|
1631
|
+
nk_angular_i4_neonsdot(a, b, n, result);
|
|
1382
1632
|
#elif NK_TARGET_RVV
|
|
1383
1633
|
nk_angular_i4_rvv(a, b, n, result);
|
|
1384
1634
|
#else
|
|
@@ -1389,6 +1639,8 @@ NK_PUBLIC void nk_angular_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n
|
|
|
1389
1639
|
NK_PUBLIC void nk_euclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1390
1640
|
#if NK_TARGET_ICELAKE
|
|
1391
1641
|
nk_euclidean_u4_icelake(a, b, n, result);
|
|
1642
|
+
#elif NK_TARGET_NEONSDOT
|
|
1643
|
+
nk_euclidean_u4_neonsdot(a, b, n, result);
|
|
1392
1644
|
#elif NK_TARGET_RVV
|
|
1393
1645
|
nk_euclidean_u4_rvv(a, b, n, result);
|
|
1394
1646
|
#else
|
|
@@ -1399,6 +1651,8 @@ NK_PUBLIC void nk_euclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t
|
|
|
1399
1651
|
NK_PUBLIC void nk_sqeuclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
1400
1652
|
#if NK_TARGET_ICELAKE
|
|
1401
1653
|
nk_sqeuclidean_u4_icelake(a, b, n, result);
|
|
1654
|
+
#elif NK_TARGET_NEONSDOT
|
|
1655
|
+
nk_sqeuclidean_u4_neonsdot(a, b, n, result);
|
|
1402
1656
|
#elif NK_TARGET_RVV
|
|
1403
1657
|
nk_sqeuclidean_u4_rvv(a, b, n, result);
|
|
1404
1658
|
#else
|
|
@@ -1409,6 +1663,8 @@ NK_PUBLIC void nk_sqeuclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size
|
|
|
1409
1663
|
NK_PUBLIC void nk_angular_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
1410
1664
|
#if NK_TARGET_ICELAKE
|
|
1411
1665
|
nk_angular_u4_icelake(a, b, n, result);
|
|
1666
|
+
#elif NK_TARGET_NEONSDOT
|
|
1667
|
+
nk_angular_u4_neonsdot(a, b, n, result);
|
|
1412
1668
|
#elif NK_TARGET_RVV
|
|
1413
1669
|
nk_angular_u4_rvv(a, b, n, result);
|
|
1414
1670
|
#else
|