numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
package/include/numkong/each.h
CHANGED
|
@@ -6,32 +6,34 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Contains following element-wise operations:
|
|
8
8
|
*
|
|
9
|
-
* - Scale (Multiply) with shift: result[i] =
|
|
9
|
+
* - Scale (Multiply) with shift: result[i] = α·a[i] + β
|
|
10
10
|
* - Sum (Add): result[i] = a[i] + b[i]
|
|
11
|
-
* - Blend: result[i] =
|
|
12
|
-
* - FMA (Fused Multiply-Add): result[i] =
|
|
11
|
+
* - Blend: result[i] = α·a[i] + β·b[i]
|
|
12
|
+
* - FMA (Fused Multiply-Add): result[i] = α·a[i]·b[i] + β·c[i]
|
|
13
13
|
*
|
|
14
14
|
* Beyond their obvious usecases, those can be reused for vector-scalar math and other operations:
|
|
15
15
|
*
|
|
16
|
-
* - Scale with
|
|
17
|
-
* - Sum is equivalent to WSum with
|
|
18
|
-
* - Average is WSum with
|
|
19
|
-
* - Elementwise multiply is FMA with
|
|
16
|
+
* - Scale with β = 0 for a pure multiply.
|
|
17
|
+
* - Sum is equivalent to WSum with α = β = 1.
|
|
18
|
+
* - Average is WSum with α = β = 0.5.
|
|
19
|
+
* - Elementwise multiply is FMA with β = 0.
|
|
20
20
|
*
|
|
21
21
|
* For dtypes:
|
|
22
22
|
*
|
|
23
|
-
* -
|
|
24
|
-
* -
|
|
25
|
-
* -
|
|
26
|
-
* -
|
|
27
|
-
* -
|
|
28
|
-
* -
|
|
29
|
-
* -
|
|
30
|
-
* -
|
|
31
|
-
* -
|
|
32
|
-
* -
|
|
33
|
-
* -
|
|
34
|
-
* -
|
|
23
|
+
* - f64c: 64-bit complex × 64-bit complex scales
|
|
24
|
+
* - f32c: 32-bit complex × 32-bit complex scales
|
|
25
|
+
* - f64: 64-bit IEEE floating point × 64-bit scales
|
|
26
|
+
* - f32: 32-bit IEEE floating point × 32-bit scales
|
|
27
|
+
* - f16: 16-bit IEEE floating point × 32-bit scales
|
|
28
|
+
* - bf16: 16-bit brain floating point × 32-bit scales
|
|
29
|
+
* - e4m3: 8-bit e4m3 floating point × 32-bit scales
|
|
30
|
+
* - e5m2: 8-bit e5m2 floating point × 32-bit scales
|
|
31
|
+
* - e2m3: 8-bit e2m3 floating point (MX) × 32-bit scales
|
|
32
|
+
* - e3m2: 8-bit e3m2 floating point (MX) × 32-bit scales
|
|
33
|
+
* - i8/u8: 8-bit integers × 32-bit scales
|
|
34
|
+
* - i16/u16: 16-bit integers × 32-bit scales
|
|
35
|
+
* - i32/u32: 32-bit integers × 64-bit scales
|
|
36
|
+
* - i64/u64: 64-bit integers × 64-bit scales
|
|
35
37
|
*
|
|
36
38
|
* For hardware architectures:
|
|
37
39
|
*
|
|
@@ -55,13 +57,13 @@
|
|
|
55
57
|
* Saturating integer adds (VPADDSW/VPADDUSW) provide overflow protection for i16/u16 sums without
|
|
56
58
|
* branching. FMA (VFMADD231PS) is the workhorse for scale (alpha*x+beta) and blend (alpha*a+beta*b).
|
|
57
59
|
*
|
|
58
|
-
* Intrinsic Instruction
|
|
59
|
-
* _mm512_cvtph_ps VCVTPH2PS (ZMM, YMM)
|
|
60
|
-
* _mm512_cvtps_ph VCVTPS2PH (YMM, ZMM, I8)
|
|
61
|
-
* _mm256_adds_epi16 VPADDSW (YMM, YMM, YMM)
|
|
62
|
-
* _mm256_adds_epu16 VPADDUSW (YMM, YMM, YMM)
|
|
63
|
-
* _mm512_fpclass_ps_mask VFPCLASSPS (K, ZMM, I8)
|
|
64
|
-
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM)
|
|
60
|
+
* Intrinsic Instruction Icelake Genoa
|
|
61
|
+
* _mm512_cvtph_ps VCVTPH2PS (ZMM, YMM) 7cy @ p0+p5 6cy @ p12+p23
|
|
62
|
+
* _mm512_cvtps_ph VCVTPS2PH (YMM, ZMM, I8) 7cy @ p0+p5 7cy @ p12+p23
|
|
63
|
+
* _mm256_adds_epi16 VPADDSW (YMM, YMM, YMM) 1cy @ p01 n/a
|
|
64
|
+
* _mm256_adds_epu16 VPADDUSW (YMM, YMM, YMM) 1cy @ p01 n/a
|
|
65
|
+
* _mm512_fpclass_ps_mask VFPCLASSPS (K, ZMM, I8) 3cy @ p5 5cy @ p01
|
|
66
|
+
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
|
|
65
67
|
*
|
|
66
68
|
* @section arm_instructions Relevant ARM NEON/SVE Instructions
|
|
67
69
|
*
|
|
@@ -69,16 +71,16 @@
|
|
|
69
71
|
* vector throughput (8 elements per 128-bit register vs 4 for f32). Saturating adds (SQADD/UQADD)
|
|
70
72
|
* handle integer overflow. FMLA provides fused multiply-add for floating-point scale/blend/fma.
|
|
71
73
|
*
|
|
72
|
-
* Intrinsic
|
|
73
|
-
* vfmaq_f32
|
|
74
|
-
* vqaddq_s16
|
|
75
|
-
* vqaddq_u16
|
|
76
|
-
* vcvtq_f32_s32
|
|
77
|
-
* vcvtnq_s32_f32
|
|
74
|
+
* Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
|
|
75
|
+
* vfmaq_f32 FMLA.S (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
|
|
76
|
+
* vqaddq_s16 SQADD (vec) 3cy @ V0123 2cy @ V0123 2cy @ V0123
|
|
77
|
+
* vqaddq_u16 UQADD (vec) 3cy @ V0123 2cy @ V0123 2cy @ V0123
|
|
78
|
+
* vcvtq_f32_s32 SCVTF (vec) 3cy @ V0123 3cy @ V01 3cy @ V01
|
|
79
|
+
* vcvtnq_s32_f32 FCVTNS (vec) 3cy @ V0123 3cy @ V01 3cy @ V01
|
|
78
80
|
*
|
|
79
81
|
* @section references References
|
|
80
82
|
*
|
|
81
|
-
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
|
|
83
|
+
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
|
82
84
|
* - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
|
|
83
85
|
*
|
|
84
86
|
*/
|
|
@@ -651,6 +653,11 @@ NK_PUBLIC void nk_each_fma_f32c_neon(nk_f32c_t const *a, nk_f32c_t const *b, nk_
|
|
|
651
653
|
/** @copydoc nk_each_fma_f64 */
|
|
652
654
|
NK_PUBLIC void nk_each_fma_f64c_neon(nk_f64c_t const *a, nk_f64c_t const *b, nk_f64c_t const *c, nk_size_t n,
|
|
653
655
|
nk_f64c_t const *alpha, nk_f64c_t const *beta, nk_f64c_t *result);
|
|
656
|
+
|
|
657
|
+
/** @copydoc nk_each_sum_i8 */
|
|
658
|
+
NK_PUBLIC void nk_each_sum_i8_neon(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result);
|
|
659
|
+
/** @copydoc nk_each_sum_u8 */
|
|
660
|
+
NK_PUBLIC void nk_each_sum_u8_neon(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result);
|
|
654
661
|
#endif // NK_TARGET_NEON
|
|
655
662
|
|
|
656
663
|
#if NK_TARGET_NEONBFDOT
|
|
@@ -680,10 +687,6 @@ NK_PUBLIC void nk_each_blend_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b,
|
|
|
680
687
|
NK_PUBLIC void nk_each_fma_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
|
|
681
688
|
nk_f32_t const *alpha, nk_f32_t const *beta, nk_f16_t *result);
|
|
682
689
|
|
|
683
|
-
/** @copydoc nk_each_sum_i8 */
|
|
684
|
-
NK_PUBLIC void nk_each_sum_i8_neonhalf(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result);
|
|
685
|
-
/** @copydoc nk_each_sum_u8 */
|
|
686
|
-
NK_PUBLIC void nk_each_sum_u8_neonhalf(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result);
|
|
687
690
|
/** @copydoc nk_each_scale_i8 */
|
|
688
691
|
NK_PUBLIC void nk_each_scale_i8_neonhalf(nk_i8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
|
|
689
692
|
nk_i8_t *result);
|
|
@@ -696,13 +699,65 @@ NK_PUBLIC void nk_each_blend_i8_neonhalf(nk_i8_t const *a, nk_i8_t const *b, nk_
|
|
|
696
699
|
/** @copydoc nk_each_blend_u8 */
|
|
697
700
|
NK_PUBLIC void nk_each_blend_u8_neonhalf(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t const *alpha,
|
|
698
701
|
nk_f32_t const *beta, nk_u8_t *result);
|
|
702
|
+
#endif // NK_TARGET_NEONHALF
|
|
703
|
+
|
|
704
|
+
#if NK_TARGET_V128RELAXED
|
|
705
|
+
/** @copydoc nk_each_sum_f32 */
|
|
706
|
+
NK_PUBLIC void nk_each_sum_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *result);
|
|
707
|
+
/** @copydoc nk_each_scale_f32 */
|
|
708
|
+
NK_PUBLIC void nk_each_scale_f32_v128relaxed(nk_f32_t const *a, nk_size_t n, nk_f32_t const *alpha,
|
|
709
|
+
nk_f32_t const *beta, nk_f32_t *result);
|
|
710
|
+
/** @copydoc nk_each_blend_f32 */
|
|
711
|
+
NK_PUBLIC void nk_each_blend_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t const *alpha,
|
|
712
|
+
nk_f32_t const *beta, nk_f32_t *result);
|
|
713
|
+
/** @copydoc nk_each_fma_f32 */
|
|
714
|
+
NK_PUBLIC void nk_each_fma_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t n,
|
|
715
|
+
nk_f32_t const *alpha, nk_f32_t const *beta, nk_f32_t *result);
|
|
716
|
+
/** @copydoc nk_each_sum_f16 */
|
|
717
|
+
NK_PUBLIC void nk_each_sum_f16_v128relaxed(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f16_t *result);
|
|
718
|
+
/** @copydoc nk_each_scale_f16 */
|
|
719
|
+
NK_PUBLIC void nk_each_scale_f16_v128relaxed(nk_f16_t const *a, nk_size_t n, nk_f32_t const *alpha,
|
|
720
|
+
nk_f32_t const *beta, nk_f16_t *result);
|
|
721
|
+
/** @copydoc nk_each_blend_f16 */
|
|
722
|
+
NK_PUBLIC void nk_each_blend_f16_v128relaxed(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t const *alpha,
|
|
723
|
+
nk_f32_t const *beta, nk_f16_t *result);
|
|
724
|
+
/** @copydoc nk_each_fma_f16 */
|
|
725
|
+
NK_PUBLIC void nk_each_fma_f16_v128relaxed(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
|
|
726
|
+
nk_f32_t const *alpha, nk_f32_t const *beta, nk_f16_t *result);
|
|
727
|
+
/** @copydoc nk_each_sum_bf16 */
|
|
728
|
+
NK_PUBLIC void nk_each_sum_bf16_v128relaxed(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_bf16_t *result);
|
|
729
|
+
/** @copydoc nk_each_scale_bf16 */
|
|
730
|
+
NK_PUBLIC void nk_each_scale_bf16_v128relaxed(nk_bf16_t const *a, nk_size_t n, nk_f32_t const *alpha,
|
|
731
|
+
nk_f32_t const *beta, nk_bf16_t *result);
|
|
732
|
+
/** @copydoc nk_each_blend_bf16 */
|
|
733
|
+
NK_PUBLIC void nk_each_blend_bf16_v128relaxed(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n,
|
|
734
|
+
nk_f32_t const *alpha, nk_f32_t const *beta, nk_bf16_t *result);
|
|
735
|
+
/** @copydoc nk_each_fma_bf16 */
|
|
736
|
+
NK_PUBLIC void nk_each_fma_bf16_v128relaxed(nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_t const *c, nk_size_t n,
|
|
737
|
+
nk_f32_t const *alpha, nk_f32_t const *beta, nk_bf16_t *result);
|
|
738
|
+
/** @copydoc nk_each_sum_i8 */
|
|
739
|
+
NK_PUBLIC void nk_each_sum_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result);
|
|
740
|
+
/** @copydoc nk_each_scale_i8 */
|
|
741
|
+
NK_PUBLIC void nk_each_scale_i8_v128relaxed(nk_i8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
|
|
742
|
+
nk_i8_t *result);
|
|
743
|
+
/** @copydoc nk_each_blend_i8 */
|
|
744
|
+
NK_PUBLIC void nk_each_blend_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t const *alpha,
|
|
745
|
+
nk_f32_t const *beta, nk_i8_t *result);
|
|
699
746
|
/** @copydoc nk_each_fma_i8 */
|
|
700
|
-
NK_PUBLIC void
|
|
701
|
-
|
|
747
|
+
NK_PUBLIC void nk_each_fma_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n,
|
|
748
|
+
nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result);
|
|
749
|
+
/** @copydoc nk_each_sum_u8 */
|
|
750
|
+
NK_PUBLIC void nk_each_sum_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result);
|
|
751
|
+
/** @copydoc nk_each_scale_u8 */
|
|
752
|
+
NK_PUBLIC void nk_each_scale_u8_v128relaxed(nk_u8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
|
|
753
|
+
nk_u8_t *result);
|
|
754
|
+
/** @copydoc nk_each_blend_u8 */
|
|
755
|
+
NK_PUBLIC void nk_each_blend_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t const *alpha,
|
|
756
|
+
nk_f32_t const *beta, nk_u8_t *result);
|
|
702
757
|
/** @copydoc nk_each_fma_u8 */
|
|
703
|
-
NK_PUBLIC void
|
|
704
|
-
|
|
705
|
-
#endif //
|
|
758
|
+
NK_PUBLIC void nk_each_fma_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n,
|
|
759
|
+
nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result);
|
|
760
|
+
#endif // NK_TARGET_V128RELAXED
|
|
706
761
|
|
|
707
762
|
#if NK_TARGET_HASWELL
|
|
708
763
|
/** @copydoc nk_each_scale_f64 */
|
|
@@ -1026,12 +1081,6 @@ NK_PUBLIC void nk_each_blend_i8_sapphire(nk_i8_t const *a, nk_i8_t const *b, nk_
|
|
|
1026
1081
|
NK_PUBLIC void nk_each_blend_u8_sapphire(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t const *alpha,
|
|
1027
1082
|
nk_f32_t const *beta, nk_u8_t *result);
|
|
1028
1083
|
|
|
1029
|
-
/** @copydoc nk_each_fma_i8 */
|
|
1030
|
-
NK_PUBLIC void nk_each_fma_i8_sapphire(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n,
|
|
1031
|
-
nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result);
|
|
1032
|
-
/** @copydoc nk_each_fma_u8 */
|
|
1033
|
-
NK_PUBLIC void nk_each_fma_u8_sapphire(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n,
|
|
1034
|
-
nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result);
|
|
1035
1084
|
#endif // NK_TARGET_SAPPHIRE
|
|
1036
1085
|
|
|
1037
1086
|
#if NK_TARGET_RVV
|
|
@@ -1213,6 +1262,10 @@ NK_INTERNAL nk_dtype_t nk_each_scale_input_dtype(nk_dtype_t dtype) {
|
|
|
1213
1262
|
case nk_u16_k: return nk_f32_k;
|
|
1214
1263
|
case nk_i8_k: return nk_f32_k;
|
|
1215
1264
|
case nk_u8_k: return nk_f32_k;
|
|
1265
|
+
case nk_e4m3_k: return nk_f32_k;
|
|
1266
|
+
case nk_e5m2_k: return nk_f32_k;
|
|
1267
|
+
case nk_e2m3_k: return nk_f32_k;
|
|
1268
|
+
case nk_e3m2_k: return nk_f32_k;
|
|
1216
1269
|
default: return nk_dtype_unknown_k;
|
|
1217
1270
|
}
|
|
1218
1271
|
}
|
|
@@ -1230,6 +1283,7 @@ NK_INTERNAL nk_dtype_t nk_each_scale_input_dtype(nk_dtype_t dtype) {
|
|
|
1230
1283
|
#include "numkong/each/icelake.h"
|
|
1231
1284
|
#include "numkong/each/sapphire.h"
|
|
1232
1285
|
#include "numkong/each/rvv.h"
|
|
1286
|
+
#include "numkong/each/v128relaxed.h"
|
|
1233
1287
|
|
|
1234
1288
|
#if defined(__cplusplus)
|
|
1235
1289
|
extern "C" {
|
|
@@ -1260,6 +1314,8 @@ NK_PUBLIC void nk_each_sum_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n
|
|
|
1260
1314
|
nk_each_sum_f32_neon(a, b, n, r);
|
|
1261
1315
|
#elif NK_TARGET_RVV
|
|
1262
1316
|
nk_each_sum_f32_rvv(a, b, n, r);
|
|
1317
|
+
#elif NK_TARGET_V128RELAXED
|
|
1318
|
+
nk_each_sum_f32_v128relaxed(a, b, n, r);
|
|
1263
1319
|
#else
|
|
1264
1320
|
nk_each_sum_f32_serial(a, b, n, r);
|
|
1265
1321
|
#endif
|
|
@@ -1274,6 +1330,8 @@ NK_PUBLIC void nk_each_sum_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_
|
|
|
1274
1330
|
nk_each_sum_bf16_neonbfdot(a, b, n, r);
|
|
1275
1331
|
#elif NK_TARGET_RVV
|
|
1276
1332
|
nk_each_sum_bf16_rvv(a, b, n, r);
|
|
1333
|
+
#elif NK_TARGET_V128RELAXED
|
|
1334
|
+
nk_each_sum_bf16_v128relaxed(a, b, n, r);
|
|
1277
1335
|
#else
|
|
1278
1336
|
nk_each_sum_bf16_serial(a, b, n, r);
|
|
1279
1337
|
#endif
|
|
@@ -1288,6 +1346,8 @@ NK_PUBLIC void nk_each_sum_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n
|
|
|
1288
1346
|
nk_each_sum_f16_neonhalf(a, b, n, r);
|
|
1289
1347
|
#elif NK_TARGET_RVV
|
|
1290
1348
|
nk_each_sum_f16_rvv(a, b, n, r);
|
|
1349
|
+
#elif NK_TARGET_V128RELAXED
|
|
1350
|
+
nk_each_sum_f16_v128relaxed(a, b, n, r);
|
|
1291
1351
|
#else
|
|
1292
1352
|
nk_each_sum_f16_serial(a, b, n, r);
|
|
1293
1353
|
#endif
|
|
@@ -1298,10 +1358,12 @@ NK_PUBLIC void nk_each_sum_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, n
|
|
|
1298
1358
|
nk_each_sum_i8_icelake(a, b, n, r);
|
|
1299
1359
|
#elif NK_TARGET_HASWELL
|
|
1300
1360
|
nk_each_sum_i8_haswell(a, b, n, r);
|
|
1301
|
-
#elif
|
|
1302
|
-
|
|
1361
|
+
#elif NK_TARGET_NEON
|
|
1362
|
+
nk_each_sum_i8_neon(a, b, n, r);
|
|
1303
1363
|
#elif NK_TARGET_RVV
|
|
1304
1364
|
nk_each_sum_i8_rvv(a, b, n, r);
|
|
1365
|
+
#elif NK_TARGET_V128RELAXED
|
|
1366
|
+
nk_each_sum_i8_v128relaxed(a, b, n, r);
|
|
1305
1367
|
#else
|
|
1306
1368
|
nk_each_sum_i8_serial(a, b, n, r);
|
|
1307
1369
|
#endif
|
|
@@ -1312,10 +1374,12 @@ NK_PUBLIC void nk_each_sum_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, n
|
|
|
1312
1374
|
nk_each_sum_u8_icelake(a, b, n, r);
|
|
1313
1375
|
#elif NK_TARGET_HASWELL
|
|
1314
1376
|
nk_each_sum_u8_haswell(a, b, n, r);
|
|
1315
|
-
#elif
|
|
1316
|
-
|
|
1377
|
+
#elif NK_TARGET_NEON
|
|
1378
|
+
nk_each_sum_u8_neon(a, b, n, r);
|
|
1317
1379
|
#elif NK_TARGET_RVV
|
|
1318
1380
|
nk_each_sum_u8_rvv(a, b, n, r);
|
|
1381
|
+
#elif NK_TARGET_V128RELAXED
|
|
1382
|
+
nk_each_sum_u8_v128relaxed(a, b, n, r);
|
|
1319
1383
|
#else
|
|
1320
1384
|
nk_each_sum_u8_serial(a, b, n, r);
|
|
1321
1385
|
#endif
|
|
@@ -1426,6 +1490,8 @@ NK_PUBLIC void nk_each_scale_f32(nk_f32_t const *a, nk_size_t n, nk_f32_t const
|
|
|
1426
1490
|
nk_each_scale_f32_neon(a, n, alpha, beta, r);
|
|
1427
1491
|
#elif NK_TARGET_RVV
|
|
1428
1492
|
nk_each_scale_f32_rvv(a, n, alpha, beta, r);
|
|
1493
|
+
#elif NK_TARGET_V128RELAXED
|
|
1494
|
+
nk_each_scale_f32_v128relaxed(a, n, alpha, beta, r);
|
|
1429
1495
|
#else
|
|
1430
1496
|
nk_each_scale_f32_serial(a, n, alpha, beta, r);
|
|
1431
1497
|
#endif
|
|
@@ -1441,6 +1507,8 @@ NK_PUBLIC void nk_each_scale_bf16(nk_bf16_t const *a, nk_size_t n, nk_f32_t cons
|
|
|
1441
1507
|
nk_each_scale_bf16_neonbfdot(a, n, alpha, beta, r);
|
|
1442
1508
|
#elif NK_TARGET_RVV
|
|
1443
1509
|
nk_each_scale_bf16_rvv(a, n, alpha, beta, r);
|
|
1510
|
+
#elif NK_TARGET_V128RELAXED
|
|
1511
|
+
nk_each_scale_bf16_v128relaxed(a, n, alpha, beta, r);
|
|
1444
1512
|
#else
|
|
1445
1513
|
nk_each_scale_bf16_serial(a, n, alpha, beta, r);
|
|
1446
1514
|
#endif
|
|
@@ -1456,6 +1524,8 @@ NK_PUBLIC void nk_each_scale_f16(nk_f16_t const *a, nk_size_t n, nk_f32_t const
|
|
|
1456
1524
|
nk_each_scale_f16_neonhalf(a, n, alpha, beta, r);
|
|
1457
1525
|
#elif NK_TARGET_RVV
|
|
1458
1526
|
nk_each_scale_f16_rvv(a, n, alpha, beta, r);
|
|
1527
|
+
#elif NK_TARGET_V128RELAXED
|
|
1528
|
+
nk_each_scale_f16_v128relaxed(a, n, alpha, beta, r);
|
|
1459
1529
|
#else
|
|
1460
1530
|
nk_each_scale_f16_serial(a, n, alpha, beta, r);
|
|
1461
1531
|
#endif
|
|
@@ -1473,6 +1543,8 @@ NK_PUBLIC void nk_each_scale_i8(nk_i8_t const *a, nk_size_t n, nk_f32_t const *a
|
|
|
1473
1543
|
nk_each_scale_i8_neonhalf(a, n, alpha, beta, r);
|
|
1474
1544
|
#elif NK_TARGET_RVV
|
|
1475
1545
|
nk_each_scale_i8_rvv(a, n, alpha, beta, r);
|
|
1546
|
+
#elif NK_TARGET_V128RELAXED
|
|
1547
|
+
nk_each_scale_i8_v128relaxed(a, n, alpha, beta, r);
|
|
1476
1548
|
#else
|
|
1477
1549
|
nk_each_scale_i8_serial(a, n, alpha, beta, r);
|
|
1478
1550
|
#endif
|
|
@@ -1490,6 +1562,8 @@ NK_PUBLIC void nk_each_scale_u8(nk_u8_t const *a, nk_size_t n, nk_f32_t const *a
|
|
|
1490
1562
|
nk_each_scale_u8_neonhalf(a, n, alpha, beta, r);
|
|
1491
1563
|
#elif NK_TARGET_RVV
|
|
1492
1564
|
nk_each_scale_u8_rvv(a, n, alpha, beta, r);
|
|
1565
|
+
#elif NK_TARGET_V128RELAXED
|
|
1566
|
+
nk_each_scale_u8_v128relaxed(a, n, alpha, beta, r);
|
|
1493
1567
|
#else
|
|
1494
1568
|
nk_each_scale_u8_serial(a, n, alpha, beta, r);
|
|
1495
1569
|
#endif
|
|
@@ -1606,6 +1680,8 @@ NK_PUBLIC void nk_each_blend_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
|
1606
1680
|
nk_each_blend_f32_neon(a, b, n, alpha, beta, r);
|
|
1607
1681
|
#elif NK_TARGET_RVV
|
|
1608
1682
|
nk_each_blend_f32_rvv(a, b, n, alpha, beta, r);
|
|
1683
|
+
#elif NK_TARGET_V128RELAXED
|
|
1684
|
+
nk_each_blend_f32_v128relaxed(a, b, n, alpha, beta, r);
|
|
1609
1685
|
#else
|
|
1610
1686
|
nk_each_blend_f32_serial(a, b, n, alpha, beta, r);
|
|
1611
1687
|
#endif
|
|
@@ -1621,6 +1697,8 @@ NK_PUBLIC void nk_each_blend_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_siz
|
|
|
1621
1697
|
nk_each_blend_bf16_neonbfdot(a, b, n, alpha, beta, r);
|
|
1622
1698
|
#elif NK_TARGET_RVV
|
|
1623
1699
|
nk_each_blend_bf16_rvv(a, b, n, alpha, beta, r);
|
|
1700
|
+
#elif NK_TARGET_V128RELAXED
|
|
1701
|
+
nk_each_blend_bf16_v128relaxed(a, b, n, alpha, beta, r);
|
|
1624
1702
|
#else
|
|
1625
1703
|
nk_each_blend_bf16_serial(a, b, n, alpha, beta, r);
|
|
1626
1704
|
#endif
|
|
@@ -1636,6 +1714,8 @@ NK_PUBLIC void nk_each_blend_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t
|
|
|
1636
1714
|
nk_each_blend_f16_neonhalf(a, b, n, alpha, beta, r);
|
|
1637
1715
|
#elif NK_TARGET_RVV
|
|
1638
1716
|
nk_each_blend_f16_rvv(a, b, n, alpha, beta, r);
|
|
1717
|
+
#elif NK_TARGET_V128RELAXED
|
|
1718
|
+
nk_each_blend_f16_v128relaxed(a, b, n, alpha, beta, r);
|
|
1639
1719
|
#else
|
|
1640
1720
|
nk_each_blend_f16_serial(a, b, n, alpha, beta, r);
|
|
1641
1721
|
#endif
|
|
@@ -1651,6 +1731,8 @@ NK_PUBLIC void nk_each_blend_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n,
|
|
|
1651
1731
|
nk_each_blend_i8_neonhalf(a, b, n, alpha, beta, r);
|
|
1652
1732
|
#elif NK_TARGET_RVV
|
|
1653
1733
|
nk_each_blend_i8_rvv(a, b, n, alpha, beta, r);
|
|
1734
|
+
#elif NK_TARGET_V128RELAXED
|
|
1735
|
+
nk_each_blend_i8_v128relaxed(a, b, n, alpha, beta, r);
|
|
1654
1736
|
#else
|
|
1655
1737
|
nk_each_blend_i8_serial(a, b, n, alpha, beta, r);
|
|
1656
1738
|
#endif
|
|
@@ -1666,6 +1748,8 @@ NK_PUBLIC void nk_each_blend_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n,
|
|
|
1666
1748
|
nk_each_blend_u8_neonhalf(a, b, n, alpha, beta, r);
|
|
1667
1749
|
#elif NK_TARGET_RVV
|
|
1668
1750
|
nk_each_blend_u8_rvv(a, b, n, alpha, beta, r);
|
|
1751
|
+
#elif NK_TARGET_V128RELAXED
|
|
1752
|
+
nk_each_blend_u8_v128relaxed(a, b, n, alpha, beta, r);
|
|
1669
1753
|
#else
|
|
1670
1754
|
nk_each_blend_u8_serial(a, b, n, alpha, beta, r);
|
|
1671
1755
|
#endif
|
|
@@ -1726,6 +1810,8 @@ NK_PUBLIC void nk_each_fma_f32(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t co
|
|
|
1726
1810
|
nk_each_fma_f32_neon(a, b, c, n, alpha, beta, r);
|
|
1727
1811
|
#elif NK_TARGET_RVV
|
|
1728
1812
|
nk_each_fma_f32_rvv(a, b, c, n, alpha, beta, r);
|
|
1813
|
+
#elif NK_TARGET_V128RELAXED
|
|
1814
|
+
nk_each_fma_f32_v128relaxed(a, b, c, n, alpha, beta, r);
|
|
1729
1815
|
#else
|
|
1730
1816
|
nk_each_fma_f32_serial(a, b, c, n, alpha, beta, r);
|
|
1731
1817
|
#endif
|
|
@@ -1741,6 +1827,8 @@ NK_PUBLIC void nk_each_fma_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_
|
|
|
1741
1827
|
nk_each_fma_bf16_neonbfdot(a, b, c, n, alpha, beta, r);
|
|
1742
1828
|
#elif NK_TARGET_RVV
|
|
1743
1829
|
nk_each_fma_bf16_rvv(a, b, c, n, alpha, beta, r);
|
|
1830
|
+
#elif NK_TARGET_V128RELAXED
|
|
1831
|
+
nk_each_fma_bf16_v128relaxed(a, b, c, n, alpha, beta, r);
|
|
1744
1832
|
#else
|
|
1745
1833
|
nk_each_fma_bf16_serial(a, b, c, n, alpha, beta, r);
|
|
1746
1834
|
#endif
|
|
@@ -1756,6 +1844,8 @@ NK_PUBLIC void nk_each_fma_f16(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t co
|
|
|
1756
1844
|
nk_each_fma_f16_neonhalf(a, b, c, n, alpha, beta, r);
|
|
1757
1845
|
#elif NK_TARGET_RVV
|
|
1758
1846
|
nk_each_fma_f16_rvv(a, b, c, n, alpha, beta, r);
|
|
1847
|
+
#elif NK_TARGET_V128RELAXED
|
|
1848
|
+
nk_each_fma_f16_v128relaxed(a, b, c, n, alpha, beta, r);
|
|
1759
1849
|
#else
|
|
1760
1850
|
nk_each_fma_f16_serial(a, b, c, n, alpha, beta, r);
|
|
1761
1851
|
#endif
|
|
@@ -1763,16 +1853,14 @@ NK_PUBLIC void nk_each_fma_f16(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t co
|
|
|
1763
1853
|
|
|
1764
1854
|
NK_PUBLIC void nk_each_fma_i8(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n, nk_f32_t const *alpha,
|
|
1765
1855
|
nk_f32_t const *beta, nk_i8_t *r) {
|
|
1766
|
-
#if
|
|
1767
|
-
nk_each_fma_i8_sapphire(a, b, c, n, alpha, beta, r);
|
|
1768
|
-
#elif NK_TARGET_SKYLAKE
|
|
1856
|
+
#if NK_TARGET_SKYLAKE
|
|
1769
1857
|
nk_each_fma_i8_skylake(a, b, c, n, alpha, beta, r);
|
|
1770
1858
|
#elif NK_TARGET_HASWELL
|
|
1771
1859
|
nk_each_fma_i8_haswell(a, b, c, n, alpha, beta, r);
|
|
1772
|
-
#elif NK_TARGET_NEONHALF
|
|
1773
|
-
nk_each_fma_i8_neonhalf(a, b, c, n, alpha, beta, r);
|
|
1774
1860
|
#elif NK_TARGET_RVV
|
|
1775
1861
|
nk_each_fma_i8_rvv(a, b, c, n, alpha, beta, r);
|
|
1862
|
+
#elif NK_TARGET_V128RELAXED
|
|
1863
|
+
nk_each_fma_i8_v128relaxed(a, b, c, n, alpha, beta, r);
|
|
1776
1864
|
#else
|
|
1777
1865
|
nk_each_fma_i8_serial(a, b, c, n, alpha, beta, r);
|
|
1778
1866
|
#endif
|
|
@@ -1780,16 +1868,14 @@ NK_PUBLIC void nk_each_fma_i8(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const
|
|
|
1780
1868
|
|
|
1781
1869
|
NK_PUBLIC void nk_each_fma_u8(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n, nk_f32_t const *alpha,
|
|
1782
1870
|
nk_f32_t const *beta, nk_u8_t *r) {
|
|
1783
|
-
#if
|
|
1784
|
-
nk_each_fma_u8_sapphire(a, b, c, n, alpha, beta, r);
|
|
1785
|
-
#elif NK_TARGET_SKYLAKE
|
|
1871
|
+
#if NK_TARGET_SKYLAKE
|
|
1786
1872
|
nk_each_fma_u8_skylake(a, b, c, n, alpha, beta, r);
|
|
1787
1873
|
#elif NK_TARGET_HASWELL
|
|
1788
1874
|
nk_each_fma_u8_haswell(a, b, c, n, alpha, beta, r);
|
|
1789
|
-
#elif NK_TARGET_NEONHALF
|
|
1790
|
-
nk_each_fma_u8_neonhalf(a, b, c, n, alpha, beta, r);
|
|
1791
1875
|
#elif NK_TARGET_RVV
|
|
1792
1876
|
nk_each_fma_u8_rvv(a, b, c, n, alpha, beta, r);
|
|
1877
|
+
#elif NK_TARGET_V128RELAXED
|
|
1878
|
+
nk_each_fma_u8_v128relaxed(a, b, c, n, alpha, beta, r);
|
|
1793
1879
|
#else
|
|
1794
1880
|
nk_each_fma_u8_serial(a, b, c, n, alpha, beta, r);
|
|
1795
1881
|
#endif
|
package/include/numkong/each.hpp
CHANGED
|
@@ -199,7 +199,7 @@ void fma(in_type_ const *a, in_type_ const *b, std::size_t d, in_type_ const *c,
|
|
|
199
199
|
|
|
200
200
|
namespace ashvardanian::numkong {
|
|
201
201
|
|
|
202
|
-
#pragma region
|
|
202
|
+
#pragma region Tensor Elementwise
|
|
203
203
|
|
|
204
204
|
/** @brief Scale: output[i] = α × input[i] + β. */
|
|
205
205
|
template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
|
|
@@ -427,7 +427,7 @@ tensor<value_type_, allocator_type_, max_rank_> try_mul(tensor_view<value_type_,
|
|
|
427
427
|
return result;
|
|
428
428
|
}
|
|
429
429
|
|
|
430
|
-
#pragma endregion
|
|
430
|
+
#pragma endregion Tensor Elementwise
|
|
431
431
|
|
|
432
432
|
} // namespace ashvardanian::numkong
|
|
433
433
|
|
|
@@ -5,17 +5,17 @@ Both operate on arrays of latitude/longitude pairs in radians and produce distan
|
|
|
5
5
|
|
|
6
6
|
The Haversine formula computes the great-circle distance between two points:
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
$$
|
|
9
9
|
\text{haversine}(\phi_1, \lambda_1, \phi_2, \lambda_2) = 2R \arcsin\sqrt{\sin^2\frac{\phi_2 - \phi_1}{2} + \cos\phi_1 \cos\phi_2 \sin^2\frac{\lambda_2 - \lambda_1}{2}}
|
|
10
|
-
|
|
10
|
+
$$
|
|
11
11
|
|
|
12
12
|
where $R$ is Earth's mean radius and $(\phi, \lambda)$ are latitude and longitude in radians.
|
|
13
13
|
|
|
14
14
|
Vincenty's formula solves the inverse geodesic problem on an oblate spheroid, iteratively refining the reduced latitude difference until convergence:
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
$$
|
|
17
17
|
\text{vincenty}(\phi_1, \lambda_1, \phi_2, \lambda_2) = b \cdot A \cdot (\sigma - \Delta\sigma)
|
|
18
|
-
|
|
18
|
+
$$
|
|
19
19
|
|
|
20
20
|
where $a$ and $b$ are the equatorial and polar semi-axes of the WGS-84 ellipsoid, $\sigma$ is the angular separation, and $\Delta\sigma$ is the correction term computed through iterative convergence.
|
|
21
21
|
|
|
@@ -56,12 +56,12 @@ Each SIMD lane may converge at a different iteration count, so the kernel accumu
|
|
|
56
56
|
Early exit uses `_mm256_movemask_pd` — when all 4 bits (for `f64`) or 8 bits (for `f32`) are set, the loop breaks.
|
|
57
57
|
Coincident points and equatorial edge cases are handled by blending safe values (ones) into the intermediate terms to avoid division by zero, without requiring branches that would diverge across SIMD lanes.
|
|
58
58
|
|
|
59
|
-
### Haversine Without Final Arc Conversion
|
|
59
|
+
### Potential Optimization: Haversine Without Final Arc Conversion
|
|
60
60
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
The haversine formula computes $d = 2R \cdot \text{asin}(\sqrt{h})$ where $h = \sin^2(\Delta\phi/2) + \cos\phi_1 \cos\phi_2 \cdot \sin^2(\Delta\lambda/2)$.
|
|
62
|
+
Since both `asin` and `sqrt` are monotonically increasing, comparing $h$ values directly produces the same ordering as comparing full haversine distances.
|
|
63
|
+
For ranking-only use cases, a future "similarity mode" could skip the final `sqrt`/`atan2` conversion and return $h$ directly, eliminating the two most expensive operations in the pipeline.
|
|
64
|
+
Currently, all kernels compute the full distance.
|
|
65
65
|
|
|
66
66
|
## Performance
|
|
67
67
|
|
|
@@ -131,17 +131,17 @@ Measured with Wasmtime v42 (Cranelift backend).
|
|
|
131
131
|
|
|
132
132
|
#### WASM
|
|
133
133
|
|
|
134
|
-
Measured with Wasmtime
|
|
134
|
+
Measured with Wasmtime v43 (Cranelift backend).
|
|
135
135
|
|
|
136
136
|
| Kernel | ≤1° | ≤30° | ≤180° |
|
|
137
137
|
| :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
138
138
|
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
139
|
-
| `nk_haversine_f64_serial` |
|
|
140
|
-
| `nk_vincenty_f64_serial` |
|
|
141
|
-
| `nk_haversine_f64_v128relaxed` |
|
|
142
|
-
| `nk_vincenty_f64_v128relaxed` |
|
|
139
|
+
| `nk_haversine_f64_serial` | 5.12 mp/s, 1.12 km | 5.12 mp/s, 32.8 km | 5.07 mp/s, 148 km |
|
|
140
|
+
| `nk_vincenty_f64_serial` | 1.27 mp/s, 1.86 nm | 1.11 mp/s, 2.33 nm | 1.02 mp/s, 594 nm |
|
|
141
|
+
| `nk_haversine_f64_v128relaxed` | 109 mp/s, 1.12 km | 109 mp/s, 32.8 km | 109 mp/s, 148 km |
|
|
142
|
+
| `nk_vincenty_f64_v128relaxed` | 12.8 mp/s, 1.89 nm | 10.4 mp/s, 2.33 nm | 8.28 mp/s, 594 nm |
|
|
143
143
|
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
144
|
-
| `nk_haversine_f32_serial` |
|
|
145
|
-
| `nk_vincenty_f32_serial` |
|
|
146
|
-
| `nk_haversine_f32_v128relaxed` |
|
|
147
|
-
| `nk_vincenty_f32_v128relaxed` |
|
|
144
|
+
| `nk_haversine_f32_serial` | 20.9 mp/s, 20,000 km | 20.9 mp/s, 32.7 km | 21.0 mp/s, 136 km |
|
|
145
|
+
| `nk_vincenty_f32_serial` | 6.11 mp/s, 20,000 km | 4.30 mp/s, 12.0 m | 3.27 mp/s, 22.0 m |
|
|
146
|
+
| `nk_haversine_f32_v128relaxed` | 523 mp/s, 20,000 km | 524 mp/s, 32.7 km | 524 mp/s, 153 km |
|
|
147
|
+
| `nk_vincenty_f32_v128relaxed` | 76.9 mp/s, 12.0 m | 68.2 mp/s, 16.2 m | 26.8 mp/s, 18.0 m |
|