numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -8,23 +8,23 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @section set_sve_instructions ARM SVE Instructions
|
|
10
10
|
*
|
|
11
|
-
* Intrinsic
|
|
12
|
-
* svld1_u8
|
|
13
|
-
* svld1_u32
|
|
14
|
-
* sveor_u8_m
|
|
15
|
-
* svand_u8_m
|
|
16
|
-
* svorr_u8_m
|
|
17
|
-
* svcnt_u8_x
|
|
18
|
-
* svadd_u8_z
|
|
19
|
-
* svaddv_u8
|
|
20
|
-
* svcmpeq_u32
|
|
21
|
-
* svcntp_b32
|
|
22
|
-
* svdup_n_u8
|
|
23
|
-
* svwhilelt_b8
|
|
24
|
-
* svwhilelt_b32
|
|
25
|
-
* svptrue_b8
|
|
26
|
-
* svcntb
|
|
27
|
-
* svcntw
|
|
11
|
+
* Intrinsic Instruction V1
|
|
12
|
+
* svld1_u8 LD1B (Z.B, P/Z, [Xn]) 4-6cy @ 2p
|
|
13
|
+
* svld1_u32 LD1W (Z.S, P/Z, [Xn]) 4-6cy @ 2p
|
|
14
|
+
* sveor_u8_m EOR (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
|
|
15
|
+
* svand_u8_m AND (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
|
|
16
|
+
* svorr_u8_m ORR (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
|
|
17
|
+
* svcnt_u8_x CNT (Z.B, P/M, Z.B) 2cy @ 2p
|
|
18
|
+
* svadd_u8_z ADD (Z.B, P/M, Z.B, Z.B) 1cy @ 2p
|
|
19
|
+
* svaddv_u8 UADDV (D, P, Z.B) 6cy @ 1p
|
|
20
|
+
* svcmpeq_u32 CMPEQ (P.S, P/Z, Z.S, Z.S) 2cy @ 1p
|
|
21
|
+
* svcntp_b32 CNTP (Xd, P, P.S) 2cy @ 1p
|
|
22
|
+
* svdup_n_u8 DUP (Z.B, #imm) 1cy @ 2p
|
|
23
|
+
* svwhilelt_b8 WHILELT (P.B, Xn, Xm) 2cy @ 1p
|
|
24
|
+
* svwhilelt_b32 WHILELT (P.S, Xn, Xm) 2cy @ 1p
|
|
25
|
+
* svptrue_b8 PTRUE (P.B, pattern) 1cy @ 2p
|
|
26
|
+
* svcntb CNTB (Xd) 1cy @ 2p
|
|
27
|
+
* svcntw CNTW (Xd) 1cy @ 2p
|
|
28
28
|
*/
|
|
29
29
|
#ifndef NK_SET_SVE_H
|
|
30
30
|
#define NK_SET_SVE_H
|
|
@@ -46,7 +46,7 @@ extern "C" {
|
|
|
46
46
|
#pragma GCC target("arch=armv8.2-a+sve")
|
|
47
47
|
#endif
|
|
48
48
|
|
|
49
|
-
#pragma region
|
|
49
|
+
#pragma region Binary Sets
|
|
50
50
|
|
|
51
51
|
NK_PUBLIC void nk_hamming_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
52
52
|
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
@@ -61,20 +61,20 @@ NK_PUBLIC void nk_hamming_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
|
|
|
61
61
|
// On larger register sizes, SVE is faster.
|
|
62
62
|
nk_size_t i = 0, cycle = 0;
|
|
63
63
|
nk_u32_t differences = 0;
|
|
64
|
-
svuint8_t
|
|
65
|
-
svbool_t const
|
|
64
|
+
svuint8_t popcount_u8x = svdup_n_u8(0);
|
|
65
|
+
svbool_t const all_predicate_b8x = svptrue_b8();
|
|
66
66
|
while (i < n_bytes) {
|
|
67
67
|
do {
|
|
68
|
-
svbool_t
|
|
69
|
-
svuint8_t
|
|
70
|
-
svuint8_t
|
|
71
|
-
|
|
72
|
-
|
|
68
|
+
svbool_t active_predicate_b8x = svwhilelt_b8_u64(i, n_bytes);
|
|
69
|
+
svuint8_t a_u8x = svld1_u8(active_predicate_b8x, a + i);
|
|
70
|
+
svuint8_t b_u8x = svld1_u8(active_predicate_b8x, b + i);
|
|
71
|
+
popcount_u8x = svadd_u8_z(all_predicate_b8x, popcount_u8x,
|
|
72
|
+
svcnt_u8_x(all_predicate_b8x, sveor_u8_m(all_predicate_b8x, a_u8x, b_u8x)));
|
|
73
73
|
i += words_per_register;
|
|
74
74
|
++cycle;
|
|
75
75
|
} while (i < n_bytes && cycle < 31);
|
|
76
|
-
differences += svaddv_u8(
|
|
77
|
-
|
|
76
|
+
differences += svaddv_u8(all_predicate_b8x, popcount_u8x);
|
|
77
|
+
popcount_u8x = svdup_n_u8(0);
|
|
78
78
|
cycle = 0; // Reset the cycle counter.
|
|
79
79
|
}
|
|
80
80
|
|
|
@@ -94,45 +94,46 @@ NK_PUBLIC void nk_jaccard_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
|
|
|
94
94
|
// On larger register sizes, SVE is faster.
|
|
95
95
|
nk_size_t i = 0, cycle = 0;
|
|
96
96
|
nk_u32_t intersection_count = 0, union_count = 0;
|
|
97
|
-
svuint8_t
|
|
98
|
-
svuint8_t
|
|
99
|
-
svbool_t const
|
|
97
|
+
svuint8_t intersection_popcount_u8x = svdup_n_u8(0);
|
|
98
|
+
svuint8_t union_popcount_u8x = svdup_n_u8(0);
|
|
99
|
+
svbool_t const all_predicate_b8x = svptrue_b8();
|
|
100
100
|
while (i < n_bytes) {
|
|
101
101
|
do {
|
|
102
|
-
svbool_t
|
|
103
|
-
svuint8_t
|
|
104
|
-
svuint8_t
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
102
|
+
svbool_t active_predicate_b8x = svwhilelt_b8_u64(i, n_bytes);
|
|
103
|
+
svuint8_t a_u8x = svld1_u8(active_predicate_b8x, a + i);
|
|
104
|
+
svuint8_t b_u8x = svld1_u8(active_predicate_b8x, b + i);
|
|
105
|
+
intersection_popcount_u8x = svadd_u8_z(
|
|
106
|
+
all_predicate_b8x, intersection_popcount_u8x,
|
|
107
|
+
svcnt_u8_x(all_predicate_b8x, svand_u8_m(all_predicate_b8x, a_u8x, b_u8x)));
|
|
108
|
+
union_popcount_u8x = svadd_u8_z(all_predicate_b8x, union_popcount_u8x,
|
|
109
|
+
svcnt_u8_x(all_predicate_b8x, svorr_u8_m(all_predicate_b8x, a_u8x, b_u8x)));
|
|
109
110
|
i += words_per_register;
|
|
110
111
|
++cycle;
|
|
111
112
|
} while (i < n_bytes && cycle < 31);
|
|
112
|
-
intersection_count += svaddv_u8(
|
|
113
|
-
|
|
114
|
-
union_count += svaddv_u8(
|
|
115
|
-
|
|
113
|
+
intersection_count += svaddv_u8(all_predicate_b8x, intersection_popcount_u8x);
|
|
114
|
+
intersection_popcount_u8x = svdup_n_u8(0);
|
|
115
|
+
union_count += svaddv_u8(all_predicate_b8x, union_popcount_u8x);
|
|
116
|
+
union_popcount_u8x = svdup_n_u8(0);
|
|
116
117
|
cycle = 0; // Reset the cycle counter.
|
|
117
118
|
}
|
|
118
119
|
|
|
119
120
|
*result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
|
|
120
121
|
}
|
|
121
122
|
|
|
122
|
-
#pragma endregion
|
|
123
|
+
#pragma endregion Binary Sets
|
|
123
124
|
|
|
124
|
-
#pragma region
|
|
125
|
+
#pragma region Integer Sets
|
|
125
126
|
|
|
126
127
|
NK_PUBLIC void nk_jaccard_u32_sve(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
127
128
|
nk_size_t const words_per_register = svcntw();
|
|
128
129
|
nk_size_t i = 0;
|
|
129
130
|
nk_u32_t intersection_count = 0;
|
|
130
131
|
while (i < n) {
|
|
131
|
-
svbool_t
|
|
132
|
-
svuint32_t
|
|
133
|
-
svuint32_t
|
|
134
|
-
svbool_t
|
|
135
|
-
intersection_count += svcntp_b32(
|
|
132
|
+
svbool_t active_predicate_b32x = svwhilelt_b32_u64(i, n);
|
|
133
|
+
svuint32_t a_u32x = svld1_u32(active_predicate_b32x, a + i);
|
|
134
|
+
svuint32_t b_u32x = svld1_u32(active_predicate_b32x, b + i);
|
|
135
|
+
svbool_t equality_predicate_b32x = svcmpeq_u32(active_predicate_b32x, a_u32x, b_u32x);
|
|
136
|
+
intersection_count += svcntp_b32(active_predicate_b32x, equality_predicate_b32x);
|
|
136
137
|
i += words_per_register;
|
|
137
138
|
}
|
|
138
139
|
*result = (n != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)n : 0.0f;
|
|
@@ -143,11 +144,11 @@ NK_PUBLIC void nk_hamming_u8_sve(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n
|
|
|
143
144
|
nk_size_t i = 0;
|
|
144
145
|
nk_u32_t differences = 0;
|
|
145
146
|
while (i < n) {
|
|
146
|
-
svbool_t
|
|
147
|
-
svuint8_t
|
|
148
|
-
svuint8_t
|
|
149
|
-
svbool_t
|
|
150
|
-
differences += svcntp_b8(
|
|
147
|
+
svbool_t active_predicate_b8x = svwhilelt_b8_u64(i, n);
|
|
148
|
+
svuint8_t a_u8x = svld1_u8(active_predicate_b8x, a + i);
|
|
149
|
+
svuint8_t b_u8x = svld1_u8(active_predicate_b8x, b + i);
|
|
150
|
+
svbool_t neq_predicate_b8x = svcmpne_u8(active_predicate_b8x, a_u8x, b_u8x);
|
|
151
|
+
differences += svcntp_b8(active_predicate_b8x, neq_predicate_b8x);
|
|
151
152
|
i += bytes_per_register;
|
|
152
153
|
}
|
|
153
154
|
*result = differences;
|
|
@@ -158,17 +159,17 @@ NK_PUBLIC void nk_jaccard_u16_sve(nk_u16_t const *a, nk_u16_t const *b, nk_size_
|
|
|
158
159
|
nk_size_t i = 0;
|
|
159
160
|
nk_u32_t intersection_count = 0;
|
|
160
161
|
while (i < n) {
|
|
161
|
-
svbool_t
|
|
162
|
-
svuint16_t
|
|
163
|
-
svuint16_t
|
|
164
|
-
svbool_t
|
|
165
|
-
intersection_count += svcntp_b16(
|
|
162
|
+
svbool_t active_predicate_b16x = svwhilelt_b16_u64(i, n);
|
|
163
|
+
svuint16_t a_u16x = svld1_u16(active_predicate_b16x, a + i);
|
|
164
|
+
svuint16_t b_u16x = svld1_u16(active_predicate_b16x, b + i);
|
|
165
|
+
svbool_t equality_predicate_b16x = svcmpeq_u16(active_predicate_b16x, a_u16x, b_u16x);
|
|
166
|
+
intersection_count += svcntp_b16(active_predicate_b16x, equality_predicate_b16x);
|
|
166
167
|
i += halfwords_per_register;
|
|
167
168
|
}
|
|
168
169
|
*result = (n != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)n : 0.0f;
|
|
169
170
|
}
|
|
170
171
|
|
|
171
|
-
#pragma endregion
|
|
172
|
+
#pragma endregion Integer Sets
|
|
172
173
|
|
|
173
174
|
#if defined(__clang__)
|
|
174
175
|
#pragma clang attribute pop
|
|
@@ -30,7 +30,7 @@ extern "C" {
|
|
|
30
30
|
#pragma clang attribute push(__attribute__((target("relaxed-simd"))), apply_to = function)
|
|
31
31
|
#endif
|
|
32
32
|
|
|
33
|
-
#pragma region
|
|
33
|
+
#pragma region Binary Sets
|
|
34
34
|
|
|
35
35
|
NK_PUBLIC void nk_hamming_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
36
36
|
nk_u8_t const *a_bytes = (nk_u8_t const *)a;
|
|
@@ -121,9 +121,9 @@ NK_PUBLIC void nk_jaccard_u1_v128relaxed(nk_u1x8_t const *a, nk_u1x8_t const *b,
|
|
|
121
121
|
*result = union_count > 0 ? 1.0f - ((nk_f32_t)intersection / (nk_f32_t)union_count) : 0.0f;
|
|
122
122
|
}
|
|
123
123
|
|
|
124
|
-
#pragma endregion
|
|
124
|
+
#pragma endregion Binary Sets
|
|
125
125
|
|
|
126
|
-
#pragma region
|
|
126
|
+
#pragma region Integer Sets
|
|
127
127
|
|
|
128
128
|
NK_PUBLIC void nk_hamming_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
129
129
|
nk_u32_t sum_total = 0;
|
|
@@ -197,9 +197,9 @@ NK_PUBLIC void nk_jaccard_u16_v128relaxed(nk_u16_t const *a, nk_u16_t const *b,
|
|
|
197
197
|
*result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
|
|
198
198
|
}
|
|
199
199
|
|
|
200
|
-
#pragma endregion
|
|
200
|
+
#pragma endregion Integer Sets
|
|
201
201
|
|
|
202
|
-
#pragma region
|
|
202
|
+
#pragma region Binary Sets from Dot
|
|
203
203
|
|
|
204
204
|
NK_INTERNAL void nk_hamming_u32x4_from_dot_v128relaxed_( //
|
|
205
205
|
nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
|
|
@@ -226,7 +226,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_v128relaxed_( //
|
|
|
226
226
|
results->v128 = wasm_i32x4_relaxed_laneselect(zero_f32x4, jaccard_f32x4, zero_mask_u32x4);
|
|
227
227
|
}
|
|
228
228
|
|
|
229
|
-
#pragma endregion
|
|
229
|
+
#pragma endregion Binary Sets from Dot
|
|
230
230
|
|
|
231
231
|
#if defined(__clang__)
|
|
232
232
|
#pragma clang attribute pop
|
package/include/numkong/set.h
CHANGED
|
@@ -41,12 +41,12 @@
|
|
|
41
41
|
* On binary vectors, when computing Jaccard distance, the CPU often struggles to compute the
|
|
42
42
|
* large number of required population counts. There are several instructions we should keep in mind:
|
|
43
43
|
*
|
|
44
|
-
* Intrinsic
|
|
45
|
-
* _mm512_popcnt_epi64
|
|
46
|
-
* _mm512_shuffle_epi8
|
|
47
|
-
* _mm512_sad_epu8
|
|
48
|
-
* _mm512_ternarylogic_epi64
|
|
49
|
-
* _mm512_gf2p8mul_epi8
|
|
44
|
+
* Intrinsic Instruction Icelake Genoa
|
|
45
|
+
* _mm512_popcnt_epi64 VPOPCNTQ (ZMM, K, ZMM) 3cy @ p5 2cy @ p01
|
|
46
|
+
* _mm512_shuffle_epi8 VPSHUFB (ZMM, ZMM, ZMM) 1cy @ p5 2cy @ p12
|
|
47
|
+
* _mm512_sad_epu8 VPSADBW (ZMM, ZMM, ZMM) 3cy @ p5 3cy @ p01
|
|
48
|
+
* _mm512_ternarylogic_epi64 VPTERNLOGQ (ZMM, ZMM, ZMM, I8) 1cy @ p05 1cy @ p0123
|
|
49
|
+
* _mm512_gf2p8mul_epi8 VGF2P8MULB (ZMM, ZMM, ZMM) 5cy @ p0 3cy @ p01
|
|
50
50
|
*
|
|
51
51
|
* On Ice Lake, VPOPCNTQ bottlenecks on port 5. On AMD Genoa/Turin, it dual-issues
|
|
52
52
|
* on ports 0-1, making native popcount significantly faster without CSA tricks.
|
|
@@ -123,7 +123,7 @@
|
|
|
123
123
|
*
|
|
124
124
|
* @section references References
|
|
125
125
|
*
|
|
126
|
-
* - Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
|
|
126
|
+
* - Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
|
127
127
|
* - Arm Intrinsics Reference: https://developer.arm.com/architectures/instruction-sets/intrinsics/
|
|
128
128
|
* - Muła et al. "Faster Population Counts": https://arxiv.org/pdf/1611.07612
|
|
129
129
|
* - Muła SSE POPCOUNT experiments: https://github.com/WojciechMula/sse-popcount
|
|
@@ -344,9 +344,11 @@ NK_INTERNAL nk_dtype_t nk_jaccard_output_dtype(nk_dtype_t dtype) {
|
|
|
344
344
|
#include "numkong/set/sve.h"
|
|
345
345
|
#include "numkong/set/icelake.h"
|
|
346
346
|
#include "numkong/set/haswell.h"
|
|
347
|
+
#include "numkong/set/powervsx.h"
|
|
347
348
|
#include "numkong/set/v128relaxed.h"
|
|
348
349
|
#include "numkong/set/rvv.h"
|
|
349
350
|
#include "numkong/set/rvvbb.h"
|
|
351
|
+
#include "numkong/set/loongsonasx.h"
|
|
350
352
|
|
|
351
353
|
#if defined(__cplusplus)
|
|
352
354
|
extern "C" {
|
|
@@ -365,6 +367,10 @@ NK_PUBLIC void nk_hamming_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n
|
|
|
365
367
|
nk_hamming_u1_icelake(a, b, n, result);
|
|
366
368
|
#elif NK_TARGET_HASWELL
|
|
367
369
|
nk_hamming_u1_haswell(a, b, n, result);
|
|
370
|
+
#elif NK_TARGET_POWERVSX
|
|
371
|
+
nk_hamming_u1_powervsx(a, b, n, result);
|
|
372
|
+
#elif NK_TARGET_LOONGSONASX
|
|
373
|
+
nk_hamming_u1_loongsonasx(a, b, n, result);
|
|
368
374
|
#elif NK_TARGET_RVVBB
|
|
369
375
|
nk_hamming_u1_rvvbb(a, b, n, result);
|
|
370
376
|
#elif NK_TARGET_RVV
|
|
@@ -385,6 +391,10 @@ NK_PUBLIC void nk_jaccard_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n
|
|
|
385
391
|
nk_jaccard_u1_icelake(a, b, n, result);
|
|
386
392
|
#elif NK_TARGET_HASWELL
|
|
387
393
|
nk_jaccard_u1_haswell(a, b, n, result);
|
|
394
|
+
#elif NK_TARGET_POWERVSX
|
|
395
|
+
nk_jaccard_u1_powervsx(a, b, n, result);
|
|
396
|
+
#elif NK_TARGET_LOONGSONASX
|
|
397
|
+
nk_jaccard_u1_loongsonasx(a, b, n, result);
|
|
388
398
|
#elif NK_TARGET_RVVBB
|
|
389
399
|
nk_jaccard_u1_rvvbb(a, b, n, result);
|
|
390
400
|
#elif NK_TARGET_RVV
|
|
@@ -423,6 +433,10 @@ NK_PUBLIC void nk_hamming_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk
|
|
|
423
433
|
nk_hamming_u8_icelake(a, b, n, result);
|
|
424
434
|
#elif NK_TARGET_HASWELL
|
|
425
435
|
nk_hamming_u8_haswell(a, b, n, result);
|
|
436
|
+
#elif NK_TARGET_POWERVSX
|
|
437
|
+
nk_hamming_u8_powervsx(a, b, n, result);
|
|
438
|
+
#elif NK_TARGET_LOONGSONASX
|
|
439
|
+
nk_hamming_u8_loongsonasx(a, b, n, result);
|
|
426
440
|
#elif NK_TARGET_RVV
|
|
427
441
|
nk_hamming_u8_rvv(a, b, n, result);
|
|
428
442
|
#else
|
|
@@ -4,17 +4,17 @@ NumKong implements batched M×N Hamming and Jaccard distance matrices for binary
|
|
|
4
4
|
|
|
5
5
|
Hamming distance from batched dot products:
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
$$
|
|
8
8
|
D_{ij} = \|A_i\|_1 + \|B_j\|_1 - 2 \cdot \text{dot}(A_i, B_j)
|
|
9
|
-
|
|
9
|
+
$$
|
|
10
10
|
|
|
11
11
|
Where dot = popcount(AND), measuring intersection size.
|
|
12
12
|
|
|
13
13
|
Jaccard distance from batched dot products:
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
$$
|
|
16
16
|
D_{ij} = 1 - \frac{\text{dot}(A_i, B_j)}{\|A_i\|_1 + \|B_j\|_1 - \text{dot}(A_i, B_j)}
|
|
17
|
-
|
|
17
|
+
$$
|
|
18
18
|
|
|
19
19
|
Reformulating as Python pseudocode:
|
|
20
20
|
|
|
@@ -112,38 +112,38 @@ Measured with Wasmtime v42 (Cranelift backend).
|
|
|
112
112
|
| `nk_jaccards_symmetric_u1_serial` | 3.57 gso/s, 0 ulp | 13.3 gso/s, 0 ulp | 40.6 gso/s, 0 ulp |
|
|
113
113
|
| `nk_jaccards_symmetric_u1_v128relaxed` | 3.65 gso/s, 0 ulp | 13.9 gso/s, 0 ulp | 42.2 gso/s, 0 ulp |
|
|
114
114
|
|
|
115
|
-
### Apple
|
|
115
|
+
### Apple M5
|
|
116
116
|
|
|
117
117
|
#### Native
|
|
118
118
|
|
|
119
119
|
| Kernel | 256³ | 1024³ | 4096³ |
|
|
120
120
|
| :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
121
121
|
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
122
|
-
| `nk_hammings_packed_u1_serial` |
|
|
123
|
-
| `nk_hammings_symmetric_u1_serial` |
|
|
124
|
-
| `nk_jaccards_packed_u1_serial` |
|
|
125
|
-
| `nk_jaccards_symmetric_u1_serial` |
|
|
126
|
-
| `nk_hammings_packed_u1_neon` |
|
|
127
|
-
| `nk_hammings_symmetric_u1_neon` |
|
|
128
|
-
| `nk_jaccards_packed_u1_neon` |
|
|
129
|
-
| `nk_jaccards_symmetric_u1_neon` |
|
|
130
|
-
| `nk_hammings_packed_u1_smebi32` |
|
|
131
|
-
| `nk_hammings_symmetric_u1_smebi32` |
|
|
132
|
-
| `nk_jaccards_packed_u1_smebi32` |
|
|
133
|
-
| `nk_jaccards_symmetric_u1_smebi32` |
|
|
122
|
+
| `nk_hammings_packed_u1_serial` | 156 gso/s | 231 gso/s | 262 gso/s |
|
|
123
|
+
| `nk_hammings_symmetric_u1_serial` | 106 gso/s | 196 gso/s | 246 gso/s |
|
|
124
|
+
| `nk_jaccards_packed_u1_serial` | 136 gso/s, 0 ulp | 221 gso/s, 0 ulp | 262 gso/s, 0 ulp |
|
|
125
|
+
| `nk_jaccards_symmetric_u1_serial` | 96.5 gso/s, 0 ulp | 183 gso/s, 0 ulp | 244 gso/s, 0 ulp |
|
|
126
|
+
| `nk_hammings_packed_u1_neon` | 321 gso/s | 436 gso/s | 508 gso/s |
|
|
127
|
+
| `nk_hammings_symmetric_u1_neon` | 126 gso/s | 239 gso/s | 318 gso/s |
|
|
128
|
+
| `nk_jaccards_packed_u1_neon` | 271 gso/s, 0 ulp | 423 gso/s, 0 ulp | 503 gso/s, 0 ulp |
|
|
129
|
+
| `nk_jaccards_symmetric_u1_neon` | 120 gso/s, 0 ulp | 233 gso/s, 0 ulp | 316 gso/s, 0 ulp |
|
|
130
|
+
| `nk_hammings_packed_u1_smebi32` | 3,286 gso/s | 7,303 gso/s | 11,269 gso/s |
|
|
131
|
+
| `nk_hammings_symmetric_u1_smebi32` | 1,872 gso/s | 5,332 gso/s | 4,079 gso/s |
|
|
132
|
+
| `nk_jaccards_packed_u1_smebi32` | 371 gso/s, 0 ulp | 1,735 gso/s, 0 ulp | 4,348 gso/s, 0 ulp |
|
|
133
|
+
| `nk_jaccards_symmetric_u1_smebi32` | 83.1 gso/s, 0 ulp | 358 gso/s, 0 ulp | 1,005 gso/s, 0 ulp |
|
|
134
134
|
|
|
135
135
|
#### WASM
|
|
136
136
|
|
|
137
|
-
Measured with Wasmtime
|
|
137
|
+
Measured with Wasmtime v43 (Cranelift backend).
|
|
138
138
|
|
|
139
139
|
| Kernel | 256³ | 1024³ | 4096³ |
|
|
140
140
|
| :------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
141
141
|
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
142
|
-
| `nk_hammings_packed_u1_serial` |
|
|
143
|
-
| `nk_hammings_symmetric_u1_serial` |
|
|
144
|
-
| `nk_jaccards_packed_u1_serial` |
|
|
145
|
-
| `nk_jaccards_symmetric_u1_serial` |
|
|
146
|
-
| `nk_hammings_packed_u1_v128relaxed` |
|
|
147
|
-
| `nk_hammings_symmetric_u1_v128relaxed` |
|
|
148
|
-
| `nk_jaccards_packed_u1_v128relaxed` |
|
|
149
|
-
| `nk_jaccards_symmetric_u1_v128relaxed` |
|
|
142
|
+
| `nk_hammings_packed_u1_serial` | 99.3 gso/s | 127 gso/s | 154 gso/s |
|
|
143
|
+
| `nk_hammings_symmetric_u1_serial` | 63.7 gso/s | 142 gso/s | 210 gso/s |
|
|
144
|
+
| `nk_jaccards_packed_u1_serial` | 92.2 gso/s, 0 ulp | 123 gso/s, 0 ulp | 153 gso/s, 0 ulp |
|
|
145
|
+
| `nk_jaccards_symmetric_u1_serial` | 59.3 gso/s, 0 ulp | 142 gso/s, 0 ulp | 207 gso/s, 0 ulp |
|
|
146
|
+
| `nk_hammings_packed_u1_v128relaxed` | 266 gso/s | 378 gso/s | 426 gso/s |
|
|
147
|
+
| `nk_hammings_symmetric_u1_v128relaxed` | 72.2 gso/s | 185 gso/s | 259 gso/s |
|
|
148
|
+
| `nk_jaccards_packed_u1_v128relaxed` | 243 gso/s, 0 ulp | 370 gso/s, 0 ulp | 424 gso/s, 0 ulp |
|
|
149
|
+
| `nk_jaccards_symmetric_u1_v128relaxed` | 72.9 gso/s, 0 ulp | 183 gso/s, 0 ulp | 257 gso/s, 0 ulp |
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Batched Set Operations for LoongArch LASX (256-bit).
|
|
3
|
+
* @file include/numkong/sets/loongsonasx.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date March 25, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/sets.h
|
|
8
|
+
*/
|
|
9
|
+
#ifndef NK_SETS_LOONGSONASX_H
|
|
10
|
+
#define NK_SETS_LOONGSONASX_H
|
|
11
|
+
|
|
12
|
+
#if NK_TARGET_LOONGARCH_
|
|
13
|
+
#if NK_TARGET_LOONGSONASX
|
|
14
|
+
|
|
15
|
+
#include "numkong/set/loongsonasx.h"
|
|
16
|
+
#include "numkong/dots/loongsonasx.h"
|
|
17
|
+
|
|
18
|
+
#if defined(__cplusplus)
|
|
19
|
+
extern "C" {
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
nk_define_cross_normalized_packed_(hamming, u1, loongsonasx, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32,
|
|
23
|
+
nk_b128_vec_t, nk_dots_packed_u1_loongsonasx, nk_hamming_u32x4_from_dot_loongsonasx_,
|
|
24
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
|
|
25
|
+
nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
|
|
26
|
+
/*dimensions_per_value=*/8)
|
|
27
|
+
|
|
28
|
+
nk_define_cross_normalized_packed_(jaccard, u1, loongsonasx, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32,
|
|
29
|
+
nk_b128_vec_t, nk_dots_packed_u1_loongsonasx, nk_jaccard_f32x4_from_dot_loongsonasx_,
|
|
30
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
|
|
31
|
+
nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
|
|
32
|
+
/*dimensions_per_value=*/8)
|
|
33
|
+
|
|
34
|
+
nk_define_cross_normalized_symmetric_(hamming, u1, loongsonasx, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
35
|
+
nk_dots_symmetric_u1_loongsonasx, nk_hamming_u32x4_from_dot_loongsonasx_,
|
|
36
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
|
|
37
|
+
nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
|
|
38
|
+
/*dimensions_per_value=*/8)
|
|
39
|
+
|
|
40
|
+
nk_define_cross_normalized_symmetric_(jaccard, u1, loongsonasx, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
41
|
+
nk_dots_symmetric_u1_loongsonasx, nk_jaccard_f32x4_from_dot_loongsonasx_,
|
|
42
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_loongsonasx_, nk_partial_load_b32x4_serial_,
|
|
43
|
+
nk_store_b128_loongsonasx_, nk_partial_store_b32x4_serial_,
|
|
44
|
+
/*dimensions_per_value=*/8)
|
|
45
|
+
|
|
46
|
+
#if defined(__cplusplus)
|
|
47
|
+
} // extern "C"
|
|
48
|
+
#endif
|
|
49
|
+
|
|
50
|
+
#endif // NK_TARGET_LOONGSONASX
|
|
51
|
+
#endif // NK_TARGET_LOONGARCH_
|
|
52
|
+
#endif // NK_SETS_LOONGSONASX_H
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Batched Set Operations for Power VSX.
|
|
3
|
+
* @file include/numkong/sets/powervsx.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date March 23, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/sets.h
|
|
8
|
+
*/
|
|
9
|
+
#ifndef NK_SETS_POWERVSX_H
|
|
10
|
+
#define NK_SETS_POWERVSX_H
|
|
11
|
+
|
|
12
|
+
#if NK_TARGET_POWER_
|
|
13
|
+
#if NK_TARGET_POWERVSX
|
|
14
|
+
|
|
15
|
+
#include "numkong/set/powervsx.h"
|
|
16
|
+
#include "numkong/dots/powervsx.h"
|
|
17
|
+
|
|
18
|
+
#if defined(__clang__)
|
|
19
|
+
#pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
|
|
20
|
+
#elif defined(__GNUC__)
|
|
21
|
+
#pragma GCC push_options
|
|
22
|
+
#pragma GCC target("power9-vector")
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
#if defined(__cplusplus)
|
|
26
|
+
extern "C" {
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
nk_define_cross_normalized_packed_(hamming, u1, powervsx, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
30
|
+
nk_dots_packed_u1_powervsx, nk_hamming_u32x4_from_dot_powervsx_,
|
|
31
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
|
|
32
|
+
nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
|
|
33
|
+
/*dimensions_per_value=*/8)
|
|
34
|
+
|
|
35
|
+
nk_define_cross_normalized_packed_(jaccard, u1, powervsx, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
36
|
+
nk_dots_packed_u1_powervsx, nk_jaccard_f32x4_from_dot_powervsx_,
|
|
37
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
|
|
38
|
+
nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
|
|
39
|
+
/*dimensions_per_value=*/8)
|
|
40
|
+
|
|
41
|
+
nk_define_cross_normalized_symmetric_(hamming, u1, powervsx, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
42
|
+
nk_dots_symmetric_u1_powervsx, nk_hamming_u32x4_from_dot_powervsx_,
|
|
43
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
|
|
44
|
+
nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
|
|
45
|
+
/*dimensions_per_value=*/8)
|
|
46
|
+
|
|
47
|
+
nk_define_cross_normalized_symmetric_(jaccard, u1, powervsx, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
48
|
+
nk_dots_symmetric_u1_powervsx, nk_jaccard_f32x4_from_dot_powervsx_,
|
|
49
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_powervsx_, nk_partial_load_b32x4_powervsx_,
|
|
50
|
+
nk_store_b128_powervsx_, nk_partial_store_b32x4_powervsx_,
|
|
51
|
+
/*dimensions_per_value=*/8)
|
|
52
|
+
|
|
53
|
+
#if defined(__cplusplus)
|
|
54
|
+
} // extern "C"
|
|
55
|
+
#endif
|
|
56
|
+
|
|
57
|
+
#if defined(__clang__)
|
|
58
|
+
#pragma clang attribute pop
|
|
59
|
+
#elif defined(__GNUC__)
|
|
60
|
+
#pragma GCC pop_options
|
|
61
|
+
#endif
|
|
62
|
+
|
|
63
|
+
#endif // NK_TARGET_POWERVSX
|
|
64
|
+
#endif // NK_TARGET_POWER_
|
|
65
|
+
#endif // NK_SETS_POWERVSX_H
|