numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -8,13 +8,13 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @section scalars_neon_instructions Key NEON Scalar Instructions
|
|
10
10
|
*
|
|
11
|
-
* Intrinsic
|
|
12
|
-
* vsqrt_f32
|
|
13
|
-
* vsqrt_f64
|
|
14
|
-
* vfmas_f32
|
|
15
|
-
* vfmad_f64
|
|
16
|
-
* vqaddb_u8
|
|
17
|
-
* vqaddb_s8
|
|
11
|
+
* Intrinsic Instruction A76 M5
|
|
12
|
+
* vsqrt_f32 FSQRT (S) 12cy @ 1p 9cy @ 1p
|
|
13
|
+
* vsqrt_f64 FSQRT (D) 12cy @ 1p 9cy @ 1p
|
|
14
|
+
* vfmas_f32 FMADD (S, S, S, S) 4cy @ 2p 3cy @ 4p
|
|
15
|
+
* vfmad_f64 FMADD (D, D, D, D) 4cy @ 2p 3cy @ 4p
|
|
16
|
+
* vqaddb_u8 UQADD (B) 2cy @ 2p 3cy @ 2p
|
|
17
|
+
* vqaddb_s8 SQADD (B) 2cy @ 2p 3cy @ 2p
|
|
18
18
|
*/
|
|
19
19
|
#ifndef NK_SCALAR_NEON_H
|
|
20
20
|
#define NK_SCALAR_NEON_H
|
|
@@ -98,8 +98,8 @@ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_neon(nk_u64_t a, nk_u64_t b) {
|
|
|
98
98
|
}
|
|
99
99
|
NK_PUBLIC nk_i64_t nk_i64_saturating_mul_neon(nk_i64_t a, nk_i64_t b) {
|
|
100
100
|
int sign = (a < 0) ^ (b < 0);
|
|
101
|
-
nk_u64_t abs_a = a < 0 ? -(nk_u64_t)a : (nk_u64_t)a;
|
|
102
|
-
nk_u64_t abs_b = b < 0 ? -(nk_u64_t)b : (nk_u64_t)b;
|
|
101
|
+
nk_u64_t abs_a = a < 0 ? (0u - (nk_u64_t)a) : (nk_u64_t)a;
|
|
102
|
+
nk_u64_t abs_b = b < 0 ? (0u - (nk_u64_t)b) : (nk_u64_t)b;
|
|
103
103
|
nk_u64_t high = nk_u64_mulhigh_neon_(abs_a, abs_b);
|
|
104
104
|
nk_u64_t low = abs_a * abs_b;
|
|
105
105
|
if (high || (sign && low > 9223372036854775808ull) || (!sign && low > 9223372036854775807ull))
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Scalar Math Helpers for Power VSX.
|
|
3
|
+
* @file include/numkong/scalar/powervsx.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date March 24, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/scalar.h
|
|
8
|
+
*
|
|
9
|
+
* @section scalars_powervsx_instructions Key Power VSX Scalar Instructions
|
|
10
|
+
*
|
|
11
|
+
* Instruction Description Latency
|
|
12
|
+
* xssqrtsp Scalar √ (f32) 26cy
|
|
13
|
+
* xssqrtdp Scalar √ (f64) 33cy
|
|
14
|
+
* xsrsqrtesp Scalar 1/√ estimate (f32) 6cy
|
|
15
|
+
* xsrsqrtedp Scalar 1/√ estimate (f64) 6cy
|
|
16
|
+
* xsmaddadp Scalar FMA (f64) 5cy
|
|
17
|
+
* xsmaddasp Scalar FMA (f32) 5cy
|
|
18
|
+
*/
|
|
19
|
+
#ifndef NK_SCALAR_POWERVSX_H
|
|
20
|
+
#define NK_SCALAR_POWERVSX_H
|
|
21
|
+
|
|
22
|
+
#if NK_TARGET_POWER_
|
|
23
|
+
#if NK_TARGET_POWERVSX
|
|
24
|
+
|
|
25
|
+
#include "numkong/types.h"
|
|
26
|
+
|
|
27
|
+
#if defined(__cplusplus)
|
|
28
|
+
extern "C" {
|
|
29
|
+
#endif
|
|
30
|
+
|
|
31
|
+
#if defined(__clang__)
|
|
32
|
+
#pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
|
|
33
|
+
#elif defined(__GNUC__)
|
|
34
|
+
#pragma GCC push_options
|
|
35
|
+
#pragma GCC target("power9-vector")
|
|
36
|
+
#endif
|
|
37
|
+
|
|
38
|
+
NK_PUBLIC nk_f32_t nk_f32_sqrt_powervsx(nk_f32_t x) {
|
|
39
|
+
nk_f32_t result;
|
|
40
|
+
__asm__("xssqrtsp %0, %1" : "=f"(result) : "f"(x));
|
|
41
|
+
return result;
|
|
42
|
+
}
|
|
43
|
+
NK_PUBLIC nk_f64_t nk_f64_sqrt_powervsx(nk_f64_t x) {
|
|
44
|
+
nk_f64_t result;
|
|
45
|
+
__asm__("xssqrtdp %0, %1" : "=d"(result) : "d"(x));
|
|
46
|
+
return result;
|
|
47
|
+
}
|
|
48
|
+
NK_PUBLIC nk_f32_t nk_f32_rsqrt_powervsx(nk_f32_t x) {
|
|
49
|
+
// xsrsqrtesp → ~12-bit estimate, then 2 Newton→Raphson iterations → ~24-bit precision
|
|
50
|
+
nk_f32_t r;
|
|
51
|
+
__asm__("xsrsqrtesp %0, %1" : "=f"(r) : "f"(x));
|
|
52
|
+
// Newton→Raphson: r = r * (3 - x * r * r) / 2
|
|
53
|
+
nk_f32_t half_x = x * 0.5f;
|
|
54
|
+
nk_f32_t three_half = 1.5f;
|
|
55
|
+
r = r * (three_half - half_x * r * r);
|
|
56
|
+
r = r * (three_half - half_x * r * r);
|
|
57
|
+
return r;
|
|
58
|
+
}
|
|
59
|
+
NK_PUBLIC nk_f64_t nk_f64_rsqrt_powervsx(nk_f64_t x) {
|
|
60
|
+
// xsrsqrtedp → ~14-bit estimate, then 3 Newton→Raphson iterations → ~48-bit precision
|
|
61
|
+
nk_f64_t r;
|
|
62
|
+
__asm__("xsrsqrtedp %0, %1" : "=d"(r) : "d"(x));
|
|
63
|
+
// Newton→Raphson: r = r * (3 - x * r * r) / 2
|
|
64
|
+
nk_f64_t half_x = x * 0.5;
|
|
65
|
+
nk_f64_t three_half = 1.5;
|
|
66
|
+
r = r * (three_half - half_x * r * r);
|
|
67
|
+
r = r * (three_half - half_x * r * r);
|
|
68
|
+
r = r * (three_half - half_x * r * r);
|
|
69
|
+
return r;
|
|
70
|
+
}
|
|
71
|
+
NK_PUBLIC nk_f32_t nk_f32_fma_powervsx(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
|
|
72
|
+
// xsmaddasp: result = a * b + c (scalar f32 FMA)
|
|
73
|
+
nk_f32_t r = c;
|
|
74
|
+
__asm__("xsmaddasp %0, %1, %2" : "+f"(r) : "f"(a), "f"(b));
|
|
75
|
+
return r;
|
|
76
|
+
}
|
|
77
|
+
NK_PUBLIC nk_f64_t nk_f64_fma_powervsx(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
|
|
78
|
+
// xsmaddadp: result = a * b + c (scalar f64 FMA)
|
|
79
|
+
nk_f64_t r = c;
|
|
80
|
+
__asm__("xsmaddadp %0, %1, %2" : "+d"(r) : "d"(a), "d"(b));
|
|
81
|
+
return r;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
#if defined(__clang__)
|
|
85
|
+
#pragma clang attribute pop
|
|
86
|
+
#elif defined(__GNUC__)
|
|
87
|
+
#pragma GCC pop_options
|
|
88
|
+
#endif
|
|
89
|
+
|
|
90
|
+
#if defined(__cplusplus)
|
|
91
|
+
} // extern "C"
|
|
92
|
+
#endif
|
|
93
|
+
|
|
94
|
+
#endif // NK_TARGET_POWERVSX
|
|
95
|
+
#endif // NK_TARGET_POWER_
|
|
96
|
+
#endif // NK_SCALAR_POWERVSX_H
|
|
@@ -185,8 +185,8 @@ NK_PUBLIC nk_u64_t nk_u64_saturating_mul_rvv(nk_u64_t a, nk_u64_t b) {
|
|
|
185
185
|
}
|
|
186
186
|
NK_PUBLIC nk_i64_t nk_i64_saturating_mul_rvv(nk_i64_t a, nk_i64_t b) {
|
|
187
187
|
int sign = (a < 0) ^ (b < 0);
|
|
188
|
-
nk_u64_t abs_a = a < 0 ? -(nk_u64_t)a : (nk_u64_t)a;
|
|
189
|
-
nk_u64_t abs_b = b < 0 ? -(nk_u64_t)b : (nk_u64_t)b;
|
|
188
|
+
nk_u64_t abs_a = a < 0 ? (0u - (nk_u64_t)a) : (nk_u64_t)a;
|
|
189
|
+
nk_u64_t abs_b = b < 0 ? (0u - (nk_u64_t)b) : (nk_u64_t)b;
|
|
190
190
|
vuint64m1_t a_u64m1 = __riscv_vmv_v_x_u64m1(abs_a, 1);
|
|
191
191
|
vuint64m1_t b_u64m1 = __riscv_vmv_v_x_u64m1(abs_b, 1);
|
|
192
192
|
nk_u64_t high = __riscv_vmv_x_s_u64m1_u64(__riscv_vmulhu_vv_u64m1(a_u64m1, b_u64m1, 1));
|
|
@@ -29,23 +29,34 @@ extern "C" {
|
|
|
29
29
|
#endif
|
|
30
30
|
|
|
31
31
|
NK_PUBLIC int nk_f16_order_sapphire(nk_f16_t a, nk_f16_t b) {
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
nk_fui16_t a_fui, b_fui;
|
|
33
|
+
a_fui.f = a, b_fui.f = b;
|
|
34
|
+
__m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a_fui.u));
|
|
35
|
+
__m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b_fui.u));
|
|
34
36
|
return _mm_comigt_sh(a_f16x8, b_f16x8) - _mm_comilt_sh(a_f16x8, b_f16x8);
|
|
35
37
|
}
|
|
36
38
|
NK_PUBLIC nk_f16_t nk_f16_sqrt_sapphire(nk_f16_t x) {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
+
nk_fui16_t x_fui, out_fui;
|
|
40
|
+
x_fui.f = x;
|
|
41
|
+
__m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x_fui.u));
|
|
42
|
+
out_fui.u = (nk_u16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_sqrt_sh(x_f16x8, x_f16x8)));
|
|
43
|
+
return out_fui.f;
|
|
39
44
|
}
|
|
40
45
|
NK_PUBLIC nk_f16_t nk_f16_rsqrt_sapphire(nk_f16_t x) {
|
|
41
|
-
|
|
42
|
-
|
|
46
|
+
nk_fui16_t x_fui, out_fui;
|
|
47
|
+
x_fui.f = x;
|
|
48
|
+
__m128h x_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(x_fui.u));
|
|
49
|
+
out_fui.u = (nk_u16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_rsqrt_sh(x_f16x8, x_f16x8)));
|
|
50
|
+
return out_fui.f;
|
|
43
51
|
}
|
|
44
52
|
NK_PUBLIC nk_f16_t nk_f16_fma_sapphire(nk_f16_t a, nk_f16_t b, nk_f16_t c) {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
__m128h
|
|
48
|
-
|
|
53
|
+
nk_fui16_t a_fui, b_fui, c_fui, out_fui;
|
|
54
|
+
a_fui.f = a, b_fui.f = b, c_fui.f = c;
|
|
55
|
+
__m128h a_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(a_fui.u));
|
|
56
|
+
__m128h b_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(b_fui.u));
|
|
57
|
+
__m128h c_f16x8 = _mm_castsi128_ph(_mm_cvtsi32_si128(c_fui.u));
|
|
58
|
+
out_fui.u = (nk_u16_t)_mm_cvtsi128_si32(_mm_castph_si128(_mm_fmadd_sh(a_f16x8, b_f16x8, c_f16x8)));
|
|
59
|
+
return out_fui.f;
|
|
49
60
|
}
|
|
50
61
|
|
|
51
62
|
#if defined(__clang__)
|
|
@@ -74,16 +74,16 @@ NK_PUBLIC nk_f64_t nk_f64_fma_serial(nk_f64_t multiplicand, nk_f64_t multiplier,
|
|
|
74
74
|
nk_f64_t product = multiplicand * multiplier;
|
|
75
75
|
// Dekker splitting: break each operand into non-overlapping high and low halves
|
|
76
76
|
nk_f64_t const dekker_split = 134217729.0; // 2^27 + 1 for double precision
|
|
77
|
-
nk_f64_t
|
|
78
|
-
nk_f64_t
|
|
79
|
-
|
|
80
|
-
nk_f64_t
|
|
81
|
-
nk_f64_t
|
|
82
|
-
|
|
77
|
+
nk_f64_t multiplicand_high = dekker_split * multiplicand;
|
|
78
|
+
nk_f64_t multiplicand_low = multiplicand - (multiplicand_high - (multiplicand_high - multiplicand));
|
|
79
|
+
multiplicand_high = multiplicand_high - (multiplicand_high - multiplicand);
|
|
80
|
+
nk_f64_t multiplier_high = dekker_split * multiplier;
|
|
81
|
+
nk_f64_t multiplier_low = multiplier - (multiplier_high - (multiplier_high - multiplier));
|
|
82
|
+
multiplier_high = multiplier_high - (multiplier_high - multiplier);
|
|
83
83
|
// Exact multiplication error from the four cross-products
|
|
84
|
-
nk_f64_t product_error = ((
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
nk_f64_t product_error = ((multiplicand_high * multiplier_high - product) + multiplicand_high * multiplier_low +
|
|
85
|
+
multiplicand_low * multiplier_high) +
|
|
86
|
+
multiplicand_low * multiplier_low;
|
|
87
87
|
// Knuth TwoSum: add the addend with error tracking
|
|
88
88
|
nk_f64_t result = product + addend;
|
|
89
89
|
nk_f64_t addend_recovered = result - product;
|
|
@@ -102,16 +102,16 @@ NK_PUBLIC nk_f32_t nk_f32_fma_serial(nk_f32_t multiplicand, nk_f32_t multiplier,
|
|
|
102
102
|
nk_f32_t product = multiplicand * multiplier;
|
|
103
103
|
// Dekker splitting: break each operand into non-overlapping high and low halves
|
|
104
104
|
nk_f32_t const dekker_split = 4097.0f; // 2^12 + 1 for single precision
|
|
105
|
-
nk_f32_t
|
|
106
|
-
nk_f32_t
|
|
107
|
-
|
|
108
|
-
nk_f32_t
|
|
109
|
-
nk_f32_t
|
|
110
|
-
|
|
105
|
+
nk_f32_t multiplicand_high = dekker_split * multiplicand;
|
|
106
|
+
nk_f32_t multiplicand_low = multiplicand - (multiplicand_high - (multiplicand_high - multiplicand));
|
|
107
|
+
multiplicand_high = multiplicand_high - (multiplicand_high - multiplicand);
|
|
108
|
+
nk_f32_t multiplier_high = dekker_split * multiplier;
|
|
109
|
+
nk_f32_t multiplier_low = multiplier - (multiplier_high - (multiplier_high - multiplier));
|
|
110
|
+
multiplier_high = multiplier_high - (multiplier_high - multiplier);
|
|
111
111
|
// Exact multiplication error from the four cross-products
|
|
112
|
-
nk_f32_t product_error = ((
|
|
113
|
-
|
|
114
|
-
|
|
112
|
+
nk_f32_t product_error = ((multiplicand_high * multiplier_high - product) + multiplicand_high * multiplier_low +
|
|
113
|
+
multiplicand_low * multiplier_high) +
|
|
114
|
+
multiplicand_low * multiplier_low;
|
|
115
115
|
// Knuth TwoSum: add the addend with error tracking
|
|
116
116
|
nk_f32_t result = product + addend;
|
|
117
117
|
nk_f32_t addend_recovered = result - product;
|
|
@@ -125,7 +125,7 @@ NK_PUBLIC nk_f32_t nk_f32_fma_serial(nk_f32_t multiplicand, nk_f32_t multiplier,
|
|
|
125
125
|
* Uses TwoProd (via FMA) and TwoSum error-free transformations.
|
|
126
126
|
* @see Ogita, T., Rump, S.M., Oishi, S. (2005). "Accurate Sum and Dot Product"
|
|
127
127
|
*/
|
|
128
|
-
NK_INTERNAL void nk_f64_dot2_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t a, nk_f64_t b) {
|
|
128
|
+
NK_INTERNAL void nk_f64_dot2_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t a, nk_f64_t b) NK_STREAMING_COMPATIBLE_ {
|
|
129
129
|
nk_f64_t product = a * b;
|
|
130
130
|
nk_f64_t product_error = nk_f64_fma_serial(a, b, -product);
|
|
131
131
|
nk_f64_t running_sum = *sum + product;
|
|
@@ -238,8 +238,8 @@ NK_PUBLIC nk_i64_t nk_i64_saturating_mul_serial(nk_i64_t a, nk_i64_t b) {
|
|
|
238
238
|
int sign = ((a < 0) ^ (b < 0)) ? -1 : 1; // Track the sign of the result
|
|
239
239
|
|
|
240
240
|
// Take absolute values for easy multiplication and overflow detection
|
|
241
|
-
nk_u64_t abs_a = (a < 0) ? -(nk_u64_t)a : (nk_u64_t)a;
|
|
242
|
-
nk_u64_t abs_b = (b < 0) ? -(nk_u64_t)b : (nk_u64_t)b;
|
|
241
|
+
nk_u64_t abs_a = (a < 0) ? (0u - (nk_u64_t)a) : (nk_u64_t)a;
|
|
242
|
+
nk_u64_t abs_b = (b < 0) ? (0u - (nk_u64_t)b) : (nk_u64_t)b;
|
|
243
243
|
|
|
244
244
|
// Split the absolute values into high and low 32-bit parts
|
|
245
245
|
nk_u64_t a_high = abs_a >> 32;
|
package/include/numkong/scalar.h
CHANGED
|
@@ -383,6 +383,7 @@ NK_PUBLIC nk_f64_t nk_f64_fma_v128relaxed(nk_f64_t a, nk_f64_t b, nk_f64_t c);
|
|
|
383
383
|
#include "numkong/scalar/haswell.h" // `nk_f32_sqrt_haswell`
|
|
384
384
|
#include "numkong/scalar/sapphire.h" // `nk_f16_order_sapphire`
|
|
385
385
|
#include "numkong/scalar/rvv.h" // `nk_f32_rsqrt_rvv`
|
|
386
|
+
#include "numkong/scalar/powervsx.h" // `nk_f32_sqrt_powervsx`
|
|
386
387
|
#include "numkong/scalar/v128relaxed.h" // `nk_f32_sqrt_v128relaxed`
|
|
387
388
|
|
|
388
389
|
#if defined(__cplusplus)
|
|
@@ -396,6 +397,8 @@ NK_PUBLIC nk_f32_t nk_f32_sqrt(nk_f32_t x) {
|
|
|
396
397
|
return nk_f32_sqrt_haswell(x);
|
|
397
398
|
#elif NK_TARGET_NEON
|
|
398
399
|
return nk_f32_sqrt_neon(x);
|
|
400
|
+
#elif NK_TARGET_POWERVSX
|
|
401
|
+
return nk_f32_sqrt_powervsx(x);
|
|
399
402
|
#elif NK_TARGET_RVV
|
|
400
403
|
return nk_f32_sqrt_rvv(x);
|
|
401
404
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -410,6 +413,8 @@ NK_PUBLIC nk_f64_t nk_f64_sqrt(nk_f64_t x) {
|
|
|
410
413
|
return nk_f64_sqrt_haswell(x);
|
|
411
414
|
#elif NK_TARGET_NEON
|
|
412
415
|
return nk_f64_sqrt_neon(x);
|
|
416
|
+
#elif NK_TARGET_POWERVSX
|
|
417
|
+
return nk_f64_sqrt_powervsx(x);
|
|
413
418
|
#elif NK_TARGET_RVV
|
|
414
419
|
return nk_f64_sqrt_rvv(x);
|
|
415
420
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -424,6 +429,8 @@ NK_PUBLIC nk_f32_t nk_f32_rsqrt(nk_f32_t x) {
|
|
|
424
429
|
return nk_f32_rsqrt_haswell(x);
|
|
425
430
|
#elif NK_TARGET_NEON
|
|
426
431
|
return nk_f32_rsqrt_neon(x);
|
|
432
|
+
#elif NK_TARGET_POWERVSX
|
|
433
|
+
return nk_f32_rsqrt_powervsx(x);
|
|
427
434
|
#elif NK_TARGET_RVV
|
|
428
435
|
return nk_f32_rsqrt_rvv(x);
|
|
429
436
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -438,6 +445,8 @@ NK_PUBLIC nk_f64_t nk_f64_rsqrt(nk_f64_t x) {
|
|
|
438
445
|
return nk_f64_rsqrt_haswell(x);
|
|
439
446
|
#elif NK_TARGET_NEON
|
|
440
447
|
return nk_f64_rsqrt_neon(x);
|
|
448
|
+
#elif NK_TARGET_POWERVSX
|
|
449
|
+
return nk_f64_rsqrt_powervsx(x);
|
|
441
450
|
#elif NK_TARGET_RVV
|
|
442
451
|
return nk_f64_rsqrt_rvv(x);
|
|
443
452
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -452,6 +461,8 @@ NK_PUBLIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c) {
|
|
|
452
461
|
return nk_f32_fma_haswell(a, b, c);
|
|
453
462
|
#elif NK_TARGET_NEON
|
|
454
463
|
return nk_f32_fma_neon(a, b, c);
|
|
464
|
+
#elif NK_TARGET_POWERVSX
|
|
465
|
+
return nk_f32_fma_powervsx(a, b, c);
|
|
455
466
|
#elif NK_TARGET_RVV
|
|
456
467
|
return nk_f32_fma_rvv(a, b, c);
|
|
457
468
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -466,6 +477,8 @@ NK_PUBLIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c) {
|
|
|
466
477
|
return nk_f64_fma_haswell(a, b, c);
|
|
467
478
|
#elif NK_TARGET_NEON
|
|
468
479
|
return nk_f64_fma_neon(a, b, c);
|
|
480
|
+
#elif NK_TARGET_POWERVSX
|
|
481
|
+
return nk_f64_fma_powervsx(a, b, c);
|
|
469
482
|
#elif NK_TARGET_RVV
|
|
470
483
|
return nk_f64_fma_rvv(a, b, c);
|
|
471
484
|
#elif NK_TARGET_V128RELAXED
|
|
@@ -7,22 +7,22 @@ Hamming distance counts the number of positions where elements differ.
|
|
|
7
7
|
For binary vectors packed as octets, this is the popcount of the XOR.
|
|
8
8
|
For byte-level vectors, it counts the number of mismatched bytes:
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
$$
|
|
11
11
|
\text{hamming}(a, b) = \sum_{i=0}^{n-1} [a_i \neq b_i]
|
|
12
|
-
|
|
12
|
+
$$
|
|
13
13
|
|
|
14
14
|
Jaccard distance measures the dissimilarity of two sets.
|
|
15
15
|
For binary vectors, the intersection and union are computed via bitwise AND and OR with popcount:
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
$$
|
|
18
18
|
\text{jaccard}(a, b) = 1 - \frac{|A \cap B|}{|A \cup B|} = 1 - \frac{\text{popcount}(a \mathbin{\&} b)}{\text{popcount}(a \mathbin{|} b)}
|
|
19
|
-
|
|
19
|
+
$$
|
|
20
20
|
|
|
21
21
|
For word-level vectors (MinHash signatures), Jaccard similarity is the fraction of matching elements:
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
$$
|
|
24
24
|
\text{jaccard}(a, b) = 1 - \frac{\sum_{i=0}^{n-1} [a_i = b_i]}{n}
|
|
25
|
-
|
|
25
|
+
$$
|
|
26
26
|
|
|
27
27
|
Reformulating as Python pseudocode:
|
|
28
28
|
|
|
@@ -136,44 +136,44 @@ Measured with Wasmtime v42 (Cranelift backend).
|
|
|
136
136
|
| __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
137
137
|
| `nk_jaccard_u32_v128relaxed` | 0.430 gb/s, 0 ulp | 2.46 gb/s, 0 ulp | 1.08 gb/s, 0 ulp |
|
|
138
138
|
|
|
139
|
-
### Apple
|
|
139
|
+
### Apple M5
|
|
140
140
|
|
|
141
141
|
#### Native
|
|
142
142
|
|
|
143
143
|
| Kernel | 256 | 1024 | 4096 |
|
|
144
144
|
| :---------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
145
145
|
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
146
|
-
| `nk_hamming_u1_serial` |
|
|
147
|
-
| `nk_jaccard_u1_serial` |
|
|
148
|
-
| `nk_hamming_u1_neon` |
|
|
149
|
-
| `nk_jaccard_u1_neon` |
|
|
146
|
+
| `nk_hamming_u1_serial` | 6.79 gb/s | 7.48 gb/s | 6.92 gb/s |
|
|
147
|
+
| `nk_jaccard_u1_serial` | 4.36 gb/s, 0 ulp | 5.38 gb/s, 0 ulp | 5.45 gb/s, 0 ulp |
|
|
148
|
+
| `nk_hamming_u1_neon` | 31.6 gb/s | 65.6 gb/s | 90.9 gb/s |
|
|
149
|
+
| `nk_jaccard_u1_neon` | 28.4 gb/s, 0 ulp | 48.1 gb/s, 0 ulp | 51.0 gb/s, 0 ulp |
|
|
150
150
|
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
151
|
-
| `nk_hamming_u8_serial` |
|
|
152
|
-
| `nk_hamming_u8_neon` |
|
|
151
|
+
| `nk_hamming_u8_serial` | 27.8 gb/s | 30.1 gb/s | 31.2 gb/s |
|
|
152
|
+
| `nk_hamming_u8_neon` | 96.9 gb/s | 79.5 gb/s | 56.3 gb/s |
|
|
153
153
|
| __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
154
|
-
| `nk_jaccard_u16_serial` |
|
|
155
|
-
| `nk_jaccard_u16_neon` |
|
|
154
|
+
| `nk_jaccard_u16_serial` | 59.3 gb/s, 0 ulp | 69.4 gb/s, 0 ulp | 66.8 gb/s, 0 ulp |
|
|
155
|
+
| `nk_jaccard_u16_neon` | 67.8 gb/s, 0 ulp | 61.6 gb/s, 0 ulp | 50.8 gb/s, 0 ulp |
|
|
156
156
|
| __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
157
|
-
| `nk_jaccard_u32_serial` |
|
|
158
|
-
| `nk_jaccard_u32_neon` |
|
|
157
|
+
| `nk_jaccard_u32_serial` | 105 gb/s, 0 ulp | 101 gb/s, 0 ulp | 89.1 gb/s, 0 ulp |
|
|
158
|
+
| `nk_jaccard_u32_neon` | 89.3 gb/s, 0 ulp | 72.8 gb/s, 0 ulp | 68.2 gb/s, 0 ulp |
|
|
159
159
|
|
|
160
160
|
#### WASM
|
|
161
161
|
|
|
162
|
-
Measured with Wasmtime
|
|
162
|
+
Measured with Wasmtime v43 (Cranelift backend).
|
|
163
163
|
|
|
164
164
|
| Kernel | 256 | 1024 | 4096 |
|
|
165
165
|
| :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
166
166
|
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
167
|
-
| `nk_hamming_u1_serial` |
|
|
168
|
-
| `nk_jaccard_u1_serial` |
|
|
169
|
-
| `nk_hamming_u1_v128relaxed` |
|
|
170
|
-
| `nk_jaccard_u1_v128relaxed` |
|
|
167
|
+
| `nk_hamming_u1_serial` | 5.18 gb/s | 5.66 gb/s | 6.52 gb/s |
|
|
168
|
+
| `nk_jaccard_u1_serial` | 1.74 gb/s, 0 ulp | 3.32 gb/s, 0 ulp | 3.61 gb/s, 0 ulp |
|
|
169
|
+
| `nk_hamming_u1_v128relaxed` | 22.6 gb/s | 46.5 gb/s | 67.9 gb/s |
|
|
170
|
+
| `nk_jaccard_u1_v128relaxed` | 16.1 gb/s, 0 ulp | 34.5 gb/s, 0 ulp | 50.8 gb/s, 0 ulp |
|
|
171
171
|
| __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
172
|
-
| `nk_hamming_u8_serial` |
|
|
173
|
-
| `nk_hamming_u8_v128relaxed` |
|
|
172
|
+
| `nk_hamming_u8_serial` | 8.32 gb/s | 6.09 gb/s | 5.84 gb/s |
|
|
173
|
+
| `nk_hamming_u8_v128relaxed` | 47.7 gb/s | 68.5 gb/s | 72.1 gb/s |
|
|
174
174
|
| __u16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
175
|
-
| `nk_jaccard_u16_serial` |
|
|
176
|
-
| `nk_jaccard_u16_v128relaxed` |
|
|
175
|
+
| `nk_jaccard_u16_serial` | 19.2 gb/s, 0 ulp | 12.4 gb/s, 0 ulp | 11.9 gb/s, 0 ulp |
|
|
176
|
+
| `nk_jaccard_u16_v128relaxed` | 89.8 gb/s, 0 ulp | 74.0 gb/s, 0 ulp | 71.3 gb/s, 0 ulp |
|
|
177
177
|
| __u32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
178
|
-
| `nk_jaccard_u32_serial` |
|
|
179
|
-
| `nk_jaccard_u32_v128relaxed` |
|
|
178
|
+
| `nk_jaccard_u32_serial` | 91.6 gb/s, 0 ulp | 69.4 gb/s, 0 ulp | 68.4 gb/s, 0 ulp |
|
|
179
|
+
| `nk_jaccard_u32_v128relaxed` | 94.8 gb/s, 0 ulp | 76.2 gb/s, 0 ulp | 68.8 gb/s, 0 ulp |
|
|
@@ -8,12 +8,12 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @section set_haswell_instructions Key POPCNT/AVX2 Set Instructions
|
|
10
10
|
*
|
|
11
|
-
* Intrinsic
|
|
12
|
-
* _mm_popcnt_u64
|
|
13
|
-
* _mm256_and_si256
|
|
14
|
-
* _mm256_or_si256
|
|
15
|
-
* _mm256_xor_si256
|
|
16
|
-
* _mm256_extracti128_si256
|
|
11
|
+
* Intrinsic Instruction Haswell Genoa
|
|
12
|
+
* _mm_popcnt_u64 POPCNT (R64, R64) 3cy @ p1 1cy @ p0123
|
|
13
|
+
* _mm256_and_si256 VPAND (YMM, YMM, YMM) 1cy @ p015 1cy @ p0123
|
|
14
|
+
* _mm256_or_si256 VPOR (YMM, YMM, YMM) 1cy @ p015 1cy @ p0123
|
|
15
|
+
* _mm256_xor_si256 VPXOR (YMM, YMM, YMM) 1cy @ p015 1cy @ p0123
|
|
16
|
+
* _mm256_extracti128_si256 VEXTRACTI128 (XMM, YMM, I8) 3cy @ p5 1cy @ p0123
|
|
17
17
|
*
|
|
18
18
|
* Haswell lacks SIMD popcount; we extract 64-bit words and use scalar POPCNT. The p1 port
|
|
19
19
|
* bottleneck limits throughput to 1 popcount/cycle. For Hamming distance, XOR + POPCNT;
|
|
@@ -55,7 +55,7 @@ extern "C" {
|
|
|
55
55
|
#pragma GCC target("avx2", "sse4.1", "popcnt")
|
|
56
56
|
#endif
|
|
57
57
|
|
|
58
|
-
#pragma region
|
|
58
|
+
#pragma region Binary Sets
|
|
59
59
|
|
|
60
60
|
NK_PUBLIC void nk_hamming_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
61
61
|
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
@@ -79,9 +79,9 @@ NK_PUBLIC void nk_jaccard_u1_haswell(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_
|
|
|
79
79
|
*result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
-
#pragma endregion
|
|
82
|
+
#pragma endregion Binary Sets
|
|
83
83
|
|
|
84
|
-
#pragma region
|
|
84
|
+
#pragma region Integer Sets
|
|
85
85
|
|
|
86
86
|
NK_PUBLIC void nk_jaccard_u32_haswell(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
87
87
|
nk_u32_t intersection_count = 0;
|
|
@@ -192,9 +192,9 @@ NK_PUBLIC void nk_jaccard_u16_haswell(nk_u16_t const *a, nk_u16_t const *b, nk_s
|
|
|
192
192
|
*result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
|
|
193
193
|
}
|
|
194
194
|
|
|
195
|
-
#pragma endregion
|
|
195
|
+
#pragma endregion Integer Sets
|
|
196
196
|
|
|
197
|
-
#pragma region
|
|
197
|
+
#pragma region Stateful Streaming
|
|
198
198
|
|
|
199
199
|
typedef struct nk_hamming_u1x64_state_haswell_t {
|
|
200
200
|
nk_u32_t intersection_count;
|
|
@@ -317,7 +317,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_haswell_(nk_b128_vec_t dots, nk_u32_t
|
|
|
317
317
|
results->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
|
|
318
318
|
}
|
|
319
319
|
|
|
320
|
-
#pragma endregion
|
|
320
|
+
#pragma endregion Stateful Streaming
|
|
321
321
|
|
|
322
322
|
#if defined(__clang__)
|
|
323
323
|
#pragma clang attribute pop
|
|
@@ -8,12 +8,12 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @section set_icelake_instructions Key AVX-512 Set Instructions
|
|
10
10
|
*
|
|
11
|
-
* Intrinsic
|
|
12
|
-
* _mm512_popcnt_epi64
|
|
13
|
-
* _mm512_and_si512
|
|
14
|
-
* _mm512_or_si512
|
|
15
|
-
* _mm512_xor_si512
|
|
16
|
-
* _mm512_maskz_loadu_epi8
|
|
11
|
+
* Intrinsic Instruction Ice Lake
|
|
12
|
+
* _mm512_popcnt_epi64 VPOPCNTQ (ZMM, ZMM) 3cy @ p5
|
|
13
|
+
* _mm512_and_si512 VPANDQ (ZMM, ZMM, ZMM) 1cy @ p05
|
|
14
|
+
* _mm512_or_si512 VPORQ (ZMM, ZMM, ZMM) 1cy @ p05
|
|
15
|
+
* _mm512_xor_si512 VPXORQ (ZMM, ZMM, ZMM) 1cy @ p05
|
|
16
|
+
* _mm512_maskz_loadu_epi8 VMOVDQU8 (ZMM, mem, k1) 7cy @ p23
|
|
17
17
|
*
|
|
18
18
|
* Ice Lake has native VPOPCNTQ instruction via AVX-512 VPOPCNTDQ extension, enabling
|
|
19
19
|
* efficient 64-bit element-wise popcount. We process 512 bits per iteration.
|
|
@@ -54,7 +54,7 @@ extern "C" {
|
|
|
54
54
|
#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512vpopcntdq", "f16c", "fma", "bmi", "bmi2")
|
|
55
55
|
#endif
|
|
56
56
|
|
|
57
|
-
#pragma region
|
|
57
|
+
#pragma region Binary Sets
|
|
58
58
|
|
|
59
59
|
NK_PUBLIC void nk_hamming_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
60
60
|
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
@@ -239,9 +239,9 @@ NK_PUBLIC void nk_jaccard_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_
|
|
|
239
239
|
*result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
|
|
240
240
|
}
|
|
241
241
|
|
|
242
|
-
#pragma endregion
|
|
242
|
+
#pragma endregion Binary Sets
|
|
243
243
|
|
|
244
|
-
#pragma region
|
|
244
|
+
#pragma region Integer Sets
|
|
245
245
|
|
|
246
246
|
NK_PUBLIC void nk_jaccard_u32_icelake(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
247
247
|
nk_u32_t intersection_count = 0;
|
|
@@ -300,9 +300,9 @@ NK_PUBLIC void nk_jaccard_u16_icelake(nk_u16_t const *a, nk_u16_t const *b, nk_s
|
|
|
300
300
|
*result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
|
|
301
301
|
}
|
|
302
302
|
|
|
303
|
-
#pragma endregion
|
|
303
|
+
#pragma endregion Integer Sets
|
|
304
304
|
|
|
305
|
-
#pragma region
|
|
305
|
+
#pragma region Stateful Streaming
|
|
306
306
|
|
|
307
307
|
typedef struct nk_hamming_u1x512_state_icelake_t {
|
|
308
308
|
__m512i intersection_count_i64x8;
|
|
@@ -438,7 +438,7 @@ NK_INTERNAL void nk_jaccard_u1x512_finalize_icelake( //
|
|
|
438
438
|
result->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
|
|
439
439
|
}
|
|
440
440
|
|
|
441
|
-
/** @brief Hamming from_dot: computes pop_a + pop_b - 2*dot for 4 pairs (
|
|
441
|
+
/** @brief Hamming from_dot: computes pop_a + pop_b - 2*dot for 4 pairs (Icelake). */
|
|
442
442
|
NK_INTERNAL void nk_hamming_u32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
|
|
443
443
|
nk_b128_vec_t *results) {
|
|
444
444
|
__m128i dots_i32x4 = dots.xmm;
|
|
@@ -447,7 +447,7 @@ NK_INTERNAL void nk_hamming_u32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t
|
|
|
447
447
|
results->xmm = _mm_sub_epi32(_mm_add_epi32(query_i32x4, target_i32x4), _mm_slli_epi32(dots_i32x4, 1));
|
|
448
448
|
}
|
|
449
449
|
|
|
450
|
-
/** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (
|
|
450
|
+
/** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (Icelake). */
|
|
451
451
|
NK_INTERNAL void nk_jaccard_f32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
|
|
452
452
|
nk_b128_vec_t *results) {
|
|
453
453
|
__m128 dot_f32x4 = _mm_cvtepi32_ps(dots.xmm);
|
|
@@ -468,7 +468,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_icelake_(nk_b128_vec_t dots, nk_u32_t
|
|
|
468
468
|
results->xmm_ps = _mm_blendv_ps(jaccard_f32x4, _mm_setzero_ps(), zero_union_mask);
|
|
469
469
|
}
|
|
470
470
|
|
|
471
|
-
#pragma endregion
|
|
471
|
+
#pragma endregion Stateful Streaming
|
|
472
472
|
|
|
473
473
|
#if defined(__clang__)
|
|
474
474
|
#pragma clang attribute pop
|