numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Set Similarity Measures for LoongArch LASX (256-bit).
|
|
3
|
+
* @file include/numkong/set/loongsonasx.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date March 23, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/set.h
|
|
8
|
+
*
|
|
9
|
+
* @section set_loongsonasx_instructions Key LASX Set Instructions
|
|
10
|
+
*
|
|
11
|
+
* Intrinsic Instruction
|
|
12
|
+
* __lasx_xvld XVLD (256-bit unaligned load)
|
|
13
|
+
* __lasx_xvxor_v XVXOR.V (bitwise XOR)
|
|
14
|
+
* __lasx_xvor_v XVOR.V (bitwise OR)
|
|
15
|
+
* __lasx_xvand_v XVAND.V (bitwise AND)
|
|
16
|
+
* __lasx_xvpcnt_d XVPCNT.D (popcount per u64 element)
|
|
17
|
+
* __lasx_xvseq_b XVSEQ.B (byte-wise equality, 0xFF/0x00)
|
|
18
|
+
* __lasx_xvmin_bu XVMIN.BU (unsigned byte minimum)
|
|
19
|
+
* __lasx_xvhaddw_hu_bu XVHADDW.HU.BU (horizontal pairwise add u8->u16)
|
|
20
|
+
* __lasx_xvhaddw_wu_hu XVHADDW.WU.HU (horizontal pairwise add u16->u32)
|
|
21
|
+
* __lasx_xvhaddw_du_wu XVHADDW.DU.WU (horizontal pairwise add u32->u64)
|
|
22
|
+
* __lasx_xvadd_d XVADD.D (i64 addition)
|
|
23
|
+
* __lasx_xvpermi_q XVPERMI.Q (extract/permute 128-bit lanes)
|
|
24
|
+
*
|
|
25
|
+
* LASX provides per-element popcount at multiple widths (`xvpcnt_b/h/w/d`).
|
|
26
|
+
* For binary set operations we use `xvpcnt_d` which gives 4 x u64 popcount values
|
|
27
|
+
* directly, eliminating the need for horizontal byte-sum reduction chains.
|
|
28
|
+
*
|
|
29
|
+
* For sorted integer set operations (jaccard_u16, jaccard_u32), SIMD provides limited
|
|
30
|
+
* benefit due to the inherently serial merge-based algorithm, so we delegate to the
|
|
31
|
+
* serial implementations.
|
|
32
|
+
*/
|
|
33
|
+
#ifndef NK_SET_LOONGSONASX_H
|
|
34
|
+
#define NK_SET_LOONGSONASX_H
|
|
35
|
+
|
|
36
|
+
#if NK_TARGET_LOONGARCH_
|
|
37
|
+
#if NK_TARGET_LOONGSONASX
|
|
38
|
+
|
|
39
|
+
#include "numkong/types.h"
|
|
40
|
+
#include "numkong/set/serial.h" // `nk_u1x8_popcount_`, serial fallbacks
|
|
41
|
+
#include "numkong/dot/loongsonasx.h" // `nk_reduce_add_i32x8_loongsonasx_`
|
|
42
|
+
|
|
43
|
+
#if defined(__cplusplus)
|
|
44
|
+
extern "C" {
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
#pragma region Reduction Helpers
|
|
48
|
+
|
|
49
|
+
/** @brief Horizontal sum of 4 u64 lanes in a 256-bit LASX register. */
|
|
50
|
+
NK_INTERNAL nk_u64_t nk_reduce_add_u64x4_loongsonasx_(__m256i sum_u64x4) {
|
|
51
|
+
__m256i high_u64x4 = __lasx_xvpermi_q(sum_u64x4, sum_u64x4, 0x11);
|
|
52
|
+
__m256i sum_u64x2 = __lasx_xvadd_d(sum_u64x4, high_u64x4);
|
|
53
|
+
__m256i swapped_u64x2 = __lasx_xvshuf4i_d(sum_u64x2, sum_u64x2, 0b0001);
|
|
54
|
+
__m256i reduced_u64x2 = __lasx_xvadd_d(sum_u64x2, swapped_u64x2);
|
|
55
|
+
return (nk_u64_t)__lasx_xvpickve2gr_du(reduced_u64x2, 0);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** @brief Horizontally sum all bytes in a 256-bit register as unsigned values.
|
|
59
|
+
*
|
|
60
|
+
* Chains pairwise widening additions: u8→u16→u32→u64, then reduces 4 u64 lanes.
|
|
61
|
+
*/
|
|
62
|
+
NK_INTERNAL nk_u64_t nk_reduce_add_u8x32_loongsonasx_(__m256i v_u8x32) {
|
|
63
|
+
__m256i sum_u16x16 = __lasx_xvhaddw_hu_bu(v_u8x32, v_u8x32);
|
|
64
|
+
__m256i sum_u32x8 = __lasx_xvhaddw_wu_hu(sum_u16x16, sum_u16x16);
|
|
65
|
+
__m256i sum_u64x4 = __lasx_xvhaddw_du_wu(sum_u32x8, sum_u32x8);
|
|
66
|
+
return nk_reduce_add_u64x4_loongsonasx_(sum_u64x4);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
#pragma endregion Reduction Helpers
|
|
70
|
+
|
|
71
|
+
#pragma region Binary Sets
|
|
72
|
+
|
|
73
|
+
NK_PUBLIC void nk_hamming_u1_loongsonasx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
74
|
+
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
75
|
+
__m256i count_u64x4 = __lasx_xvreplgr2vr_d(0);
|
|
76
|
+
nk_size_t i = 0;
|
|
77
|
+
|
|
78
|
+
for (; i + 32 <= n_bytes; i += 32) {
|
|
79
|
+
__m256i a_u8x32 = __lasx_xvld(a + i, 0);
|
|
80
|
+
__m256i b_u8x32 = __lasx_xvld(b + i, 0);
|
|
81
|
+
__m256i xor_u8x32 = __lasx_xvxor_v(a_u8x32, b_u8x32);
|
|
82
|
+
count_u64x4 = __lasx_xvadd_d(count_u64x4, __lasx_xvpcnt_d(xor_u8x32));
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
nk_u64_t count = nk_reduce_add_u64x4_loongsonasx_(count_u64x4);
|
|
86
|
+
|
|
87
|
+
for (; i < n_bytes; ++i) count += nk_u1x8_popcount_(a[i] ^ b[i]);
|
|
88
|
+
*result = (nk_u32_t)count;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
NK_PUBLIC void nk_jaccard_u1_loongsonasx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
92
|
+
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
93
|
+
__m256i xor_count_u64x4 = __lasx_xvreplgr2vr_d(0);
|
|
94
|
+
__m256i or_count_u64x4 = __lasx_xvreplgr2vr_d(0);
|
|
95
|
+
nk_size_t i = 0;
|
|
96
|
+
|
|
97
|
+
for (; i + 32 <= n_bytes; i += 32) {
|
|
98
|
+
__m256i a_u8x32 = __lasx_xvld(a + i, 0);
|
|
99
|
+
__m256i b_u8x32 = __lasx_xvld(b + i, 0);
|
|
100
|
+
__m256i xor_u8x32 = __lasx_xvxor_v(a_u8x32, b_u8x32);
|
|
101
|
+
__m256i or_u8x32 = __lasx_xvor_v(a_u8x32, b_u8x32);
|
|
102
|
+
xor_count_u64x4 = __lasx_xvadd_d(xor_count_u64x4, __lasx_xvpcnt_d(xor_u8x32));
|
|
103
|
+
or_count_u64x4 = __lasx_xvadd_d(or_count_u64x4, __lasx_xvpcnt_d(or_u8x32));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
nk_u64_t xor_count = nk_reduce_add_u64x4_loongsonasx_(xor_count_u64x4);
|
|
107
|
+
nk_u64_t or_count = nk_reduce_add_u64x4_loongsonasx_(or_count_u64x4);
|
|
108
|
+
|
|
109
|
+
for (; i < n_bytes; ++i) {
|
|
110
|
+
xor_count += nk_u1x8_popcount_(a[i] ^ b[i]);
|
|
111
|
+
or_count += nk_u1x8_popcount_(a[i] | b[i]);
|
|
112
|
+
}
|
|
113
|
+
*result = (or_count != 0) ? (nk_f32_t)xor_count / (nk_f32_t)or_count : 0.0f;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
#pragma endregion Binary Sets
|
|
117
|
+
|
|
118
|
+
#pragma region Integer Sets
|
|
119
|
+
|
|
120
|
+
NK_PUBLIC void nk_hamming_u8_loongsonasx(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
121
|
+
__m256i count_u64x4 = __lasx_xvreplgr2vr_d(0);
|
|
122
|
+
__m256i ones_u8x32 = __lasx_xvreplgr2vr_b(1);
|
|
123
|
+
nk_size_t i = 0;
|
|
124
|
+
|
|
125
|
+
for (; i + 32 <= n; i += 32) {
|
|
126
|
+
__m256i a_u8x32 = __lasx_xvld(a + i, 0);
|
|
127
|
+
__m256i b_u8x32 = __lasx_xvld(b + i, 0);
|
|
128
|
+
__m256i xor_u8x32 = __lasx_xvxor_v(a_u8x32, b_u8x32);
|
|
129
|
+
__m256i min_u8x32 = __lasx_xvmin_bu(xor_u8x32, ones_u8x32);
|
|
130
|
+
__m256i sum_u16x16 = __lasx_xvhaddw_hu_bu(min_u8x32, min_u8x32);
|
|
131
|
+
__m256i sum_u32x8 = __lasx_xvhaddw_wu_hu(sum_u16x16, sum_u16x16);
|
|
132
|
+
__m256i sum_u64x4 = __lasx_xvhaddw_du_wu(sum_u32x8, sum_u32x8);
|
|
133
|
+
count_u64x4 = __lasx_xvadd_d(count_u64x4, sum_u64x4);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
nk_u64_t count = nk_reduce_add_u64x4_loongsonasx_(count_u64x4);
|
|
137
|
+
|
|
138
|
+
for (; i < n; ++i) count += (a[i] != b[i]);
|
|
139
|
+
*result = (nk_u32_t)count;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
#pragma endregion Integer Sets
|
|
143
|
+
|
|
144
|
+
#pragma region Batched Finalizers
|
|
145
|
+
|
|
146
|
+
/** @brief Hamming from_dot: computes pop_a + pop_b − 2 × dot for 4 pairs (LSX). */
|
|
147
|
+
NK_INTERNAL void nk_hamming_u32x4_from_dot_loongsonasx_(nk_b128_vec_t dots, nk_u32_t query_pop,
|
|
148
|
+
nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
|
|
149
|
+
__m128i dots_u32x4 = dots.xmm;
|
|
150
|
+
__m128i query_u32x4 = __lsx_vreplgr2vr_w((int)query_pop);
|
|
151
|
+
__m128i target_u32x4 = target_pops.xmm;
|
|
152
|
+
results->xmm = __lsx_vsub_w(__lsx_vadd_w(query_u32x4, target_u32x4), __lsx_vslli_w(dots_u32x4, 1));
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/** @brief Jaccard from_dot: computes 1 − dot / (pop_a + pop_b − dot) for 4 pairs (LSX). */
|
|
156
|
+
NK_INTERNAL void nk_jaccard_f32x4_from_dot_loongsonasx_(nk_b128_vec_t dots, nk_u32_t query_pop,
|
|
157
|
+
nk_b128_vec_t target_pops, nk_b128_vec_t *results) {
|
|
158
|
+
__m128 dot_f32x4 = __lsx_vffint_s_wu(dots.xmm);
|
|
159
|
+
__m128 query_f32x4 = nk_xvreplgr2vr_s_128_((nk_f32_t)query_pop);
|
|
160
|
+
__m128 target_f32x4 = __lsx_vffint_s_wu(target_pops.xmm);
|
|
161
|
+
__m128 union_f32x4 = __lsx_vfsub_s(__lsx_vfadd_s(query_f32x4, target_f32x4), dot_f32x4);
|
|
162
|
+
|
|
163
|
+
__m128 zero_f32x4 = (__m128)__lsx_vreplgr2vr_w(0);
|
|
164
|
+
__m128 one_f32x4 = nk_xvreplgr2vr_s_128_(1.0f);
|
|
165
|
+
__m128i zero_union_mask_u32x4 = __lsx_vfcmp_ceq_s(union_f32x4, zero_f32x4);
|
|
166
|
+
__m128 safe_union_f32x4 = (__m128)__lsx_vbitsel_v((__m128i)union_f32x4, (__m128i)one_f32x4, zero_union_mask_u32x4);
|
|
167
|
+
|
|
168
|
+
__m128 ratio_f32x4 = __lsx_vfdiv_s(dot_f32x4, safe_union_f32x4);
|
|
169
|
+
__m128 jaccard_f32x4 = __lsx_vfsub_s(one_f32x4, ratio_f32x4);
|
|
170
|
+
results->xmm_ps = (__m128)__lsx_vbitsel_v((__m128i)jaccard_f32x4, (__m128i)zero_f32x4, zero_union_mask_u32x4);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#pragma endregion Batched Finalizers
|
|
174
|
+
|
|
175
|
+
#if defined(__cplusplus)
|
|
176
|
+
} // extern "C"
|
|
177
|
+
#endif
|
|
178
|
+
|
|
179
|
+
#endif // NK_TARGET_LOONGSONASX
|
|
180
|
+
#endif // NK_TARGET_LOONGARCH_
|
|
181
|
+
#endif // NK_SET_LOONGSONASX_H
|
|
@@ -10,13 +10,13 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Key NEON instructions for binary/bitwise operations (Cortex-A76 class):
|
|
12
12
|
*
|
|
13
|
-
* Intrinsic
|
|
14
|
-
* vcntq_u8
|
|
15
|
-
* veorq_u8
|
|
16
|
-
* vandq_u8
|
|
17
|
-
* vorrq_u8
|
|
18
|
-
* vpaddlq_u8
|
|
19
|
-
* vaddvq_u32
|
|
13
|
+
* Intrinsic Instruction A76 M5
|
|
14
|
+
* vcntq_u8 CNT (V.16B, V.16B) 2cy @ 2p 2cy @ 4p
|
|
15
|
+
* veorq_u8 EOR (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
|
|
16
|
+
* vandq_u8 AND (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
|
|
17
|
+
* vorrq_u8 ORR (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
|
|
18
|
+
* vpaddlq_u8 UADDLP (V.8H, V.16B) 2cy @ 2p 2cy @ 4p
|
|
19
|
+
* vaddvq_u32 ADDV (S, V.4S) 4cy @ 1p 5cy @ 1p
|
|
20
20
|
*
|
|
21
21
|
* According to the available literature, the throughput for those basic integer ops is
|
|
22
22
|
* identical across most Apple, Qualcomm, and AWS Graviton chips. As long as we avoid widening
|
|
@@ -58,7 +58,7 @@ extern "C" {
|
|
|
58
58
|
#pragma GCC target("arch=armv8-a+simd")
|
|
59
59
|
#endif
|
|
60
60
|
|
|
61
|
-
#pragma region
|
|
61
|
+
#pragma region Binary Sets
|
|
62
62
|
|
|
63
63
|
NK_PUBLIC void nk_hamming_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
64
64
|
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
@@ -109,9 +109,9 @@ NK_PUBLIC void nk_jaccard_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_siz
|
|
|
109
109
|
*result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
-
#pragma endregion
|
|
112
|
+
#pragma endregion Binary Sets
|
|
113
113
|
|
|
114
|
-
#pragma region
|
|
114
|
+
#pragma region Integer Sets
|
|
115
115
|
|
|
116
116
|
NK_PUBLIC void nk_jaccard_u32_neon(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
117
117
|
nk_u32_t intersection_count = 0;
|
|
@@ -174,9 +174,9 @@ NK_PUBLIC void nk_jaccard_u16_neon(nk_u16_t const *a, nk_u16_t const *b, nk_size
|
|
|
174
174
|
*result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
|
|
175
175
|
}
|
|
176
176
|
|
|
177
|
-
#pragma endregion
|
|
177
|
+
#pragma endregion Integer Sets
|
|
178
178
|
|
|
179
|
-
#pragma region
|
|
179
|
+
#pragma region Stateful Streaming
|
|
180
180
|
|
|
181
181
|
typedef struct nk_hamming_u1x128_state_neon_t {
|
|
182
182
|
uint32x4_t intersection_count_u32x4;
|
|
@@ -290,12 +290,11 @@ NK_INTERNAL void nk_jaccard_u1x128_finalize_neon( //
|
|
|
290
290
|
float32x4_t intersection_f32x4 = vcvtq_f32_u32(intersection_u32x4);
|
|
291
291
|
|
|
292
292
|
// Compute union using |A ∪ B| = |A| + |B| - |A ∩ B|
|
|
293
|
-
// Build target popcounts vector
|
|
293
|
+
// Build target popcounts vector from two independent halves (avoids serial lane insertion chain).
|
|
294
294
|
float32x4_t query_f32x4 = vdupq_n_f32(query_popcount);
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
targets_f32x4 =
|
|
298
|
-
targets_f32x4 = vsetq_lane_f32(target_popcount_d, targets_f32x4, 3);
|
|
295
|
+
float32x2_t targets_ab_f32x2 = vset_lane_f32(target_popcount_b, vdup_n_f32(target_popcount_a), 1);
|
|
296
|
+
float32x2_t targets_cd_f32x2 = vset_lane_f32(target_popcount_d, vdup_n_f32(target_popcount_c), 1);
|
|
297
|
+
float32x4_t targets_f32x4 = vcombine_f32(targets_ab_f32x2, targets_cd_f32x2);
|
|
299
298
|
float32x4_t union_f32x4 = vsubq_f32(vaddq_f32(query_f32x4, targets_f32x4), intersection_f32x4);
|
|
300
299
|
|
|
301
300
|
// Handle zero-union edge case (empty vectors → distance = 0.0, matching scipy convention)
|
|
@@ -347,7 +346,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_neon_(nk_b128_vec_t dots, nk_u32_t qu
|
|
|
347
346
|
results->f32x4 = vbslq_f32(zero_union_mask, vdupq_n_f32(0.0f), jaccard_f32x4);
|
|
348
347
|
}
|
|
349
348
|
|
|
350
|
-
#pragma endregion
|
|
349
|
+
#pragma endregion Stateful Streaming
|
|
351
350
|
|
|
352
351
|
#if defined(__clang__)
|
|
353
352
|
#pragma clang attribute pop
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Set Similarity Measures for Power ISA VSX.
|
|
3
|
+
* @file include/numkong/set/powervsx.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date March 23, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/set.h
|
|
8
|
+
*
|
|
9
|
+
* @section set_powervsx_instructions Power9 VSX Set Instructions
|
|
10
|
+
*
|
|
11
|
+
* Key Power9 VSX instructions for binary/bitwise operations:
|
|
12
|
+
*
|
|
13
|
+
* Intrinsic Instruction P9
|
|
14
|
+
* vec_popcnt vpopcntb/h/w/d 2cy @ 2p element-wise popcount
|
|
15
|
+
* vec_xor xxlxor 1cy @ 4p
|
|
16
|
+
* vec_and xxland 1cy @ 4p
|
|
17
|
+
* vec_or xxlor 1cy @ 4p
|
|
18
|
+
* vec_cmpne vcmpneb/h/w 2cy @ 2p byte/half/word not-equal
|
|
19
|
+
* vec_xl_len lxvll 6cy @ 1p partial vector load
|
|
20
|
+
*
|
|
21
|
+
* Power9 has native doubleword `vpopcntd` instruction, providing efficient SIMD popcount
|
|
22
|
+
* with minimal data flow complexity. `vec_xl_len` enables branchless tail handling.
|
|
23
|
+
*
|
|
24
|
+
* @section set_powervsx_stateful Stateful Streaming Logic
|
|
25
|
+
*
|
|
26
|
+
* To build memory-optimal tiled algorithms, this file defines:
|
|
27
|
+
*
|
|
28
|
+
* - nk_hamming_u1x128_state_powervsx_t for streaming Hamming distance
|
|
29
|
+
* - nk_jaccard_u1x128_state_powervsx_t for streaming Jaccard similarity
|
|
30
|
+
*
|
|
31
|
+
* @code{c}
|
|
32
|
+
* nk_jaccard_u1x128_state_powervsx_t state_first, state_second, state_third, state_fourth;
|
|
33
|
+
* nk_jaccard_u1x128_init_powervsx(&state_first);
|
|
34
|
+
* // ... stream through packed binary vectors ...
|
|
35
|
+
* nk_jaccard_u1x128_finalize_powervsx(&state_first, &state_second, &state_third, &state_fourth,
|
|
36
|
+
* query_popcount, target_popcount_a, target_popcount_b, target_popcount_c, target_popcount_d,
|
|
37
|
+
* total_dimensions, &results);
|
|
38
|
+
* @endcode
|
|
39
|
+
*/
|
|
40
|
+
#ifndef NK_SET_POWERVSX_H
|
|
41
|
+
#define NK_SET_POWERVSX_H
|
|
42
|
+
|
|
43
|
+
#if NK_TARGET_POWER_
|
|
44
|
+
#if NK_TARGET_POWERVSX
|
|
45
|
+
|
|
46
|
+
#include "numkong/types.h"
|
|
47
|
+
#include "numkong/set/serial.h" // `nk_u1x8_popcount_`
|
|
48
|
+
#include "numkong/dot/powervsx.h" // `nk_hsum_u32x4_powervsx_`, `nk_hsum_u64x2_powervsx_`
|
|
49
|
+
|
|
50
|
+
#if defined(__cplusplus)
|
|
51
|
+
extern "C" {
|
|
52
|
+
#endif
|
|
53
|
+
|
|
54
|
+
#if defined(__clang__)
|
|
55
|
+
#pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
|
|
56
|
+
#elif defined(__GNUC__)
|
|
57
|
+
#pragma GCC push_options
|
|
58
|
+
#pragma GCC target("power9-vector")
|
|
59
|
+
#endif
|
|
60
|
+
|
|
61
|
+
NK_PUBLIC void nk_hamming_u1_powervsx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
62
|
+
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
63
|
+
nk_vu64x2_t differences_u64x2 = vec_splats((nk_u64_t)0);
|
|
64
|
+
nk_size_t i = 0;
|
|
65
|
+
// Process 16 bytes at a time using doubleword popcount (vpopcntd)
|
|
66
|
+
for (; i + 16 <= n_bytes; i += 16) {
|
|
67
|
+
nk_vu8x16_t a_u8x16 = vec_xl(0, (nk_u8_t const *)(a + i));
|
|
68
|
+
nk_vu8x16_t b_u8x16 = vec_xl(0, (nk_u8_t const *)(b + i));
|
|
69
|
+
nk_vu8x16_t xor_u8x16 = vec_xor(a_u8x16, b_u8x16);
|
|
70
|
+
nk_vu64x2_t popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)xor_u8x16);
|
|
71
|
+
differences_u64x2 = vec_add(differences_u64x2, popcnt_u64x2);
|
|
72
|
+
}
|
|
73
|
+
// Branchless tail: vec_xl_len zero-fills beyond remaining_bytes
|
|
74
|
+
nk_size_t remaining_bytes = n_bytes - i;
|
|
75
|
+
nk_vu8x16_t a_u8x16 = vec_xl_len((nk_u8_t *)(a + i), remaining_bytes);
|
|
76
|
+
nk_vu8x16_t b_u8x16 = vec_xl_len((nk_u8_t *)(b + i), remaining_bytes);
|
|
77
|
+
nk_vu8x16_t xor_u8x16 = vec_xor(a_u8x16, b_u8x16);
|
|
78
|
+
nk_vu64x2_t popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)xor_u8x16);
|
|
79
|
+
differences_u64x2 = vec_add(differences_u64x2, popcnt_u64x2);
|
|
80
|
+
*result = (nk_u32_t)nk_hsum_u64x2_powervsx_(differences_u64x2);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
NK_PUBLIC void nk_jaccard_u1_powervsx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
84
|
+
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
85
|
+
nk_vu64x2_t intersection_u64x2 = vec_splats((nk_u64_t)0);
|
|
86
|
+
nk_vu64x2_t union_u64x2 = vec_splats((nk_u64_t)0);
|
|
87
|
+
nk_size_t i = 0;
|
|
88
|
+
for (; i + 16 <= n_bytes; i += 16) {
|
|
89
|
+
nk_vu8x16_t a_u8x16 = vec_xl(0, (nk_u8_t const *)(a + i));
|
|
90
|
+
nk_vu8x16_t b_u8x16 = vec_xl(0, (nk_u8_t const *)(b + i));
|
|
91
|
+
nk_vu64x2_t and_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_and(a_u8x16, b_u8x16));
|
|
92
|
+
nk_vu64x2_t or_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_or(a_u8x16, b_u8x16));
|
|
93
|
+
intersection_u64x2 = vec_add(intersection_u64x2, and_popcnt_u64x2);
|
|
94
|
+
union_u64x2 = vec_add(union_u64x2, or_popcnt_u64x2);
|
|
95
|
+
}
|
|
96
|
+
// Branchless tail
|
|
97
|
+
nk_size_t remaining_bytes = n_bytes - i;
|
|
98
|
+
nk_vu8x16_t a_u8x16 = vec_xl_len((nk_u8_t *)(a + i), remaining_bytes);
|
|
99
|
+
nk_vu8x16_t b_u8x16 = vec_xl_len((nk_u8_t *)(b + i), remaining_bytes);
|
|
100
|
+
nk_vu64x2_t and_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_and(a_u8x16, b_u8x16));
|
|
101
|
+
nk_vu64x2_t or_popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)vec_or(a_u8x16, b_u8x16));
|
|
102
|
+
intersection_u64x2 = vec_add(intersection_u64x2, and_popcnt_u64x2);
|
|
103
|
+
union_u64x2 = vec_add(union_u64x2, or_popcnt_u64x2);
|
|
104
|
+
nk_u32_t intersection_count = (nk_u32_t)nk_hsum_u64x2_powervsx_(intersection_u64x2);
|
|
105
|
+
nk_u32_t union_count = (nk_u32_t)nk_hsum_u64x2_powervsx_(union_u64x2);
|
|
106
|
+
*result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
NK_PUBLIC void nk_hamming_u8_powervsx(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
110
|
+
nk_vu32x4_t differences_u32x4 = vec_splats((nk_u32_t)0);
|
|
111
|
+
nk_vu8x16_t ones_u8x16 = vec_splats((nk_u8_t)1);
|
|
112
|
+
nk_size_t i = 0;
|
|
113
|
+
// Process 16 bytes at a time using vec_cmpne
|
|
114
|
+
for (; i + 16 <= n; i += 16) {
|
|
115
|
+
nk_vu8x16_t a_u8x16 = vec_xl(0, (nk_u8_t const *)(a + i));
|
|
116
|
+
nk_vu8x16_t b_u8x16 = vec_xl(0, (nk_u8_t const *)(b + i));
|
|
117
|
+
// vec_cmpne returns 0xFF for not-equal, 0x00 for equal
|
|
118
|
+
// AND with 1 to get 0x01 for not-equal, then sum groups of 4 bytes → u32
|
|
119
|
+
nk_vu8x16_t not_equal_u8x16 = vec_and((nk_vu8x16_t)vec_cmpne(a_u8x16, b_u8x16), ones_u8x16);
|
|
120
|
+
differences_u32x4 = vec_sum4s(not_equal_u8x16, differences_u32x4);
|
|
121
|
+
}
|
|
122
|
+
// Branchless tail
|
|
123
|
+
nk_size_t remaining_bytes = n - i;
|
|
124
|
+
nk_vu8x16_t a_u8x16 = vec_xl_len((nk_u8_t *)(a + i), remaining_bytes);
|
|
125
|
+
nk_vu8x16_t b_u8x16 = vec_xl_len((nk_u8_t *)(b + i), remaining_bytes);
|
|
126
|
+
nk_vu8x16_t not_equal_u8x16 = vec_and((nk_vu8x16_t)vec_cmpne(a_u8x16, b_u8x16), ones_u8x16);
|
|
127
|
+
differences_u32x4 = vec_sum4s(not_equal_u8x16, differences_u32x4);
|
|
128
|
+
*result = nk_hsum_u32x4_powervsx_(differences_u32x4);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
typedef struct nk_hamming_u1x128_state_powervsx_t {
|
|
132
|
+
nk_vu32x4_t intersection_count_u32x4;
|
|
133
|
+
} nk_hamming_u1x128_state_powervsx_t;
|
|
134
|
+
|
|
135
|
+
NK_INTERNAL void nk_hamming_u1x128_init_powervsx(nk_hamming_u1x128_state_powervsx_t *state) {
|
|
136
|
+
state->intersection_count_u32x4 = vec_splats((nk_u32_t)0);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
NK_INTERNAL void nk_hamming_u1x128_update_powervsx(nk_hamming_u1x128_state_powervsx_t *state, nk_b128_vec_t a,
|
|
140
|
+
nk_b128_vec_t b, nk_size_t depth_offset,
|
|
141
|
+
nk_size_t active_dimensions) {
|
|
142
|
+
nk_unused_(depth_offset);
|
|
143
|
+
nk_unused_(active_dimensions);
|
|
144
|
+
|
|
145
|
+
// Process one 128-bit chunk (native VSX register size).
|
|
146
|
+
// Uses vector accumulation → horizontal sum deferred to finalize.
|
|
147
|
+
//
|
|
148
|
+
// Power9 VSX instruction characteristics:
|
|
149
|
+
// - `vec_xor`: xxlxor (V, V, V) 1cy, bitwise XOR
|
|
150
|
+
// - `vec_popcnt`: vpopcntw (V.4S, V.4S) 3cy, word popcount
|
|
151
|
+
// - `vec_add`: vadduwm (V.4S, V.4S, V.4S) 2cy, u32 add
|
|
152
|
+
// Total: ~6cy per 128-bit chunk (horizontal sum deferred to finalize)
|
|
153
|
+
|
|
154
|
+
// Step 1: Compute difference bits (A XOR B)
|
|
155
|
+
nk_vu8x16_t a_u8x16 = *(nk_vu8x16_t *)&a;
|
|
156
|
+
nk_vu8x16_t b_u8x16 = *(nk_vu8x16_t *)&b;
|
|
157
|
+
nk_vu8x16_t xor_u8x16 = vec_xor(a_u8x16, b_u8x16);
|
|
158
|
+
|
|
159
|
+
// Step 2: Word popcount → each u32 lane contains set bits for 4 bytes
|
|
160
|
+
nk_vu32x4_t popcnt_u32x4 = vec_popcnt((nk_vu32x4_t)xor_u8x16);
|
|
161
|
+
|
|
162
|
+
// Step 3: Vector accumulation (defers horizontal sum to finalize)
|
|
163
|
+
state->intersection_count_u32x4 = vec_add(state->intersection_count_u32x4, popcnt_u32x4);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
NK_INTERNAL void nk_hamming_u1x128_finalize_powervsx( //
|
|
167
|
+
nk_hamming_u1x128_state_powervsx_t const *state_a, nk_hamming_u1x128_state_powervsx_t const *state_b,
|
|
168
|
+
nk_hamming_u1x128_state_powervsx_t const *state_c, nk_hamming_u1x128_state_powervsx_t const *state_d,
|
|
169
|
+
nk_size_t total_dimensions, nk_b128_vec_t *result) {
|
|
170
|
+
nk_unused_(total_dimensions);
|
|
171
|
+
|
|
172
|
+
nk_vu32x4_t a_u32x4 = state_a->intersection_count_u32x4, b_u32x4 = state_b->intersection_count_u32x4,
|
|
173
|
+
c_u32x4 = state_c->intersection_count_u32x4, d_u32x4 = state_d->intersection_count_u32x4;
|
|
174
|
+
nk_vu32x4_t transpose_ab_low_u32x4 = vec_mergeh(a_u32x4, b_u32x4);
|
|
175
|
+
nk_vu32x4_t transpose_cd_low_u32x4 = vec_mergeh(c_u32x4, d_u32x4);
|
|
176
|
+
nk_vu32x4_t transpose_ab_high_u32x4 = vec_mergel(a_u32x4, b_u32x4);
|
|
177
|
+
nk_vu32x4_t transpose_cd_high_u32x4 = vec_mergel(c_u32x4, d_u32x4);
|
|
178
|
+
nk_vu32x4_t sum_lane0_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
|
|
179
|
+
(nk_vu64x2_t)transpose_cd_low_u32x4, 0);
|
|
180
|
+
nk_vu32x4_t sum_lane1_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
|
|
181
|
+
(nk_vu64x2_t)transpose_cd_low_u32x4, 3);
|
|
182
|
+
nk_vu32x4_t sum_lane2_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
|
|
183
|
+
(nk_vu64x2_t)transpose_cd_high_u32x4, 0);
|
|
184
|
+
nk_vu32x4_t sum_lane3_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
|
|
185
|
+
(nk_vu64x2_t)transpose_cd_high_u32x4, 3);
|
|
186
|
+
result->vu32x4 = vec_add(vec_add(sum_lane0_u32x4, sum_lane1_u32x4), vec_add(sum_lane2_u32x4, sum_lane3_u32x4));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
typedef struct nk_jaccard_u1x128_state_powervsx_t {
|
|
190
|
+
nk_vu32x4_t intersection_count_u32x4;
|
|
191
|
+
} nk_jaccard_u1x128_state_powervsx_t;
|
|
192
|
+
|
|
193
|
+
NK_INTERNAL void nk_jaccard_u1x128_init_powervsx(nk_jaccard_u1x128_state_powervsx_t *state) {
|
|
194
|
+
state->intersection_count_u32x4 = vec_splats((nk_u32_t)0);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
NK_INTERNAL void nk_jaccard_u1x128_update_powervsx(nk_jaccard_u1x128_state_powervsx_t *state, nk_b128_vec_t a,
|
|
198
|
+
nk_b128_vec_t b, nk_size_t depth_offset,
|
|
199
|
+
nk_size_t active_dimensions) {
|
|
200
|
+
nk_unused_(depth_offset);
|
|
201
|
+
nk_unused_(active_dimensions);
|
|
202
|
+
|
|
203
|
+
// Process one 128-bit chunk (native VSX register size).
|
|
204
|
+
// Uses vector accumulation → horizontal sum deferred to finalize.
|
|
205
|
+
//
|
|
206
|
+
// Power9 VSX instruction characteristics:
|
|
207
|
+
// - `vec_and`: xxland (V, V, V) 1cy, bitwise AND
|
|
208
|
+
// - `vec_popcnt`: vpopcntw (V.4S, V.4S) 3cy, word popcount
|
|
209
|
+
// - `vec_add`: vadduwm (V.4S, V.4S, V.4S) 2cy, u32 add
|
|
210
|
+
// Total: ~6cy per 128-bit chunk (horizontal sum deferred to finalize)
|
|
211
|
+
|
|
212
|
+
// Step 1: Compute intersection bits (A AND B)
|
|
213
|
+
nk_vu8x16_t a_u8x16 = *(nk_vu8x16_t *)&a;
|
|
214
|
+
nk_vu8x16_t b_u8x16 = *(nk_vu8x16_t *)&b;
|
|
215
|
+
nk_vu8x16_t intersection_u8x16 = vec_and(a_u8x16, b_u8x16);
|
|
216
|
+
|
|
217
|
+
// Step 2: Word popcount → each u32 lane contains set bits for 4 bytes
|
|
218
|
+
nk_vu32x4_t popcnt_u32x4 = vec_popcnt((nk_vu32x4_t)intersection_u8x16);
|
|
219
|
+
|
|
220
|
+
// Step 3: Vector accumulation (defers horizontal sum to finalize)
|
|
221
|
+
state->intersection_count_u32x4 = vec_add(state->intersection_count_u32x4, popcnt_u32x4);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
NK_INTERNAL void nk_jaccard_u1x128_finalize_powervsx( //
|
|
225
|
+
nk_jaccard_u1x128_state_powervsx_t const *state_a, nk_jaccard_u1x128_state_powervsx_t const *state_b,
|
|
226
|
+
nk_jaccard_u1x128_state_powervsx_t const *state_c, nk_jaccard_u1x128_state_powervsx_t const *state_d,
|
|
227
|
+
nk_f32_t query_popcount, nk_f32_t target_popcount_a, nk_f32_t target_popcount_b, nk_f32_t target_popcount_c,
|
|
228
|
+
nk_f32_t target_popcount_d, nk_size_t total_dimensions, nk_b128_vec_t *result) {
|
|
229
|
+
nk_unused_(total_dimensions);
|
|
230
|
+
|
|
231
|
+
// Transpose-based 4-way horizontal sum of u32x4 intersection counts
|
|
232
|
+
nk_vu32x4_t a_u32x4 = state_a->intersection_count_u32x4, b_u32x4 = state_b->intersection_count_u32x4,
|
|
233
|
+
c_u32x4 = state_c->intersection_count_u32x4, d_u32x4 = state_d->intersection_count_u32x4;
|
|
234
|
+
nk_vu32x4_t transpose_ab_low_u32x4 = vec_mergeh(a_u32x4, b_u32x4);
|
|
235
|
+
nk_vu32x4_t transpose_cd_low_u32x4 = vec_mergeh(c_u32x4, d_u32x4);
|
|
236
|
+
nk_vu32x4_t transpose_ab_high_u32x4 = vec_mergel(a_u32x4, b_u32x4);
|
|
237
|
+
nk_vu32x4_t transpose_cd_high_u32x4 = vec_mergel(c_u32x4, d_u32x4);
|
|
238
|
+
nk_vu32x4_t sum_lane0_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
|
|
239
|
+
(nk_vu64x2_t)transpose_cd_low_u32x4, 0);
|
|
240
|
+
nk_vu32x4_t sum_lane1_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
|
|
241
|
+
(nk_vu64x2_t)transpose_cd_low_u32x4, 3);
|
|
242
|
+
nk_vu32x4_t sum_lane2_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
|
|
243
|
+
(nk_vu64x2_t)transpose_cd_high_u32x4, 0);
|
|
244
|
+
nk_vu32x4_t sum_lane3_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
|
|
245
|
+
(nk_vu64x2_t)transpose_cd_high_u32x4, 3);
|
|
246
|
+
nk_vu32x4_t intersection_u32x4 = vec_add(vec_add(sum_lane0_u32x4, sum_lane1_u32x4),
|
|
247
|
+
vec_add(sum_lane2_u32x4, sum_lane3_u32x4));
|
|
248
|
+
nk_vf32x4_t intersection_f32x4 = vec_ctf(intersection_u32x4, 0);
|
|
249
|
+
|
|
250
|
+
// Build target popcounts vector via vec_insert
|
|
251
|
+
nk_vf32x4_t targets_f32x4 = vec_splats(0.0f);
|
|
252
|
+
targets_f32x4 = vec_insert(target_popcount_a, targets_f32x4, 0);
|
|
253
|
+
targets_f32x4 = vec_insert(target_popcount_b, targets_f32x4, 1);
|
|
254
|
+
targets_f32x4 = vec_insert(target_popcount_c, targets_f32x4, 2);
|
|
255
|
+
targets_f32x4 = vec_insert(target_popcount_d, targets_f32x4, 3);
|
|
256
|
+
nk_vf32x4_t query_f32x4 = vec_splats(query_popcount);
|
|
257
|
+
|
|
258
|
+
// Compute union using |A union B| = |A| + |B| - |A intersection B|
|
|
259
|
+
nk_vf32x4_t union_f32x4 = vec_sub(vec_add(query_f32x4, targets_f32x4), intersection_f32x4);
|
|
260
|
+
|
|
261
|
+
// Handle zero-union edge case (empty vectors → distance = 0.0)
|
|
262
|
+
nk_vf32x4_t one_f32x4 = vec_splats(1.0f);
|
|
263
|
+
nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
|
|
264
|
+
nk_vu32x4_t zero_union_mask_u32x4 = (nk_vu32x4_t)vec_cmpeq(union_f32x4, zero_f32x4);
|
|
265
|
+
nk_vf32x4_t safe_union_f32x4 = vec_sel(union_f32x4, one_f32x4, zero_union_mask_u32x4);
|
|
266
|
+
|
|
267
|
+
// Fast reciprocal with Newton-Raphson refinement
|
|
268
|
+
nk_vf32x4_t union_reciprocal_f32x4 = vec_re(safe_union_f32x4);
|
|
269
|
+
// One Newton-Raphson step: reciprocal = reciprocal × (2 - value * reciprocal)
|
|
270
|
+
nk_vf32x4_t two_f32x4 = vec_splats(2.0f);
|
|
271
|
+
union_reciprocal_f32x4 = vec_mul(union_reciprocal_f32x4,
|
|
272
|
+
vec_sub(two_f32x4, vec_mul(safe_union_f32x4, union_reciprocal_f32x4)));
|
|
273
|
+
|
|
274
|
+
// Compute Jaccard distance = 1 - intersection / union
|
|
275
|
+
nk_vf32x4_t ratio_f32x4 = vec_mul(intersection_f32x4, union_reciprocal_f32x4);
|
|
276
|
+
nk_vf32x4_t jaccard_f32x4 = vec_sub(one_f32x4, ratio_f32x4);
|
|
277
|
+
result->vf32x4 = vec_sel(jaccard_f32x4, zero_f32x4, zero_union_mask_u32x4);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/** @brief Hamming from_dot: computes pop_a + pop_b - 2 × dot for 4 pairs (Power VSX). */
|
|
281
|
+
NK_INTERNAL void nk_hamming_u32x4_from_dot_powervsx_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
|
|
282
|
+
nk_b128_vec_t *results) {
|
|
283
|
+
nk_vu32x4_t dots_u32x4 = dots.vu32x4;
|
|
284
|
+
nk_vu32x4_t query_u32x4 = vec_splats(query_pop);
|
|
285
|
+
nk_vu32x4_t target_u32x4 = target_pops.vu32x4;
|
|
286
|
+
nk_vu32x4_t two_dots_u32x4 = vec_add(dots_u32x4, dots_u32x4);
|
|
287
|
+
results->vu32x4 = vec_sub(vec_add(query_u32x4, target_u32x4), two_dots_u32x4);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/** @brief Jaccard from_dot: computes 1 - dot / (pop_a + pop_b - dot) for 4 pairs (Power VSX). */
|
|
291
|
+
NK_INTERNAL void nk_jaccard_f32x4_from_dot_powervsx_(nk_b128_vec_t dots, nk_u32_t query_pop, nk_b128_vec_t target_pops,
|
|
292
|
+
nk_b128_vec_t *results) {
|
|
293
|
+
nk_vf32x4_t dot_f32x4 = vec_ctf(dots.vu32x4, 0);
|
|
294
|
+
nk_vf32x4_t query_f32x4 = vec_splats((nk_f32_t)query_pop);
|
|
295
|
+
nk_vf32x4_t target_f32x4 = vec_ctf(target_pops.vu32x4, 0);
|
|
296
|
+
nk_vf32x4_t union_f32x4 = vec_sub(vec_add(query_f32x4, target_f32x4), dot_f32x4);
|
|
297
|
+
|
|
298
|
+
nk_vf32x4_t one_f32x4 = vec_splats(1.0f);
|
|
299
|
+
nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
|
|
300
|
+
nk_vu32x4_t zero_union_mask_u32x4 = (nk_vu32x4_t)vec_cmpeq(union_f32x4, zero_f32x4);
|
|
301
|
+
nk_vf32x4_t safe_union_f32x4 = vec_sel(union_f32x4, one_f32x4, zero_union_mask_u32x4);
|
|
302
|
+
|
|
303
|
+
// Fast reciprocal with Newton-Raphson
|
|
304
|
+
nk_vf32x4_t union_reciprocal_f32x4 = vec_re(safe_union_f32x4);
|
|
305
|
+
nk_vf32x4_t two_f32x4 = vec_splats(2.0f);
|
|
306
|
+
union_reciprocal_f32x4 = vec_mul(union_reciprocal_f32x4,
|
|
307
|
+
vec_sub(two_f32x4, vec_mul(safe_union_f32x4, union_reciprocal_f32x4)));
|
|
308
|
+
|
|
309
|
+
nk_vf32x4_t ratio_f32x4 = vec_mul(dot_f32x4, union_reciprocal_f32x4);
|
|
310
|
+
nk_vf32x4_t jaccard_f32x4 = vec_sub(one_f32x4, ratio_f32x4);
|
|
311
|
+
results->vf32x4 = vec_sel(jaccard_f32x4, zero_f32x4, zero_union_mask_u32x4);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
#if defined(__clang__)
|
|
315
|
+
#pragma clang attribute pop
|
|
316
|
+
#elif defined(__GNUC__)
|
|
317
|
+
#pragma GCC pop_options
|
|
318
|
+
#endif
|
|
319
|
+
|
|
320
|
+
#if defined(__cplusplus)
|
|
321
|
+
} // extern "C"
|
|
322
|
+
#endif
|
|
323
|
+
|
|
324
|
+
#endif // NK_TARGET_POWERVSX
|
|
325
|
+
#endif // NK_TARGET_POWER_
|
|
326
|
+
#endif // NK_SET_POWERVSX_H
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
extern "C" {
|
|
51
51
|
#endif
|
|
52
52
|
|
|
53
|
-
#pragma region
|
|
53
|
+
#pragma region Binary Sets
|
|
54
54
|
|
|
55
55
|
/**
|
|
56
56
|
* @brief Compute byte-level popcount using arithmetic SWAR.
|
|
@@ -142,9 +142,9 @@ NK_PUBLIC void nk_jaccard_u1_rvv(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
|
|
|
142
142
|
*result = (union_count_u32 != 0) ? 1.0f - (nk_f32_t)intersection_count_u32 / (nk_f32_t)union_count_u32 : 0.0f;
|
|
143
143
|
}
|
|
144
144
|
|
|
145
|
-
#pragma endregion
|
|
145
|
+
#pragma endregion Binary Sets
|
|
146
146
|
|
|
147
|
-
#pragma region
|
|
147
|
+
#pragma region Integer Sets
|
|
148
148
|
|
|
149
149
|
NK_PUBLIC void nk_hamming_u8_rvv(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
150
150
|
vuint32m1_t difference_count_u32m1 = __riscv_vmv_v_x_u32m1(0, 1);
|
|
@@ -209,7 +209,7 @@ NK_PUBLIC void nk_jaccard_u16_rvv(nk_u16_t const *a, nk_u16_t const *b, nk_size_
|
|
|
209
209
|
*result = (n != 0) ? 1.0f - (nk_f32_t)match_count_u32 / (nk_f32_t)n : 0.0f;
|
|
210
210
|
}
|
|
211
211
|
|
|
212
|
-
#pragma endregion
|
|
212
|
+
#pragma endregion Integer Sets
|
|
213
213
|
|
|
214
214
|
#if defined(__cplusplus)
|
|
215
215
|
} // extern "C"
|
|
@@ -35,7 +35,7 @@
|
|
|
35
35
|
extern "C" {
|
|
36
36
|
#endif
|
|
37
37
|
|
|
38
|
-
#pragma region
|
|
38
|
+
#pragma region Binary Sets
|
|
39
39
|
|
|
40
40
|
NK_PUBLIC void nk_hamming_u1_serial(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
41
41
|
nk_size_t n_bytes = nk_size_divide_round_up_(n, NK_BITS_PER_BYTE);
|
|
@@ -52,9 +52,9 @@ NK_PUBLIC void nk_jaccard_u1_serial(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_s
|
|
|
52
52
|
*result = (union_count != 0) ? 1.0f - (nk_f32_t)intersection_count / (nk_f32_t)union_count : 0.0f;
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
-
#pragma endregion
|
|
55
|
+
#pragma endregion Binary Sets
|
|
56
56
|
|
|
57
|
-
#pragma region
|
|
57
|
+
#pragma region Integer Sets
|
|
58
58
|
|
|
59
59
|
NK_PUBLIC void nk_jaccard_u32_serial(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
60
60
|
nk_u32_t intersection_count = 0;
|
|
@@ -74,9 +74,9 @@ NK_PUBLIC void nk_jaccard_u16_serial(nk_u16_t const *a, nk_u16_t const *b, nk_si
|
|
|
74
74
|
*result = (n != 0) ? 1.0f - (nk_f32_t)matches / (nk_f32_t)n : 0.0f;
|
|
75
75
|
}
|
|
76
76
|
|
|
77
|
-
#pragma endregion
|
|
77
|
+
#pragma endregion Integer Sets
|
|
78
78
|
|
|
79
|
-
#pragma region
|
|
79
|
+
#pragma region Stateful Streaming
|
|
80
80
|
|
|
81
81
|
typedef struct nk_jaccard_u1x128_state_serial_t {
|
|
82
82
|
nk_u64_t intersection_count;
|
|
@@ -165,7 +165,7 @@ NK_INTERNAL void nk_jaccard_f32x4_from_dot_serial_(nk_b128_vec_t dots, nk_u32_t
|
|
|
165
165
|
}
|
|
166
166
|
}
|
|
167
167
|
|
|
168
|
-
#pragma endregion
|
|
168
|
+
#pragma endregion Stateful Streaming
|
|
169
169
|
|
|
170
170
|
#if defined(__cplusplus)
|
|
171
171
|
} // extern "C"
|