numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief C++ bindings for set-intersection kernels.
|
|
3
|
+
* @file include/numkong/set.hpp
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 5, 2026
|
|
6
|
+
*/
|
|
7
|
+
#ifndef NK_SET_HPP
|
|
8
|
+
#define NK_SET_HPP
|
|
9
|
+
|
|
10
|
+
#include <cstdint>
|
|
11
|
+
#include <type_traits>
|
|
12
|
+
|
|
13
|
+
#include "numkong/set.h"
|
|
14
|
+
#include "numkong/sets.h"
|
|
15
|
+
|
|
16
|
+
#include "numkong/types.hpp"
|
|
17
|
+
|
|
18
|
+
namespace ashvardanian::numkong {
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* @brief Hamming distance: Σ(aᵢ ⊕ bᵢ)
|
|
22
|
+
* @param[in] a,b Input vectors
|
|
23
|
+
* @param[in] d Number of dimensions
|
|
24
|
+
* @param[out] r Pointer to output count
|
|
25
|
+
*
|
|
26
|
+
* @tparam in_type_ Input vector element type (u1x8_t or u8_t)
|
|
27
|
+
* @tparam result_type_ Accumulator type, defaults to `in_type_::hamming_result_t`
|
|
28
|
+
* @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
|
|
29
|
+
*/
|
|
30
|
+
template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::hamming_result_t,
|
|
31
|
+
allow_simd_t allow_simd_ = prefer_simd_k>
|
|
32
|
+
void hamming(in_type_ const *a, in_type_ const *b, std::size_t d, result_type_ *r) noexcept {
|
|
33
|
+
constexpr bool simd = allow_simd_ == prefer_simd_k &&
|
|
34
|
+
std::is_same_v<result_type_, typename in_type_::hamming_result_t>;
|
|
35
|
+
|
|
36
|
+
if constexpr (std::is_same_v<in_type_, u1x8_t> && simd) nk_hamming_u1(&a->raw_, &b->raw_, d, &r->raw_);
|
|
37
|
+
else if constexpr (std::is_same_v<in_type_, u8_t> && simd) nk_hamming_u8(&a->raw_, &b->raw_, d, &r->raw_);
|
|
38
|
+
else {
|
|
39
|
+
constexpr std::size_t dims_per_value = dimensions_per_value<in_type_>();
|
|
40
|
+
std::size_t n = divide_round_up(d, dims_per_value);
|
|
41
|
+
typename result_type_::raw_t count = 0;
|
|
42
|
+
for (std::size_t i = 0; i < n; i++) count += count_differences(a[i], b[i]);
|
|
43
|
+
*r = result_type_::from_raw(count);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* @brief Jaccard distance: 1 − |A ∩ B| / |A ∪ B|
|
|
49
|
+
* @param[in] a,b Input vectors
|
|
50
|
+
* @param[in] d Number of dimensions
|
|
51
|
+
* @param[out] r Pointer to output distance
|
|
52
|
+
*
|
|
53
|
+
* For u1x8_t (bit vectors): uses popcount(AND) / popcount(OR)
|
|
54
|
+
* For u16_t/u32_t (element vectors): uses count of matching elements / total
|
|
55
|
+
*
|
|
56
|
+
* @tparam in_type_ Input vector element type (u1x8_t, u16_t, or u32_t)
|
|
57
|
+
* @tparam result_type_ Accumulator type, defaults to `in_type_::jaccard_result_t`
|
|
58
|
+
* @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
|
|
59
|
+
*/
|
|
60
|
+
template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::jaccard_result_t,
|
|
61
|
+
allow_simd_t allow_simd_ = prefer_simd_k>
|
|
62
|
+
void jaccard(in_type_ const *a, in_type_ const *b, std::size_t d, result_type_ *r) noexcept {
|
|
63
|
+
constexpr bool simd = allow_simd_ == prefer_simd_k &&
|
|
64
|
+
std::is_same_v<result_type_, typename in_type_::jaccard_result_t>;
|
|
65
|
+
|
|
66
|
+
if constexpr (std::is_same_v<in_type_, u1x8_t> && simd) nk_jaccard_u1(&a->raw_, &b->raw_, d, &r->raw_);
|
|
67
|
+
else if constexpr (std::is_same_v<in_type_, u16_t> && simd) nk_jaccard_u16(&a->raw_, &b->raw_, d, &r->raw_);
|
|
68
|
+
else if constexpr (std::is_same_v<in_type_, u32_t> && simd) nk_jaccard_u32(&a->raw_, &b->raw_, d, &r->raw_);
|
|
69
|
+
else {
|
|
70
|
+
constexpr std::size_t dims_per_value = dimensions_per_value<in_type_>();
|
|
71
|
+
std::size_t n = divide_round_up(d, dims_per_value);
|
|
72
|
+
std::uint32_t intersection_count = 0, union_count = 0;
|
|
73
|
+
for (std::size_t i = 0; i < n; i++)
|
|
74
|
+
intersection_count += count_intersection(a[i], b[i]), union_count += count_union(a[i], b[i]);
|
|
75
|
+
if (union_count == 0) *r = result_type_();
|
|
76
|
+
else *r = result_type_(1) - result_type_(intersection_count) / result_type_(union_count);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
} // namespace ashvardanian::numkong
|
|
81
|
+
|
|
82
|
+
#include "numkong/tensor.hpp"
|
|
83
|
+
|
|
84
|
+
namespace ashvardanian::numkong {
|
|
85
|
+
|
|
86
|
+
template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::hamming_result_t,
|
|
87
|
+
allow_simd_t allow_simd_ = prefer_simd_k, std::size_t max_rank_a_, std::size_t max_rank_b_>
|
|
88
|
+
void hamming(tensor_view<in_type_, max_rank_a_> a, tensor_view<in_type_, max_rank_b_> b, std::size_t d,
|
|
89
|
+
result_type_ *r) noexcept {
|
|
90
|
+
hamming<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::hamming_result_t,
|
|
94
|
+
allow_simd_t allow_simd_ = prefer_simd_k>
|
|
95
|
+
void hamming(vector_view<in_type_> a, vector_view<in_type_> b, std::size_t d, result_type_ *r) noexcept {
|
|
96
|
+
hamming<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::jaccard_result_t,
|
|
100
|
+
allow_simd_t allow_simd_ = prefer_simd_k, std::size_t max_rank_a_, std::size_t max_rank_b_>
|
|
101
|
+
void jaccard(tensor_view<in_type_, max_rank_a_> a, tensor_view<in_type_, max_rank_b_> b, std::size_t d,
|
|
102
|
+
result_type_ *r) noexcept {
|
|
103
|
+
jaccard<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
template <numeric_dtype in_type_, numeric_dtype result_type_ = typename in_type_::jaccard_result_t,
|
|
107
|
+
allow_simd_t allow_simd_ = prefer_simd_k>
|
|
108
|
+
void jaccard(vector_view<in_type_> a, vector_view<in_type_> b, std::size_t d, result_type_ *r) noexcept {
|
|
109
|
+
jaccard<in_type_, result_type_, allow_simd_>(a.data(), b.data(), d, r);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
} // namespace ashvardanian::numkong
|
|
113
|
+
|
|
114
|
+
#endif // NK_SET_HPP
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# Batched Set Distances in NumKong
|
|
2
|
+
|
|
3
|
+
NumKong implements batched M×N Hamming and Jaccard distance matrices for binary vectors. The module reuses the dots u1 packing and GEMM infrastructure, converting popcount-of-AND dot products to set distances via precomputed norms.
|
|
4
|
+
|
|
5
|
+
Hamming distance from batched dot products:
|
|
6
|
+
|
|
7
|
+
```math
|
|
8
|
+
D_{ij} = \|A_i\|_1 + \|B_j\|_1 - 2 \cdot \text{dot}(A_i, B_j)
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Where dot = popcount(AND), measuring intersection size.
|
|
12
|
+
|
|
13
|
+
Jaccard distance from batched dot products:
|
|
14
|
+
|
|
15
|
+
```math
|
|
16
|
+
D_{ij} = 1 - \frac{\text{dot}(A_i, B_j)}{\|A_i\|_1 + \|B_j\|_1 - \text{dot}(A_i, B_j)}
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Reformulating as Python pseudocode:
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
def hammings_packed(a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
25
|
+
dots = np.array([[np.unpackbits(np.bitwise_and(ai, bj)).sum()
|
|
26
|
+
for bj in b] for ai in a])
|
|
27
|
+
a_pop = np.array([np.unpackbits(ai).sum() for ai in a])[:, None]
|
|
28
|
+
b_pop = np.array([np.unpackbits(bj).sum() for bj in b])[None, :]
|
|
29
|
+
return a_pop + b_pop - 2 * dots
|
|
30
|
+
|
|
31
|
+
def jaccards_packed(a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
32
|
+
dots = np.array([[np.unpackbits(np.bitwise_and(ai, bj)).sum()
|
|
33
|
+
for bj in b] for ai in a])
|
|
34
|
+
a_pop = np.array([np.unpackbits(ai).sum() for ai in a])[:, None]
|
|
35
|
+
b_pop = np.array([np.unpackbits(bj).sum() for bj in b])[None, :]
|
|
36
|
+
union = a_pop + b_pop - dots
|
|
37
|
+
return np.where(union > 0, 1.0 - dots / union, 0.0)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Input & Output Types
|
|
41
|
+
|
|
42
|
+
| Input Type | Output Type | Description |
|
|
43
|
+
| ---------- | ----------- | -------------------------------------- |
|
|
44
|
+
| `u1` | `u32` | Binary Hamming distance, packed octets |
|
|
45
|
+
| `u1` | `f32` | Binary Jaccard distance, packed octets |
|
|
46
|
+
|
|
47
|
+
## Optimizations
|
|
48
|
+
|
|
49
|
+
### Hamming and Jaccard from Intersection Counts
|
|
50
|
+
|
|
51
|
+
`nk_hammings_packed_u1_serial`, `nk_hammings_packed_u1_haswell`, `nk_jaccards_packed_u1_serial`, `nk_jaccards_packed_u1_haswell` reuse the dots u1 GEMM output where each dot product $\text{dot}(a, b) = \text{popcount}(a \mathbin{\&} b) = |A \cap B|$ counts intersection bits.
|
|
52
|
+
The L1 norm of a binary vector is its popcount: $|A| = \text{popcount}(a) = \|a\|_1$.
|
|
53
|
+
By inclusion-exclusion, $|A \cup B| = |A| + |B| - |A \cap B|$.
|
|
54
|
+
Hamming distance counts positions where exactly one bit is set: $D_H = |A| + |B| - 2|A \cap B| = \text{popcount}(a \oplus b)$.
|
|
55
|
+
Finalizer `nk_hamming_u32x4_from_dot_serial_` computes `pop_a + pop_b - 2 * dot` in pure UInt32 arithmetic — no division, no float conversion, no sqrt.
|
|
56
|
+
Jaccard distance: $D_J = 1 - \frac{|A \cap B|}{|A \cup B|} = 1 - \frac{\text{dot}}{\text{pop}_a + \text{pop}_b - \text{dot}}$.
|
|
57
|
+
Finalizer `nk_jaccard_f32x4_from_dot_serial_` requires UInt32 → Float32 cast plus Float32 division (~11cy latency on Haswell), making it ~3× more expensive per element than Hamming's integer subtraction chain.
|
|
58
|
+
Per-column popcount norms ($\|a\|_1$, $\|b\|_1$) are precomputed during packing and stored in packed buffer metadata, avoiding per-pair recomputation.
|
|
59
|
+
|
|
60
|
+
### SME Binary Outer-Product Accumulation
|
|
61
|
+
|
|
62
|
+
`nk_hammings_packed_u1_smebi32`, `nk_jaccards_packed_u1_smebi32` use the `BMOPA` instruction which computes $\text{popcount}(\text{XNOR}(a, b))$ — counting _matching_ bits in a single outer-product operation over 16×16 output tiles with 512-bit depth chunks.
|
|
63
|
+
This is fundamentally different from the AND+POPCNT used by scalar/NEON/x86 kernels, which count _intersection_ bits.
|
|
64
|
+
Hamming from `BMOPA`: $D_H = \text{depth\_bits} - \text{popcount}(\text{XNOR})$, since XOR popcount (differing bits) is the Hamming distance directly — no per-vector norm correction needed.
|
|
65
|
+
Jaccard from `BMOPA`: must convert matching-bit counts to intersection via $|A \cap B| = (\text{popcount}(\text{XNOR}) - (\text{depth\_bits} - |A| - |B|)) / 2$, then apply the Jaccard formula — more arithmetic than the AND-based path.
|
|
66
|
+
Streaming mode overhead (~50–100 cycles for `SMSTART`/`SMSTOP`) is amortized across the full M×N output.
|
|
67
|
+
|
|
68
|
+
## Performance
|
|
69
|
+
|
|
70
|
+
The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
|
|
71
|
+
The input size is controlled by `NK_MATRIX_HEIGHT`, `NK_MATRIX_WIDTH`, and `NK_MATRIX_DEPTH` environment variables, all set to the same value for batched set operations over square matrices.
|
|
72
|
+
Columns show throughput for 256³, 1024³, and 4096³ configurations.
|
|
73
|
+
The throughput is measured in GSO/s as Giga Scalar Operations per Second.
|
|
74
|
+
Accuracy is reported where applicable as exact distance in the result representation; floating Jaccard rows are shown as mean ULP (units in last place).
|
|
75
|
+
Each kernel runs for at least 20 seconds per configuration.
|
|
76
|
+
Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
|
|
77
|
+
Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
|
|
78
|
+
|
|
79
|
+
### Intel Sapphire Rapids
|
|
80
|
+
|
|
81
|
+
#### Native
|
|
82
|
+
|
|
83
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
84
|
+
| :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
85
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
86
|
+
| `nk_hammings_packed_u1_serial` | 109 gso/s | 162 gso/s | 284 gso/s |
|
|
87
|
+
| `nk_hammings_symmetric_u1_serial` | 39.7 gso/s | 133 gso/s | 325 gso/s |
|
|
88
|
+
| `nk_jaccards_packed_u1_serial` | 54.8 gso/s, 0 ulp | 128 gso/s, 0 ulp | 259 gso/s, 0 ulp |
|
|
89
|
+
| `nk_jaccards_symmetric_u1_serial` | 29.8 gso/s, 0 ulp | 110 gso/s, 0 ulp | 292 gso/s, 0 ulp |
|
|
90
|
+
| `nk_hammings_packed_u1_haswell` | 100 gso/s | 126 gso/s | 168 gso/s |
|
|
91
|
+
| `nk_hammings_symmetric_u1_haswell` | 58.5 gso/s | 132 gso/s | 328 gso/s |
|
|
92
|
+
| `nk_jaccards_packed_u1_haswell` | 84.2 gso/s, 0.3 ulp | 124 gso/s, 0.3 ulp | 165 gso/s, 0.3 ulp |
|
|
93
|
+
| `nk_jaccards_symmetric_u1_haswell` | 57.6 gso/s, 0.3 ulp | 131 gso/s, 0.3 ulp | 324 gso/s, 0.3 ulp |
|
|
94
|
+
| `nk_hammings_packed_u1_icelake` | 110 gso/s | 340 gso/s | 604 gso/s |
|
|
95
|
+
| `nk_hammings_symmetric_u1_icelake` | 76.2 gso/s | 258 gso/s | 1,040 gso/s |
|
|
96
|
+
| `nk_jaccards_packed_u1_icelake` | 89.2 gso/s, 0.3 ulp | 312 gso/s, 0.3 ulp | 601 gso/s, 0.3 ulp |
|
|
97
|
+
| `nk_jaccards_symmetric_u1_icelake` | 66.9 gso/s, 0.3 ulp | 260 gso/s, 0.3 ulp | 965 gso/s, 0.3 ulp |
|
|
98
|
+
|
|
99
|
+
#### WASM
|
|
100
|
+
|
|
101
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
102
|
+
|
|
103
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
104
|
+
| :------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
105
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
106
|
+
| `nk_hammings_packed_u1_serial` | 43.7 gso/s | 68.0 gso/s | 74.7 gso/s |
|
|
107
|
+
| `nk_hammings_packed_u1_v128relaxed` | 75.3 gso/s | 134 gso/s | 144 gso/s |
|
|
108
|
+
| `nk_hammings_symmetric_u1_serial` | 3.72 gso/s | 13.5 gso/s | 41.0 gso/s |
|
|
109
|
+
| `nk_hammings_symmetric_u1_v128relaxed` | 3.64 gso/s | 13.9 gso/s | 42.2 gso/s |
|
|
110
|
+
| `nk_jaccards_packed_u1_serial` | 33.7 gso/s, 0 ulp | 61.3 gso/s, 0 ulp | 73.2 gso/s, 0 ulp |
|
|
111
|
+
| `nk_jaccards_packed_u1_v128relaxed` | 66.4 gso/s, 0 ulp | 129 gso/s, 0 ulp | 143 gso/s, 0 ulp |
|
|
112
|
+
| `nk_jaccards_symmetric_u1_serial` | 3.57 gso/s, 0 ulp | 13.3 gso/s, 0 ulp | 40.6 gso/s, 0 ulp |
|
|
113
|
+
| `nk_jaccards_symmetric_u1_v128relaxed` | 3.65 gso/s, 0 ulp | 13.9 gso/s, 0 ulp | 42.2 gso/s, 0 ulp |
|
|
114
|
+
|
|
115
|
+
### Apple M4
|
|
116
|
+
|
|
117
|
+
#### Native
|
|
118
|
+
|
|
119
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
120
|
+
| :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
121
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
122
|
+
| `nk_hammings_packed_u1_serial` | 154 gso/s | 204 gso/s | 221 gso/s |
|
|
123
|
+
| `nk_hammings_symmetric_u1_serial` | 101 gso/s | 159 gso/s | 172 gso/s |
|
|
124
|
+
| `nk_jaccards_packed_u1_serial` | 116 gso/s, 0 ulp | 203 gso/s, 0 ulp | 232 gso/s, 0 ulp |
|
|
125
|
+
| `nk_jaccards_symmetric_u1_serial` | 86.3 gso/s, 0 ulp | 157 gso/s, 0 ulp | 176 gso/s, 0 ulp |
|
|
126
|
+
| `nk_hammings_packed_u1_neon` | 315 gso/s | 428 gso/s | 481 gso/s |
|
|
127
|
+
| `nk_hammings_symmetric_u1_neon` | 132 gso/s | 240 gso/s | 294 gso/s |
|
|
128
|
+
| `nk_jaccards_packed_u1_neon` | 266 gso/s, 8.6 ulp | 416 gso/s, 8.6 ulp | 488 gso/s, 8.6 ulp |
|
|
129
|
+
| `nk_jaccards_symmetric_u1_neon` | 129 gso/s, 8.5 ulp | 242 gso/s, 8.5 ulp | 294 gso/s, 8.5 ulp |
|
|
130
|
+
| `nk_hammings_packed_u1_smebi32` | 1,420 gso/s | 2,928 gso/s | 4,027 gso/s |
|
|
131
|
+
| `nk_hammings_symmetric_u1_smebi32` | 629 gso/s | 1,438 gso/s | 1,111 gso/s |
|
|
132
|
+
| `nk_jaccards_packed_u1_smebi32` | 273 gso/s, 0 ulp | 1,381 gso/s, 0 ulp | 3,280 gso/s, 0 ulp |
|
|
133
|
+
| `nk_jaccards_symmetric_u1_smebi32` | 45.1 gso/s, 0 ulp | 267 gso/s, 0 ulp | 618 gso/s, 0 ulp |
|
|
134
|
+
|
|
135
|
+
#### WASM
|
|
136
|
+
|
|
137
|
+
Measured with Wasmtime v42 (Cranelift backend).
|
|
138
|
+
|
|
139
|
+
| Kernel | 256³ | 1024³ | 4096³ |
|
|
140
|
+
| :------------------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
141
|
+
| __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
142
|
+
| `nk_hammings_packed_u1_serial` | 35.2 gso/s | 47.6 gso/s | 52.8 gso/s |
|
|
143
|
+
| `nk_hammings_symmetric_u1_serial` | 25.4 gso/s | 51.5 gso/s | 129 gso/s |
|
|
144
|
+
| `nk_jaccards_packed_u1_serial` | 30.9 gso/s, 0 ulp | 46.0 gso/s, 0 ulp | 52.7 gso/s, 0 ulp |
|
|
145
|
+
| `nk_jaccards_symmetric_u1_serial` | 22.8 gso/s, 0 ulp | 48.9 gso/s, 0 ulp | 123 gso/s, 0 ulp |
|
|
146
|
+
| `nk_hammings_packed_u1_v128relaxed` | 102 gso/s | 144 gso/s | 160 gso/s |
|
|
147
|
+
| `nk_hammings_symmetric_u1_v128relaxed` | 28.2 gso/s | 61.7 gso/s | 175 gso/s |
|
|
148
|
+
| `nk_jaccards_packed_u1_v128relaxed` | 91.2 gso/s, 0 ulp | 140 gso/s, 0 ulp | 172 gso/s, 0 ulp |
|
|
149
|
+
| `nk_jaccards_symmetric_u1_v128relaxed` | 26.9 gso/s, 0 ulp | 60.3 gso/s, 0 ulp | 177 gso/s, 0 ulp |
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Batched Set Operations for Haswell (AVX2).
|
|
3
|
+
* @file include/numkong/sets/haswell.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 23, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/sets.h
|
|
8
|
+
*/
|
|
9
|
+
#ifndef NK_SETS_HASWELL_H
|
|
10
|
+
#define NK_SETS_HASWELL_H
|
|
11
|
+
|
|
12
|
+
#if NK_TARGET_X86_
|
|
13
|
+
#if NK_TARGET_HASWELL
|
|
14
|
+
|
|
15
|
+
#include "numkong/set/haswell.h"
|
|
16
|
+
#include "numkong/dots/haswell.h"
|
|
17
|
+
|
|
18
|
+
#if defined(__cplusplus)
|
|
19
|
+
extern "C" {
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
#if defined(__clang__)
|
|
23
|
+
#pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2,popcnt"))), apply_to = function)
|
|
24
|
+
#elif defined(__GNUC__)
|
|
25
|
+
#pragma GCC push_options
|
|
26
|
+
#pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2", "popcnt")
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
nk_define_cross_normalized_packed_(hamming, u1, haswell, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
30
|
+
nk_dots_packed_u1_haswell, nk_hamming_u32x4_from_dot_haswell_,
|
|
31
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
|
|
32
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_, /*dimensions_per_value=*/8)
|
|
33
|
+
|
|
34
|
+
nk_define_cross_normalized_packed_(jaccard, u1, haswell, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
35
|
+
nk_dots_packed_u1_haswell, nk_jaccard_f32x4_from_dot_haswell_,
|
|
36
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
|
|
37
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_, /*dimensions_per_value=*/8)
|
|
38
|
+
|
|
39
|
+
nk_define_cross_normalized_symmetric_(hamming, u1, haswell, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
40
|
+
nk_dots_symmetric_u1_haswell, nk_hamming_u32x4_from_dot_haswell_,
|
|
41
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
|
|
42
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_,
|
|
43
|
+
/*dimensions_per_value=*/8)
|
|
44
|
+
|
|
45
|
+
nk_define_cross_normalized_symmetric_(jaccard, u1, haswell, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
46
|
+
nk_dots_symmetric_u1_haswell, nk_jaccard_f32x4_from_dot_haswell_,
|
|
47
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_haswell_,
|
|
48
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_haswell_,
|
|
49
|
+
/*dimensions_per_value=*/8)
|
|
50
|
+
|
|
51
|
+
#if defined(__clang__)
|
|
52
|
+
#pragma clang attribute pop
|
|
53
|
+
#elif defined(__GNUC__)
|
|
54
|
+
#pragma GCC pop_options
|
|
55
|
+
#endif
|
|
56
|
+
|
|
57
|
+
#if defined(__cplusplus)
|
|
58
|
+
} // extern "C"
|
|
59
|
+
#endif
|
|
60
|
+
|
|
61
|
+
#endif // NK_TARGET_HASWELL
|
|
62
|
+
#endif // NK_TARGET_X86_
|
|
63
|
+
#endif // NK_SETS_HASWELL_H
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Batched Set Operations for Ice Lake (AVX-512 VNNI/VBMI).
|
|
3
|
+
* @file include/numkong/sets/icelake.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 23, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/sets.h
|
|
8
|
+
*/
|
|
9
|
+
#ifndef NK_SETS_ICELAKE_H
|
|
10
|
+
#define NK_SETS_ICELAKE_H
|
|
11
|
+
|
|
12
|
+
#if NK_TARGET_X86_
|
|
13
|
+
#if NK_TARGET_ICELAKE
|
|
14
|
+
|
|
15
|
+
#include "numkong/set/icelake.h"
|
|
16
|
+
#include "numkong/dots/icelake.h"
|
|
17
|
+
|
|
18
|
+
#if defined(__cplusplus)
|
|
19
|
+
extern "C" {
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
#if defined(__clang__)
|
|
23
|
+
#pragma clang attribute push( \
|
|
24
|
+
__attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,avx512vnni,avx512vpopcntdq,f16c,fma,bmi,bmi2"))), \
|
|
25
|
+
apply_to = function)
|
|
26
|
+
#elif defined(__GNUC__)
|
|
27
|
+
#pragma GCC push_options
|
|
28
|
+
#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vnni", "avx512vpopcntdq", "f16c", \
|
|
29
|
+
"fma", "bmi", "bmi2")
|
|
30
|
+
#endif
|
|
31
|
+
|
|
32
|
+
nk_define_cross_normalized_packed_(hamming, u1, icelake, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
33
|
+
nk_dots_packed_u1_icelake, nk_hamming_u32x4_from_dot_icelake_,
|
|
34
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
|
|
35
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_, /*dimensions_per_value=*/8)
|
|
36
|
+
|
|
37
|
+
nk_define_cross_normalized_packed_(jaccard, u1, icelake, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
38
|
+
nk_dots_packed_u1_icelake, nk_jaccard_f32x4_from_dot_icelake_,
|
|
39
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
|
|
40
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_, /*dimensions_per_value=*/8)
|
|
41
|
+
|
|
42
|
+
nk_define_cross_normalized_symmetric_(hamming, u1, icelake, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
43
|
+
nk_dots_symmetric_u1_icelake, nk_hamming_u32x4_from_dot_icelake_,
|
|
44
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
|
|
45
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
|
|
46
|
+
/*dimensions_per_value=*/8)
|
|
47
|
+
|
|
48
|
+
nk_define_cross_normalized_symmetric_(jaccard, u1, icelake, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
49
|
+
nk_dots_symmetric_u1_icelake, nk_jaccard_f32x4_from_dot_icelake_,
|
|
50
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_haswell_, nk_partial_load_b32x4_skylake_,
|
|
51
|
+
nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
|
|
52
|
+
/*dimensions_per_value=*/8)
|
|
53
|
+
|
|
54
|
+
#if defined(__clang__)
|
|
55
|
+
#pragma clang attribute pop
|
|
56
|
+
#elif defined(__GNUC__)
|
|
57
|
+
#pragma GCC pop_options
|
|
58
|
+
#endif
|
|
59
|
+
|
|
60
|
+
#if defined(__cplusplus)
|
|
61
|
+
} // extern "C"
|
|
62
|
+
#endif
|
|
63
|
+
|
|
64
|
+
#endif // NK_TARGET_ICELAKE
|
|
65
|
+
#endif // NK_TARGET_X86_
|
|
66
|
+
#endif // NK_SETS_ICELAKE_H
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Batched Set Operations for NEON.
|
|
3
|
+
* @file include/numkong/sets/neon.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 23, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/sets.h
|
|
8
|
+
*/
|
|
9
|
+
#ifndef NK_SETS_NEON_H
|
|
10
|
+
#define NK_SETS_NEON_H
|
|
11
|
+
|
|
12
|
+
#if NK_TARGET_ARM_
|
|
13
|
+
#if NK_TARGET_NEON
|
|
14
|
+
|
|
15
|
+
#include "numkong/set/neon.h"
|
|
16
|
+
#include "numkong/dots/neon.h"
|
|
17
|
+
|
|
18
|
+
#if defined(__cplusplus)
|
|
19
|
+
extern "C" {
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
#if defined(__clang__)
|
|
23
|
+
#pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
|
|
24
|
+
#elif defined(__GNUC__)
|
|
25
|
+
#pragma GCC push_options
|
|
26
|
+
#pragma GCC target("arch=armv8-a+simd")
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
nk_define_cross_normalized_packed_(hamming, u1, neon, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
30
|
+
nk_dots_packed_u1_neon, nk_hamming_u32x4_from_dot_neon_, nk_dots_reduce_sum_u1_,
|
|
31
|
+
nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
|
|
32
|
+
nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
33
|
+
|
|
34
|
+
nk_define_cross_normalized_packed_(jaccard, u1, neon, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
35
|
+
nk_dots_packed_u1_neon, nk_jaccard_f32x4_from_dot_neon_, nk_dots_reduce_sum_u1_,
|
|
36
|
+
nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
|
|
37
|
+
nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
38
|
+
|
|
39
|
+
nk_define_cross_normalized_symmetric_(hamming, u1, neon, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
40
|
+
nk_dots_symmetric_u1_neon, nk_hamming_u32x4_from_dot_neon_,
|
|
41
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
|
|
42
|
+
nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
43
|
+
|
|
44
|
+
nk_define_cross_normalized_symmetric_(jaccard, u1, neon, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
45
|
+
nk_dots_symmetric_u1_neon, nk_jaccard_f32x4_from_dot_neon_,
|
|
46
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
|
|
47
|
+
nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
48
|
+
|
|
49
|
+
#if defined(__clang__)
|
|
50
|
+
#pragma clang attribute pop
|
|
51
|
+
#elif defined(__GNUC__)
|
|
52
|
+
#pragma GCC pop_options
|
|
53
|
+
#endif
|
|
54
|
+
|
|
55
|
+
#if defined(__cplusplus)
|
|
56
|
+
} // extern "C"
|
|
57
|
+
#endif
|
|
58
|
+
|
|
59
|
+
#endif // NK_TARGET_NEON
|
|
60
|
+
#endif // NK_TARGET_ARM_
|
|
61
|
+
#endif // NK_SETS_NEON_H
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Batched Set Operations for Serial (non-SIMD) Backends.
|
|
3
|
+
* @file include/numkong/sets/serial.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 23, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/sets.h
|
|
8
|
+
*/
|
|
9
|
+
#ifndef NK_SETS_SERIAL_H
|
|
10
|
+
#define NK_SETS_SERIAL_H
|
|
11
|
+
|
|
12
|
+
#include "numkong/set/serial.h"
|
|
13
|
+
#include "numkong/dots/serial.h"
|
|
14
|
+
|
|
15
|
+
#if defined(__cplusplus)
|
|
16
|
+
extern "C" {
|
|
17
|
+
#endif
|
|
18
|
+
|
|
19
|
+
nk_define_cross_normalized_packed_(hamming, u1, serial, u1x8, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
20
|
+
nk_dots_packed_u1_serial, nk_hamming_u32x4_from_dot_serial_, nk_dots_reduce_sum_u1_,
|
|
21
|
+
nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
|
|
22
|
+
nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
23
|
+
|
|
24
|
+
nk_define_cross_normalized_packed_(jaccard, u1, serial, u1x8, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
25
|
+
nk_dots_packed_u1_serial, nk_jaccard_f32x4_from_dot_serial_, nk_dots_reduce_sum_u1_,
|
|
26
|
+
nk_load_b128_serial_, nk_partial_load_b32x4_serial_, nk_store_b128_serial_,
|
|
27
|
+
nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
28
|
+
|
|
29
|
+
nk_define_cross_normalized_symmetric_(hamming, u1, serial, u1x8, u32, /*norm_value_type=*/u32, u32, nk_b128_vec_t,
|
|
30
|
+
nk_dots_symmetric_u1_serial, nk_hamming_u32x4_from_dot_serial_,
|
|
31
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
|
|
32
|
+
nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
33
|
+
|
|
34
|
+
nk_define_cross_normalized_symmetric_(jaccard, u1, serial, u1x8, u32, /*norm_value_type=*/u32, f32, nk_b128_vec_t,
|
|
35
|
+
nk_dots_symmetric_u1_serial, nk_jaccard_f32x4_from_dot_serial_,
|
|
36
|
+
nk_dots_reduce_sum_u1_, nk_load_b128_serial_, nk_partial_load_b32x4_serial_,
|
|
37
|
+
nk_store_b128_serial_, nk_partial_store_b32x4_serial_, /*dimensions_per_value=*/8)
|
|
38
|
+
|
|
39
|
+
#if defined(__cplusplus)
|
|
40
|
+
} // extern "C"
|
|
41
|
+
#endif
|
|
42
|
+
|
|
43
|
+
#endif // NK_SETS_SERIAL_H
|