numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,113 @@
1
+ /**
2
+ * @brief C++ bindings for sparse-vector kernels.
3
+ * @file include/numkong/sparse.hpp
4
+ * @author Ash Vardanian
5
+ * @date February 5, 2026
6
+ */
7
+ #ifndef NK_SPARSE_HPP
8
+ #define NK_SPARSE_HPP
9
+
10
+ #include <cstdint>
11
+ #include <type_traits>
12
+
13
+ #include "numkong/sparse.h"
14
+
15
+ #include "numkong/types.hpp"
16
+
17
+ namespace ashvardanian::numkong {
18
+
19
+ /**
20
+ * @brief Count intersection of two sorted index arrays
21
+ * @param[in] a,b Sorted index arrays (ascending, unique elements)
22
+ * @param[in] a_length,b_length Number of elements in each array
23
+ * @param[out] count Output intersection count
24
+ *
25
+ * @tparam index_type_ Index type (u16_t, u32_t, u64_t)
26
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
27
+ */
28
+ template <numeric_dtype index_type_, allow_simd_t allow_simd_ = prefer_simd_k>
29
+ void sparse_intersect(index_type_ const *a, index_type_ const *b, std::size_t a_length, std::size_t b_length,
30
+ nk_size_t *count) noexcept {
31
+ constexpr bool simd = allow_simd_ == prefer_simd_k;
32
+
33
+ if constexpr (std::is_same_v<index_type_, u16_t> && simd)
34
+ nk_sparse_intersect_u16(&a->raw_, &b->raw_, a_length, b_length, nullptr, count);
35
+ else if constexpr (std::is_same_v<index_type_, u32_t> && simd)
36
+ nk_sparse_intersect_u32(&a->raw_, &b->raw_, a_length, b_length, nullptr, count);
37
+ else if constexpr (std::is_same_v<index_type_, u64_t> && simd)
38
+ nk_sparse_intersect_u64(&a->raw_, &b->raw_, a_length, b_length, nullptr, count);
39
+ // Scalar fallback
40
+ else {
41
+ nk_size_t c = 0;
42
+ std::size_t i = 0, j = 0;
43
+ while (i < a_length && j < b_length) {
44
+ if (a[i] < b[j]) i++;
45
+ else if (b[j] < a[i]) j++;
46
+ else c++, i++, j++;
47
+ }
48
+ *count = c;
49
+ }
50
+ }
51
+
52
+ /**
53
+ * @brief Sparse weighted dot product: Σ aₖ × bₖ over shared indices
54
+ * @param[in] a,b Sorted index arrays (ascending, unique elements)
55
+ * @param[in] a_weights,b_weights Weights corresponding to indices
56
+ * @param[in] a_length,b_length Number of elements in each array
57
+ * @param[out] product Output dot product
58
+ *
59
+ * @tparam index_type_ Index type (u16_t, u32_t, u64_t)
60
+ * @tparam weight_t Weight type (bf16_t for u16 indices, f32_t for u32 indices)
61
+ * @tparam result_type_ Result type, defaults to `f32_t`
62
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
63
+ *
64
+ * @note Computes sum of a_weights[i] * b_weights[j] for all i,j where a[i] == b[j]
65
+ */
66
+ template <numeric_dtype index_type_, numeric_dtype weight_t,
67
+ numeric_dtype result_type_ = typename weight_t::dot_result_t, allow_simd_t allow_simd_ = prefer_simd_k>
68
+ void sparse_dot(index_type_ const *a, index_type_ const *b, weight_t const *a_weights, weight_t const *b_weights,
69
+ std::size_t a_length, std::size_t b_length, result_type_ *product) noexcept {
70
+ constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<result_type_, typename weight_t::dot_result_t>;
71
+
72
+ // u16 indices + bf16 weights -> f32 product
73
+ if constexpr (std::is_same_v<index_type_, u16_t> && std::is_same_v<weight_t, bf16_t> && simd)
74
+ nk_sparse_dot_u16bf16(&a->raw_, &b->raw_, &a_weights->raw_, &b_weights->raw_, a_length, b_length,
75
+ &product->raw_);
76
+ else if constexpr (std::is_same_v<index_type_, u32_t> && std::is_same_v<weight_t, f32_t> && simd)
77
+ nk_sparse_dot_u32f32(&a->raw_, &b->raw_, &a_weights->raw_, &b_weights->raw_, a_length, b_length,
78
+ &product->raw_);
79
+ // Scalar fallback
80
+ else {
81
+ result_type_ sum {};
82
+ std::size_t i = 0, j = 0;
83
+ while (i < a_length && j < b_length) {
84
+ if (a[i] < b[j]) i++;
85
+ else if (b[j] < a[i]) j++;
86
+ else sum = fma<weight_t, result_type_>(a_weights[i], b_weights[j], sum), i++, j++;
87
+ }
88
+ *product = sum;
89
+ }
90
+ }
91
+
92
+ } // namespace ashvardanian::numkong
93
+
94
+ #include "numkong/tensor.hpp"
95
+
96
+ namespace ashvardanian::numkong {
97
+
98
+ template <numeric_dtype index_type_, allow_simd_t allow_simd_ = prefer_simd_k>
99
+ void sparse_intersect(vector_view<index_type_> a, vector_view<index_type_> b, nk_size_t *count) noexcept {
100
+ sparse_intersect<index_type_, allow_simd_>(a.data(), b.data(), a.size(), b.size(), count);
101
+ }
102
+
103
+ template <numeric_dtype index_type_, numeric_dtype weight_t,
104
+ numeric_dtype result_type_ = typename weight_t::dot_result_t, allow_simd_t allow_simd_ = prefer_simd_k>
105
+ void sparse_dot(vector_view<index_type_> a, vector_view<index_type_> b, vector_view<weight_t> a_weights,
106
+ vector_view<weight_t> b_weights, result_type_ *product) noexcept {
107
+ sparse_dot<index_type_, weight_t, result_type_, allow_simd_>(a.data(), b.data(), a_weights.data(), b_weights.data(),
108
+ a.size(), b.size(), product);
109
+ }
110
+
111
+ } // namespace ashvardanian::numkong
112
+
113
+ #endif // NK_SPARSE_HPP
@@ -0,0 +1,435 @@
1
+ # Spatial Distances in NumKong
2
+
3
+ NumKong implements spatial distance functions for dense vectors: squared Euclidean distance, Euclidean distance, and angular (cosine) distance.
4
+ These are the most widely used metrics in nearest-neighbor search, clustering, and dimensionality reduction, covering every numeric type supported by the library.
5
+
6
+ Squared Euclidean distance measures the sum of squared element-wise differences:
7
+
8
+ ```math
9
+ \text{sqeuclidean}(a, b) = \sum_{i=0}^{n-1} (a_i - b_i)^2
10
+ ```
11
+
12
+ Euclidean distance is the square root of the squared Euclidean distance:
13
+
14
+ ```math
15
+ \text{euclidean}(a, b) = \sqrt{\sum_{i=0}^{n-1} (a_i - b_i)^2}
16
+ ```
17
+
18
+ Angular distance (cosine distance) measures the angle between two vectors:
19
+
20
+ ```math
21
+ \text{angular}(a, b) = 1 - \frac{\sum_{i=0}^{n-1} a_i \cdot b_i}{\sqrt{\sum_{i=0}^{n-1} a_i^2} \cdot \sqrt{\sum_{i=0}^{n-1} b_i^2}}
22
+ ```
23
+
24
+ Reformulating as Python pseudocode:
25
+
26
+ ```python
27
+ import numpy as np
28
+
29
+ def sqeuclidean(a: np.ndarray, b: np.ndarray) -> float:
30
+ return np.sum((a - b) ** 2)
31
+
32
+ def euclidean(a: np.ndarray, b: np.ndarray) -> float:
33
+ return np.sqrt(np.sum((a - b) ** 2))
34
+
35
+ def angular(a: np.ndarray, b: np.ndarray) -> float:
36
+ ab = np.dot(a, b)
37
+ a2 = np.dot(a, a)
38
+ b2 = np.dot(b, b)
39
+ if a2 == 0 and b2 == 0: return 0
40
+ if ab == 0: return 1
41
+ return 1 - ab / (np.sqrt(a2) * np.sqrt(b2))
42
+ ```
43
+
44
+ ## Input & Output Types
45
+
46
+ | Input Type | Output Type | Description |
47
+ | ---------- | ----------- | ---------------------------------------------- |
48
+ | `f64` | `f64` | 64-bit IEEE 754 double precision |
49
+ | `f32` | `f32` | 32-bit IEEE 754 single precision |
50
+ | `f16` | `f32` | 16-bit IEEE 754 half precision, widened output |
51
+ | `bf16` | `f32` | 16-bit brain float, widened output |
52
+ | `e5m2` | `f32` | 8-bit Float8: 5 exponent, 2 mantissa bits |
53
+ | `e4m3` | `f32` | 8-bit Float8: 4 exponent, 3 mantissa bits |
54
+ | `e3m2` | `f32` | 8-bit MX format: 3 exponent, 2 mantissa bits |
55
+ | `e2m3` | `f32` | 8-bit MX format: 2 exponent, 3 mantissa bits |
56
+ | `i8` | `f32` | 8-bit signed integers |
57
+ | `u8` | `f32` | 8-bit unsigned integers |
58
+ | `i4` | `f32` | 4-bit signed integers, packed nibble pairs |
59
+ | `u4` | `f32` | 4-bit unsigned integers, packed nibble pairs |
60
+
61
+ ## Optimizations
62
+
63
+ ### Three-Accumulator Angular Pattern
64
+
65
+ `nk_angular_f32_haswell`, `nk_angular_f32_skylake`, `nk_angular_f32_neon` compute cosine distance as $1 - ab / (\sqrt{a^2} \cdot \sqrt{b^2})$, requiring three concurrent dot products in a single pass: $\sum a_i b_i$, $\sum a_i^2$, and $\sum b_i^2$.
66
+ All spatial angular kernels interleave these three FMA streams so that each vector element is loaded once and immediately contributes to all three accumulators.
67
+ This triples register pressure compared to a plain dot product — on Haswell with 16 YMM registers, three independent 4-register accumulator chains leave only 4 registers for temporaries.
68
+ The single-pass design is essential because reading two vectors of length $n$ once costs $2n$ cache line fetches, while a three-pass approach would cost $6n$.
69
+
70
+ ### Reciprocal Square Root with Newton-Raphson Refinement
71
+
72
+ `nk_angular_f32_haswell`, `nk_angular_f64_haswell`, `nk_angular_f32_neon`, `nk_angular_f64_neon` compute the final normalization via in-hardware reciprocal square root estimates refined by Newton-Raphson iteration.
73
+ The iteration formula is $x_{n+1} = x_n \cdot (3 - d \cdot x_n^2) / 2$, where $d$ is the value whose reciprocal square root is needed.
74
+ NEON `vrsqrte` + `vrsqrts` performs one refinement step, reaching roughly 22 bits of precision.
75
+ Haswell `VRSQRT14` provides $2^{-14}$ relative error and one Newton-Raphson step doubles the precision to approximately 28 bits.
76
+ Skylake `VRSQRT28` achieves $2^{-28}$ accuracy directly, eliminating the need for a refinement step entirely.
77
+ This reciprocal square root is needed for both euclidean distance ($\sqrt{d}$ via $d \cdot \text{rsqrt}(d)$) and angular distance ($1/\sqrt{a^2} \cdot 1/\sqrt{b^2}$).
78
+
79
+ ### Absolute Differences for Integer Types
80
+
81
+ `nk_sqeuclidean_i8_haswell`, `nk_sqeuclidean_u8_haswell`, `nk_sqeuclidean_i8_icelake`, `nk_sqeuclidean_u8_icelake` compute squared Euclidean distance by first obtaining element-wise absolute differences, then squaring and accumulating.
82
+ For signed `i8`, XOR with `0x80` converts the range from [-128, 127] to unsigned [0, 255], then saturating subtract in both directions followed by OR gives $|a - b|$:
83
+
84
+ ```
85
+ bias_a = _mm256_xor_si256(a, 0x80)
86
+ bias_b = _mm256_xor_si256(b, 0x80)
87
+ abs_diff = _mm256_or_si256(_mm256_subs_epu8(bias_a, bias_b), _mm256_subs_epu8(bias_b, bias_a))
88
+ ```
89
+
90
+ For unsigned `u8`, the same saturating subtract trick works without the XOR bias.
91
+ The absolute differences are then zero-extended via `VPUNPCKLBW`/`VPUNPCKHBW` (1 cycle, cheaper than `VPMOVZXBW`) and squared+accumulated via `VPMADDWD`, which computes $d_i^2 + d_{i+1}^2$ in one instruction.
92
+
93
+ ### Masked Neumaier Compensation on Skylake
94
+
95
+ `nk_sqeuclidean_f64_skylake` uses `VGETEXP`-based Neumaier TwoSum inside AVX-512 masked loops.
96
+ The mask register tracks which lanes are active, handling tail elements when the vector length is not a multiple of the SIMD width.
97
+ The compensation term accumulates the low-order rounding errors from each addition, and because the mask propagates through both the main sum and the compensation update, even the final partial iteration maintains full Neumaier accuracy.
98
+ This avoids the need for a separate scalar tail loop that would otherwise lose the compensated error tracking.
99
+
100
+ ## Performance
101
+
102
+ The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
103
+ The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
104
+ The throughput is measured in GB/s as the number of input bytes per second.
105
+ Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
106
+ Each kernel runs for at least 20 seconds per configuration.
107
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
108
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
109
+
110
+ ### Intel Sapphire Rapids
111
+
112
+ #### Native
113
+
114
+ | Kernel | 256 | 1024 | 4096 |
115
+ | :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
116
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
117
+ | `nk_sqeuclidean_f64_serial` | 8.00 gb/s, 0.1 ulp | 8.32 gb/s, 0 ulp | 8.13 gb/s, 0 ulp |
118
+ | `nk_euclidean_f64_serial` | 7.81 gb/s, 0.6 ulp | 7.95 gb/s, 0.5 ulp | 8.34 gb/s, 0.5 ulp |
119
+ | `nk_angular_f64_serial` | 2.80 gb/s, 0 ulp | 3.03 gb/s, 0 ulp | 3.18 gb/s, 0 ulp |
120
+ | `nk_sqeuclidean_f64_skylake` | 32.4 gb/s, 0.4 ulp | 30.6 gb/s, 0.7 ulp | 22.2 gb/s, 1.3 ulp |
121
+ | `nk_euclidean_f64_skylake` | 31.7 gb/s, 0.3 ulp | 29.4 gb/s, 0.4 ulp | 22.9 gb/s, 0.7 ulp |
122
+ | `nk_angular_f64_skylake` | 26.5 gb/s, 0 ulp | 26.8 gb/s, 0 ulp | 17.8 gb/s, 0 ulp |
123
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
124
+ | `nk_sqeuclidean_f32_serial` | 4.01 gb/s, 0 ulp | 4.06 gb/s, 0 ulp | 4.19 gb/s, 0 ulp |
125
+ | `nk_euclidean_f32_serial` | 3.99 gb/s, 0.1 ulp | 4.07 gb/s, 0.1 ulp | 4.11 gb/s, 0.1 ulp |
126
+ | `nk_angular_f32_serial` | 1.29 gb/s, 0 ulp | 1.41 gb/s, 0 ulp | 1.53 gb/s, 0 ulp |
127
+ | `nk_sqeuclidean_f32_skylake` | 36.5 gb/s, 0 ulp | 27.0 gb/s, 0 ulp | 23.2 gb/s, 0 ulp |
128
+ | `nk_euclidean_f32_skylake` | 36.4 gb/s, 0.1 ulp | 28.1 gb/s, 0.1 ulp | 26.7 gb/s, 0.1 ulp |
129
+ | `nk_angular_f32_skylake` | 24.3 gb/s, 0 ulp | 23.2 gb/s, 0 ulp | 22.5 gb/s, 0 ulp |
130
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
131
+ | `nk_sqeuclidean_bf16_serial` | 0.582 gb/s, 0 ulp | 0.358 gb/s, 0 ulp | 0.390 gb/s, 0 ulp |
132
+ | `nk_euclidean_bf16_serial` | 0.569 gb/s, 0.5 ulp | 0.373 gb/s, 0.5 ulp | 0.372 gb/s, 0.4 ulp |
133
+ | `nk_angular_bf16_serial` | 0.455 gb/s, 0 ulp | 0.241 gb/s, 0 ulp | 0.259 gb/s, 0 ulp |
134
+ | `nk_sqeuclidean_bf16_haswell` | 27.7 gb/s, 0.5 ulp | 14.0 gb/s, 7.5 ulp | 11.8 gb/s, 27 ulp |
135
+ | `nk_euclidean_bf16_haswell` | 23.3 gb/s, 0.3 ulp | 13.4 gb/s, 4.1 ulp | 12.0 gb/s, 15 ulp |
136
+ | `nk_angular_bf16_haswell` | 20.1 gb/s, 0 ulp | 13.4 gb/s, 0 ulp | 10.6 gb/s, 0.2 ulp |
137
+ | `nk_sqeuclidean_bf16_genoa` | 50.1 gb/s, 0.3 ulp | 21.0 gb/s, 0.5 ulp | 20.5 gb/s, 10 ulp |
138
+ | `nk_euclidean_bf16_genoa` | 48.3 gb/s, 0.2 ulp | 23.1 gb/s, 0.3 ulp | 20.4 gb/s, 5.8 ulp |
139
+ | `nk_angular_bf16_genoa` | 36.4 gb/s, 0 ulp | 22.4 gb/s, 0 ulp | 21.0 gb/s, 0.1 ulp |
140
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
141
+ | `nk_sqeuclidean_f16_serial` | 0.950 gb/s, 0.1 ulp | 0.872 gb/s, 0.1 ulp | 0.864 gb/s, 0.1 ulp |
142
+ | `nk_euclidean_f16_serial` | 0.934 gb/s, 0.5 ulp | 0.913 gb/s, 0.5 ulp | 0.906 gb/s, 0.5 ulp |
143
+ | `nk_angular_f16_serial` | 0.881 gb/s, 0 ulp | 0.531 gb/s, 0 ulp | 0.543 gb/s, 0 ulp |
144
+ | `nk_sqeuclidean_f16_haswell` | 29.8 gb/s, 0.4 ulp | 14.8 gb/s, 1.4 ulp | 11.8 gb/s, 5.2 ulp |
145
+ | `nk_euclidean_f16_haswell` | 22.9 gb/s, 0.3 ulp | 12.9 gb/s, 0.8 ulp | 10.6 gb/s, 2.8 ulp |
146
+ | `nk_angular_f16_haswell` | 19.9 gb/s, 0.1 ulp | 17.5 gb/s, 0.1 ulp | 16.1 gb/s, 0.1 ulp |
147
+ | __e5m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
148
+ | `nk_sqeuclidean_e5m2_serial` | 0.955 gb/s, 0 ulp | 1.01 gb/s, 0 ulp | 1.02 gb/s, 0 ulp |
149
+ | `nk_euclidean_e5m2_serial` | 0.954 gb/s, 0.5 ulp | 0.985 gb/s, 0.5 ulp | 1.03 gb/s, 0.5 ulp |
150
+ | `nk_angular_e5m2_serial` | 0.336 gb/s, 0 ulp | 0.385 gb/s, 0 ulp | 0.407 gb/s, 0 ulp |
151
+ | `nk_sqeuclidean_e5m2_skylake` | 4.44 gb/s, 0 ulp | 4.65 gb/s, 0 ulp | 5.80 gb/s, 0 ulp |
152
+ | `nk_euclidean_e5m2_skylake` | 4.34 gb/s, 0 ulp | 4.65 gb/s, 0 ulp | 5.88 gb/s, 0 ulp |
153
+ | `nk_angular_e5m2_skylake` | 3.83 gb/s, 0 ulp | 4.39 gb/s, 0 ulp | 6.10 gb/s, 0 ulp |
154
+ | `nk_sqeuclidean_e5m2_genoa` | 7.12 gb/s, 0 ulp | 8.07 gb/s, 0 ulp | 8.05 gb/s, 0 ulp |
155
+ | `nk_euclidean_e5m2_genoa` | 7.01 gb/s, 0 ulp | 6.97 gb/s, 0 ulp | 8.16 gb/s, 0 ulp |
156
+ | `nk_angular_e5m2_genoa` | 6.33 gb/s, 0 ulp | 6.79 gb/s, 0 ulp | 7.99 gb/s, 0 ulp |
157
+ | __e4m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
158
+ | `nk_sqeuclidean_e4m3_serial` | 0.569 gb/s, 0 ulp | 0.606 gb/s, 0 ulp | 0.609 gb/s, 0 ulp |
159
+ | `nk_euclidean_e4m3_serial` | 0.587 gb/s, 0.5 ulp | 0.602 gb/s, 0.5 ulp | 0.578 gb/s, 0.5 ulp |
160
+ | `nk_angular_e4m3_serial` | 0.326 gb/s, 0 ulp | 0.196 gb/s, 0 ulp | 0.366 gb/s, 0 ulp |
161
+ | `nk_sqeuclidean_e4m3_skylake` | 3.84 gb/s, 0 ulp | 3.62 gb/s, 0 ulp | 3.95 gb/s, 0.2 ulp |
162
+ | `nk_euclidean_e4m3_skylake` | 3.48 gb/s, 0 ulp | 3.69 gb/s, 0 ulp | 3.33 gb/s, 0.2 ulp |
163
+ | `nk_angular_e4m3_skylake` | 4.22 gb/s, 0 ulp | 3.38 gb/s, 0 ulp | 4.54 gb/s, 0 ulp |
164
+ | `nk_sqeuclidean_e4m3_genoa` | 7.05 gb/s, 0 ulp | 9.87 gb/s, 0 ulp | 6.69 gb/s, 0.2 ulp |
165
+ | `nk_euclidean_e4m3_genoa` | 6.86 gb/s, 0 ulp | 9.64 gb/s, 0 ulp | 7.98 gb/s, 0.2 ulp |
166
+ | `nk_angular_e4m3_genoa` | 6.27 gb/s, 0 ulp | 7.10 gb/s, 0 ulp | 7.31 gb/s, 0 ulp |
167
+ | `nk_sqeuclidean_e4m3_sapphire` | 3.68 gb/s, 0 ulp | 5.09 gb/s, 0 ulp | 4.83 gb/s, 0.2 ulp |
168
+ | `nk_euclidean_e4m3_sapphire` | 3.47 gb/s, 0 ulp | 4.71 gb/s, 0 ulp | 4.62 gb/s, 0.2 ulp |
169
+ | __e3m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
170
+ | `nk_sqeuclidean_e3m2_serial` | 1.01 gb/s, 0 ulp | 0.971 gb/s, 0 ulp | 1.03 gb/s, 0 ulp |
171
+ | `nk_euclidean_e3m2_serial` | 0.997 gb/s, 0.5 ulp | 0.990 gb/s, 0.5 ulp | 0.999 gb/s, 0.4 ulp |
172
+ | `nk_angular_e3m2_serial` | 0.332 gb/s, 0 ulp | 0.361 gb/s, 0 ulp | 0.437 gb/s, 0 ulp |
173
+ | `nk_sqeuclidean_e3m2_skylake` | 4.47 gb/s, 0 ulp | 5.46 gb/s, 0 ulp | 5.04 gb/s, 0 ulp |
174
+ | `nk_euclidean_e3m2_skylake` | 4.34 gb/s, 0 ulp | 6.20 gb/s, 0 ulp | 5.10 gb/s, 0 ulp |
175
+ | `nk_angular_e3m2_skylake` | 3.79 gb/s, 0 ulp | 4.41 gb/s, 0 ulp | 4.82 gb/s, 0 ulp |
176
+ | `nk_sqeuclidean_e3m2_genoa` | 8.79 gb/s, 0 ulp | 9.52 gb/s, 0 ulp | 10.6 gb/s, 0 ulp |
177
+ | `nk_euclidean_e3m2_genoa` | 8.68 gb/s, 0 ulp | 9.01 gb/s, 0 ulp | 12.8 gb/s, 0 ulp |
178
+ | `nk_angular_e3m2_genoa` | 6.89 gb/s, 0 ulp | 9.30 gb/s, 0 ulp | 10.3 gb/s, 0 ulp |
179
+ | `nk_sqeuclidean_e3m2_sapphire` | 23.2 gb/s, 1.02K ulp | 18.9 gb/s, 1.04K ulp | 22.8 gb/s, 1.03K ulp |
180
+ | `nk_euclidean_e3m2_sapphire` | 21.0 gb/s, 564 ulp | 16.6 gb/s, 571 ulp | 21.7 gb/s, 569 ulp |
181
+ | `nk_angular_e3m2_sapphire` | 13.1 gb/s, 11 ulp | 16.8 gb/s, 5.8 ulp | 19.2 gb/s, 3.0 ulp |
182
+ | __e2m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
183
+ | `nk_sqeuclidean_e2m3_serial` | 0.964 gb/s, 0 ulp | 0.981 gb/s, 0 ulp | 1.03 gb/s, 0 ulp |
184
+ | `nk_euclidean_e2m3_serial` | 0.979 gb/s, 0.5 ulp | 0.966 gb/s, 0.5 ulp | 1.02 gb/s, 0.5 ulp |
185
+ | `nk_angular_e2m3_serial` | 0.347 gb/s, 0 ulp | 0.389 gb/s, 0 ulp | 0.418 gb/s, 0 ulp |
186
+ | `nk_sqeuclidean_e2m3_skylake` | 4.58 gb/s, 0 ulp | 4.65 gb/s, 0 ulp | 5.08 gb/s, 0 ulp |
187
+ | `nk_euclidean_e2m3_skylake` | 4.48 gb/s, 0 ulp | 4.39 gb/s, 0 ulp | 4.96 gb/s, 0 ulp |
188
+ | `nk_angular_e2m3_skylake` | 3.94 gb/s, 0 ulp | 4.25 gb/s, 0 ulp | 4.90 gb/s, 0 ulp |
189
+ | `nk_sqeuclidean_e2m3_genoa` | 9.62 gb/s, 0 ulp | 10.9 gb/s, 0 ulp | 10.8 gb/s, 0 ulp |
190
+ | `nk_euclidean_e2m3_genoa` | 8.45 gb/s, 0 ulp | 9.80 gb/s, 0 ulp | 10.3 gb/s, 0 ulp |
191
+ | `nk_angular_e2m3_genoa` | 7.21 gb/s, 0 ulp | 10.1 gb/s, 0 ulp | 10.4 gb/s, 0 ulp |
192
+ | `nk_sqeuclidean_e2m3_sapphire` | 21.8 gb/s, 354 ulp | 23.3 gb/s, 269 ulp | 22.5 gb/s, 253 ulp |
193
+ | `nk_euclidean_e2m3_sapphire` | 20.6 gb/s, 192 ulp | 21.9 gb/s, 148 ulp | 22.5 gb/s, 140 ulp |
194
+ | `nk_angular_e2m3_sapphire` | 13.1 gb/s, 2.8 ulp | 17.6 gb/s, 1.4 ulp | 18.5 gb/s, 0.7 ulp |
195
+ | __i8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
196
+ | `nk_sqeuclidean_i8_serial` | 34.0 gb/s | 18.4 gb/s | 16.5 gb/s |
197
+ | `nk_euclidean_i8_serial` | 29.0 gb/s, 0.4 ulp | 18.0 gb/s, 0.4 ulp | 15.6 gb/s, 0.4 ulp |
198
+ | `nk_angular_i8_serial` | 7.88 gb/s, 0 ulp | 6.31 gb/s, 0 ulp | 6.12 gb/s, 0 ulp |
199
+ | `nk_sqeuclidean_i8_haswell` | 38.4 gb/s | 17.9 gb/s | 18.4 gb/s |
200
+ | `nk_euclidean_i8_haswell` | 35.6 gb/s, 0 ulp | 17.0 gb/s, 0 ulp | 15.5 gb/s, 0 ulp |
201
+ | `nk_angular_i8_haswell` | 20.3 gb/s, 0.1 ulp | 12.9 gb/s, 0 ulp | 11.9 gb/s, 0 ulp |
202
+ | `nk_sqeuclidean_i8_icelake` | 60.2 gb/s | 24.5 gb/s | 23.5 gb/s |
203
+ | `nk_euclidean_i8_icelake` | 59.0 gb/s, 0 ulp | 23.0 gb/s, 0 ulp | 22.3 gb/s, 0 ulp |
204
+ | `nk_angular_i8_icelake` | 25.2 gb/s, 0.1 ulp | 18.4 gb/s, 0 ulp | 20.5 gb/s, 0 ulp |
205
+ | `nk_sqeuclidean_i8_alder` | 33.4 gb/s | 17.4 gb/s | 17.6 gb/s |
206
+ | `nk_euclidean_i8_alder` | 31.9 gb/s, 0 ulp | 19.1 gb/s, 0 ulp | 17.8 gb/s, 0 ulp |
207
+ | `nk_angular_i8_alder` | 26.2 gb/s, 0.1 ulp | 17.1 gb/s, 0 ulp | 17.8 gb/s, 0 ulp |
208
+ | __u8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
209
+ | `nk_sqeuclidean_u8_serial` | 11.7 gb/s | 8.77 gb/s | 7.07 gb/s |
210
+ | `nk_euclidean_u8_serial` | 11.6 gb/s, 0.5 ulp | 8.31 gb/s, 0.5 ulp | 8.36 gb/s, 0.6 ulp |
211
+ | `nk_angular_u8_serial` | 7.95 gb/s, 0.4 ulp | 6.68 gb/s, 0.4 ulp | 5.88 gb/s, 0.4 ulp |
212
+ | `nk_sqeuclidean_u8_haswell` | 45.4 gb/s | 17.7 gb/s | 18.5 gb/s |
213
+ | `nk_euclidean_u8_haswell` | 38.9 gb/s, 0 ulp | 18.8 gb/s, 0 ulp | 19.3 gb/s, 0 ulp |
214
+ | `nk_angular_u8_haswell` | 21.9 gb/s, 0.7 ulp | 11.7 gb/s, 0.6 ulp | 13.4 gb/s, 0.5 ulp |
215
+ | `nk_sqeuclidean_u8_icelake` | 70.1 gb/s | 28.8 gb/s | 21.0 gb/s |
216
+ | `nk_euclidean_u8_icelake` | 66.4 gb/s, 0 ulp | 27.6 gb/s, 0 ulp | 23.5 gb/s, 0 ulp |
217
+ | `nk_angular_u8_icelake` | 28.9 gb/s, 0.7 ulp | 21.2 gb/s, 0.6 ulp | 21.5 gb/s, 0.5 ulp |
218
+ | `nk_sqeuclidean_u8_alder` | 32.2 gb/s | 17.5 gb/s | 19.0 gb/s |
219
+ | `nk_euclidean_u8_alder` | 31.3 gb/s, 0 ulp | 17.0 gb/s, 0 ulp | 19.6 gb/s, 0 ulp |
220
+ | `nk_angular_u8_alder` | 26.5 gb/s, 0.7 ulp | 17.1 gb/s, 0.6 ulp | 17.5 gb/s, 0.5 ulp |
221
+ | __i4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
222
+ | `nk_sqeuclidean_i4_serial` | 15.4 gb/s | 16.5 gb/s | 15.6 gb/s |
223
+ | `nk_euclidean_i4_serial` | 12.2 gb/s, 0.5 ulp | 15.6 gb/s, 0.5 ulp | 15.2 gb/s, 0.6 ulp |
224
+ | `nk_angular_i4_serial` | 5.60 gb/s, 0.4 ulp | 6.42 gb/s, 0.4 ulp | 6.69 gb/s, 0.4 ulp |
225
+ | `nk_sqeuclidean_i4_icelake` | 23.6 gb/s | 51.5 gb/s | 29.3 gb/s |
226
+ | `nk_euclidean_i4_icelake` | 20.6 gb/s, 0 ulp | 45.2 gb/s, 0 ulp | 28.9 gb/s, 0 ulp |
227
+ | `nk_angular_i4_icelake` | 5.14 gb/s, 0.7 ulp | 18.0 gb/s, 0.6 ulp | 17.6 gb/s, 0.5 ulp |
228
+ | __u4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
229
+ | `nk_sqeuclidean_u4_serial` | 15.6 gb/s | 17.3 gb/s | 15.8 gb/s |
230
+ | `nk_euclidean_u4_serial` | 12.0 gb/s, 0.5 ulp | 15.9 gb/s, 0.5 ulp | 15.3 gb/s, 0.6 ulp |
231
+ | `nk_angular_u4_serial` | 5.20 gb/s, 0.4 ulp | 6.63 gb/s, 0.4 ulp | 7.01 gb/s, 0.4 ulp |
232
+ | `nk_sqeuclidean_u4_icelake` | 22.7 gb/s | 23.7 gb/s | 24.5 gb/s |
233
+ | `nk_euclidean_u4_icelake` | 20.9 gb/s, 0 ulp | 18.8 gb/s, 0 ulp | 24.1 gb/s, 0 ulp |
234
+ | `nk_angular_u4_icelake` | 9.32 gb/s, 0.7 ulp | 27.4 gb/s, 0.6 ulp | 24.2 gb/s, 0.5 ulp |
235
+
236
+ #### WASM
237
+
238
+ Measured with Wasmtime v42 (Cranelift backend).
239
+
240
+ | Kernel | 256 | 1024 | 4096 |
241
+ | :-------------------------------- | -----------------------: | -----------------------: | -----------------------: |
242
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
243
+ | `nk_sqeuclidean_f64_serial` | 2.97 gb/s, 0.1 ulp | 3.16 gb/s, 0 ulp | 0.02 gb/s, 0 ulp |
244
+ | `nk_euclidean_f64_serial` | 0.104 gb/s, 0.6 ulp | 1.06 gb/s, 0.6 ulp | 0.33 gb/s, 0.5 ulp |
245
+ | `nk_angular_f64_serial` | 1.91 gb/s, 0.1 ulp | 1.93 gb/s, 0 ulp | 0.18 gb/s, 0 ulp |
246
+ | `nk_sqeuclidean_f64_v128relaxed` | 1.23 gb/s, 1.3 ulp | 1.87 gb/s, 2.5 ulp | 0.15 gb/s, 5.0 ulp |
247
+ | `nk_euclidean_f64_v128relaxed` | 0.315 gb/s, 0.7 ulp | 2.21 gb/s, 1.4 ulp | 0.03 gb/s, 2.8 ulp |
248
+ | `nk_angular_f64_v128relaxed` | 1.14 gb/s, 0.1 ulp | 0.928 gb/s, 0.1 ulp | 0.26 gb/s, 0.1 ulp |
249
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
250
+ | `nk_sqeuclidean_f32_serial` | 0.657 gb/s, 0 ulp | 0.928 gb/s, 0 ulp | 0.06 gb/s, 0 ulp |
251
+ | `nk_euclidean_f32_serial` | 0.757 gb/s, 0.1 ulp | 0.914 gb/s, 0.1 ulp | 0.05 gb/s, 0.1 ulp |
252
+ | `nk_angular_f32_serial` | 0.882 gb/s, 0 ulp | 0.902 gb/s, 0 ulp | 0.26 gb/s, 0 ulp |
253
+ | `nk_sqeuclidean_f32_v128relaxed` | 2.87 gb/s, 0.7 ulp | 3.03 gb/s, 1.3 ulp | 1.77 gb/s, 2.6 ulp |
254
+ | `nk_euclidean_f32_v128relaxed` | 1.83 gb/s, 0.4 ulp | 3.00 gb/s, 0.7 ulp | 0.22 gb/s, 1.4 ulp |
255
+ | `nk_angular_f32_v128relaxed` | 3.37 gb/s, 0 ulp | 0.991 gb/s, 0 ulp | 0.19 gb/s, 0 ulp |
256
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
257
+ | `nk_sqeuclidean_bf16_serial` | 1.89 gb/s, 0 ulp | 1.09 gb/s, 0 ulp | 0.31 gb/s, 0 ulp |
258
+ | `nk_euclidean_bf16_serial` | 2.02 gb/s, 0.6 ulp | 2.13 gb/s, 0.5 ulp | 0.29 gb/s, 0.5 ulp |
259
+ | `nk_angular_bf16_serial` | 0.399 gb/s, 0 ulp | 0.308 gb/s, 0 ulp | 0.11 gb/s, 0 ulp |
260
+ | `nk_sqeuclidean_bf16_v128relaxed` | 2.10 gb/s, 0.9 ulp | 1.94 gb/s, 12.6 ulp | 0.17 gb/s, 20.8 ulp |
261
+ | `nk_euclidean_bf16_v128relaxed` | 2.08 gb/s, 0.5 ulp | 2.22 gb/s, 7.0 ulp | 0.13 gb/s, 11.4 ulp |
262
+ | `nk_angular_bf16_v128relaxed` | 1.08 gb/s, 0 ulp | 2.09 gb/s, 0.2 ulp | 0.20 gb/s, 0.6 ulp |
263
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
264
+ | `nk_sqeuclidean_f16_serial` | 1.10 gb/s, 0.1 ulp | 1.13 gb/s, 0.1 ulp | 0.20 gb/s, 0.1 ulp |
265
+ | `nk_euclidean_f16_serial` | 1.17 gb/s, 0.6 ulp | 1.16 gb/s, 0.6 ulp | 0.26 gb/s, 0.5 ulp |
266
+ | `nk_angular_f16_serial` | 0.363 gb/s, 0 ulp | 0.372 gb/s, 0 ulp | 0.06 gb/s, 0 ulp |
267
+ | `nk_sqeuclidean_f16_v128relaxed` | 1.12 gb/s, 0.9 ulp | 0.633 gb/s, 3.6 ulp | 0.03 gb/s, 9.7 ulp |
268
+ | `nk_euclidean_f16_v128relaxed` | 0.806 gb/s, 0.5 ulp | 0.991 gb/s, 2.0 ulp | 0.09 gb/s, 5.4 ulp |
269
+ | `nk_angular_f16_v128relaxed` | 1.79 gb/s, 0.1 ulp | 0.976 gb/s, 0.1 ulp | 0.00 gb/s, 0.1 ulp |
270
+ | __e5m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
271
+ | `nk_sqeuclidean_e5m2_serial` | 0.713 gb/s, 0 ulp | 0.689 gb/s, 0 ulp | 0.16 gb/s, 0 ulp |
272
+ | `nk_euclidean_e5m2_serial` | 0.637 gb/s, 0.5 ulp | 0.736 gb/s, 0.5 ulp | 0.12 gb/s, 0.5 ulp |
273
+ | `nk_angular_e5m2_serial` | 0.169 gb/s, 0 ulp | 0.162 gb/s, 0 ulp | 0.17 gb/s, 0 ulp |
274
+ | __e4m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
275
+ | `nk_sqeuclidean_e4m3_serial` | 0.374 gb/s, 0 ulp | 0.383 gb/s, 0 ulp | 0.09 gb/s, 0 ulp |
276
+ | `nk_euclidean_e4m3_serial` | 0.374 gb/s, 0.5 ulp | 0.360 gb/s, 0.5 ulp | 0.09 gb/s, 0.5 ulp |
277
+ | `nk_angular_e4m3_serial` | 0.162 gb/s, 0 ulp | 0.166 gb/s, 0 ulp | 0.17 gb/s, 0 ulp |
278
+ | __e3m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
279
+ | `nk_sqeuclidean_e3m2_serial` | 0.712 gb/s, 0 ulp | 0.744 gb/s, 0 ulp | 0.17 gb/s, 0 ulp |
280
+ | `nk_euclidean_e3m2_serial` | 0.709 gb/s, 0.5 ulp | 0.759 gb/s, 0.5 ulp | 0.17 gb/s, 0.5 ulp |
281
+ | `nk_angular_e3m2_serial` | 0.152 gb/s, 0 ulp | 0.165 gb/s, 0 ulp | 0.17 gb/s, 0 ulp |
282
+ | __e2m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
283
+ | `nk_sqeuclidean_e2m3_serial` | 0.702 gb/s, 0 ulp | 0.760 gb/s, 0 ulp | 0.13 gb/s, 0 ulp |
284
+ | `nk_euclidean_e2m3_serial` | 0.650 gb/s, 0.5 ulp | 0.753 gb/s, 0.5 ulp | 0.15 gb/s, 0.5 ulp |
285
+ | `nk_angular_e2m3_serial` | 0.158 gb/s, 0 ulp | 0.168 gb/s, 0 ulp | 0.17 gb/s, 0 ulp |
286
+ | __i8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
287
+ | `nk_sqeuclidean_i8_serial` | 0.327 gb/s | 0.328 gb/s | 0.09 gb/s |
288
+ | `nk_euclidean_i8_serial` | 2.93 gb/s, 0.5 ulp | 0.174 gb/s, 0.4 ulp | 0.14 gb/s, 0.4 ulp |
289
+ | `nk_angular_i8_serial` | 1.23 gb/s, 0 ulp | 0.946 gb/s, 0 ulp | 0.10 gb/s, 0 ulp |
290
+ | `nk_sqeuclidean_i8_v128relaxed` | 1.84 gb/s | 0.736 gb/s | 0.08 gb/s |
291
+ | `nk_euclidean_i8_v128relaxed` | 1.36 gb/s, 0 ulp | 0.805 gb/s, 0 ulp | 0.21 gb/s, 0 ulp |
292
+ | `nk_angular_i8_v128relaxed` | 1.80 gb/s, 0 ulp | 2.79 gb/s, 0 ulp | 0.14 gb/s, 0 ulp |
293
+ | __u8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
294
+ | `nk_sqeuclidean_u8_serial` | 0.528 gb/s | 0.496 gb/s | 0.30 gb/s |
295
+ | `nk_euclidean_u8_serial` | 0.00982 gb/s, 0.5 ulp | 0.311 gb/s, 0.5 ulp | 0.04 gb/s, 0.6 ulp |
296
+ | `nk_angular_u8_serial` | 0.813 gb/s, 0.5 ulp | 1.46 gb/s, 0.4 ulp | 0.29 gb/s, 0.5 ulp |
297
+ | `nk_sqeuclidean_u8_v128relaxed` | 3.05 gb/s | 1.68 gb/s | 0.28 gb/s |
298
+ | `nk_euclidean_u8_v128relaxed` | 2.52 gb/s, 0 ulp | 1.70 gb/s, 0 ulp | 0.09 gb/s, 0 ulp |
299
+ | `nk_angular_u8_v128relaxed` | 2.47 gb/s, 526M ulp | 1.91 gb/s, 501M ulp | 0.09 gb/s, 443M ulp |
300
+ | __i4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
301
+ | `nk_sqeuclidean_i4_serial` | 1.91 gb/s | 1.94 gb/s | 0.30 gb/s |
302
+ | `nk_euclidean_i4_serial` | 1.76 gb/s, 0.5 ulp | 1.90 gb/s, 0.5 ulp | 0.02 gb/s, 0.0 ulp |
303
+ | `nk_angular_i4_serial` | 1.28 gb/s, 0.5 ulp | 1.34 gb/s, 0.5 ulp | 0.10 gb/s, 0.5 ulp |
304
+ | __u4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
305
+ | `nk_sqeuclidean_u4_serial` | 2.91 gb/s | 3.00 gb/s | 0.09 gb/s |
306
+ | `nk_euclidean_u4_serial` | 2.78 gb/s, 0.5 ulp | 3.01 gb/s, 0.5 ulp | 0.10 gb/s, 0.0 ulp |
307
+ | `nk_angular_u4_serial` | 1.84 gb/s, 0.5 ulp | 2.03 gb/s, 0.5 ulp | 0.21 gb/s, 0.5 ulp |
308
+
309
+ ### Apple M4
310
+
311
+ #### Native
312
+
313
+ | Kernel | 256 | 1024 | 4096 |
314
+ | :------------------------------ | -----------------------: | -----------------------: | -----------------------: |
315
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
316
+ | `nk_sqeuclidean_f64_serial` | 12.9 gb/s, 0.1 ulp | 9.79 gb/s, 0 ulp | 9.66 gb/s, 0 ulp |
317
+ | `nk_euclidean_f64_serial` | 12.8 gb/s, 0.6 ulp | 9.74 gb/s, 0.5 ulp | 9.72 gb/s, 0.5 ulp |
318
+ | `nk_angular_f64_serial` | 8.55 gb/s, 0 ulp | 6.28 gb/s, 0 ulp | 6.34 gb/s, 0 ulp |
319
+ | `nk_sqeuclidean_f64_neon` | 31.7 gb/s, 1.3 ulp | 28.3 gb/s, 2.6 ulp | 25.6 gb/s, 5.1 ulp |
320
+ | `nk_euclidean_f64_neon` | 33.4 gb/s, 0.7 ulp | 28.6 gb/s, 1.4 ulp | 26.3 gb/s, 2.8 ulp |
321
+ | `nk_angular_f64_neon` | 23.6 gb/s, 0.1 ulp | 24.0 gb/s, 0 ulp | 23.6 gb/s, 0 ulp |
322
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
323
+ | `nk_sqeuclidean_f32_serial` | 6.36 gb/s, 0 ulp | 4.67 gb/s, 0 ulp | 4.64 gb/s, 0 ulp |
324
+ | `nk_euclidean_f32_serial` | 6.32 gb/s, 0.1 ulp | 4.66 gb/s, 0.1 ulp | 4.63 gb/s, 0.1 ulp |
325
+ | `nk_angular_f32_serial` | 4.01 gb/s, 0 ulp | 2.84 gb/s, 0 ulp | 2.87 gb/s, 0 ulp |
326
+ | `nk_sqeuclidean_f32_neon` | 17.0 gb/s, 0.1 ulp | 12.8 gb/s, 0 ulp | 12.5 gb/s, 0 ulp |
327
+ | `nk_euclidean_f32_neon` | 18.8 gb/s, 0.1 ulp | 15.2 gb/s, 0.1 ulp | 13.3 gb/s, 0.1 ulp |
328
+ | `nk_angular_f32_neon` | 16.1 gb/s, 0 ulp | 13.2 gb/s, 0 ulp | 12.5 gb/s, 0 ulp |
329
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
330
+ | `nk_sqeuclidean_bf16_serial` | 3.19 gb/s, 0 ulp | 2.37 gb/s, 0 ulp | 2.33 gb/s, 0 ulp |
331
+ | `nk_euclidean_bf16_serial` | 3.20 gb/s, 0.5 ulp | 2.37 gb/s, 0.5 ulp | 2.36 gb/s, 0.5 ulp |
332
+ | `nk_angular_bf16_serial` | 1.45 gb/s, 0 ulp | 1.34 gb/s, 0 ulp | 1.35 gb/s, 0 ulp |
333
+ | `nk_sqeuclidean_bf16_neonbfdot` | 23.4 gb/s, 0.9 ulp | 16.4 gb/s, 13 ulp | 14.8 gb/s, 21 ulp |
334
+ | `nk_euclidean_bf16_neonbfdot` | 23.4 gb/s, 0.5 ulp | 17.0 gb/s, 7.0 ulp | 14.7 gb/s, 12 ulp |
335
+ | `nk_angular_bf16_neonbfdot` | 19.8 gb/s, 0 ulp | 24.0 gb/s, 0.1 ulp | 25.8 gb/s, 0 ulp |
336
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
337
+ | `nk_sqeuclidean_f16_serial` | 3.16 gb/s, 0.1 ulp | 2.33 gb/s, 0.1 ulp | 2.34 gb/s, 0.1 ulp |
338
+ | `nk_euclidean_f16_serial` | 3.18 gb/s, 0.6 ulp | 2.34 gb/s, 0.5 ulp | 2.32 gb/s, 0.5 ulp |
339
+ | `nk_angular_f16_serial` | 1.90 gb/s, 0 ulp | 1.35 gb/s, 0 ulp | 1.36 gb/s, 0 ulp |
340
+ | `nk_sqeuclidean_f16_neonhalf` | 22.9 gb/s, 0.9 ulp | 15.4 gb/s, 3.6 ulp | 14.0 gb/s, 9.7 ulp |
341
+ | `nk_euclidean_f16_neonhalf` | 22.7 gb/s, 0.5 ulp | 15.8 gb/s, 2.0 ulp | 14.0 gb/s, 5.3 ulp |
342
+ | `nk_angular_f16_neonhalf` | 18.3 gb/s, 0.1 ulp | 14.9 gb/s, 0.1 ulp | 14.0 gb/s, 0.1 ulp |
343
+ | __e5m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
344
+ | `nk_sqeuclidean_e5m2_serial` | 2.15 gb/s, 0 ulp | 1.44 gb/s, 0 ulp | 1.38 gb/s, 0 ulp |
345
+ | `nk_euclidean_e5m2_serial` | 2.11 gb/s, 0.5 ulp | 1.40 gb/s, 0.5 ulp | 1.40 gb/s, 0.5 ulp |
346
+ | `nk_angular_e5m2_serial` | 0.943 gb/s, 0 ulp | 0.657 gb/s, 0 ulp | 0.650 gb/s, 0 ulp |
347
+ | __e4m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
348
+ | `nk_sqeuclidean_e4m3_serial` | 1.08 gb/s, 0 ulp | 0.686 gb/s, 0 ulp | 0.700 gb/s, 0 ulp |
349
+ | `nk_euclidean_e4m3_serial` | 1.06 gb/s, 0.5 ulp | 0.691 gb/s, 0.5 ulp | 0.699 gb/s, 0.5 ulp |
350
+ | `nk_angular_e4m3_serial` | 0.699 gb/s, 0 ulp | 0.463 gb/s, 0 ulp | 0.470 gb/s, 0 ulp |
351
+ | __e3m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
352
+ | `nk_sqeuclidean_e3m2_serial` | 2.13 gb/s, 0 ulp | 1.40 gb/s, 0 ulp | 1.39 gb/s, 0 ulp |
353
+ | `nk_euclidean_e3m2_serial` | 2.12 gb/s, 0.5 ulp | 1.41 gb/s, 0.5 ulp | 1.42 gb/s, 0.5 ulp |
354
+ | `nk_angular_e3m2_serial` | 0.945 gb/s, 0 ulp | 0.657 gb/s, 0 ulp | 0.663 gb/s, 0 ulp |
355
+ | `nk_sqeuclidean_e3m2_neon` | 3.78 gb/s, 0 ulp | 3.63 gb/s, 0 ulp | 3.59 gb/s, 0 ulp |
356
+ | `nk_euclidean_e3m2_neon` | 3.74 gb/s, 0 ulp | 3.55 gb/s, 0 ulp | 3.55 gb/s, 0 ulp |
357
+ | `nk_angular_e3m2_neon` | 3.44 gb/s, 0 ulp | 3.37 gb/s, 0 ulp | 3.34 gb/s, 0 ulp |
358
+ | __e2m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
359
+ | `nk_sqeuclidean_e2m3_serial` | 2.14 gb/s, 0 ulp | 1.41 gb/s, 0 ulp | 1.40 gb/s, 0 ulp |
360
+ | `nk_euclidean_e2m3_serial` | 2.12 gb/s, 0.5 ulp | 1.39 gb/s, 0.5 ulp | 1.40 gb/s, 0.4 ulp |
361
+ | `nk_angular_e2m3_serial` | 0.946 gb/s, 0 ulp | 0.664 gb/s, 0 ulp | 0.653 gb/s, 0 ulp |
362
+ | `nk_sqeuclidean_e2m3_neon` | 3.77 gb/s, 0 ulp | 3.62 gb/s, 0 ulp | 3.54 gb/s, 0 ulp |
363
+ | `nk_euclidean_e2m3_neon` | 3.73 gb/s, 0 ulp | 3.64 gb/s, 0 ulp | 3.58 gb/s, 0 ulp |
364
+ | `nk_angular_e2m3_neon` | 3.42 gb/s, 0 ulp | 3.37 gb/s, 0 ulp | 3.37 gb/s, 0 ulp |
365
+ | __i8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
366
+ | `nk_sqeuclidean_i8_serial` | 62.0 gb/s | 45.9 gb/s | 47.6 gb/s |
367
+ | `nk_euclidean_i8_serial` | 40.9 gb/s | 36.6 gb/s | 40.8 gb/s |
368
+ | `nk_angular_i8_serial` | 54.0 gb/s | 39.5 gb/s | 35.5 gb/s |
369
+ | `nk_sqeuclidean_i8_neonsdot` | 59.8 gb/s | 49.7 gb/s | 36.0 gb/s |
370
+ | `nk_euclidean_i8_neonsdot` | 56.7 gb/s | 48.6 gb/s | 33.0 gb/s |
371
+ | `nk_angular_i8_neonsdot` | 44.2 gb/s | 40.5 gb/s | 32.5 gb/s |
372
+ | __u8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
373
+ | `nk_sqeuclidean_u8_serial` | 63.6 gb/s | 47.1 gb/s | 40.0 gb/s |
374
+ | `nk_euclidean_u8_serial` | 43.1 gb/s | 36.7 gb/s | 38.5 gb/s |
375
+ | `nk_angular_u8_serial` | 18.0 gb/s | 13.2 gb/s | 12.4 gb/s |
376
+ | `nk_sqeuclidean_u8_neonsdot` | 59.3 gb/s | 51.7 gb/s | 33.0 gb/s |
377
+ | `nk_euclidean_u8_neonsdot` | 54.7 gb/s | 47.9 gb/s | 32.7 gb/s |
378
+ | `nk_angular_u8_neonsdot` | 43.9 gb/s | 39.4 gb/s | 28.6 gb/s |
379
+ | __i4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
380
+ | `nk_sqeuclidean_i4_serial` | 24.0 gb/s | 17.8 gb/s | 18.2 gb/s |
381
+ | `nk_euclidean_i4_serial` | 20.6 gb/s | 16.2 gb/s | 16.0 gb/s |
382
+ | `nk_angular_i4_serial` | 9.44 gb/s | 7.38 gb/s | 7.36 gb/s |
383
+ | __u4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
384
+ | `nk_sqeuclidean_u4_serial` | 29.8 gb/s | 19.6 gb/s | 18.0 gb/s |
385
+ | `nk_euclidean_u4_serial` | 21.2 gb/s | 16.4 gb/s | 16.5 gb/s |
386
+ | `nk_angular_u4_serial` | 9.21 gb/s | 6.71 gb/s | 6.83 gb/s |
387
+
388
+ #### WASM
389
+
390
+ Measured with Wasmtime v42 (Cranelift backend).
391
+
392
+ | Kernel | 256 | 1024 | 4096 |
393
+ | :-------------------------------- | -----------------------: | -----------------------: | -----------------------: |
394
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
395
+ | `nk_sqeuclidean_f64_serial` | 22.0 gb/s, 0.1 ulp | 21.6 gb/s, 0 ulp | 19.5 gb/s, 0 ulp |
396
+ | `nk_euclidean_f64_serial` | 21.3 gb/s, 0.6 ulp | 20.9 gb/s, 0.6 ulp | 20.2 gb/s, 0.5 ulp |
397
+ | `nk_angular_f64_serial` | 10.9 gb/s, 0 ulp | 10.8 gb/s, 0 ulp | 10.4 gb/s, 0 ulp |
398
+ | `nk_sqeuclidean_f64_v128relaxed` | 44.7 gb/s, 1.3 ulp | 37.6 gb/s, 2.6 ulp | 31.1 gb/s, 5.0 ulp |
399
+ | `nk_euclidean_f64_v128relaxed` | 44.4 gb/s, 0.7 ulp | 35.5 gb/s, 1.4 ulp | 31.6 gb/s, 2.8 ulp |
400
+ | `nk_angular_f64_v128relaxed` | 28.1 gb/s, 0.1 ulp | 19.4 gb/s, 0.1 ulp | 17.3 gb/s, 0.1 ulp |
401
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
402
+ | `nk_sqeuclidean_f32_serial` | 9.75 gb/s, 0 ulp | 9.54 gb/s, 0 ulp | 9.47 gb/s, 0 ulp |
403
+ | `nk_euclidean_f32_serial` | 9.62 gb/s, 0.1 ulp | 9.48 gb/s, 0.1 ulp | 9.41 gb/s, 0.1 ulp |
404
+ | `nk_angular_f32_serial` | 5.07 gb/s, 0 ulp | 4.98 gb/s, 0 ulp | 4.94 gb/s, 0 ulp |
405
+ | `nk_sqeuclidean_f32_v128relaxed` | 37.0 gb/s, 0.7 ulp | 37.2 gb/s, 1.3 ulp | 31.2 gb/s, 2.6 ulp |
406
+ | `nk_euclidean_f32_v128relaxed` | 35.7 gb/s, 0.4 ulp | 36.2 gb/s, 0.7 ulp | 32.8 gb/s, 1.4 ulp |
407
+ | `nk_angular_f32_v128relaxed` | 12.5 gb/s, 0 ulp | 10.8 gb/s, 0 ulp | 10.3 gb/s, 0 ulp |
408
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
409
+ | `nk_sqeuclidean_bf16_serial` | 5.48 gb/s, 0 ulp | 5.35 gb/s, 0 ulp | 5.35 gb/s, 0 ulp |
410
+ | `nk_euclidean_bf16_serial` | 5.42 gb/s, 0.6 ulp | 5.34 gb/s, 0.5 ulp | 5.35 gb/s, 0.5 ulp |
411
+ | `nk_angular_bf16_serial` | 2.48 gb/s, 0 ulp | 2.43 gb/s, 0 ulp | 2.43 gb/s, 0 ulp |
412
+ | `nk_sqeuclidean_bf16_v128relaxed` | 7.37 gb/s, 0.9 ulp | 6.78 gb/s, 13 ulp | 6.28 gb/s, 21 ulp |
413
+ | `nk_euclidean_bf16_v128relaxed` | 7.26 gb/s, 0.5 ulp | 6.33 gb/s, 7.0 ulp | 6.18 gb/s, 12 ulp |
414
+ | `nk_angular_bf16_v128relaxed` | 10.0 gb/s, 0 ulp | 10.1 gb/s, 0.2 ulp | 10.2 gb/s, 0.6 ulp |
415
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
416
+ | `nk_sqeuclidean_f16_serial` | 3.13 gb/s, 0.1 ulp | 3.13 gb/s, 0.1 ulp | 3.08 gb/s, 0.1 ulp |
417
+ | `nk_euclidean_f16_serial` | 2.65 gb/s, 0.6 ulp | 2.61 gb/s, 0.5 ulp | 2.60 gb/s, 0.5 ulp |
418
+ | `nk_angular_f16_serial` | 2.48 gb/s, 0 ulp | 2.47 gb/s, 0 ulp | 2.47 gb/s, 0 ulp |
419
+ | `nk_sqeuclidean_f16_v128relaxed` | 4.70 gb/s, 0.9 ulp | 4.82 gb/s, 3.6 ulp | 4.59 gb/s, 9.6 ulp |
420
+ | `nk_euclidean_f16_v128relaxed` | 4.72 gb/s, 0.5 ulp | 4.60 gb/s, 2.0 ulp | 4.60 gb/s, 5.3 ulp |
421
+ | `nk_angular_f16_v128relaxed` | 4.45 gb/s, 0.1 ulp | 4.37 gb/s, 0.1 ulp | 4.36 gb/s, 0.1 ulp |
422
+ | __i8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
423
+ | `nk_sqeuclidean_i8_serial` | 15.0 gb/s | 14.7 gb/s | 15.5 gb/s |
424
+ | `nk_euclidean_i8_serial` | 14.7 gb/s, 0.5 ulp | 15.6 gb/s, 0.4 ulp | 15.6 gb/s, 0.4 ulp |
425
+ | `nk_angular_i8_serial` | 8.15 gb/s, 0 ulp | 8.50 gb/s, 0 ulp | 8.59 gb/s, 0 ulp |
426
+ | `nk_sqeuclidean_i8_v128relaxed` | 28.0 gb/s | 20.6 gb/s | 16.6 gb/s |
427
+ | `nk_euclidean_i8_v128relaxed` | 25.0 gb/s, 0 ulp | 20.5 gb/s, 0 ulp | 16.5 gb/s, 0 ulp |
428
+ | `nk_angular_i8_v128relaxed` | 16.0 gb/s, 0 ulp | 17.8 gb/s, 0 ulp | 18.2 gb/s, 0 ulp |
429
+ | __u8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
430
+ | `nk_sqeuclidean_u8_serial` | 14.9 gb/s | 14.7 gb/s | 15.5 gb/s |
431
+ | `nk_euclidean_u8_serial` | 14.7 gb/s, 0.5 ulp | 15.6 gb/s, 0.5 ulp | 15.6 gb/s, 0.6 ulp |
432
+ | `nk_angular_u8_serial` | 8.14 gb/s, 0.5 ulp | 8.46 gb/s, 0.5 ulp | 8.63 gb/s, 0.4 ulp |
433
+ | `nk_sqeuclidean_u8_v128relaxed` | 29.8 gb/s | 21.4 gb/s | 16.9 gb/s |
434
+ | `nk_euclidean_u8_v128relaxed` | 26.5 gb/s, 0 ulp | 21.8 gb/s, 0 ulp | 16.8 gb/s, 0 ulp |
435
+ | `nk_angular_u8_v128relaxed` | 20.2 gb/s, 0 ulp | 22.8 gb/s, 0 ulp | 23.6 gb/s, 0 ulp |