numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,434 @@
1
+ /**
2
+ * @brief C++ wrappers for SIMD-accelerated Elementwise Arithmetic.
3
+ * @file include/numkong/each.hpp
4
+ * @author Ash Vardanian
5
+ * @date February 5, 2026
6
+ */
7
+ #ifndef NK_EACH_HPP
8
+ #define NK_EACH_HPP
9
+
10
+ #include <cstdint>
11
+ #include <type_traits>
12
+
13
+ #include "numkong/each.h"
14
+
15
+ #include "numkong/types.hpp"
16
+
17
+ namespace ashvardanian::numkong {
18
+
19
+ /**
20
+ * @brief Elementwise sum: cᵢ = aᵢ + bᵢ
21
+ * @param[in] a,b Input vectors
22
+ * @param[in] d Number of dimensions in input vectors
23
+ * @param[out] c Output vector
24
+ *
25
+ * @tparam in_type_ Element type
26
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
27
+ */
28
+ template <numeric_dtype in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
29
+ void sum(in_type_ const *a, in_type_ const *b, std::size_t d, in_type_ *c) noexcept {
30
+ constexpr bool simd = allow_simd_ == prefer_simd_k;
31
+
32
+ if constexpr (std::is_same_v<in_type_, f64_t> && simd) nk_each_sum_f64(&a->raw_, &b->raw_, d, &c->raw_);
33
+ else if constexpr (std::is_same_v<in_type_, f32_t> && simd) nk_each_sum_f32(&a->raw_, &b->raw_, d, &c->raw_);
34
+ else if constexpr (std::is_same_v<in_type_, f16_t> && simd) nk_each_sum_f16(&a->raw_, &b->raw_, d, &c->raw_);
35
+ else if constexpr (std::is_same_v<in_type_, bf16_t> && simd) nk_each_sum_bf16(&a->raw_, &b->raw_, d, &c->raw_);
36
+ else if constexpr (std::is_same_v<in_type_, i8_t> && simd) nk_each_sum_i8(&a->raw_, &b->raw_, d, &c->raw_);
37
+ else if constexpr (std::is_same_v<in_type_, u8_t> && simd) nk_each_sum_u8(&a->raw_, &b->raw_, d, &c->raw_);
38
+ else if constexpr (std::is_same_v<in_type_, i16_t> && simd) nk_each_sum_i16(&a->raw_, &b->raw_, d, &c->raw_);
39
+ else if constexpr (std::is_same_v<in_type_, u16_t> && simd) nk_each_sum_u16(&a->raw_, &b->raw_, d, &c->raw_);
40
+ else if constexpr (std::is_same_v<in_type_, i32_t> && simd) nk_each_sum_i32(&a->raw_, &b->raw_, d, &c->raw_);
41
+ else if constexpr (std::is_same_v<in_type_, u32_t> && simd) nk_each_sum_u32(&a->raw_, &b->raw_, d, &c->raw_);
42
+ else if constexpr (std::is_same_v<in_type_, i64_t> && simd) nk_each_sum_i64(&a->raw_, &b->raw_, d, &c->raw_);
43
+ else if constexpr (std::is_same_v<in_type_, u64_t> && simd) nk_each_sum_u64(&a->raw_, &b->raw_, d, &c->raw_);
44
+ else if constexpr (std::is_same_v<in_type_, f32c_t> && simd) nk_each_sum_f32c(&a->raw_, &b->raw_, d, &c->raw_);
45
+ else if constexpr (std::is_same_v<in_type_, f64c_t> && simd) nk_each_sum_f64c(&a->raw_, &b->raw_, d, &c->raw_);
46
+ // Scalar fallback
47
+ else {
48
+ for (std::size_t i = 0; i < d; i++) c[i] = saturating_add(a[i], b[i]);
49
+ }
50
+ }
51
+
52
+ /**
53
+ * @brief Elementwise scale: cᵢ = α × aᵢ + β
54
+ * @param[in] a Input vector
55
+ * @param[in] d Number of dimensions in input vector
56
+ * @param[in] alpha,beta Scale and shift coefficients
57
+ * @param[out] c Output vector
58
+ *
59
+ * @tparam in_type_ Element type
60
+ * @tparam precision_type_ Precision type for scalar fallback computations, defaults to `in_type_`
61
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
62
+ */
63
+ template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
64
+ void scale(in_type_ const *a, std::size_t d, typename in_type_::scale_t const *alpha,
65
+ typename in_type_::scale_t const *beta, in_type_ *c) noexcept {
66
+ constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<precision_type_, in_type_>;
67
+
68
+ if constexpr (std::is_same_v<in_type_, f64_t> && simd) nk_each_scale_f64(&a->raw_, d, alpha, beta, &c->raw_);
69
+ else if constexpr (std::is_same_v<in_type_, f32_t> && simd) nk_each_scale_f32(&a->raw_, d, alpha, beta, &c->raw_);
70
+ else if constexpr (std::is_same_v<in_type_, f16_t> && simd) nk_each_scale_f16(&a->raw_, d, alpha, beta, &c->raw_);
71
+ else if constexpr (std::is_same_v<in_type_, bf16_t> && simd) nk_each_scale_bf16(&a->raw_, d, alpha, beta, &c->raw_);
72
+ else if constexpr (std::is_same_v<in_type_, i8_t> && simd) nk_each_scale_i8(&a->raw_, d, alpha, beta, &c->raw_);
73
+ else if constexpr (std::is_same_v<in_type_, u8_t> && simd) nk_each_scale_u8(&a->raw_, d, alpha, beta, &c->raw_);
74
+ else if constexpr (std::is_same_v<in_type_, i16_t> && simd) nk_each_scale_i16(&a->raw_, d, alpha, beta, &c->raw_);
75
+ else if constexpr (std::is_same_v<in_type_, u16_t> && simd) nk_each_scale_u16(&a->raw_, d, alpha, beta, &c->raw_);
76
+ else if constexpr (std::is_same_v<in_type_, i32_t> && simd) nk_each_scale_i32(&a->raw_, d, alpha, beta, &c->raw_);
77
+ else if constexpr (std::is_same_v<in_type_, u32_t> && simd) nk_each_scale_u32(&a->raw_, d, alpha, beta, &c->raw_);
78
+ else if constexpr (std::is_same_v<in_type_, i64_t> && simd) nk_each_scale_i64(&a->raw_, d, alpha, beta, &c->raw_);
79
+ else if constexpr (std::is_same_v<in_type_, u64_t> && simd) nk_each_scale_u64(&a->raw_, d, alpha, beta, &c->raw_);
80
+ else if constexpr (std::is_same_v<in_type_, f32c_t> && simd) nk_each_scale_f32c(&a->raw_, d, alpha, beta, &c->raw_);
81
+ else if constexpr (std::is_same_v<in_type_, f64c_t> && simd) nk_each_scale_f64c(&a->raw_, d, alpha, beta, &c->raw_);
82
+ // Scalar fallback with high-precision intermediates
83
+ else {
84
+ for (std::size_t i = 0; i < d; i++)
85
+ c[i] = (precision_type_(a[i]) * precision_type_(*alpha) + precision_type_(*beta)).template to<in_type_>();
86
+ }
87
+ }
88
+
89
+ /**
90
+ * @brief Blend: cᵢ = α × aᵢ + β × bᵢ
91
+ * @param[in] a,b Input vectors
92
+ * @param[in] d Number of dimensions in input vectors
93
+ * @param[in] alpha,beta Weight coefficients
94
+ * @param[out] c Output vector
95
+ *
96
+ * @tparam in_type_ Element type
97
+ * @tparam precision_type_ Precision type for scalar fallback computations, defaults to `in_type_`
98
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
99
+ */
100
+ template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
101
+ void blend(in_type_ const *a, in_type_ const *b, std::size_t d, typename in_type_::scale_t const *alpha,
102
+ typename in_type_::scale_t const *beta, in_type_ *c) noexcept {
103
+ constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<precision_type_, in_type_>;
104
+
105
+ if constexpr (std::is_same_v<in_type_, f64_t> && simd)
106
+ nk_each_blend_f64(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
107
+ else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
108
+ nk_each_blend_f32(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
109
+ else if constexpr (std::is_same_v<in_type_, f16_t> && simd)
110
+ nk_each_blend_f16(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
111
+ else if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
112
+ nk_each_blend_bf16(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
113
+ else if constexpr (std::is_same_v<in_type_, i8_t> && simd)
114
+ nk_each_blend_i8(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
115
+ else if constexpr (std::is_same_v<in_type_, u8_t> && simd)
116
+ nk_each_blend_u8(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
117
+ else if constexpr (std::is_same_v<in_type_, i16_t> && simd)
118
+ nk_each_blend_i16(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
119
+ else if constexpr (std::is_same_v<in_type_, u16_t> && simd)
120
+ nk_each_blend_u16(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
121
+ else if constexpr (std::is_same_v<in_type_, i32_t> && simd)
122
+ nk_each_blend_i32(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
123
+ else if constexpr (std::is_same_v<in_type_, u32_t> && simd)
124
+ nk_each_blend_u32(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
125
+ else if constexpr (std::is_same_v<in_type_, i64_t> && simd)
126
+ nk_each_blend_i64(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
127
+ else if constexpr (std::is_same_v<in_type_, u64_t> && simd)
128
+ nk_each_blend_u64(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
129
+ else if constexpr (std::is_same_v<in_type_, f32c_t> && simd)
130
+ nk_each_blend_f32c(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
131
+ else if constexpr (std::is_same_v<in_type_, f64c_t> && simd)
132
+ nk_each_blend_f64c(&a->raw_, &b->raw_, d, alpha, beta, &c->raw_);
133
+ // Scalar fallback with high-precision intermediates
134
+ else {
135
+ for (std::size_t i = 0; i < d; i++) {
136
+ c[i] = (precision_type_(a[i]) * precision_type_(*alpha) + precision_type_(b[i]) * precision_type_(*beta))
137
+ .template to<in_type_>();
138
+ }
139
+ }
140
+ }
141
+
142
+ /**
143
+ * @brief Elementwise FMA: outᵢ = α × aᵢ × bᵢ + β × cᵢ
144
+ * @param[in] a,b,c Input vectors
145
+ * @param[in] d Number of dimensions in input vectors
146
+ * @param[in] alpha,beta Coefficients
147
+ * @param[out] out Output vector
148
+ *
149
+ * @tparam in_type_ Element type
150
+ * @tparam precision_type_ Precision type for scalar fallback computations, defaults to `in_type_`
151
+ * @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`
152
+ */
153
+ template <numeric_dtype in_type_, numeric_dtype precision_type_ = in_type_, allow_simd_t allow_simd_ = prefer_simd_k>
154
+ void fma(in_type_ const *a, in_type_ const *b, std::size_t d, in_type_ const *c,
155
+ typename in_type_::scale_t const *alpha, typename in_type_::scale_t const *beta, in_type_ *out) noexcept {
156
+ constexpr bool simd = allow_simd_ == prefer_simd_k && std::is_same_v<precision_type_, in_type_>;
157
+
158
+ if constexpr (std::is_same_v<in_type_, f64_t> && simd)
159
+ nk_each_fma_f64(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
160
+ else if constexpr (std::is_same_v<in_type_, f32_t> && simd)
161
+ nk_each_fma_f32(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
162
+ else if constexpr (std::is_same_v<in_type_, f16_t> && simd)
163
+ nk_each_fma_f16(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
164
+ else if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
165
+ nk_each_fma_bf16(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
166
+ else if constexpr (std::is_same_v<in_type_, i8_t> && simd)
167
+ nk_each_fma_i8(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
168
+ else if constexpr (std::is_same_v<in_type_, u8_t> && simd)
169
+ nk_each_fma_u8(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
170
+ else if constexpr (std::is_same_v<in_type_, i16_t> && simd)
171
+ nk_each_fma_i16(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
172
+ else if constexpr (std::is_same_v<in_type_, u16_t> && simd)
173
+ nk_each_fma_u16(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
174
+ else if constexpr (std::is_same_v<in_type_, i32_t> && simd)
175
+ nk_each_fma_i32(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
176
+ else if constexpr (std::is_same_v<in_type_, u32_t> && simd)
177
+ nk_each_fma_u32(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
178
+ else if constexpr (std::is_same_v<in_type_, i64_t> && simd)
179
+ nk_each_fma_i64(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
180
+ else if constexpr (std::is_same_v<in_type_, u64_t> && simd)
181
+ nk_each_fma_u64(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
182
+ else if constexpr (std::is_same_v<in_type_, f32c_t> && simd)
183
+ nk_each_fma_f32c(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
184
+ else if constexpr (std::is_same_v<in_type_, f64c_t> && simd)
185
+ nk_each_fma_f64c(&a->raw_, &b->raw_, &c->raw_, d, alpha, beta, &out->raw_);
186
+ // Scalar fallback with high-precision intermediates
187
+ else {
188
+ for (std::size_t i = 0; i < d; i++) {
189
+ out[i] = (precision_type_(a[i]) * precision_type_(b[i]) * precision_type_(*alpha) +
190
+ precision_type_(c[i]) * precision_type_(*beta))
191
+ .template to<in_type_>();
192
+ }
193
+ }
194
+ }
195
+
196
+ } // namespace ashvardanian::numkong
197
+
198
+ #include "numkong/tensor.hpp"
199
+
200
+ namespace ashvardanian::numkong {
201
+
202
+ #pragma region - Tensor Elementwise
203
+
204
+ /** @brief Scale: output[i] = α × input[i] + β. */
205
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
206
+ bool scale(tensor_view<value_type_, max_rank_> input, typename value_type_::scale_t alpha,
207
+ typename value_type_::scale_t beta, tensor_span<value_type_, max_rank_> output) noexcept {
208
+ return elementwise_into_<value_type_, max_rank_>(
209
+ input, output, [&](tensor_view<value_type_, max_rank_> in, tensor_span<value_type_, max_rank_> out) {
210
+ numkong::scale<value_type_>(in.data(), in.extent(0), &alpha, &beta, out.data());
211
+ });
212
+ }
213
+
214
+ /** @brief Allocating scale: result[i] = α × input[i] + β. */
215
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
216
+ typename allocator_type_ = aligned_allocator<value_type_>>
217
+ tensor<value_type_, allocator_type_, max_rank_> try_scale(tensor_view<value_type_, max_rank_> input,
218
+ typename value_type_::scale_t alpha,
219
+ typename value_type_::scale_t beta) noexcept {
220
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
221
+ if (input.empty()) return out_tensor_t {};
222
+ auto &input_shape = input.shape();
223
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
224
+ if (result.empty()) return result;
225
+ if (!scale<value_type_, max_rank_>(input, alpha, beta, result.span())) return out_tensor_t {};
226
+ return result;
227
+ }
228
+
229
+ /** @brief Blend: output[i] = α × lhs[i] + β × rhs[i]. */
230
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
231
+ bool blend(tensor_view<value_type_, max_rank_> lhs, tensor_view<value_type_, max_rank_> rhs,
232
+ typename value_type_::scale_t alpha, typename value_type_::scale_t beta,
233
+ tensor_span<value_type_, max_rank_> output) noexcept {
234
+ return elementwise_into_<value_type_, max_rank_>(
235
+ lhs, rhs, output,
236
+ [&](tensor_view<value_type_, max_rank_> l, tensor_view<value_type_, max_rank_> r,
237
+ tensor_span<value_type_, max_rank_> out) {
238
+ numkong::blend<value_type_>(l.data(), r.data(), l.extent(0), &alpha, &beta, out.data());
239
+ });
240
+ }
241
+
242
+ /** @brief Allocating blend: result[i] = α × lhs[i] + β × rhs[i]. */
243
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
244
+ typename allocator_type_ = aligned_allocator<value_type_>>
245
+ tensor<value_type_, allocator_type_, max_rank_> try_blend(tensor_view<value_type_, max_rank_> lhs,
246
+ tensor_view<value_type_, max_rank_> rhs,
247
+ typename value_type_::scale_t alpha,
248
+ typename value_type_::scale_t beta) noexcept {
249
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
250
+ if (!shapes_match_(lhs, rhs) || lhs.empty()) return out_tensor_t {};
251
+ auto &input_shape = lhs.shape();
252
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
253
+ if (result.empty()) return result;
254
+ if (!blend<value_type_, max_rank_>(lhs, rhs, alpha, beta, result.span())) return out_tensor_t {};
255
+ return result;
256
+ }
257
+
258
+ /** @brief FMA: output[i] = α × lhs[i] × rhs[i] + β × addend[i]. */
259
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
260
+ bool fma(tensor_view<value_type_, max_rank_> lhs, tensor_view<value_type_, max_rank_> rhs,
261
+ tensor_view<value_type_, max_rank_> addend, typename value_type_::scale_t alpha,
262
+ typename value_type_::scale_t beta, tensor_span<value_type_, max_rank_> output) noexcept {
263
+ return elementwise_into_<value_type_, max_rank_>(
264
+ lhs, rhs, addend, output,
265
+ [&](tensor_view<value_type_, max_rank_> a, tensor_view<value_type_, max_rank_> b,
266
+ tensor_view<value_type_, max_rank_> c, tensor_span<value_type_, max_rank_> out) {
267
+ numkong::fma<value_type_>(a.data(), b.data(), a.extent(0), c.data(), &alpha, &beta, out.data());
268
+ });
269
+ }
270
+
271
+ /** @brief Allocating FMA: result[i] = α × lhs[i] × rhs[i] + β × addend[i]. */
272
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
273
+ typename allocator_type_ = aligned_allocator<value_type_>>
274
+ tensor<value_type_, allocator_type_, max_rank_> try_fma(tensor_view<value_type_, max_rank_> lhs,
275
+ tensor_view<value_type_, max_rank_> rhs,
276
+ tensor_view<value_type_, max_rank_> addend,
277
+ typename value_type_::scale_t alpha,
278
+ typename value_type_::scale_t beta) noexcept {
279
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
280
+ if (!shapes_match_(lhs, rhs) || !shapes_match_(lhs, addend) || lhs.empty()) return out_tensor_t {};
281
+ auto &input_shape = lhs.shape();
282
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
283
+ if (result.empty()) return result;
284
+ if (!fma<value_type_, max_rank_>(lhs, rhs, addend, alpha, beta, result.span())) return out_tensor_t {};
285
+ return result;
286
+ }
287
+
288
+ /** @brief Elementwise addition: output[i] = lhs[i] + rhs[i]. */
289
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
290
+ bool add(tensor_view<value_type_, max_rank_> lhs, tensor_view<value_type_, max_rank_> rhs,
291
+ tensor_span<value_type_, max_rank_> output) noexcept {
292
+ return elementwise_into_<value_type_, max_rank_>(
293
+ lhs, rhs, output,
294
+ [](tensor_view<value_type_, max_rank_> l, tensor_view<value_type_, max_rank_> r,
295
+ tensor_span<value_type_, max_rank_> out) {
296
+ numkong::sum<value_type_>(l.data(), r.data(), l.extent(0), out.data());
297
+ });
298
+ }
299
+
300
+ /** @brief Allocating elementwise add: result = lhs + rhs. */
301
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
302
+ typename allocator_type_ = aligned_allocator<value_type_>>
303
+ tensor<value_type_, allocator_type_, max_rank_> try_add(tensor_view<value_type_, max_rank_> lhs,
304
+ tensor_view<value_type_, max_rank_> rhs) noexcept {
305
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
306
+ if (!shapes_match_(lhs, rhs) || lhs.empty()) return out_tensor_t {};
307
+ auto &input_shape = lhs.shape();
308
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
309
+ if (result.empty()) return result;
310
+ if (!add<value_type_, max_rank_>(lhs, rhs, result.span())) return out_tensor_t {};
311
+ return result;
312
+ }
313
+
314
+ /** @brief Elementwise add scalar: output[i] = input[i] + scalar. */
315
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
316
+ bool add(tensor_view<value_type_, max_rank_> input, typename value_type_::scale_t scalar,
317
+ tensor_span<value_type_, max_rank_> output) noexcept {
318
+ typename value_type_::scale_t one {1};
319
+ return scale<value_type_, max_rank_>(input, one, scalar, output);
320
+ }
321
+
322
+ /** @brief Allocating add scalar. */
323
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
324
+ typename allocator_type_ = aligned_allocator<value_type_>>
325
+ tensor<value_type_, allocator_type_, max_rank_> try_add(tensor_view<value_type_, max_rank_> input,
326
+ typename value_type_::scale_t scalar) noexcept {
327
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
328
+ if (input.empty()) return out_tensor_t {};
329
+ auto &input_shape = input.shape();
330
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
331
+ if (result.empty()) return result;
332
+ if (!add<value_type_, max_rank_>(input, scalar, result.span())) return out_tensor_t {};
333
+ return result;
334
+ }
335
+
336
+ /** @brief Elementwise subtraction: output[i] = lhs[i] − rhs[i]. */
337
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
338
+ bool sub(tensor_view<value_type_, max_rank_> lhs, tensor_view<value_type_, max_rank_> rhs,
339
+ tensor_span<value_type_, max_rank_> output) noexcept {
340
+ typename value_type_::scale_t alpha {1}, beta {-1};
341
+ return blend<value_type_, max_rank_>(lhs, rhs, alpha, beta, output);
342
+ }
343
+
344
+ /** @brief Allocating elementwise sub. */
345
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
346
+ typename allocator_type_ = aligned_allocator<value_type_>>
347
+ tensor<value_type_, allocator_type_, max_rank_> try_sub(tensor_view<value_type_, max_rank_> lhs,
348
+ tensor_view<value_type_, max_rank_> rhs) noexcept {
349
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
350
+ if (!shapes_match_(lhs, rhs) || lhs.empty()) return out_tensor_t {};
351
+ auto &input_shape = lhs.shape();
352
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
353
+ if (result.empty()) return result;
354
+ if (!sub<value_type_, max_rank_>(lhs, rhs, result.span())) return out_tensor_t {};
355
+ return result;
356
+ }
357
+
358
+ /** @brief Elementwise sub scalar: output[i] = input[i] − scalar. */
359
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
360
+ bool sub(tensor_view<value_type_, max_rank_> input, typename value_type_::scale_t scalar,
361
+ tensor_span<value_type_, max_rank_> output) noexcept {
362
+ typename value_type_::scale_t one {1};
363
+ typename value_type_::scale_t neg_scalar = -scalar;
364
+ return scale<value_type_, max_rank_>(input, one, neg_scalar, output);
365
+ }
366
+
367
+ /** @brief Allocating sub scalar. */
368
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
369
+ typename allocator_type_ = aligned_allocator<value_type_>>
370
+ tensor<value_type_, allocator_type_, max_rank_> try_sub(tensor_view<value_type_, max_rank_> input,
371
+ typename value_type_::scale_t scalar) noexcept {
372
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
373
+ if (input.empty()) return out_tensor_t {};
374
+ auto &input_shape = input.shape();
375
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
376
+ if (result.empty()) return result;
377
+ if (!sub<value_type_, max_rank_>(input, scalar, result.span())) return out_tensor_t {};
378
+ return result;
379
+ }
380
+
381
+ /** @brief Elementwise multiplication: output[i] = lhs[i] × rhs[i]. */
382
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
383
+ bool mul(tensor_view<value_type_, max_rank_> lhs, tensor_view<value_type_, max_rank_> rhs,
384
+ tensor_span<value_type_, max_rank_> output) noexcept {
385
+ return elementwise_into_<value_type_, max_rank_>(
386
+ lhs, rhs, output,
387
+ [](tensor_view<value_type_, max_rank_> l, tensor_view<value_type_, max_rank_> r,
388
+ tensor_span<value_type_, max_rank_> out) {
389
+ typename value_type_::scale_t alpha {1}, beta {0};
390
+ numkong::fma<value_type_>(l.data(), r.data(), l.extent(0), out.data(), &alpha, &beta, out.data());
391
+ });
392
+ }
393
+
394
+ /** @brief Allocating elementwise multiply. */
395
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
396
+ typename allocator_type_ = aligned_allocator<value_type_>>
397
+ tensor<value_type_, allocator_type_, max_rank_> try_mul(tensor_view<value_type_, max_rank_> lhs,
398
+ tensor_view<value_type_, max_rank_> rhs) noexcept {
399
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
400
+ if (!shapes_match_(lhs, rhs) || lhs.empty()) return out_tensor_t {};
401
+ auto &input_shape = lhs.shape();
402
+ auto result = out_tensor_t::try_zeros(input_shape.extents, input_shape.rank);
403
+ if (result.empty()) return result;
404
+ if (!mul<value_type_, max_rank_>(lhs, rhs, result.span())) return out_tensor_t {};
405
+ return result;
406
+ }
407
+
408
+ /** @brief Elementwise multiply by scalar: output[i] = input[i] × scalar. */
409
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
410
+ bool mul(tensor_view<value_type_, max_rank_> input, typename value_type_::scale_t scalar,
411
+ tensor_span<value_type_, max_rank_> output) noexcept {
412
+ typename value_type_::scale_t zero {0};
413
+ return scale<value_type_, max_rank_>(input, scalar, zero, output);
414
+ }
415
+
416
+ /** @brief Allocating multiply by scalar. */
417
+ template <numeric_dtype value_type_, std::size_t max_rank_ = 8,
418
+ typename allocator_type_ = aligned_allocator<value_type_>>
419
+ tensor<value_type_, allocator_type_, max_rank_> try_mul(tensor_view<value_type_, max_rank_> input,
420
+ typename value_type_::scale_t scalar) noexcept {
421
+ using out_tensor_t = tensor<value_type_, allocator_type_, max_rank_>;
422
+ if (input.empty()) return out_tensor_t {};
423
+ auto &input_shape = input.shape();
424
+ auto result = out_tensor_t::try_empty(input_shape.extents, input_shape.rank);
425
+ if (result.empty()) return result;
426
+ if (!mul<value_type_, max_rank_>(input, scalar, result.span())) return out_tensor_t {};
427
+ return result;
428
+ }
429
+
430
+ #pragma endregion - Tensor Elementwise
431
+
432
+ } // namespace ashvardanian::numkong
433
+
434
+ #endif // NK_EACH_HPP
@@ -0,0 +1,147 @@
1
+ # Geospatial Distances in NumKong
2
+
3
+ NumKong implements geodesic distance functions for points on Earth's surface: Haversine computes great-circle distance on a perfect sphere, while Vincenty solves the inverse geodesic problem on the WGS-84 oblate spheroid.
4
+ Both operate on arrays of latitude/longitude pairs in radians and produce distances in meters.
5
+
6
+ The Haversine formula computes the great-circle distance between two points:
7
+
8
+ ```math
9
+ \text{haversine}(\phi_1, \lambda_1, \phi_2, \lambda_2) = 2R \arcsin\sqrt{\sin^2\frac{\phi_2 - \phi_1}{2} + \cos\phi_1 \cos\phi_2 \sin^2\frac{\lambda_2 - \lambda_1}{2}}
10
+ ```
11
+
12
+ where $R$ is Earth's mean radius and $(\phi, \lambda)$ are latitude and longitude in radians.
13
+
14
+ Vincenty's formula solves the inverse geodesic problem on an oblate spheroid, iteratively refining the reduced latitude difference until convergence:
15
+
16
+ ```math
17
+ \text{vincenty}(\phi_1, \lambda_1, \phi_2, \lambda_2) = b \cdot A \cdot (\sigma - \Delta\sigma)
18
+ ```
19
+
20
+ where $a$ and $b$ are the equatorial and polar semi-axes of the WGS-84 ellipsoid, $\sigma$ is the angular separation, and $\Delta\sigma$ is the correction term computed through iterative convergence.
21
+
22
+ Reformulating as Python pseudocode:
23
+
24
+ ```python
25
+ import numpy as np
26
+
27
+ def haversine(lat1, lon1, lat2, lon2, R=6371000):
28
+ dlat = lat2 - lat1
29
+ dlon = lon2 - lon1
30
+ a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
31
+ return 2 * R * np.arcsin(np.sqrt(a))
32
+ ```
33
+
34
+ Input coordinates are in radians, output distances are in meters.
35
+
36
+ ## Input & Output Types
37
+
38
+ | Input Type | Output Type | Description |
39
+ | ---------- | ----------- | ----------------------- |
40
+ | `f64` | `f64` | 64-bit double precision |
41
+ | `f32` | `f32` | 32-bit single precision |
42
+
43
+ ## Optimizations
44
+
45
+ ### Trigonometric Polynomial Approximations
46
+
47
+ `nk_haversine_f32_haswell`, `nk_haversine_f32_skylake`, `nk_haversine_f32_neon` replace `libm` sin, cos, atan2, and asin with SIMD polynomial approximations achieving approximately 3.5 ULP accuracy.
48
+ Range reduction maps the input angle to $[-\pi/4, \pi/4]$ using Cody-Waite extended precision constants, then an odd-degree minimax polynomial evaluates sin and an even-degree polynomial evaluates cos.
49
+ The `f32` kernels use 5-term polynomials while `f64` kernels use 11-term polynomials for the extra precision required by double-precision inputs.
50
+ This avoids the latency of scalar `libm` calls — each trigonometric evaluation would otherwise serialize through a single execution port, while the polynomial chains pipeline across multiple FMA units.
51
+
52
+ ### Vincenty Iterative Convergence with Masked Lanes
53
+
54
+ `nk_vincenty_f64_haswell`, `nk_vincenty_f64_skylake`, `nk_vincenty_f64_neon` implement the full Vincenty inverse formula with up to 100 iterations and a convergence threshold of $10^{-12}$ radians (approximately 6 micrometers on Earth's surface).
55
+ Each SIMD lane may converge at a different iteration count, so the kernel accumulates a `converged_mask` via `_mm256_or_pd(converged_mask, newly_converged)` and selectively freezes converged lanes with `_mm256_blendv_pd(lambda_new, lambda, converged_mask)`.
56
+ Early exit uses `_mm256_movemask_pd` — when all 4 bits (for `f64`) or 8 bits (for `f32`) are set, the loop breaks.
57
+ Coincident points and equatorial edge cases are handled by blending safe values (ones) into the intermediate terms to avoid division by zero, without requiring branches that would diverge across SIMD lanes.
58
+
59
+ ### Haversine Without Final Arc Conversion
60
+
61
+ `nk_haversine_f32_haswell`, `nk_haversine_f64_haswell` support a similarity mode where the haversine formula involves $2R \cdot \text{asin}(\sqrt{h})$ and the intermediate value $h = \sin^2(\Delta\phi/2) + \cos\phi_1 \cos\phi_2 \cdot \sin^2(\Delta\lambda/2)$ is monotonic with distance.
62
+ For ranking and comparison use cases, comparing $h$ values directly produces the same ordering as comparing full Haversine distances, since both asin and sqrt are monotonically increasing.
63
+ This eliminates the two most expensive operations in the pipeline.
64
+ The kernels compute the full distance by default, but the streaming API can optionally skip the final conversion when only relative ordering is needed.
65
+
66
+ ## Performance
67
+
68
+ The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
69
+ The input size is controlled by the `NK_MAX_COORD_ANGLE` environment variable and set to ≤1°, ≤30°, and ≤180° maximum angular separation between pairs of coordinates.
70
+ The larger the angular separation between pairs, the longer the algorithm may take to converge and the higher the error.
71
+ The throughput is measured in MP/s as the number of Millions of pairwise point distances computed per second - amortized for a large batch size, with `NK_DENSE_DIMENSIONS=1536` by default.
72
+ Current `nk_test` output reports geospatial accuracy in two forms: mean/max absolute error in meters against Vincenty's formula computed at double-double (f118) precision, and mean/max ULP against the matching high-precision implementation of the same formula. The historical tables below use the meter-based summary where it has been remeasured; older x86 rows still retain their original ULP figures until rerun.
73
+ Each kernel runs for at least 20 seconds per configuration.
74
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
75
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
76
+
77
+ ### Intel Sapphire Rapids
78
+
79
+ #### Native
80
+
81
+ | Kernel | ≤1° | ≤30° | ≤180° |
82
+ | :------------------------- | -----------------------: | -----------------------: | -----------------------: |
83
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
84
+ | `nk_haversine_f64_serial` | 1.95 mp/s, 0.8 ulp | 2.10 mp/s, 0.8 ulp | 2.02 mp/s, 1.7 ulp |
85
+ | `nk_vincenty_f64_serial` | 0.565 mp/s, 82 ulp | 0.481 mp/s, 3.9 ulp | 0.514 mp/s, 1.1 ulp |
86
+ | `nk_haversine_f64_haswell` | 73.3 mp/s, 0.6 ulp | 68.3 mp/s, 0.6 ulp | 70.4 mp/s, 1.5 ulp |
87
+ | `nk_vincenty_f64_haswell` | 12.2 mp/s, 80 ulp | 10.2 mp/s, 3.6 ulp | 7.15 mp/s, 1.1 ulp |
88
+ | `nk_haversine_f64_skylake` | 106 mp/s, 0.6 ulp | 107 mp/s, 0.6 ulp | 99.8 mp/s, 1.5 ulp |
89
+ | `nk_vincenty_f64_skylake` | 20.4 mp/s, 171K ulp | 17.5 mp/s, 6.57K ulp | 11.2 mp/s, 1.02K ulp |
90
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
91
+ | `nk_haversine_f32_serial` | 56.2 mp/s, 3.4 ulp | 62.3 mp/s, 2.9 ulp | 57.2 mp/s, 55 ulp |
92
+ | `nk_vincenty_f32_serial` | 3.25 mp/s, 58.3K ulp | 2.39 mp/s, 306 ulp | 1.79 mp/s, 103 ulp |
93
+ | `nk_haversine_f32_haswell` | 247 mp/s, 3.2 ulp | 282 mp/s, 2.7 ulp | 281 mp/s, 54 ulp |
94
+ | `nk_vincenty_f32_haswell` | 53.6 mp/s, 26.2K ulp | 46.4 mp/s, 289 ulp | 16.5 mp/s, 61 ulp |
95
+ | `nk_haversine_f32_skylake` | 350 mp/s, 3.1 ulp | 328 mp/s, 2.7 ulp | 356 mp/s, 53 ulp |
96
+ | `nk_vincenty_f32_skylake` | 78.7 mp/s, 7.16K ulp | 73.6 mp/s, 406 ulp | 20.1 mp/s, 105 ulp |
97
+
98
+ #### WASM
99
+
100
+ Measured with Wasmtime v42 (Cranelift backend).
101
+
102
+ | Kernel | ≤1° | ≤30° | ≤180° |
103
+ | :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
104
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
105
+ | `nk_haversine_f64_serial` | ? mp/s, 0.9 ulp | ? mp/s, 0.9 ulp | ? mp/s, 1.8 ulp |
106
+ | `nk_vincenty_f64_serial` | ? mp/s, 102 ulp | ? mp/s, 3.7 ulp | ? mp/s, 1.1 ulp |
107
+ | `nk_haversine_f64_v128relaxed` | ? mp/s, 0.6 ulp | ? mp/s, 0.6 ulp | ? mp/s, 1.7 ulp |
108
+ | `nk_vincenty_f64_v128relaxed` | ? mp/s, 104 ulp | ? mp/s, 3.4 ulp | ? mp/s, 1.1 ulp |
109
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
110
+ | `nk_haversine_f32_serial` | ? mp/s, 3.5 ulp | ? mp/s, 2.9 ulp | ? mp/s, 53.6 ulp |
111
+ | `nk_vincenty_f32_serial` | ? mp/s, 70.5K ulp | ? mp/s, 326 ulp | ? mp/s, 65.5 ulp |
112
+ | `nk_haversine_f32_v128relaxed` | ? mp/s, 6.5 ulp | ? mp/s, 5.6 ulp | ? mp/s, 53.3 ulp |
113
+ | `nk_vincenty_f32_v128relaxed` | ? mp/s, 23.8K ulp | ? mp/s, 323 ulp | ? mp/s, 64.0 ulp |
114
+
115
+ ### Apple M4
116
+
117
+ #### Native
118
+
119
+ | Kernel | ≤1° | ≤30° | ≤180° |
120
+ | :------------------------ | -----------------------: | -----------------------: | -----------------------: |
121
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
122
+ | `nk_haversine_f64_serial` | 3.47 mp/s, 1.12 km | 3.47 mp/s, 32.8 km | 3.48 mp/s, 150 km |
123
+ | `nk_vincenty_f64_serial` | 0.888 mp/s, 2.20 nm | 0.770 mp/s, 2.79 nm | 0.662 mp/s, 622 nm |
124
+ | `nk_haversine_f64_neon` | 72.8 mp/s, 1.12 km | 72.5 mp/s, 32.8 km | 72.8 mp/s, 150 km |
125
+ | `nk_vincenty_f64_neon` | 9.34 mp/s, 2.12 nm | 7.61 mp/s, 2.33 nm | 5.99 mp/s, 622 nm |
126
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
127
+ | `nk_haversine_f32_serial` | 14.1 mp/s, 1.12 km | 13.8 mp/s, 32.8 km | 14.0 mp/s, 146 km |
128
+ | `nk_vincenty_f32_serial` | 4.54 mp/s, 12.0 m | 3.25 mp/s, 12.5 m | 2.42 mp/s, 22.0 m |
129
+ | `nk_haversine_f32_neon` | 247 mp/s, 1.12 km | 235 mp/s, 32.8 km | 252 mp/s, 146 km |
130
+ | `nk_vincenty_f32_neon` | 45.7 mp/s, 12.2 m | 37.9 mp/s, 12.8 m | 15.7 mp/s, 22.0 m |
131
+
132
+ #### WASM
133
+
134
+ Measured with Wasmtime v42 (Cranelift backend).
135
+
136
+ | Kernel | ≤1° | ≤30° | ≤180° |
137
+ | :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
138
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
139
+ | `nk_haversine_f64_serial` | 1.84 mp/s, 1.12 km | 1.83 mp/s, 32.8 km | 1.99 mp/s, 148 km |
140
+ | `nk_vincenty_f64_serial` | 0.481 mp/s, 1.86 nm | 0.419 mp/s, 2.33 nm | 0.422 mp/s, 594 nm |
141
+ | `nk_haversine_f64_v128relaxed` | 35.7 mp/s, 1.12 km | 35.9 mp/s, 32.8 km | 35.9 mp/s, 148 km |
142
+ | `nk_vincenty_f64_v128relaxed` | 4.19 mp/s, 1.89 nm | 3.57 mp/s, 2.33 nm | 2.94 mp/s, 594 nm |
143
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
144
+ | `nk_haversine_f32_serial` | 6.74 mp/s, 20,000 km | 6.80 mp/s, 32.7 km | 7.34 mp/s, 136 km |
145
+ | `nk_vincenty_f32_serial` | 2.25 mp/s, 20,000 km | 1.65 mp/s, 12.0 m | 1.35 mp/s, 22.0 m |
146
+ | `nk_haversine_f32_v128relaxed` | 161 mp/s, 20,000 km | 165 mp/s, 32.7 km | 165 mp/s, 153 km |
147
+ | `nk_vincenty_f32_v128relaxed` | 24.6 mp/s, 12.0 m | 20.5 mp/s, 16.2 m | 9.57 mp/s, 18.0 m |