numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,180 @@
1
+ /**
2
+ * @brief SIMD-accelerated Type Conversions for WASM.
3
+ * @file include/numkong/cast/v128relaxed.h
4
+ */
5
+
6
+ #ifndef NK_CAST_V128RELAXED_H
7
+ #define NK_CAST_V128RELAXED_H
8
+
9
+ #if NK_TARGET_V128RELAXED
10
+
11
+ #include "numkong/types.h"
12
+ #include "numkong/cast/serial.h"
13
+
14
+ #if defined(__cplusplus)
15
+ extern "C" {
16
+ #endif
17
+
18
+ #if defined(__clang__)
19
+ #pragma clang attribute push(__attribute__((target("relaxed-simd"))), apply_to = function)
20
+ #endif
21
+
22
+ /** @brief Native WASM SIMD 128-bit load. */
23
+ NK_INTERNAL void nk_load_b128_v128relaxed_(void const *src, nk_b128_vec_t *dst) { dst->v128 = wasm_v128_load(src); }
24
+ /** @brief Native WASM SIMD 256-bit load using two v128 loads. */
25
+ NK_INTERNAL void nk_load_b256_v128relaxed_(void const *src, nk_b256_vec_t *dst) {
26
+ dst->v128s[0] = wasm_v128_load(src);
27
+ dst->v128s[1] = wasm_v128_load((char const *)src + 16);
28
+ }
29
+ /** @brief Native WASM SIMD 128-bit store. */
30
+ NK_INTERNAL void nk_store_b128_v128relaxed_(nk_b128_vec_t const *src, void *dst) { wasm_v128_store(dst, src->v128); }
31
+ /** @brief Native WASM SIMD 256-bit store using two v128 stores. */
32
+ NK_INTERNAL void nk_store_b256_v128relaxed_(nk_b256_vec_t const *src, void *dst) {
33
+ wasm_v128_store(dst, src->v128s[0]);
34
+ wasm_v128_store((char *)dst + 16, src->v128s[1]);
35
+ }
36
+
37
+ /** @brief BF16 is the upper 16 bits of F32, so zero-extend to u32 and shift left by 16. */
38
+ NK_INTERNAL nk_b128_vec_t nk_bf16x4_to_f32x4_v128relaxed_(nk_b64_vec_t bf16_vec) {
39
+ v128_t bf16_u16x4_in_u64 = wasm_v128_load64_zero(&bf16_vec.u64);
40
+ v128_t bf16_u32x4_low = wasm_u32x4_extend_low_u16x8(bf16_u16x4_in_u64);
41
+ nk_b128_vec_t result;
42
+ result.v128 = wasm_i32x4_shl(bf16_u32x4_low, 16);
43
+ return result;
44
+ }
45
+
46
+ /**
47
+ * @brief F16→F32: extract sign/exp/mantissa, rebias exponent (F16 bias=15, F32 bias=127, delta=112),
48
+ * widen mantissa from 10 to 23 bits. Early-exit when all lanes are normal (exp in [1,30]),
49
+ * skipping the expensive f32x4.convert_u32x4 needed for denormal FPU-based normalization.
50
+ */
51
+ NK_INTERNAL nk_b128_vec_t nk_f16x4_to_f32x4_v128relaxed_(nk_b64_vec_t f16_vec) {
52
+ v128_t f16_u16x4_in_u64 = wasm_v128_load64_zero(&f16_vec.u64);
53
+ v128_t f16_u32x4 = wasm_u32x4_extend_low_u16x8(f16_u16x4_in_u64);
54
+
55
+ v128_t sign_u32x4 = wasm_v128_and(f16_u32x4, wasm_i32x4_splat(0x8000)); // bit 15
56
+ v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(f16_u32x4, 10), wasm_i32x4_splat(0x1F)); // bits 14-10
57
+ v128_t mant_u32x4 = wasm_v128_and(f16_u32x4, wasm_i32x4_splat(0x03FF)); // bits 9-0
58
+
59
+ v128_t sign_f32_u32x4 = wasm_i32x4_shl(sign_u32x4, 16); // shift sign to F32 bit 31
60
+
61
+ // Normal path: rebias exponent, widen mantissa
62
+ v128_t exp_rebiased_u32x4 = wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(112));
63
+ v128_t normal_exp_u32x4 = wasm_i32x4_shl(exp_rebiased_u32x4, 23);
64
+ v128_t normal_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 13);
65
+ v128_t normal_bits_u32x4 = wasm_v128_or(sign_f32_u32x4, wasm_v128_or(normal_exp_u32x4, normal_mant_u32x4));
66
+
67
+ // Early exit: skip zero/denormal/inf/NaN handling when all lanes are normal
68
+ v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
69
+ v128_t exp_max_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(31));
70
+ v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, exp_max_mask);
71
+ if (!wasm_v128_any_true(exceptional_mask)) {
72
+ nk_b128_vec_t result;
73
+ result.v128 = normal_bits_u32x4;
74
+ return result;
75
+ }
76
+
77
+ // Slow path: handle zero (exp=0, mant=0), denormal (exp=0, mant!=0), inf/NaN (exp=31)
78
+ v128_t zero_bits_u32x4 = sign_f32_u32x4;
79
+ v128_t inf_nan_bits_u32x4 = wasm_v128_or(
80
+ sign_f32_u32x4, wasm_v128_or(wasm_i32x4_splat(0x7F800000), wasm_i32x4_shl(mant_u32x4, 13)));
81
+
82
+ // Denormals: convert mantissa to f32 and multiply by 2^-24, letting the FPU normalize.
83
+ // This avoids a manual CLZ+shift loop. The f32x4.convert_u32x4 legalizes to a
84
+ // multi-instruction sequence on x86 (no native u32→f32 until AVX-512), which is why
85
+ // the early exit above is so valuable.
86
+ v128_t mant_f32x4 = wasm_f32x4_convert_u32x4(mant_u32x4);
87
+ v128_t denorm_normalized_f32x4 = wasm_f32x4_mul(mant_f32x4, wasm_f32x4_splat(0x1p-24f));
88
+ v128_t denorm_bits_u32x4 = wasm_v128_or(denorm_normalized_f32x4, sign_f32_u32x4);
89
+
90
+ v128_t mant_zero_mask = wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(0));
91
+ v128_t is_zero_mask = wasm_v128_and(exp_zero_mask, mant_zero_mask);
92
+ v128_t is_denormal_mask = wasm_v128_andnot(exp_zero_mask, mant_zero_mask);
93
+
94
+ // Blend via relaxed_laneselect (1 instruction: vblendvps on x86, vs 3 for and/andn/or)
95
+ v128_t result_u32x4 = normal_bits_u32x4;
96
+ result_u32x4 = wasm_i32x4_relaxed_laneselect(zero_bits_u32x4, result_u32x4, is_zero_mask);
97
+ result_u32x4 = wasm_i32x4_relaxed_laneselect(denorm_bits_u32x4, result_u32x4, is_denormal_mask);
98
+ result_u32x4 = wasm_i32x4_relaxed_laneselect(inf_nan_bits_u32x4, result_u32x4, exp_max_mask);
99
+
100
+ nk_b128_vec_t result;
101
+ result.v128 = result_u32x4;
102
+ return result;
103
+ }
104
+
105
+ /**
106
+ * @brief E4M3→F32: 4-bit exponent (bias=7→127, delta=120), 3-bit mantissa (shift by 20).
107
+ * Subnormal via FPU: mant * (1/512) = mant * 2^-9. NaN only at exp=15,mant=7.
108
+ */
109
+ NK_INTERNAL nk_b128_vec_t nk_e4m3x4_to_f32x4_v128relaxed_(nk_b32_vec_t e4m3_vec) {
110
+ v128_t e4m3_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_v128_load32_zero(&e4m3_vec.u32)));
111
+ v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(e4m3_u32x4, 3), wasm_i32x4_splat(0x0F));
112
+ v128_t mant_u32x4 = wasm_v128_and(e4m3_u32x4, wasm_i32x4_splat(0x07));
113
+ v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(e4m3_u32x4, 7), 31);
114
+ v128_t f32_exp_u32x4 = wasm_i32x4_shl(wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(120)), 23);
115
+ v128_t f32_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 20);
116
+ v128_t normal_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_v128_or(f32_exp_u32x4, f32_mant_u32x4));
117
+ v128_t subnorm_abs_f32x4 = wasm_f32x4_mul(wasm_f32x4_convert_u32x4(mant_u32x4), wasm_f32x4_splat(1.0f / 512.0f));
118
+ v128_t subnorm_f32x4 = wasm_v128_or(subnorm_abs_f32x4, sign_u32x4);
119
+ v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
120
+ v128_t is_nan_mask = wasm_v128_and(wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(15)),
121
+ wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(7)));
122
+ v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, is_nan_mask);
123
+ if (!wasm_v128_any_true(exceptional_mask)) {
124
+ nk_b128_vec_t result;
125
+ result.v128 = normal_bits_u32x4;
126
+ return result;
127
+ }
128
+ v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(subnorm_f32x4, normal_bits_u32x4, exp_zero_mask);
129
+ if (wasm_v128_any_true(is_nan_mask)) {
130
+ v128_t nan_bits = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
131
+ result_u32x4 = wasm_i32x4_relaxed_laneselect(nan_bits, result_u32x4, is_nan_mask);
132
+ }
133
+ nk_b128_vec_t result;
134
+ result.v128 = result_u32x4;
135
+ return result;
136
+ }
137
+
138
+ /**
139
+ * @brief E5M2→F32: same exponent encoding as F16 (5-bit, bias=15, delta=112), 2-bit mantissa (shift by 21).
140
+ * Subnormal via FPU: mant * (1/65536) = mant * 2^-16. Inf at exp=31,mant=0; NaN otherwise.
141
+ */
142
+ NK_INTERNAL nk_b128_vec_t nk_e5m2x4_to_f32x4_v128relaxed_(nk_b32_vec_t e5m2_vec) {
143
+ v128_t e5m2_u32x4 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(wasm_v128_load32_zero(&e5m2_vec.u32)));
144
+ v128_t exp_u32x4 = wasm_v128_and(wasm_u32x4_shr(e5m2_u32x4, 2), wasm_i32x4_splat(0x1F));
145
+ v128_t mant_u32x4 = wasm_v128_and(e5m2_u32x4, wasm_i32x4_splat(0x03));
146
+ v128_t sign_u32x4 = wasm_i32x4_shl(wasm_u32x4_shr(e5m2_u32x4, 7), 31);
147
+ v128_t f32_exp_u32x4 = wasm_i32x4_shl(wasm_i32x4_add(exp_u32x4, wasm_i32x4_splat(112)), 23);
148
+ v128_t f32_mant_u32x4 = wasm_i32x4_shl(mant_u32x4, 21);
149
+ v128_t normal_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_v128_or(f32_exp_u32x4, f32_mant_u32x4));
150
+ v128_t subnorm_abs_f32x4 = wasm_f32x4_mul(wasm_f32x4_convert_u32x4(mant_u32x4), wasm_f32x4_splat(1.0f / 65536.0f));
151
+ v128_t subnorm_f32x4 = wasm_v128_or(subnorm_abs_f32x4, sign_u32x4);
152
+ v128_t exp_zero_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(0));
153
+ v128_t exp_max_mask = wasm_i32x4_eq(exp_u32x4, wasm_i32x4_splat(31));
154
+ v128_t exceptional_mask = wasm_v128_or(exp_zero_mask, exp_max_mask);
155
+ if (!wasm_v128_any_true(exceptional_mask)) {
156
+ nk_b128_vec_t result;
157
+ result.v128 = normal_bits_u32x4;
158
+ return result;
159
+ }
160
+ v128_t result_u32x4 = wasm_i32x4_relaxed_laneselect(subnorm_f32x4, normal_bits_u32x4, exp_zero_mask);
161
+ v128_t mant_zero_mask = wasm_i32x4_eq(mant_u32x4, wasm_i32x4_splat(0));
162
+ v128_t inf_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7F800000));
163
+ v128_t nan_bits_u32x4 = wasm_v128_or(sign_u32x4, wasm_i32x4_splat(0x7FC00000));
164
+ v128_t special_bits_u32x4 = wasm_i32x4_relaxed_laneselect(inf_bits_u32x4, nan_bits_u32x4, mant_zero_mask);
165
+ result_u32x4 = wasm_i32x4_relaxed_laneselect(special_bits_u32x4, result_u32x4, exp_max_mask);
166
+ nk_b128_vec_t result;
167
+ result.v128 = result_u32x4;
168
+ return result;
169
+ }
170
+
171
+ #if defined(__clang__)
172
+ #pragma clang attribute pop
173
+ #endif
174
+
175
+ #if defined(__cplusplus)
176
+ } // extern "C"
177
+ #endif
178
+
179
+ #endif // NK_TARGET_V128RELAXED
180
+ #endif // NK_CAST_V128RELAXED_H
@@ -0,0 +1,230 @@
1
+ /**
2
+ * @brief SIMD-accelerated Type Conversions.
3
+ * @file include/numkong/cast.h
4
+ * @author Ash Vardanian
5
+ * @date January 2, 2026
6
+ *
7
+ * This file focuses on numeric types not uniformly supported across platforms, prioritizing:
8
+ *
9
+ * - `e5m2` & `e4m3` ↔ `f16` & `bf16` - used for low-precision dot-products on modern CPUs,
10
+ * - `e5m2` & `e4m3` ↔ `f32` - used for low-precision dot-products on older CPUs,
11
+ * - `f16` & `bf16` ↔ `f32` - often used for half-precision dot-products on older CPUs,
12
+ *
13
+ * Unlike most operation classes in NumKong, these are dependent on two input types: "from" & "to".
14
+ * It contains scalar helpers named like `nk_f16_to_f32_serial_` as well as buffer-to-buffer
15
+ * `memcpy`-like vectorized operations, such as `nk_cast_f16_to_f32` with `nk_cast_f16_to_f32_serial`,
16
+ * `nk_cast_f16_to_f32_neon`, `nk_cast_f16_to_f32_skylake`, and other platform-specific variants.
17
+ *
18
+ * It also includes "partial load" and "partial store" type-punned helper functions for handling
19
+ * IO between memory and registers, that are extensively reused in reductions, elementwise ops, and
20
+ * dot-products.
21
+ *
22
+ * Float-format narrowing uses round-to-nearest, ties-to-even. Float-to-integer narrowing follows
23
+ * the same tie rule, saturates infinities, and maps NaNs to zero.
24
+ *
25
+ * Assuming the overall breadth and sparsity of our type system, its clear, that not all type conversions
26
+ * have equivalent relevance. With ~16 numeric types we'd be looking at 21x21=441 conversions for:
27
+ *
28
+ * e4m3 e5m2 bf16 f16 f32 f64
29
+ * bf16c f16c f32c f64c
30
+ * i4 i8 i16 i32 i64
31
+ * u1 u4 u8 u16 u32 u64
32
+ *
33
+ * To simplify the design and make it more broadly applicable in AI workloads, we implement a slower
34
+ * @b "hub-and-spoke" design to guiding most conversions through an intermediate type, like `f64` or `i64`.
35
+ *
36
+ */
37
+ #ifndef NK_CAST_H
38
+ #define NK_CAST_H
39
+
40
+ #include "numkong/types.h"
41
+
42
+ #if defined(__cplusplus)
43
+ extern "C" {
44
+ #endif
45
+
46
+ /**
47
+ * @brief Elementwise type-casting for arrays of entries.
48
+ *
49
+ * @param[in] from The immutable input source array containing `n` elements of `from_type` type.
50
+ * @param[in] from_type The type of elements in the immutable source array.
51
+ * @param[in] n The number of elements in both input and output arrays.
52
+ * @param[in] to The mutable output array containing `n` elements of `to_type` type.
53
+ * @param[in] to_type The type of elements in the mutable target array.
54
+ */
55
+ NK_DYNAMIC void nk_cast(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
56
+
57
+ /** @copydoc nk_cast */
58
+ NK_PUBLIC void nk_cast_serial(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
59
+
60
+ /** @brief Scalar conversion from f16 to f32. */
61
+ NK_DYNAMIC void nk_f16_to_f32(nk_f16_t const *src, nk_f32_t *dest);
62
+ /** @brief Scalar conversion from bf16 to f32. */
63
+ NK_DYNAMIC void nk_bf16_to_f32(nk_bf16_t const *src, nk_f32_t *dest);
64
+ /** @brief Scalar conversion from e4m3 to f32. */
65
+ NK_DYNAMIC void nk_e4m3_to_f32(nk_e4m3_t const *src, nk_f32_t *dest);
66
+ /** @brief Scalar conversion from e5m2 to f32. */
67
+ NK_DYNAMIC void nk_e5m2_to_f32(nk_e5m2_t const *src, nk_f32_t *dest);
68
+ /** @brief Scalar conversion from e2m3 to f32. */
69
+ NK_DYNAMIC void nk_e2m3_to_f32(nk_e2m3_t const *src, nk_f32_t *dest);
70
+ /** @brief Scalar conversion from e3m2 to f32. */
71
+ NK_DYNAMIC void nk_e3m2_to_f32(nk_e3m2_t const *src, nk_f32_t *dest);
72
+
73
+ /** @brief Scalar conversion from f32 to f16. */
74
+ NK_DYNAMIC void nk_f32_to_f16(nk_f32_t const *src, nk_f16_t *dest);
75
+ /** @brief Scalar conversion from f32 to bf16. */
76
+ NK_DYNAMIC void nk_f32_to_bf16(nk_f32_t const *src, nk_bf16_t *dest);
77
+ /** @brief Scalar conversion from f32 to e4m3. */
78
+ NK_DYNAMIC void nk_f32_to_e4m3(nk_f32_t const *src, nk_e4m3_t *dest);
79
+ /** @brief Scalar conversion from f32 to e5m2. */
80
+ NK_DYNAMIC void nk_f32_to_e5m2(nk_f32_t const *src, nk_e5m2_t *dest);
81
+ /** @brief Scalar conversion from f32 to e2m3. */
82
+ NK_DYNAMIC void nk_f32_to_e2m3(nk_f32_t const *src, nk_e2m3_t *dest);
83
+ /** @brief Scalar conversion from f32 to e3m2. */
84
+ NK_DYNAMIC void nk_f32_to_e3m2(nk_f32_t const *src, nk_e3m2_t *dest);
85
+
86
+ /** @copydoc nk_f16_to_f32 */
87
+ NK_PUBLIC void nk_f16_to_f32_serial(nk_f16_t const *src, nk_f32_t *dest);
88
+ /** @copydoc nk_f32_to_f16 */
89
+ NK_PUBLIC void nk_f32_to_f16_serial(nk_f32_t const *src, nk_f16_t *dest);
90
+ /** @copydoc nk_bf16_to_f32 */
91
+ NK_PUBLIC void nk_bf16_to_f32_serial(nk_bf16_t const *src, nk_f32_t *dest);
92
+ /** @copydoc nk_f32_to_bf16 */
93
+ NK_PUBLIC void nk_f32_to_bf16_serial(nk_f32_t const *src, nk_bf16_t *dest);
94
+ /** @copydoc nk_e4m3_to_f32 */
95
+ NK_PUBLIC void nk_e4m3_to_f32_serial(nk_e4m3_t const *src, nk_f32_t *dest);
96
+ /** @copydoc nk_f32_to_e4m3 */
97
+ NK_PUBLIC void nk_f32_to_e4m3_serial(nk_f32_t const *src, nk_e4m3_t *dest);
98
+ /** @copydoc nk_e5m2_to_f32 */
99
+ NK_PUBLIC void nk_e5m2_to_f32_serial(nk_e5m2_t const *src, nk_f32_t *dest);
100
+ /** @copydoc nk_f32_to_e5m2 */
101
+ NK_PUBLIC void nk_f32_to_e5m2_serial(nk_f32_t const *src, nk_e5m2_t *dest);
102
+ /** @copydoc nk_e2m3_to_f32 */
103
+ NK_PUBLIC void nk_e2m3_to_f32_serial(nk_e2m3_t const *src, nk_f32_t *dest);
104
+ /** @copydoc nk_f32_to_e2m3 */
105
+ NK_PUBLIC void nk_f32_to_e2m3_serial(nk_f32_t const *src, nk_e2m3_t *dest);
106
+ /** @copydoc nk_e3m2_to_f32 */
107
+ NK_PUBLIC void nk_e3m2_to_f32_serial(nk_e3m2_t const *src, nk_f32_t *dest);
108
+ /** @copydoc nk_f32_to_e3m2 */
109
+ NK_PUBLIC void nk_f32_to_e3m2_serial(nk_f32_t const *src, nk_e3m2_t *dest);
110
+
111
+ #if NK_TARGET_NEON
112
+ /** @copydoc nk_cast */
113
+ NK_PUBLIC void nk_cast_neon(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
114
+ /** @copydoc nk_f16_to_f32 */
115
+ NK_PUBLIC void nk_f16_to_f32_neon(nk_f16_t const *src, nk_f32_t *dest);
116
+ /** @copydoc nk_f32_to_f16 */
117
+ NK_PUBLIC void nk_f32_to_f16_neon(nk_f32_t const *src, nk_f16_t *dest);
118
+ #endif // NK_TARGET_NEON
119
+
120
+ #if NK_TARGET_HASWELL
121
+ /** @copydoc nk_cast */
122
+ NK_PUBLIC void nk_cast_haswell(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
123
+ /** @copydoc nk_f16_to_f32 */
124
+ NK_PUBLIC void nk_f16_to_f32_haswell(nk_f16_t const *src, nk_f32_t *dest);
125
+ /** @copydoc nk_f32_to_f16 */
126
+ NK_PUBLIC void nk_f32_to_f16_haswell(nk_f32_t const *src, nk_f16_t *dest);
127
+ #endif // NK_TARGET_HASWELL
128
+
129
+ #if NK_TARGET_SKYLAKE
130
+ /** @copydoc nk_cast */
131
+ NK_PUBLIC void nk_cast_skylake(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
132
+ #endif // NK_TARGET_SKYLAKE
133
+
134
+ #if NK_TARGET_ICELAKE
135
+ /** @copydoc nk_cast */
136
+ NK_PUBLIC void nk_cast_icelake(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
137
+ #endif // NK_TARGET_ICELAKE
138
+
139
+ #if NK_TARGET_SAPPHIRE
140
+ /** @copydoc nk_cast */
141
+ NK_PUBLIC void nk_cast_sapphire(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
142
+ /** @copydoc nk_f16_to_f32 */
143
+ NK_PUBLIC void nk_f16_to_f32_sapphire(nk_f16_t const *src, nk_f32_t *dest);
144
+ /** @copydoc nk_f32_to_f16 */
145
+ NK_PUBLIC void nk_f32_to_f16_sapphire(nk_f32_t const *src, nk_f16_t *dest);
146
+ #endif // NK_TARGET_SAPPHIRE
147
+
148
+ #if NK_TARGET_RVV
149
+ /** @copydoc nk_cast */
150
+ NK_PUBLIC void nk_cast_rvv(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
151
+ #endif // NK_TARGET_RVV
152
+
153
+ #if defined(__cplusplus)
154
+ } // extern "C"
155
+ #endif
156
+
157
+ #include "numkong/cast/serial.h"
158
+ #include "numkong/cast/neon.h"
159
+ #include "numkong/cast/haswell.h"
160
+ #include "numkong/cast/skylake.h"
161
+ #include "numkong/cast/icelake.h"
162
+ #include "numkong/cast/sapphire.h"
163
+ #include "numkong/cast/rvv.h"
164
+
165
+ #if defined(__cplusplus)
166
+ extern "C" {
167
+ #endif
168
+
169
+ #if !NK_DYNAMIC_DISPATCH
170
+
171
+ NK_PUBLIC void nk_cast(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type) {
172
+ #if NK_TARGET_SAPPHIRE
173
+ nk_cast_sapphire(from, from_type, n, to, to_type);
174
+ #elif NK_TARGET_ICELAKE
175
+ nk_cast_icelake(from, from_type, n, to, to_type);
176
+ #elif NK_TARGET_SKYLAKE
177
+ nk_cast_skylake(from, from_type, n, to, to_type);
178
+ #elif NK_TARGET_HASWELL
179
+ nk_cast_haswell(from, from_type, n, to, to_type);
180
+ #elif NK_TARGET_RVV
181
+ nk_cast_rvv(from, from_type, n, to, to_type);
182
+ #elif NK_TARGET_NEON
183
+ nk_cast_neon(from, from_type, n, to, to_type);
184
+ #else
185
+ nk_cast_serial(from, from_type, n, to, to_type);
186
+ #endif
187
+ }
188
+
189
+ NK_PUBLIC void nk_f16_to_f32(nk_f16_t const *src, nk_f32_t *dest) {
190
+ #if NK_TARGET_SAPPHIRE
191
+ nk_f16_to_f32_sapphire(src, dest);
192
+ #elif NK_TARGET_HASWELL
193
+ nk_f16_to_f32_haswell(src, dest);
194
+ #elif NK_TARGET_NEON
195
+ nk_f16_to_f32_neon(src, dest);
196
+ #else
197
+ nk_f16_to_f32_serial(src, dest);
198
+ #endif
199
+ }
200
+
201
+ NK_PUBLIC void nk_f32_to_f16(nk_f32_t const *src, nk_f16_t *dest) {
202
+ #if NK_TARGET_SAPPHIRE
203
+ nk_f32_to_f16_sapphire(src, dest);
204
+ #elif NK_TARGET_HASWELL
205
+ nk_f32_to_f16_haswell(src, dest);
206
+ #elif NK_TARGET_NEON
207
+ nk_f32_to_f16_neon(src, dest);
208
+ #else
209
+ nk_f32_to_f16_serial(src, dest);
210
+ #endif
211
+ }
212
+
213
+ NK_PUBLIC void nk_bf16_to_f32(nk_bf16_t const *src, nk_f32_t *dest) { nk_bf16_to_f32_serial(src, dest); }
214
+ NK_PUBLIC void nk_f32_to_bf16(nk_f32_t const *src, nk_bf16_t *dest) { nk_f32_to_bf16_serial(src, dest); }
215
+ NK_PUBLIC void nk_e4m3_to_f32(nk_e4m3_t const *src, nk_f32_t *dest) { nk_e4m3_to_f32_serial(src, dest); }
216
+ NK_PUBLIC void nk_f32_to_e4m3(nk_f32_t const *src, nk_e4m3_t *dest) { nk_f32_to_e4m3_serial(src, dest); }
217
+ NK_PUBLIC void nk_e5m2_to_f32(nk_e5m2_t const *src, nk_f32_t *dest) { nk_e5m2_to_f32_serial(src, dest); }
218
+ NK_PUBLIC void nk_f32_to_e5m2(nk_f32_t const *src, nk_e5m2_t *dest) { nk_f32_to_e5m2_serial(src, dest); }
219
+ NK_PUBLIC void nk_e2m3_to_f32(nk_e2m3_t const *src, nk_f32_t *dest) { nk_e2m3_to_f32_serial(src, dest); }
220
+ NK_PUBLIC void nk_f32_to_e2m3(nk_f32_t const *src, nk_e2m3_t *dest) { nk_f32_to_e2m3_serial(src, dest); }
221
+ NK_PUBLIC void nk_e3m2_to_f32(nk_e3m2_t const *src, nk_f32_t *dest) { nk_e3m2_to_f32_serial(src, dest); }
222
+ NK_PUBLIC void nk_f32_to_e3m2(nk_f32_t const *src, nk_e3m2_t *dest) { nk_f32_to_e3m2_serial(src, dest); }
223
+
224
+ #endif // !NK_DYNAMIC_DISPATCH
225
+
226
+ #if defined(__cplusplus)
227
+ } // extern "C"
228
+ #endif
229
+
230
+ #endif // NK_CAST_H
@@ -0,0 +1,223 @@
1
+ # Curved Space Distances in NumKong
2
+
3
+ NumKong implements distance functions for curved metric spaces: bilinear forms compute $a^T C b$ for an arbitrary metric tensor $C$, while Mahalanobis distance generalizes Euclidean distance to account for correlations between dimensions.
4
+ Complex bilinear forms extend this to Hermitian inner products.
5
+ These operations are central to Gaussian process inference, metric learning, and statistical distance measures.
6
+
7
+ The bilinear form for real vectors is:
8
+
9
+ ```math
10
+ \text{bilinear}(a, b, C) = a^T C b = \sum_{i=0}^{n-1} \sum_{j=0}^{n-1} a_i \cdot c_{ij} \cdot b_j
11
+ ```
12
+
13
+ The Mahalanobis distance is:
14
+
15
+ ```math
16
+ \text{mahalanobis}(a, b, C) = \sqrt{(a - b)^T C (a - b)}
17
+ ```
18
+
19
+ For complex vectors, the bilinear form uses the conjugate transpose:
20
+
21
+ ```math
22
+ \text{bilinear}(a, b, C) = a^H C b = \sum_{i=0}^{n-1} \sum_{j=0}^{n-1} \bar{a_i} \cdot c_{ij} \cdot b_j
23
+ ```
24
+
25
+ Reformulating as Python pseudocode:
26
+
27
+ ```python
28
+ import numpy as np
29
+
30
+ def bilinear(a: np.ndarray, b: np.ndarray, C: np.ndarray) -> float:
31
+ return a @ C @ b
32
+
33
+ def mahalanobis(a: np.ndarray, b: np.ndarray, C: np.ndarray) -> float:
34
+ diff = a - b
35
+ return np.sqrt(diff @ C @ diff)
36
+
37
+ def bilinear_complex(a: np.ndarray, b: np.ndarray, C: np.ndarray) -> complex:
38
+ return np.conj(a) @ C @ b
39
+ ```
40
+
41
+ ## Input & Output Types
42
+
43
+ Real bilinear and Mahalanobis:
44
+
45
+ | Input Type | Output Type | Description |
46
+ | ---------- | ----------- | ---------------------------------------------- |
47
+ | `f64` | `f64` | 64-bit IEEE 754 double precision |
48
+ | `f32` | `f32` | 32-bit IEEE 754 single precision |
49
+ | `f16` | `f32` | 16-bit IEEE 754 half precision, widened output |
50
+ | `bf16` | `f32` | 16-bit brain float, widened output |
51
+
52
+ Complex bilinear:
53
+
54
+ | Input Type | Output Type | Description |
55
+ | ---------- | ----------- | ------------------------------------------ |
56
+ | `f64c` | `f64c` | 64-bit complex pairs |
57
+ | `f32c` | `f32c` | 32-bit complex pairs |
58
+ | `f16c` | `f32c` | 16-bit complex pairs, widened output |
59
+ | `bf16c` | `f32c` | 16-bit brain complex pairs, widened output |
60
+
61
+ ## Optimizations
62
+
63
+ ### Row-Major Streaming with Nested Dot2
64
+
65
+ `nk_bilinear_f64_skylake`, `nk_mahalanobis_f64_skylake` decompose the bilinear form $a^T C b$ as $\sum_i a_i \cdot \text{dot}(C_i, b)$ where $C_i$ is the $i$-th row of the metric tensor.
66
+ Each inner dot product uses Dot2 compensation — TwoProd via FMA captures the rounding error of each $c_{ij} \cdot b_j$ product exactly, and a TwoSum chain propagates it through the accumulator.
67
+ The outer sum over rows uses a second level of compensation, tracking the rounding error of each $a_i \cdot r_i$ accumulation.
68
+ This nested structure gives $O(n)$ cache-friendly sequential access to the $n \times n$ matrix $C$, since each row is read once and discarded.
69
+ `nk_bilinear_f32_neon`, `nk_bilinear_f32_skylake`, `nk_mahalanobis_f32_neon`, `nk_mahalanobis_f32_skylake` use the same row-major streaming pattern but accumulate in `f64` instead of Dot2, which provides sufficient precision for `f32` inputs.
70
+
71
+ ### SME Outer-Product Accumulation
72
+
73
+ `nk_bilinear_f32_smef64`, `nk_bilinear_f64_smef64`, `nk_bilinear_f32c_smef64`, `nk_bilinear_f64c_smef64`, `nk_mahalanobis_f32_smef64`, `nk_mahalanobis_f64_smef64` use the Scalable Matrix Extension to compute the bilinear form as an outer-product accumulation.
74
+ Each `FMOPA` instruction performs a rank-1 update $a_i \cdot b^T$ into the SME ZA tile array, and the matrix $C$ is streamed row-by-row and multiplied into the accumulator.
75
+ This is fundamentally different from the row-major dot approach — it reformulates $a^T C b$ as a matrix-multiply problem where SME's 2D tile registers can exploit the matrix engine's throughput.
76
+ For dimensions that align to the tile size, this approach achieves near-peak throughput; dimensions that do not align fall back to NEON for cleanup of the residual elements.
77
+
78
+ ### Complex Bilinear Decomposition
79
+
80
+ `nk_bilinear_f32c_neon`, `nk_bilinear_f32c_skylake`, `nk_bilinear_f64c_skylake` compute $a^H C b$ where each element involves 4 real multiplications from the complex product $\bar{a_i} \cdot c_{ij} \cdot b_j$.
81
+ The kernel decomposes this into real and imaginary dot products over rows of $C$: for each row $i$, it computes the real part as $a_{i,re} \cdot \text{dot}(C_i, b)_{re} + a_{i,im} \cdot \text{dot}(C_i, b)_{im}$ and the imaginary part with the conjugation baked in as sign flips.
82
+ This fuses the conjugation of $a$ into the sign of the cross terms rather than explicitly negating the imaginary components, saving one negate operation per element.
83
+
84
+ ## Performance
85
+
86
+ The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
87
+ The input size is controlled by the `NK_CURVED_DIMENSIONS` environment variable.
88
+ The metric tensor is a square matrix of side $N$, so each bilinear form $\mathbf{x}^\top M \mathbf{x}$ has $O(N^2)$ arithmetic complexity.
89
+ Columns show matrix side length: 256², 1024², 4096².
90
+ The throughput is measured in GSO/s as Giga Scalar Operations per Second.
91
+ Accuracy is reported as mean ULP (units in last place) averaged over all test pairs — the average number of representable floating-point values between the computed result and the exact answer.
92
+ Each kernel runs for at least 20 seconds per configuration.
93
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
94
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
95
+ Rows marked `🧩` use external BLAS baselines rather than NumKong kernels.
96
+
97
+ ### Intel Sapphire Rapids
98
+
99
+ #### Native
100
+
101
+ | Kernel | 256² | 1024² | 4096² |
102
+ | :---------------------------- | -----------------------: | -----------------------: | -----------------------: |
103
+ | __f64c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
104
+ | `bilinear_f64c_with_blas` 🧩 | 1.25 gso/s | 1.36 gso/s | 1.38 gso/s |
105
+ | `nk_bilinear_f64c_serial` | 0.0862 gso/s, 0.5 ulp | 0.161 gso/s, 0.2 ulp | 0.171 gso/s, 0.5 ulp |
106
+ | `nk_bilinear_f64c_skylake` | 0.583 gso/s, 3.5 ulp | 0.718 gso/s, 3.5 ulp | 0.765 gso/s, 3.5 ulp |
107
+ | __f32c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
108
+ | `bilinear_f32c_with_blas` 🧩 | 2.14 gso/s | 2.61 gso/s | 2.57 gso/s |
109
+ | `nk_bilinear_f32c_serial` | 0.756 gso/s, 0 ulp | 1.37 gso/s, 0 ulp | 1.37 gso/s, 0 ulp |
110
+ | `nk_bilinear_f32c_skylake` | 1.72 gso/s, 0 ulp | 1.75 gso/s, 0 ulp | 1.46 gso/s, 0 ulp |
111
+ | __bf16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
112
+ | `nk_bilinear_bf16c_serial` | 0.154 gso/s, 5 ulp | 0.158 gso/s, 5.8 ulp | 0.155 gso/s, 5 ulp |
113
+ | `nk_bilinear_bf16c_genoa` | 2.81 gso/s, 5 ulp | 4.57 gso/s, 5 ulp | 4.47 gso/s, 5 ulp |
114
+ | __f16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
115
+ | `nk_bilinear_f16c_serial` | 0.585 gso/s, 7.2 ulp | 0.592 gso/s, 7.2 ulp | 0.600 gso/s, 7.2 ulp |
116
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
117
+ | `bilinear_f64_with_blas` 🧩 | 2.84 gso/s | 3.23 gso/s | 3.14 gso/s |
118
+ | `nk_bilinear_f64_serial` | 0.291 gso/s, 0.7 ulp | 0.565 gso/s, 0.4 ulp | 0.577 gso/s, 0.7 ulp |
119
+ | `nk_mahalanobis_f64_serial` | 0.267 gso/s, 0 ulp | 0.537 gso/s, 0 ulp | 0.539 gso/s, 0 ulp |
120
+ | `nk_bilinear_f64_skylake` | 1.79 gso/s, 1.6 ulp | 1.71 gso/s, 1.3 ulp | 1.59 gso/s, 1 ulp |
121
+ | `nk_mahalanobis_f64_skylake` | 1.77 gso/s, 0 ulp | 1.82 gso/s, 0 ulp | 2.12 gso/s, 0.2 ulp |
122
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
123
+ | `bilinear_f32_with_blas` 🧩 | 4.09 gso/s | 5.61 gso/s | 6.59 gso/s |
124
+ | `nk_bilinear_f32_serial` | 1.19 gso/s, 0 ulp | 2.71 gso/s, 0 ulp | 2.68 gso/s, 0 ulp |
125
+ | `nk_mahalanobis_f32_serial` | 2.36 gso/s, 0 ulp | 2.53 gso/s, 0 ulp | 2.40 gso/s, 0 ulp |
126
+ | `nk_bilinear_f32_haswell` | 3.45 gso/s, 0 ulp | 3.66 gso/s, 0 ulp | 3.24 gso/s, 0 ulp |
127
+ | `nk_mahalanobis_f32_haswell` | 3.37 gso/s, 0 ulp | 3.28 gso/s, 0 ulp | 3.30 gso/s, 0 ulp |
128
+ | `nk_bilinear_f32_skylake` | 3.68 gso/s, 0 ulp | 3.08 gso/s, 0 ulp | 2.71 gso/s, 0 ulp |
129
+ | `nk_mahalanobis_f32_skylake` | 3.45 gso/s, 0 ulp | 2.94 gso/s, 0 ulp | 3.32 gso/s, 0 ulp |
130
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
131
+ | `nk_bilinear_bf16_serial` | 0.321 gso/s, 16 ulp | 0.331 gso/s, 13 ulp | 0.314 gso/s, 12 ulp |
132
+ | `nk_mahalanobis_bf16_serial` | 0.216 gso/s, 2.2 ulp | 0.215 gso/s, 2.1 ulp | 0.211 gso/s, 2.3 ulp |
133
+ | `nk_bilinear_bf16_haswell` | 6.75 gso/s, 11 ulp | 7.04 gso/s, 13 ulp | 6.80 gso/s, 13 ulp |
134
+ | `nk_mahalanobis_bf16_haswell` | 5.93 gso/s, 1 ulp | 5.77 gso/s, 1 ulp | 5.86 gso/s, 1 ulp |
135
+ | `nk_bilinear_bf16_genoa` | 6.22 gso/s, 18 ulp | 10.9 gso/s, 18 ulp | 10.3 gso/s, 18 ulp |
136
+ | `nk_mahalanobis_bf16_genoa` | 7.04 gso/s, 8.55K ulp | 8.76 gso/s, 8.41K ulp | 8.57 gso/s, 8.41K ulp |
137
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
138
+ | `nk_bilinear_f16_serial` | 0.654 gso/s, 23 ulp | 0.652 gso/s, 23 ulp | 0.657 gso/s, 23 ulp |
139
+ | `nk_mahalanobis_f16_serial` | 0.510 gso/s, 2.7 ulp | 0.520 gso/s, 3.2 ulp | 0.500 gso/s, 2.7 ulp |
140
+ | `nk_bilinear_f16_haswell` | 7.36 gso/s, 37 ulp | 7.30 gso/s, 37 ulp | 7.29 gso/s, 37 ulp |
141
+ | `nk_mahalanobis_f16_haswell` | 6.75 gso/s, 1 ulp | 6.24 gso/s, 1 ulp | 6.83 gso/s, 1 ulp |
142
+
143
+ #### WASM
144
+
145
+ Measured with Wasmtime v42 (Cranelift backend).
146
+
147
+ | Kernel | 256² | 1024² | 4096² |
148
+ | :------------------------- | -----------------------: | -----------------------: | -----------------------: |
149
+ | __f64c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
150
+ | `nk_bilinear_f64c_serial` | 0.21 gso/s, 1.2 ulp | 0.21 gso/s, 1.2 ulp | 0.21 gso/s, 1.2 ulp |
151
+ | __f32c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
152
+ | `nk_bilinear_f32c_serial` | 1.10 gso/s, 0 ulp | 1.07 gso/s, 0 ulp | 1.10 gso/s, 0 ulp |
153
+ | __bf16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
154
+ | `nk_bilinear_bf16c_serial` | 1.26 gso/s, 9.8 ulp | 1.31 gso/s, 9.8 ulp | 1.27 gso/s, 9.5 ulp |
155
+ | __f16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
156
+ | `nk_bilinear_f16c_serial` | 0.40 gso/s, 39 ulp | 0.38 gso/s, 39 ulp | 0.40 gso/s, 39 ulp |
157
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
158
+ | `nk_bilinear_f64_serial` | 0.49 gso/s, 0.6 ulp | 0.49 gso/s, 0.6 ulp | 0.48 gso/s, 0.6 ulp |
159
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
160
+ | `nk_bilinear_f32_serial` | 2.54 gso/s, 0 ulp | 2.62 gso/s, 0 ulp | 2.53 gso/s, 0 ulp |
161
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
162
+ | `nk_bilinear_bf16_serial` | 2.91 gso/s, 27 ulp | 2.90 gso/s, 22 ulp | 2.98 gso/s, 22 ulp |
163
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
164
+ | `nk_bilinear_f16_serial` | 0.76 gso/s, 74 ulp | 0.76 gso/s, 74 ulp | 0.78 gso/s, 74 ulp |
165
+
166
+ ### Apple M4
167
+
168
+ #### Native
169
+
170
+ | Kernel | 256² | 1024² | 4096² |
171
+ | :------------------------------ | -----------------------: | -----------------------: | -----------------------: |
172
+ | __f64c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
173
+ | `nk_bilinear_f64c_serial` | 0.368 gso/s, 2.2 ulp | 0.371 gso/s, 2.2 ulp | 0.367 gso/s, 2.2 ulp |
174
+ | __f32c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
175
+ | `nk_bilinear_f32c_serial` | 2.33 gso/s, 0 ulp | 2.27 gso/s, 0 ulp | 2.28 gso/s, 0 ulp |
176
+ | `nk_bilinear_f32c_neon` | 2.11 gso/s, 0 ulp | 1.89 gso/s, 0 ulp | 1.85 gso/s, 0 ulp |
177
+ | __bf16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
178
+ | `nk_bilinear_bf16c_serial` | 2.83 gso/s, 33.0 ulp | 2.54 gso/s, 34.5 ulp | 2.49 gso/s, 34.5 ulp |
179
+ | `nk_bilinear_bf16c_neonbfdot` | 5.05 gso/s, 17.0 ulp | 4.20 gso/s, 17.0 ulp | 4.04 gso/s, 17.0 ulp |
180
+ | __f16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
181
+ | `nk_bilinear_f16c_serial` | 2.81 gso/s, 51.8 ulp | 2.54 gso/s, 51.8 ulp | 2.48 gso/s, 51.8 ulp |
182
+ | `nk_bilinear_f16c_neonhalf` | 5.00 gso/s, 17.3 ulp | 4.16 gso/s, 17.3 ulp | 4.00 gso/s, 16.4 ulp |
183
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
184
+ | `nk_bilinear_f64_serial` | 0.717 gso/s, 0.4 ulp | 0.711 gso/s, 0.4 ulp | 0.721 gso/s, 0.4 ulp |
185
+ | `nk_mahalanobis_f64_serial` | 0.664 gso/s, 0.5 ulp | 0.667 gso/s, 0.5 ulp | 0.672 gso/s, 0.5 ulp |
186
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
187
+ | `nk_bilinear_f32_serial` | 3.92 gso/s, 0 ulp | 3.05 gso/s, 0 ulp | 2.87 gso/s, 0 ulp |
188
+ | `nk_mahalanobis_f32_serial` | 3.42 gso/s, 0 ulp | 2.88 gso/s, 0 ulp | 2.74 gso/s, 0 ulp |
189
+ | `nk_bilinear_f32_neon` | 4.90 gso/s, 0 ulp | 3.82 gso/s, 0 ulp | 3.49 gso/s, 0 ulp |
190
+ | `nk_mahalanobis_f32_neon` | 4.68 gso/s, 0 ulp | 3.71 gso/s, 0 ulp | 3.48 gso/s, 0 ulp |
191
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
192
+ | `nk_bilinear_bf16_serial` | 4.17 gso/s, 20.7 ulp | 3.19 gso/s, 21.2 ulp | 2.94 gso/s, 20.7 ulp |
193
+ | `nk_mahalanobis_bf16_serial` | 3.86 gso/s, 2.1 ulp | 2.98 gso/s, 2.2 ulp | 2.79 gso/s, 2.1 ulp |
194
+ | `nk_bilinear_bf16_neonbfdot` | 28.0 gso/s, 28.0 ulp | 23.5 gso/s, 41.2 ulp | 20.4 gso/s, 41.1 ulp |
195
+ | `nk_mahalanobis_bf16_neonbfdot` | 9.14 gso/s, 2.2 ulp | 7.93 gso/s, 2.2 ulp | 7.43 gso/s, 2.2 ulp |
196
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
197
+ | `nk_bilinear_f16_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
198
+ | `nk_mahalanobis_f16_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
199
+ | `nk_bilinear_f16_neonhalf` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
200
+ | `nk_mahalanobis_f16_neonhalf` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
201
+
202
+ #### WASM
203
+
204
+ Measured with Wasmtime v42 (Cranelift backend).
205
+
206
+ | Kernel | 256² | 1024² | 4096² |
207
+ | :------------------------- | -----------------------: | -----------------------: | -----------------------: |
208
+ | __f64c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
209
+ | `nk_bilinear_f64c_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
210
+ | __f32c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
211
+ | `nk_bilinear_f32c_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
212
+ | __bf16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
213
+ | `nk_bilinear_bf16c_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
214
+ | __f16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
215
+ | `nk_bilinear_f16c_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
216
+ | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
217
+ | `nk_bilinear_f64_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
218
+ | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
219
+ | `nk_bilinear_f32_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
220
+ | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
221
+ | `nk_bilinear_bf16_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
222
+ | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
223
+ | `nk_bilinear_f16_serial` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |