numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -6,32 +6,34 @@
6
6
  *
7
7
  * Contains following element-wise operations:
8
8
  *
9
- * - Scale (Multiply) with shift: result[i] = alpha * a[i] + beta
9
+ * - Scale (Multiply) with shift: result[i] = α·a[i] + β
10
10
  * - Sum (Add): result[i] = a[i] + b[i]
11
- * - Blend: result[i] = alpha * a[i] + beta * b[i]
12
- * - FMA (Fused Multiply-Add): result[i] = alpha * a[i] * b[i] + beta * c[i]
11
+ * - Blend: result[i] = α·a[i] + β·b[i]
12
+ * - FMA (Fused Multiply-Add): result[i] = α·a[i]·b[i] + β·c[i]
13
13
  *
14
14
  * Beyond their obvious usecases, those can be reused for vector-scalar math and other operations:
15
15
  *
16
- * - Scale with beta = 0 for a pure multiply.
17
- * - Sum is equivalent to WSum with alpha = beta = 1.
18
- * - Average is WSum with alpha = beta = 0.5.
19
- * - Elementwise multiply is FMA with beta = 0.
16
+ * - Scale with β = 0 for a pure multiply.
17
+ * - Sum is equivalent to WSum with α = β = 1.
18
+ * - Average is WSum with α = β = 0.5.
19
+ * - Elementwise multiply is FMA with β = 0.
20
20
  *
21
21
  * For dtypes:
22
22
  *
23
- * - f64: 64-bit IEEE floating point numbers × 64-bit scales
24
- * - f32: 32-bit IEEE floating point numbers × 32-bit scales
25
- * - f16: 16-bit IEEE floating point numbers × 32-bit scales
26
- * - bf16: 16-bit brain floating point numbers × 32-bit scales
27
- * - e4m3: 8-bit e4m3 floating point numbers × 32-bit scales
28
- * - e5m2: 8-bit e5m2 floating point numbers × 32-bit scales
29
- * - e2m3: 8-bit e2m3 floating point numbers (MX) × 32-bit scales
30
- * - e3m2: 8-bit e3m2 floating point numbers (MX) × 32-bit scales
31
- * - i8/u8: 8-bit signed and unsigned integers × 32-bit scales
32
- * - i16/u16: 16-bit signed and unsigned integers × 32-bit scales
33
- * - i32/u32: 32-bit signed and unsigned integers × 64-bit scales
34
- * - i64/u64: 64-bit signed and unsigned integers × 64-bit scales
23
+ * - f64c: 64-bit complex × 64-bit complex scales
24
+ * - f32c: 32-bit complex × 32-bit complex scales
25
+ * - f64: 64-bit IEEE floating point × 64-bit scales
26
+ * - f32: 32-bit IEEE floating point × 32-bit scales
27
+ * - f16: 16-bit IEEE floating point × 32-bit scales
28
+ * - bf16: 16-bit brain floating point × 32-bit scales
29
+ * - e4m3: 8-bit e4m3 floating point × 32-bit scales
30
+ * - e5m2: 8-bit e5m2 floating point × 32-bit scales
31
+ * - e2m3: 8-bit e2m3 floating point (MX) × 32-bit scales
32
+ * - e3m2: 8-bit e3m2 floating point (MX) × 32-bit scales
33
+ * - i8/u8: 8-bit integers × 32-bit scales
34
+ * - i16/u16: 16-bit integers × 32-bit scales
35
+ * - i32/u32: 32-bit integers × 64-bit scales
36
+ * - i64/u64: 64-bit integers × 64-bit scales
35
37
  *
36
38
  * For hardware architectures:
37
39
  *
@@ -55,13 +57,13 @@
55
57
  * Saturating integer adds (VPADDSW/VPADDUSW) provide overflow protection for i16/u16 sums without
56
58
  * branching. FMA (VFMADD231PS) is the workhorse for scale (alpha*x+beta) and blend (alpha*a+beta*b).
57
59
  *
58
- * Intrinsic Instruction Ice Genoa
59
- * _mm512_cvtph_ps VCVTPH2PS (ZMM, YMM) 7c @ p0+p5 6c @ p12+p23
60
- * _mm512_cvtps_ph VCVTPS2PH (YMM, ZMM, I8) 7c @ p0+p5 7c @ p12+p23
61
- * _mm256_adds_epi16 VPADDSW (YMM, YMM, YMM) 1c @ p01 N/A
62
- * _mm256_adds_epu16 VPADDUSW (YMM, YMM, YMM) 1c @ p01 N/A
63
- * _mm512_fpclass_ps_mask VFPCLASSPS (K, ZMM, I8) 3c @ p5 5c @ p01
64
- * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
60
+ * Intrinsic Instruction Icelake Genoa
61
+ * _mm512_cvtph_ps VCVTPH2PS (ZMM, YMM) 7cy @ p0+p5 6cy @ p12+p23
62
+ * _mm512_cvtps_ph VCVTPS2PH (YMM, ZMM, I8) 7cy @ p0+p5 7cy @ p12+p23
63
+ * _mm256_adds_epi16 VPADDSW (YMM, YMM, YMM) 1cy @ p01 n/a
64
+ * _mm256_adds_epu16 VPADDUSW (YMM, YMM, YMM) 1cy @ p01 n/a
65
+ * _mm512_fpclass_ps_mask VFPCLASSPS (K, ZMM, I8) 3cy @ p5 5cy @ p01
66
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
65
67
  *
66
68
  * @section arm_instructions Relevant ARM NEON/SVE Instructions
67
69
  *
@@ -69,16 +71,16 @@
69
71
  * vector throughput (8 elements per 128-bit register vs 4 for f32). Saturating adds (SQADD/UQADD)
70
72
  * handle integer overflow. FMLA provides fused multiply-add for floating-point scale/blend/fma.
71
73
  *
72
- * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
73
- * vfmaq_f32 FMLA.S (vec) 4c @ V0123 4c @ V0123 4c @ V0123
74
- * vqaddq_s16 SQADD (vec) 3c @ V0123 2c @ V0123 2c @ V0123
75
- * vqaddq_u16 UQADD (vec) 3c @ V0123 2c @ V0123 2c @ V0123
76
- * vcvtq_f32_s32 SCVTF (vec) 3c @ V0123 3c @ V01 3c @ V01
77
- * vcvtnq_s32_f32 FCVTNS (vec) 3c @ V0123 3c @ V01 3c @ V01
74
+ * Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
75
+ * vfmaq_f32 FMLA.S (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
76
+ * vqaddq_s16 SQADD (vec) 3cy @ V0123 2cy @ V0123 2cy @ V0123
77
+ * vqaddq_u16 UQADD (vec) 3cy @ V0123 2cy @ V0123 2cy @ V0123
78
+ * vcvtq_f32_s32 SCVTF (vec) 3cy @ V0123 3cy @ V01 3cy @ V01
79
+ * vcvtnq_s32_f32 FCVTNS (vec) 3cy @ V0123 3cy @ V01 3cy @ V01
78
80
  *
79
81
  * @section references References
80
82
  *
81
- * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
83
+ * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
82
84
  * - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
83
85
  *
84
86
  */
@@ -651,6 +653,11 @@ NK_PUBLIC void nk_each_fma_f32c_neon(nk_f32c_t const *a, nk_f32c_t const *b, nk_
651
653
  /** @copydoc nk_each_fma_f64 */
652
654
  NK_PUBLIC void nk_each_fma_f64c_neon(nk_f64c_t const *a, nk_f64c_t const *b, nk_f64c_t const *c, nk_size_t n,
653
655
  nk_f64c_t const *alpha, nk_f64c_t const *beta, nk_f64c_t *result);
656
+
657
+ /** @copydoc nk_each_sum_i8 */
658
+ NK_PUBLIC void nk_each_sum_i8_neon(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result);
659
+ /** @copydoc nk_each_sum_u8 */
660
+ NK_PUBLIC void nk_each_sum_u8_neon(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result);
654
661
  #endif // NK_TARGET_NEON
655
662
 
656
663
  #if NK_TARGET_NEONBFDOT
@@ -680,10 +687,6 @@ NK_PUBLIC void nk_each_blend_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b,
680
687
  NK_PUBLIC void nk_each_fma_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
681
688
  nk_f32_t const *alpha, nk_f32_t const *beta, nk_f16_t *result);
682
689
 
683
- /** @copydoc nk_each_sum_i8 */
684
- NK_PUBLIC void nk_each_sum_i8_neonhalf(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result);
685
- /** @copydoc nk_each_sum_u8 */
686
- NK_PUBLIC void nk_each_sum_u8_neonhalf(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result);
687
690
  /** @copydoc nk_each_scale_i8 */
688
691
  NK_PUBLIC void nk_each_scale_i8_neonhalf(nk_i8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
689
692
  nk_i8_t *result);
@@ -696,13 +699,65 @@ NK_PUBLIC void nk_each_blend_i8_neonhalf(nk_i8_t const *a, nk_i8_t const *b, nk_
696
699
  /** @copydoc nk_each_blend_u8 */
697
700
  NK_PUBLIC void nk_each_blend_u8_neonhalf(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t const *alpha,
698
701
  nk_f32_t const *beta, nk_u8_t *result);
702
+ #endif // NK_TARGET_NEONHALF
703
+
704
+ #if NK_TARGET_V128RELAXED
705
+ /** @copydoc nk_each_sum_f32 */
706
+ NK_PUBLIC void nk_each_sum_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *result);
707
+ /** @copydoc nk_each_scale_f32 */
708
+ NK_PUBLIC void nk_each_scale_f32_v128relaxed(nk_f32_t const *a, nk_size_t n, nk_f32_t const *alpha,
709
+ nk_f32_t const *beta, nk_f32_t *result);
710
+ /** @copydoc nk_each_blend_f32 */
711
+ NK_PUBLIC void nk_each_blend_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t const *alpha,
712
+ nk_f32_t const *beta, nk_f32_t *result);
713
+ /** @copydoc nk_each_fma_f32 */
714
+ NK_PUBLIC void nk_each_fma_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t n,
715
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_f32_t *result);
716
+ /** @copydoc nk_each_sum_f16 */
717
+ NK_PUBLIC void nk_each_sum_f16_v128relaxed(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f16_t *result);
718
+ /** @copydoc nk_each_scale_f16 */
719
+ NK_PUBLIC void nk_each_scale_f16_v128relaxed(nk_f16_t const *a, nk_size_t n, nk_f32_t const *alpha,
720
+ nk_f32_t const *beta, nk_f16_t *result);
721
+ /** @copydoc nk_each_blend_f16 */
722
+ NK_PUBLIC void nk_each_blend_f16_v128relaxed(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t const *alpha,
723
+ nk_f32_t const *beta, nk_f16_t *result);
724
+ /** @copydoc nk_each_fma_f16 */
725
+ NK_PUBLIC void nk_each_fma_f16_v128relaxed(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
726
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_f16_t *result);
727
+ /** @copydoc nk_each_sum_bf16 */
728
+ NK_PUBLIC void nk_each_sum_bf16_v128relaxed(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_bf16_t *result);
729
+ /** @copydoc nk_each_scale_bf16 */
730
+ NK_PUBLIC void nk_each_scale_bf16_v128relaxed(nk_bf16_t const *a, nk_size_t n, nk_f32_t const *alpha,
731
+ nk_f32_t const *beta, nk_bf16_t *result);
732
+ /** @copydoc nk_each_blend_bf16 */
733
+ NK_PUBLIC void nk_each_blend_bf16_v128relaxed(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n,
734
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_bf16_t *result);
735
+ /** @copydoc nk_each_fma_bf16 */
736
+ NK_PUBLIC void nk_each_fma_bf16_v128relaxed(nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_t const *c, nk_size_t n,
737
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_bf16_t *result);
738
+ /** @copydoc nk_each_sum_i8 */
739
+ NK_PUBLIC void nk_each_sum_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result);
740
+ /** @copydoc nk_each_scale_i8 */
741
+ NK_PUBLIC void nk_each_scale_i8_v128relaxed(nk_i8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
742
+ nk_i8_t *result);
743
+ /** @copydoc nk_each_blend_i8 */
744
+ NK_PUBLIC void nk_each_blend_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t const *alpha,
745
+ nk_f32_t const *beta, nk_i8_t *result);
699
746
  /** @copydoc nk_each_fma_i8 */
700
- NK_PUBLIC void nk_each_fma_i8_neonhalf(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n,
701
- nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result);
747
+ NK_PUBLIC void nk_each_fma_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n,
748
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result);
749
+ /** @copydoc nk_each_sum_u8 */
750
+ NK_PUBLIC void nk_each_sum_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result);
751
+ /** @copydoc nk_each_scale_u8 */
752
+ NK_PUBLIC void nk_each_scale_u8_v128relaxed(nk_u8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
753
+ nk_u8_t *result);
754
+ /** @copydoc nk_each_blend_u8 */
755
+ NK_PUBLIC void nk_each_blend_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t const *alpha,
756
+ nk_f32_t const *beta, nk_u8_t *result);
702
757
  /** @copydoc nk_each_fma_u8 */
703
- NK_PUBLIC void nk_each_fma_u8_neonhalf(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n,
704
- nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result);
705
- #endif // NK_TARGET_NEONHALF
758
+ NK_PUBLIC void nk_each_fma_u8_v128relaxed(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n,
759
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result);
760
+ #endif // NK_TARGET_V128RELAXED
706
761
 
707
762
  #if NK_TARGET_HASWELL
708
763
  /** @copydoc nk_each_scale_f64 */
@@ -1026,12 +1081,6 @@ NK_PUBLIC void nk_each_blend_i8_sapphire(nk_i8_t const *a, nk_i8_t const *b, nk_
1026
1081
  NK_PUBLIC void nk_each_blend_u8_sapphire(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t const *alpha,
1027
1082
  nk_f32_t const *beta, nk_u8_t *result);
1028
1083
 
1029
- /** @copydoc nk_each_fma_i8 */
1030
- NK_PUBLIC void nk_each_fma_i8_sapphire(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n,
1031
- nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result);
1032
- /** @copydoc nk_each_fma_u8 */
1033
- NK_PUBLIC void nk_each_fma_u8_sapphire(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n,
1034
- nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result);
1035
1084
  #endif // NK_TARGET_SAPPHIRE
1036
1085
 
1037
1086
  #if NK_TARGET_RVV
@@ -1213,6 +1262,10 @@ NK_INTERNAL nk_dtype_t nk_each_scale_input_dtype(nk_dtype_t dtype) {
1213
1262
  case nk_u16_k: return nk_f32_k;
1214
1263
  case nk_i8_k: return nk_f32_k;
1215
1264
  case nk_u8_k: return nk_f32_k;
1265
+ case nk_e4m3_k: return nk_f32_k;
1266
+ case nk_e5m2_k: return nk_f32_k;
1267
+ case nk_e2m3_k: return nk_f32_k;
1268
+ case nk_e3m2_k: return nk_f32_k;
1216
1269
  default: return nk_dtype_unknown_k;
1217
1270
  }
1218
1271
  }
@@ -1230,6 +1283,7 @@ NK_INTERNAL nk_dtype_t nk_each_scale_input_dtype(nk_dtype_t dtype) {
1230
1283
  #include "numkong/each/icelake.h"
1231
1284
  #include "numkong/each/sapphire.h"
1232
1285
  #include "numkong/each/rvv.h"
1286
+ #include "numkong/each/v128relaxed.h"
1233
1287
 
1234
1288
  #if defined(__cplusplus)
1235
1289
  extern "C" {
@@ -1260,6 +1314,8 @@ NK_PUBLIC void nk_each_sum_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n
1260
1314
  nk_each_sum_f32_neon(a, b, n, r);
1261
1315
  #elif NK_TARGET_RVV
1262
1316
  nk_each_sum_f32_rvv(a, b, n, r);
1317
+ #elif NK_TARGET_V128RELAXED
1318
+ nk_each_sum_f32_v128relaxed(a, b, n, r);
1263
1319
  #else
1264
1320
  nk_each_sum_f32_serial(a, b, n, r);
1265
1321
  #endif
@@ -1274,6 +1330,8 @@ NK_PUBLIC void nk_each_sum_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_
1274
1330
  nk_each_sum_bf16_neonbfdot(a, b, n, r);
1275
1331
  #elif NK_TARGET_RVV
1276
1332
  nk_each_sum_bf16_rvv(a, b, n, r);
1333
+ #elif NK_TARGET_V128RELAXED
1334
+ nk_each_sum_bf16_v128relaxed(a, b, n, r);
1277
1335
  #else
1278
1336
  nk_each_sum_bf16_serial(a, b, n, r);
1279
1337
  #endif
@@ -1288,6 +1346,8 @@ NK_PUBLIC void nk_each_sum_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n
1288
1346
  nk_each_sum_f16_neonhalf(a, b, n, r);
1289
1347
  #elif NK_TARGET_RVV
1290
1348
  nk_each_sum_f16_rvv(a, b, n, r);
1349
+ #elif NK_TARGET_V128RELAXED
1350
+ nk_each_sum_f16_v128relaxed(a, b, n, r);
1291
1351
  #else
1292
1352
  nk_each_sum_f16_serial(a, b, n, r);
1293
1353
  #endif
@@ -1298,10 +1358,12 @@ NK_PUBLIC void nk_each_sum_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, n
1298
1358
  nk_each_sum_i8_icelake(a, b, n, r);
1299
1359
  #elif NK_TARGET_HASWELL
1300
1360
  nk_each_sum_i8_haswell(a, b, n, r);
1301
- #elif NK_TARGET_NEONHALF
1302
- nk_each_sum_i8_neonhalf(a, b, n, r);
1361
+ #elif NK_TARGET_NEON
1362
+ nk_each_sum_i8_neon(a, b, n, r);
1303
1363
  #elif NK_TARGET_RVV
1304
1364
  nk_each_sum_i8_rvv(a, b, n, r);
1365
+ #elif NK_TARGET_V128RELAXED
1366
+ nk_each_sum_i8_v128relaxed(a, b, n, r);
1305
1367
  #else
1306
1368
  nk_each_sum_i8_serial(a, b, n, r);
1307
1369
  #endif
@@ -1312,10 +1374,12 @@ NK_PUBLIC void nk_each_sum_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, n
1312
1374
  nk_each_sum_u8_icelake(a, b, n, r);
1313
1375
  #elif NK_TARGET_HASWELL
1314
1376
  nk_each_sum_u8_haswell(a, b, n, r);
1315
- #elif NK_TARGET_NEONHALF
1316
- nk_each_sum_u8_neonhalf(a, b, n, r);
1377
+ #elif NK_TARGET_NEON
1378
+ nk_each_sum_u8_neon(a, b, n, r);
1317
1379
  #elif NK_TARGET_RVV
1318
1380
  nk_each_sum_u8_rvv(a, b, n, r);
1381
+ #elif NK_TARGET_V128RELAXED
1382
+ nk_each_sum_u8_v128relaxed(a, b, n, r);
1319
1383
  #else
1320
1384
  nk_each_sum_u8_serial(a, b, n, r);
1321
1385
  #endif
@@ -1426,6 +1490,8 @@ NK_PUBLIC void nk_each_scale_f32(nk_f32_t const *a, nk_size_t n, nk_f32_t const
1426
1490
  nk_each_scale_f32_neon(a, n, alpha, beta, r);
1427
1491
  #elif NK_TARGET_RVV
1428
1492
  nk_each_scale_f32_rvv(a, n, alpha, beta, r);
1493
+ #elif NK_TARGET_V128RELAXED
1494
+ nk_each_scale_f32_v128relaxed(a, n, alpha, beta, r);
1429
1495
  #else
1430
1496
  nk_each_scale_f32_serial(a, n, alpha, beta, r);
1431
1497
  #endif
@@ -1441,6 +1507,8 @@ NK_PUBLIC void nk_each_scale_bf16(nk_bf16_t const *a, nk_size_t n, nk_f32_t cons
1441
1507
  nk_each_scale_bf16_neonbfdot(a, n, alpha, beta, r);
1442
1508
  #elif NK_TARGET_RVV
1443
1509
  nk_each_scale_bf16_rvv(a, n, alpha, beta, r);
1510
+ #elif NK_TARGET_V128RELAXED
1511
+ nk_each_scale_bf16_v128relaxed(a, n, alpha, beta, r);
1444
1512
  #else
1445
1513
  nk_each_scale_bf16_serial(a, n, alpha, beta, r);
1446
1514
  #endif
@@ -1456,6 +1524,8 @@ NK_PUBLIC void nk_each_scale_f16(nk_f16_t const *a, nk_size_t n, nk_f32_t const
1456
1524
  nk_each_scale_f16_neonhalf(a, n, alpha, beta, r);
1457
1525
  #elif NK_TARGET_RVV
1458
1526
  nk_each_scale_f16_rvv(a, n, alpha, beta, r);
1527
+ #elif NK_TARGET_V128RELAXED
1528
+ nk_each_scale_f16_v128relaxed(a, n, alpha, beta, r);
1459
1529
  #else
1460
1530
  nk_each_scale_f16_serial(a, n, alpha, beta, r);
1461
1531
  #endif
@@ -1473,6 +1543,8 @@ NK_PUBLIC void nk_each_scale_i8(nk_i8_t const *a, nk_size_t n, nk_f32_t const *a
1473
1543
  nk_each_scale_i8_neonhalf(a, n, alpha, beta, r);
1474
1544
  #elif NK_TARGET_RVV
1475
1545
  nk_each_scale_i8_rvv(a, n, alpha, beta, r);
1546
+ #elif NK_TARGET_V128RELAXED
1547
+ nk_each_scale_i8_v128relaxed(a, n, alpha, beta, r);
1476
1548
  #else
1477
1549
  nk_each_scale_i8_serial(a, n, alpha, beta, r);
1478
1550
  #endif
@@ -1490,6 +1562,8 @@ NK_PUBLIC void nk_each_scale_u8(nk_u8_t const *a, nk_size_t n, nk_f32_t const *a
1490
1562
  nk_each_scale_u8_neonhalf(a, n, alpha, beta, r);
1491
1563
  #elif NK_TARGET_RVV
1492
1564
  nk_each_scale_u8_rvv(a, n, alpha, beta, r);
1565
+ #elif NK_TARGET_V128RELAXED
1566
+ nk_each_scale_u8_v128relaxed(a, n, alpha, beta, r);
1493
1567
  #else
1494
1568
  nk_each_scale_u8_serial(a, n, alpha, beta, r);
1495
1569
  #endif
@@ -1606,6 +1680,8 @@ NK_PUBLIC void nk_each_blend_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
1606
1680
  nk_each_blend_f32_neon(a, b, n, alpha, beta, r);
1607
1681
  #elif NK_TARGET_RVV
1608
1682
  nk_each_blend_f32_rvv(a, b, n, alpha, beta, r);
1683
+ #elif NK_TARGET_V128RELAXED
1684
+ nk_each_blend_f32_v128relaxed(a, b, n, alpha, beta, r);
1609
1685
  #else
1610
1686
  nk_each_blend_f32_serial(a, b, n, alpha, beta, r);
1611
1687
  #endif
@@ -1621,6 +1697,8 @@ NK_PUBLIC void nk_each_blend_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_siz
1621
1697
  nk_each_blend_bf16_neonbfdot(a, b, n, alpha, beta, r);
1622
1698
  #elif NK_TARGET_RVV
1623
1699
  nk_each_blend_bf16_rvv(a, b, n, alpha, beta, r);
1700
+ #elif NK_TARGET_V128RELAXED
1701
+ nk_each_blend_bf16_v128relaxed(a, b, n, alpha, beta, r);
1624
1702
  #else
1625
1703
  nk_each_blend_bf16_serial(a, b, n, alpha, beta, r);
1626
1704
  #endif
@@ -1636,6 +1714,8 @@ NK_PUBLIC void nk_each_blend_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t
1636
1714
  nk_each_blend_f16_neonhalf(a, b, n, alpha, beta, r);
1637
1715
  #elif NK_TARGET_RVV
1638
1716
  nk_each_blend_f16_rvv(a, b, n, alpha, beta, r);
1717
+ #elif NK_TARGET_V128RELAXED
1718
+ nk_each_blend_f16_v128relaxed(a, b, n, alpha, beta, r);
1639
1719
  #else
1640
1720
  nk_each_blend_f16_serial(a, b, n, alpha, beta, r);
1641
1721
  #endif
@@ -1651,6 +1731,8 @@ NK_PUBLIC void nk_each_blend_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n,
1651
1731
  nk_each_blend_i8_neonhalf(a, b, n, alpha, beta, r);
1652
1732
  #elif NK_TARGET_RVV
1653
1733
  nk_each_blend_i8_rvv(a, b, n, alpha, beta, r);
1734
+ #elif NK_TARGET_V128RELAXED
1735
+ nk_each_blend_i8_v128relaxed(a, b, n, alpha, beta, r);
1654
1736
  #else
1655
1737
  nk_each_blend_i8_serial(a, b, n, alpha, beta, r);
1656
1738
  #endif
@@ -1666,6 +1748,8 @@ NK_PUBLIC void nk_each_blend_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n,
1666
1748
  nk_each_blend_u8_neonhalf(a, b, n, alpha, beta, r);
1667
1749
  #elif NK_TARGET_RVV
1668
1750
  nk_each_blend_u8_rvv(a, b, n, alpha, beta, r);
1751
+ #elif NK_TARGET_V128RELAXED
1752
+ nk_each_blend_u8_v128relaxed(a, b, n, alpha, beta, r);
1669
1753
  #else
1670
1754
  nk_each_blend_u8_serial(a, b, n, alpha, beta, r);
1671
1755
  #endif
@@ -1726,6 +1810,8 @@ NK_PUBLIC void nk_each_fma_f32(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t co
1726
1810
  nk_each_fma_f32_neon(a, b, c, n, alpha, beta, r);
1727
1811
  #elif NK_TARGET_RVV
1728
1812
  nk_each_fma_f32_rvv(a, b, c, n, alpha, beta, r);
1813
+ #elif NK_TARGET_V128RELAXED
1814
+ nk_each_fma_f32_v128relaxed(a, b, c, n, alpha, beta, r);
1729
1815
  #else
1730
1816
  nk_each_fma_f32_serial(a, b, c, n, alpha, beta, r);
1731
1817
  #endif
@@ -1741,6 +1827,8 @@ NK_PUBLIC void nk_each_fma_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_
1741
1827
  nk_each_fma_bf16_neonbfdot(a, b, c, n, alpha, beta, r);
1742
1828
  #elif NK_TARGET_RVV
1743
1829
  nk_each_fma_bf16_rvv(a, b, c, n, alpha, beta, r);
1830
+ #elif NK_TARGET_V128RELAXED
1831
+ nk_each_fma_bf16_v128relaxed(a, b, c, n, alpha, beta, r);
1744
1832
  #else
1745
1833
  nk_each_fma_bf16_serial(a, b, c, n, alpha, beta, r);
1746
1834
  #endif
@@ -1756,6 +1844,8 @@ NK_PUBLIC void nk_each_fma_f16(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t co
1756
1844
  nk_each_fma_f16_neonhalf(a, b, c, n, alpha, beta, r);
1757
1845
  #elif NK_TARGET_RVV
1758
1846
  nk_each_fma_f16_rvv(a, b, c, n, alpha, beta, r);
1847
+ #elif NK_TARGET_V128RELAXED
1848
+ nk_each_fma_f16_v128relaxed(a, b, c, n, alpha, beta, r);
1759
1849
  #else
1760
1850
  nk_each_fma_f16_serial(a, b, c, n, alpha, beta, r);
1761
1851
  #endif
@@ -1763,16 +1853,14 @@ NK_PUBLIC void nk_each_fma_f16(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t co
1763
1853
 
1764
1854
  NK_PUBLIC void nk_each_fma_i8(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n, nk_f32_t const *alpha,
1765
1855
  nk_f32_t const *beta, nk_i8_t *r) {
1766
- #if NK_TARGET_SAPPHIRE
1767
- nk_each_fma_i8_sapphire(a, b, c, n, alpha, beta, r);
1768
- #elif NK_TARGET_SKYLAKE
1856
+ #if NK_TARGET_SKYLAKE
1769
1857
  nk_each_fma_i8_skylake(a, b, c, n, alpha, beta, r);
1770
1858
  #elif NK_TARGET_HASWELL
1771
1859
  nk_each_fma_i8_haswell(a, b, c, n, alpha, beta, r);
1772
- #elif NK_TARGET_NEONHALF
1773
- nk_each_fma_i8_neonhalf(a, b, c, n, alpha, beta, r);
1774
1860
  #elif NK_TARGET_RVV
1775
1861
  nk_each_fma_i8_rvv(a, b, c, n, alpha, beta, r);
1862
+ #elif NK_TARGET_V128RELAXED
1863
+ nk_each_fma_i8_v128relaxed(a, b, c, n, alpha, beta, r);
1776
1864
  #else
1777
1865
  nk_each_fma_i8_serial(a, b, c, n, alpha, beta, r);
1778
1866
  #endif
@@ -1780,16 +1868,14 @@ NK_PUBLIC void nk_each_fma_i8(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const
1780
1868
 
1781
1869
  NK_PUBLIC void nk_each_fma_u8(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n, nk_f32_t const *alpha,
1782
1870
  nk_f32_t const *beta, nk_u8_t *r) {
1783
- #if NK_TARGET_SAPPHIRE
1784
- nk_each_fma_u8_sapphire(a, b, c, n, alpha, beta, r);
1785
- #elif NK_TARGET_SKYLAKE
1871
+ #if NK_TARGET_SKYLAKE
1786
1872
  nk_each_fma_u8_skylake(a, b, c, n, alpha, beta, r);
1787
1873
  #elif NK_TARGET_HASWELL
1788
1874
  nk_each_fma_u8_haswell(a, b, c, n, alpha, beta, r);
1789
- #elif NK_TARGET_NEONHALF
1790
- nk_each_fma_u8_neonhalf(a, b, c, n, alpha, beta, r);
1791
1875
  #elif NK_TARGET_RVV
1792
1876
  nk_each_fma_u8_rvv(a, b, c, n, alpha, beta, r);
1877
+ #elif NK_TARGET_V128RELAXED
1878
+ nk_each_fma_u8_v128relaxed(a, b, c, n, alpha, beta, r);
1793
1879
  #else
1794
1880
  nk_each_fma_u8_serial(a, b, c, n, alpha, beta, r);
1795
1881
  #endif
@@ -199,7 +199,7 @@ void fma(in_type_ const *a, in_type_ const *b, std::size_t d, in_type_ const *c,
199
199
 
200
200
  namespace ashvardanian::numkong {
201
201
 
202
- #pragma region - Tensor Elementwise
202
+ #pragma region Tensor Elementwise
203
203
 
204
204
  /** @brief Scale: output[i] = α × input[i] + β. */
205
205
  template <numeric_dtype value_type_, std::size_t max_rank_ = 8>
@@ -427,7 +427,7 @@ tensor<value_type_, allocator_type_, max_rank_> try_mul(tensor_view<value_type_,
427
427
  return result;
428
428
  }
429
429
 
430
- #pragma endregion - Tensor Elementwise
430
+ #pragma endregion Tensor Elementwise
431
431
 
432
432
  } // namespace ashvardanian::numkong
433
433
 
@@ -5,17 +5,17 @@ Both operate on arrays of latitude/longitude pairs in radians and produce distan
5
5
 
6
6
  The Haversine formula computes the great-circle distance between two points:
7
7
 
8
- ```math
8
+ $$
9
9
  \text{haversine}(\phi_1, \lambda_1, \phi_2, \lambda_2) = 2R \arcsin\sqrt{\sin^2\frac{\phi_2 - \phi_1}{2} + \cos\phi_1 \cos\phi_2 \sin^2\frac{\lambda_2 - \lambda_1}{2}}
10
- ```
10
+ $$
11
11
 
12
12
  where $R$ is Earth's mean radius and $(\phi, \lambda)$ are latitude and longitude in radians.
13
13
 
14
14
  Vincenty's formula solves the inverse geodesic problem on an oblate spheroid, iteratively refining the reduced latitude difference until convergence:
15
15
 
16
- ```math
16
+ $$
17
17
  \text{vincenty}(\phi_1, \lambda_1, \phi_2, \lambda_2) = b \cdot A \cdot (\sigma - \Delta\sigma)
18
- ```
18
+ $$
19
19
 
20
20
  where $a$ and $b$ are the equatorial and polar semi-axes of the WGS-84 ellipsoid, $\sigma$ is the angular separation, and $\Delta\sigma$ is the correction term computed through iterative convergence.
21
21
 
@@ -56,12 +56,12 @@ Each SIMD lane may converge at a different iteration count, so the kernel accumu
56
56
  Early exit uses `_mm256_movemask_pd` — when all 4 bits (for `f64`) or 8 bits (for `f32`) are set, the loop breaks.
57
57
  Coincident points and equatorial edge cases are handled by blending safe values (ones) into the intermediate terms to avoid division by zero, without requiring branches that would diverge across SIMD lanes.
58
58
 
59
- ### Haversine Without Final Arc Conversion
59
+ ### Potential Optimization: Haversine Without Final Arc Conversion
60
60
 
61
- `nk_haversine_f32_haswell`, `nk_haversine_f64_haswell` support a similarity mode where the haversine formula involves $2R \cdot \text{asin}(\sqrt{h})$ and the intermediate value $h = \sin^2(\Delta\phi/2) + \cos\phi_1 \cos\phi_2 \cdot \sin^2(\Delta\lambda/2)$ is monotonic with distance.
62
- For ranking and comparison use cases, comparing $h$ values directly produces the same ordering as comparing full Haversine distances, since both asin and sqrt are monotonically increasing.
63
- This eliminates the two most expensive operations in the pipeline.
64
- The kernels compute the full distance by default, but the streaming API can optionally skip the final conversion when only relative ordering is needed.
61
+ The haversine formula computes $d = 2R \cdot \text{asin}(\sqrt{h})$ where $h = \sin^2(\Delta\phi/2) + \cos\phi_1 \cos\phi_2 \cdot \sin^2(\Delta\lambda/2)$.
62
+ Since both `asin` and `sqrt` are monotonically increasing, comparing $h$ values directly produces the same ordering as comparing full haversine distances.
63
+ For ranking-only use cases, a future "similarity mode" could skip the final `sqrt`/`atan2` conversion and return $h$ directly, eliminating the two most expensive operations in the pipeline.
64
+ Currently, all kernels compute the full distance.
65
65
 
66
66
  ## Performance
67
67
 
@@ -131,17 +131,17 @@ Measured with Wasmtime v42 (Cranelift backend).
131
131
 
132
132
  #### WASM
133
133
 
134
- Measured with Wasmtime v42 (Cranelift backend).
134
+ Measured with Wasmtime v43 (Cranelift backend).
135
135
 
136
136
  | Kernel | ≤1° | ≤30° | ≤180° |
137
137
  | :----------------------------- | -----------------------: | -----------------------: | -----------------------: |
138
138
  | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
139
- | `nk_haversine_f64_serial` | 1.84 mp/s, 1.12 km | 1.83 mp/s, 32.8 km | 1.99 mp/s, 148 km |
140
- | `nk_vincenty_f64_serial` | 0.481 mp/s, 1.86 nm | 0.419 mp/s, 2.33 nm | 0.422 mp/s, 594 nm |
141
- | `nk_haversine_f64_v128relaxed` | 35.7 mp/s, 1.12 km | 35.9 mp/s, 32.8 km | 35.9 mp/s, 148 km |
142
- | `nk_vincenty_f64_v128relaxed` | 4.19 mp/s, 1.89 nm | 3.57 mp/s, 2.33 nm | 2.94 mp/s, 594 nm |
139
+ | `nk_haversine_f64_serial` | 5.12 mp/s, 1.12 km | 5.12 mp/s, 32.8 km | 5.07 mp/s, 148 km |
140
+ | `nk_vincenty_f64_serial` | 1.27 mp/s, 1.86 nm | 1.11 mp/s, 2.33 nm | 1.02 mp/s, 594 nm |
141
+ | `nk_haversine_f64_v128relaxed` | 109 mp/s, 1.12 km | 109 mp/s, 32.8 km | 109 mp/s, 148 km |
142
+ | `nk_vincenty_f64_v128relaxed` | 12.8 mp/s, 1.89 nm | 10.4 mp/s, 2.33 nm | 8.28 mp/s, 594 nm |
143
143
  | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
144
- | `nk_haversine_f32_serial` | 6.74 mp/s, 20,000 km | 6.80 mp/s, 32.7 km | 7.34 mp/s, 136 km |
145
- | `nk_vincenty_f32_serial` | 2.25 mp/s, 20,000 km | 1.65 mp/s, 12.0 m | 1.35 mp/s, 22.0 m |
146
- | `nk_haversine_f32_v128relaxed` | 161 mp/s, 20,000 km | 165 mp/s, 32.7 km | 165 mp/s, 153 km |
147
- | `nk_vincenty_f32_v128relaxed` | 24.6 mp/s, 12.0 m | 20.5 mp/s, 16.2 m | 9.57 mp/s, 18.0 m |
144
+ | `nk_haversine_f32_serial` | 20.9 mp/s, 20,000 km | 20.9 mp/s, 32.7 km | 21.0 mp/s, 136 km |
145
+ | `nk_vincenty_f32_serial` | 6.11 mp/s, 20,000 km | 4.30 mp/s, 12.0 m | 3.27 mp/s, 22.0 m |
146
+ | `nk_haversine_f32_v128relaxed` | 523 mp/s, 20,000 km | 524 mp/s, 32.7 km | 524 mp/s, 153 km |
147
+ | `nk_vincenty_f32_v128relaxed` | 76.9 mp/s, 12.0 m | 68.2 mp/s, 16.2 m | 26.8 mp/s, 18.0 m |