numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -77,10 +77,10 @@
77
77
  *
78
78
  * Relevant instructions and caveats:
79
79
  *
80
- * Intrinsic Instruction Notes
81
- * _mm_rsqrt_ps VRSQRTPS fast approx; refine with NR
82
- * _mm_maskz_rsqrt14_pd VRSQRT14PD higher-precision approx; MSVC masked-only
83
- * _mm_sqrt_ps/_mm_sqrt_pd VSQRTPS/VSQRTPD higher latency, sqrt/div unit
80
+ * Intrinsic Instruction Notes
81
+ * _mm_rsqrt_ps VRSQRTPS fast approx; refine with NR
82
+ * _mm_maskz_rsqrt14_pd VRSQRT14PD higher-precision approx; MSVC masked-only
83
+ * _mm_sqrt_ps/_mm_sqrt_pd VSQRTPS/VSQRTPD higher latency, sqrt/div unit
84
84
  *
85
85
  * Latency/port notes (rule of thumb):
86
86
  * - On Intel client cores, sqrt/rsqrt execute on the divide/sqrt unit (often
@@ -96,15 +96,15 @@
96
96
  * AVX-512 VNNI replaces that with VPDPWSSD. BF16 uses VDPBF16PS where available to avoid
97
97
  * convert+FMA sequences; if the ISA lacks it, we fall back to f32 FMA in the AVX2/serial:
98
98
  *
99
- * Intrinsic Instruction Ice Genoa
100
- * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4c @ p01 4c @ p01
101
- * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4c @ p01 4c @ p01
102
- * _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5c @ p01 3c @ p01
103
- * _mm512_dpwssd_epi32 VPDPWSSD (ZMM, K, ZMM, ZMM) 5c @ p05 4c @ p01
104
- * _mm512_dpbf16_ps VDPBF16PS (ZMM, K, ZMM, ZMM) n/a 6c @ p01
105
- * _mm_rsqrt_ps VRSQRTPS (XMM, XMM) 5c @ p0 4c @ p01
106
- * _mm_maskz_rsqrt14_pd VRSQRT14PD (XMM, K, XMM) 4c @ p0 5c @ p01
107
- * _mm_sqrt_ps VSQRTPS (XMM, XMM) 12c @ p0 15c @ p01
99
+ * Intrinsic Instruction Icelake Genoa
100
+ * _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
101
+ * _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 4cy @ p01 4cy @ p01
102
+ * _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5cy @ p01 3cy @ p01
103
+ * _mm512_dpwssd_epi32 VPDPWSSD (ZMM, K, ZMM, ZMM) 5cy @ p05 4cy @ p01
104
+ * _mm512_dpbf16_ps VDPBF16PS (ZMM, K, ZMM, ZMM) n/a 6cy @ p01
105
+ * _mm_rsqrt_ps VRSQRTPS (XMM, XMM) 5cy @ p0 4cy @ p01
106
+ * _mm_maskz_rsqrt14_pd VRSQRT14PD (XMM, K, XMM) 4cy @ p0 5cy @ p01
107
+ * _mm_sqrt_ps VSQRTPS (XMM, XMM) 12cy @ p0 15cy @ p01
108
108
  *
109
109
  * @section arm_instructions Relevant Arm Instructions
110
110
  *
@@ -115,18 +115,18 @@
115
115
  * instructions skipping `vbfmlal` and `vbfmlalt` alternatives to limit shuffle overhead
116
116
  * and code complexity.
117
117
  *
118
- * Intrinsic Instruction M1 Firestorm
119
- * vfmaq_f32 FMLA.S (vec) 4c / 4c
120
- * vfmaq_f64 FMLA.D (vec) 4c / 4c
121
- * vdotq_s32 SDOT.B (vec) 3c / 4c
122
- * vbfdotq_f32 BFDOT (vec) n/a
123
- * vrsqrteq_f32 FRSQRTE.S (vec) 3c / 1c
124
- * vrsqrtsq_f32 FRSQRTS.S (vec) 4c / 4c
125
- * vsqrtq_f32 FSQRT.S (vec) 10c / 0.5c
118
+ * Intrinsic Instruction M1 Firestorm
119
+ * vfmaq_f32 FMLA.S (vec) 4c / 4c
120
+ * vfmaq_f64 FMLA.D (vec) 4c / 4c
121
+ * vdotq_s32 SDOT.B (vec) 3c / 4c
122
+ * vbfdotq_f32 BFDOT (vec) n/a
123
+ * vrsqrteq_f32 FRSQRTE.S (vec) 3c / 1c
124
+ * vrsqrtsq_f32 FRSQRTS.S (vec) 4c / 4c
125
+ * vsqrtq_f32 FSQRT.S (vec) 10c / 0.5c
126
126
  *
127
127
  * @section references References
128
128
  *
129
- * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
129
+ * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
130
130
  * - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
131
131
  *
132
132
  */
@@ -332,16 +332,13 @@ NK_PUBLIC void nk_euclidean_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk
332
332
  NK_PUBLIC void nk_sqeuclidean_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
333
333
  /** @copydoc nk_angular_f64 */
334
334
  NK_PUBLIC void nk_angular_bf16_neon(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
335
- #endif // NK_TARGET_NEON
336
-
337
- #if NK_TARGET_NEONHALF
338
335
  /** @copydoc nk_euclidean_f64 */
339
- NK_PUBLIC void nk_euclidean_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
336
+ NK_PUBLIC void nk_euclidean_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
340
337
  /** @copydoc nk_sqeuclidean_f64 */
341
- NK_PUBLIC void nk_sqeuclidean_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
338
+ NK_PUBLIC void nk_sqeuclidean_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
342
339
  /** @copydoc nk_angular_f64 */
343
- NK_PUBLIC void nk_angular_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
344
- #endif // NK_TARGET_NEONHALF
340
+ NK_PUBLIC void nk_angular_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
341
+ #endif // NK_TARGET_NEON
345
342
 
346
343
  #if NK_TARGET_NEONBFDOT
347
344
  /** @copydoc nk_euclidean_f64 */
@@ -365,8 +362,62 @@ NK_PUBLIC void nk_euclidean_u8_neonsdot(nk_u8_t const *a, nk_u8_t const *b, nk_s
365
362
  NK_PUBLIC void nk_sqeuclidean_u8_neonsdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
366
363
  /** @copydoc nk_angular_f64 */
367
364
  NK_PUBLIC void nk_angular_u8_neonsdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
365
+ /** @copydoc nk_euclidean_f64 */
366
+ NK_PUBLIC void nk_euclidean_i4_neonsdot(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result);
367
+ /** @copydoc nk_sqeuclidean_f64 */
368
+ NK_PUBLIC void nk_sqeuclidean_i4_neonsdot(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_u32_t *result);
369
+ /** @copydoc nk_angular_f64 */
370
+ NK_PUBLIC void nk_angular_i4_neonsdot(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result);
371
+ /** @copydoc nk_euclidean_f64 */
372
+ NK_PUBLIC void nk_euclidean_u4_neonsdot(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result);
373
+ /** @copydoc nk_sqeuclidean_f64 */
374
+ NK_PUBLIC void nk_sqeuclidean_u4_neonsdot(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_u32_t *result);
375
+ /** @copydoc nk_angular_f64 */
376
+ NK_PUBLIC void nk_angular_u4_neonsdot(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result);
368
377
  #endif // NK_TARGET_NEONSDOT
369
378
 
379
+ #if NK_TARGET_SVESDOT
380
+ /** @copydoc nk_euclidean_f64 */
381
+ NK_PUBLIC void nk_euclidean_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
382
+ /** @copydoc nk_sqeuclidean_f64 */
383
+ NK_PUBLIC void nk_sqeuclidean_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_u32_t *result);
384
+ /** @copydoc nk_angular_f64 */
385
+ NK_PUBLIC void nk_angular_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
386
+ /** @copydoc nk_euclidean_f64 */
387
+ NK_PUBLIC void nk_euclidean_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
388
+ /** @copydoc nk_sqeuclidean_f64 */
389
+ NK_PUBLIC void nk_sqeuclidean_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
390
+ /** @copydoc nk_angular_f64 */
391
+ NK_PUBLIC void nk_angular_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
392
+ #endif // NK_TARGET_SVESDOT
393
+
394
+ #if NK_TARGET_NEONFP8
395
+ /** @copydoc nk_sqeuclidean_f64 */
396
+ NK_PUBLIC void nk_sqeuclidean_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
397
+ /** @copydoc nk_euclidean_f64 */
398
+ NK_PUBLIC void nk_euclidean_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
399
+ /** @copydoc nk_angular_f64 */
400
+ NK_PUBLIC void nk_angular_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
401
+ /** @copydoc nk_sqeuclidean_f64 */
402
+ NK_PUBLIC void nk_sqeuclidean_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
403
+ /** @copydoc nk_euclidean_f64 */
404
+ NK_PUBLIC void nk_euclidean_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
405
+ /** @copydoc nk_angular_f64 */
406
+ NK_PUBLIC void nk_angular_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
407
+ /** @copydoc nk_sqeuclidean_f64 */
408
+ NK_PUBLIC void nk_sqeuclidean_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
409
+ /** @copydoc nk_euclidean_f64 */
410
+ NK_PUBLIC void nk_euclidean_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
411
+ /** @copydoc nk_angular_f64 */
412
+ NK_PUBLIC void nk_angular_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
413
+ /** @copydoc nk_sqeuclidean_f64 */
414
+ NK_PUBLIC void nk_sqeuclidean_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
415
+ /** @copydoc nk_euclidean_f64 */
416
+ NK_PUBLIC void nk_euclidean_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
417
+ /** @copydoc nk_angular_f64 */
418
+ NK_PUBLIC void nk_angular_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
419
+ #endif // NK_TARGET_NEONFP8
420
+
370
421
  /* SIMD-powered backends for Arm SVE, mostly using 32-bit arithmetic over variable-length platform-defined word sizes.
371
422
  * Designed for Arm Graviton 3, Microsoft Cobalt, as well as Nvidia Grace and newer Ampere Altra CPUs.
372
423
  */
@@ -526,6 +577,24 @@ NK_PUBLIC void nk_euclidean_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_si
526
577
  NK_PUBLIC void nk_sqeuclidean_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
527
578
  /** @copydoc nk_angular_f64 */
528
579
  NK_PUBLIC void nk_angular_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result);
580
+ /** @copydoc nk_euclidean_f64 */
581
+ NK_PUBLIC void nk_euclidean_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
582
+ /** @copydoc nk_sqeuclidean_f64 */
583
+ NK_PUBLIC void nk_sqeuclidean_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
584
+ /** @copydoc nk_angular_f64 */
585
+ NK_PUBLIC void nk_angular_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
586
+ /** @copydoc nk_euclidean_f64 */
587
+ NK_PUBLIC void nk_euclidean_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
588
+ /** @copydoc nk_sqeuclidean_f64 */
589
+ NK_PUBLIC void nk_sqeuclidean_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
590
+ /** @copydoc nk_angular_f64 */
591
+ NK_PUBLIC void nk_angular_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
592
+ /** @copydoc nk_euclidean_f64 */
593
+ NK_PUBLIC void nk_euclidean_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
594
+ /** @copydoc nk_sqeuclidean_f64 */
595
+ NK_PUBLIC void nk_sqeuclidean_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
596
+ /** @copydoc nk_angular_f64 */
597
+ NK_PUBLIC void nk_angular_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
529
598
  #endif // NK_TARGET_ICELAKE
530
599
 
531
600
  #if NK_TARGET_GENOA
@@ -536,12 +605,6 @@ NK_PUBLIC void nk_sqeuclidean_bf16_genoa(nk_bf16_t const *a, nk_bf16_t const *b,
536
605
  /** @copydoc nk_angular_f64 */
537
606
  NK_PUBLIC void nk_angular_bf16_genoa(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
538
607
  /** @copydoc nk_euclidean_f64 */
539
- NK_PUBLIC void nk_euclidean_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
540
- /** @copydoc nk_sqeuclidean_f64 */
541
- NK_PUBLIC void nk_sqeuclidean_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
542
- /** @copydoc nk_angular_f64 */
543
- NK_PUBLIC void nk_angular_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
544
- /** @copydoc nk_euclidean_f64 */
545
608
  NK_PUBLIC void nk_euclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
546
609
  /** @copydoc nk_sqeuclidean_f64 */
547
610
  NK_PUBLIC void nk_sqeuclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
@@ -549,24 +612,26 @@ NK_PUBLIC void nk_sqeuclidean_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b,
549
612
  NK_PUBLIC void nk_angular_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
550
613
  #endif // NK_TARGET_GENOA
551
614
 
552
- #if NK_TARGET_SAPPHIRE
615
+ #if NK_TARGET_DIAMOND
553
616
  /** @copydoc nk_euclidean_f64 */
554
- NK_PUBLIC void nk_euclidean_e4m3_sapphire(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
555
- /** @copydoc nk_euclidean_f64 */
556
- NK_PUBLIC void nk_sqeuclidean_e4m3_sapphire(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
557
- /** @copydoc nk_euclidean_f64 */
558
- NK_PUBLIC void nk_sqeuclidean_e2m3_sapphire(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
559
- /** @copydoc nk_euclidean_f64 */
560
- NK_PUBLIC void nk_sqeuclidean_e3m2_sapphire(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
561
- /** @copydoc nk_euclidean_f64 */
562
- NK_PUBLIC void nk_euclidean_e2m3_sapphire(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
617
+ NK_PUBLIC void nk_euclidean_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
618
+ /** @copydoc nk_sqeuclidean_f64 */
619
+ NK_PUBLIC void nk_sqeuclidean_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
620
+ /** @copydoc nk_angular_f64 */
621
+ NK_PUBLIC void nk_angular_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
563
622
  /** @copydoc nk_euclidean_f64 */
564
- NK_PUBLIC void nk_euclidean_e3m2_sapphire(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
623
+ NK_PUBLIC void nk_euclidean_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
624
+ /** @copydoc nk_sqeuclidean_f64 */
625
+ NK_PUBLIC void nk_sqeuclidean_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
565
626
  /** @copydoc nk_angular_f64 */
566
- NK_PUBLIC void nk_angular_e2m3_sapphire(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
627
+ NK_PUBLIC void nk_angular_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
628
+ /** @copydoc nk_euclidean_f64 */
629
+ NK_PUBLIC void nk_euclidean_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
630
+ /** @copydoc nk_sqeuclidean_f64 */
631
+ NK_PUBLIC void nk_sqeuclidean_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
567
632
  /** @copydoc nk_angular_f64 */
568
- NK_PUBLIC void nk_angular_e3m2_sapphire(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
569
- #endif // NK_TARGET_SAPPHIRE
633
+ NK_PUBLIC void nk_angular_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
634
+ #endif // NK_TARGET_DIAMOND
570
635
 
571
636
  /* SIMD-powered backends for AVX-INT8-VNNI extensions on Xeon 6 CPUs, including Sierra Forest and Granite Rapids.
572
637
  * The packs many "efficiency" cores into a single socket, avoiding heavy 512-bit operations, and focusing on
@@ -591,6 +656,12 @@ NK_PUBLIC void nk_angular_e2m3_sierra(nk_e2m3_t const *a, nk_e2m3_t const *b, nk
591
656
  NK_PUBLIC void nk_euclidean_e2m3_sierra(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
592
657
  /** @copydoc nk_sqeuclidean_f64 */
593
658
  NK_PUBLIC void nk_sqeuclidean_e2m3_sierra(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
659
+ /** @copydoc nk_euclidean_f64 */
660
+ NK_PUBLIC void nk_euclidean_e3m2_sierra(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
661
+ /** @copydoc nk_sqeuclidean_f64 */
662
+ NK_PUBLIC void nk_sqeuclidean_e3m2_sierra(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
663
+ /** @copydoc nk_angular_f64 */
664
+ NK_PUBLIC void nk_angular_e3m2_sierra(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
594
665
  #endif // NK_TARGET_SIERRA
595
666
 
596
667
  #if NK_TARGET_ALDER
@@ -657,6 +728,30 @@ NK_PUBLIC void nk_sqeuclidean_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b,
657
728
  NK_PUBLIC void nk_euclidean_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
658
729
  /** @copydoc nk_angular_f64 */
659
730
  NK_PUBLIC void nk_angular_i8_v128relaxed(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result);
731
+ /** @copydoc nk_sqeuclidean_f64 */
732
+ NK_PUBLIC void nk_sqeuclidean_e4m3_v128relaxed(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
733
+ /** @copydoc nk_euclidean_f64 */
734
+ NK_PUBLIC void nk_euclidean_e4m3_v128relaxed(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
735
+ /** @copydoc nk_angular_f64 */
736
+ NK_PUBLIC void nk_angular_e4m3_v128relaxed(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
737
+ /** @copydoc nk_sqeuclidean_f64 */
738
+ NK_PUBLIC void nk_sqeuclidean_e5m2_v128relaxed(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
739
+ /** @copydoc nk_euclidean_f64 */
740
+ NK_PUBLIC void nk_euclidean_e5m2_v128relaxed(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
741
+ /** @copydoc nk_angular_f64 */
742
+ NK_PUBLIC void nk_angular_e5m2_v128relaxed(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
743
+ /** @copydoc nk_sqeuclidean_f64 */
744
+ NK_PUBLIC void nk_sqeuclidean_e2m3_v128relaxed(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
745
+ /** @copydoc nk_euclidean_f64 */
746
+ NK_PUBLIC void nk_euclidean_e2m3_v128relaxed(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
747
+ /** @copydoc nk_angular_f64 */
748
+ NK_PUBLIC void nk_angular_e2m3_v128relaxed(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
749
+ /** @copydoc nk_sqeuclidean_f64 */
750
+ NK_PUBLIC void nk_sqeuclidean_e3m2_v128relaxed(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
751
+ /** @copydoc nk_euclidean_f64 */
752
+ NK_PUBLIC void nk_euclidean_e3m2_v128relaxed(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
753
+ /** @copydoc nk_angular_f64 */
754
+ NK_PUBLIC void nk_angular_e3m2_v128relaxed(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
660
755
  #endif // NK_TARGET_V128RELAXED
661
756
 
662
757
  /* SIMD-powered backends for RISC-V Vector extension, using scalable vector arithmetic.
@@ -806,16 +901,17 @@ NK_INTERNAL nk_dtype_t nk_angular_output_dtype(nk_dtype_t dtype) {
806
901
 
807
902
  #include "numkong/spatial/serial.h"
808
903
  #include "numkong/spatial/neon.h"
809
- #include "numkong/spatial/neonhalf.h"
810
904
  #include "numkong/spatial/neonbfdot.h"
811
905
  #include "numkong/spatial/neonsdot.h"
812
906
  #include "numkong/spatial/sve.h"
813
907
  #include "numkong/spatial/svehalf.h"
814
908
  #include "numkong/spatial/svebfdot.h"
909
+ #include "numkong/spatial/svesdot.h"
910
+ #include "numkong/spatial/neonfp8.h"
815
911
  #include "numkong/spatial/haswell.h"
816
912
  #include "numkong/spatial/skylake.h"
817
913
  #include "numkong/spatial/genoa.h"
818
- #include "numkong/spatial/sapphire.h"
914
+ #include "numkong/spatial/diamond.h"
819
915
  #include "numkong/spatial/icelake.h"
820
916
  #include "numkong/spatial/alder.h"
821
917
  #include "numkong/spatial/sierra.h"
@@ -823,6 +919,8 @@ NK_INTERNAL nk_dtype_t nk_angular_output_dtype(nk_dtype_t dtype) {
823
919
  #include "numkong/spatial/rvvhalf.h"
824
920
  #include "numkong/spatial/rvvbf16.h"
825
921
  #include "numkong/spatial/v128relaxed.h"
922
+ #include "numkong/spatial/powervsx.h"
923
+ #include "numkong/spatial/loongsonasx.h"
826
924
 
827
925
  #if defined(__cplusplus)
828
926
  extern "C" {
@@ -833,6 +931,10 @@ extern "C" {
833
931
  NK_PUBLIC void nk_euclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
834
932
  #if NK_TARGET_V128RELAXED
835
933
  nk_euclidean_f64_v128relaxed(a, b, n, result);
934
+ #elif NK_TARGET_POWERVSX
935
+ nk_euclidean_f64_powervsx(a, b, n, result);
936
+ #elif NK_TARGET_LOONGSONASX
937
+ nk_euclidean_f64_loongsonasx(a, b, n, result);
836
938
  #elif NK_TARGET_RVV
837
939
  nk_euclidean_f64_rvv(a, b, n, result);
838
940
  #elif NK_TARGET_SVE
@@ -851,6 +953,10 @@ NK_PUBLIC void nk_euclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
851
953
  NK_PUBLIC void nk_sqeuclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
852
954
  #if NK_TARGET_V128RELAXED
853
955
  nk_sqeuclidean_f64_v128relaxed(a, b, n, result);
956
+ #elif NK_TARGET_POWERVSX
957
+ nk_sqeuclidean_f64_powervsx(a, b, n, result);
958
+ #elif NK_TARGET_LOONGSONASX
959
+ nk_sqeuclidean_f64_loongsonasx(a, b, n, result);
854
960
  #elif NK_TARGET_RVV
855
961
  nk_sqeuclidean_f64_rvv(a, b, n, result);
856
962
  #elif NK_TARGET_SVE
@@ -869,6 +975,10 @@ NK_PUBLIC void nk_sqeuclidean_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_
869
975
  NK_PUBLIC void nk_angular_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
870
976
  #if NK_TARGET_V128RELAXED
871
977
  nk_angular_f64_v128relaxed(a, b, n, result);
978
+ #elif NK_TARGET_POWERVSX
979
+ nk_angular_f64_powervsx(a, b, n, result);
980
+ #elif NK_TARGET_LOONGSONASX
981
+ nk_angular_f64_loongsonasx(a, b, n, result);
872
982
  #elif NK_TARGET_RVV
873
983
  nk_angular_f64_rvv(a, b, n, result);
874
984
  #elif NK_TARGET_SVE
@@ -887,6 +997,10 @@ NK_PUBLIC void nk_angular_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n,
887
997
  NK_PUBLIC void nk_euclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
888
998
  #if NK_TARGET_V128RELAXED
889
999
  nk_euclidean_f32_v128relaxed(a, b, n, result);
1000
+ #elif NK_TARGET_POWERVSX
1001
+ nk_euclidean_f32_powervsx(a, b, n, result);
1002
+ #elif NK_TARGET_LOONGSONASX
1003
+ nk_euclidean_f32_loongsonasx(a, b, n, result);
890
1004
  #elif NK_TARGET_RVV
891
1005
  nk_euclidean_f32_rvv(a, b, n, result);
892
1006
  #elif NK_TARGET_SVE
@@ -905,6 +1019,10 @@ NK_PUBLIC void nk_euclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
905
1019
  NK_PUBLIC void nk_sqeuclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
906
1020
  #if NK_TARGET_V128RELAXED
907
1021
  nk_sqeuclidean_f32_v128relaxed(a, b, n, result);
1022
+ #elif NK_TARGET_POWERVSX
1023
+ nk_sqeuclidean_f32_powervsx(a, b, n, result);
1024
+ #elif NK_TARGET_LOONGSONASX
1025
+ nk_sqeuclidean_f32_loongsonasx(a, b, n, result);
908
1026
  #elif NK_TARGET_RVV
909
1027
  nk_sqeuclidean_f32_rvv(a, b, n, result);
910
1028
  #elif NK_TARGET_SVE
@@ -923,6 +1041,10 @@ NK_PUBLIC void nk_sqeuclidean_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_
923
1041
  NK_PUBLIC void nk_angular_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
924
1042
  #if NK_TARGET_V128RELAXED
925
1043
  nk_angular_f32_v128relaxed(a, b, n, result);
1044
+ #elif NK_TARGET_POWERVSX
1045
+ nk_angular_f32_powervsx(a, b, n, result);
1046
+ #elif NK_TARGET_LOONGSONASX
1047
+ nk_angular_f32_loongsonasx(a, b, n, result);
926
1048
  #elif NK_TARGET_RVV
927
1049
  nk_angular_f32_rvv(a, b, n, result);
928
1050
  #elif NK_TARGET_SVE
@@ -941,14 +1063,18 @@ NK_PUBLIC void nk_angular_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n,
941
1063
  NK_PUBLIC void nk_euclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
942
1064
  #if NK_TARGET_V128RELAXED
943
1065
  nk_euclidean_f16_v128relaxed(a, b, n, result);
1066
+ #elif NK_TARGET_POWERVSX
1067
+ nk_euclidean_f16_powervsx(a, b, n, result);
944
1068
  #elif NK_TARGET_RVVHALF
945
1069
  nk_euclidean_f16_rvvhalf(a, b, n, result);
946
1070
  #elif NK_TARGET_RVV
947
1071
  nk_euclidean_f16_rvv(a, b, n, result);
948
1072
  #elif NK_TARGET_SVEHALF
949
1073
  nk_euclidean_f16_svehalf(a, b, n, result);
950
- #elif NK_TARGET_NEONHALF
951
- nk_euclidean_f16_neonhalf(a, b, n, result);
1074
+ #elif NK_TARGET_NEON
1075
+ nk_euclidean_f16_neon(a, b, n, result);
1076
+ #elif NK_TARGET_DIAMOND
1077
+ nk_euclidean_f16_diamond(a, b, n, result);
952
1078
  #elif NK_TARGET_SKYLAKE
953
1079
  nk_euclidean_f16_skylake(a, b, n, result);
954
1080
  #elif NK_TARGET_HASWELL
@@ -961,14 +1087,18 @@ NK_PUBLIC void nk_euclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t
961
1087
  NK_PUBLIC void nk_sqeuclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
962
1088
  #if NK_TARGET_V128RELAXED
963
1089
  nk_sqeuclidean_f16_v128relaxed(a, b, n, result);
1090
+ #elif NK_TARGET_POWERVSX
1091
+ nk_sqeuclidean_f16_powervsx(a, b, n, result);
964
1092
  #elif NK_TARGET_RVVHALF
965
1093
  nk_sqeuclidean_f16_rvvhalf(a, b, n, result);
966
1094
  #elif NK_TARGET_RVV
967
1095
  nk_sqeuclidean_f16_rvv(a, b, n, result);
968
1096
  #elif NK_TARGET_SVEHALF
969
1097
  nk_sqeuclidean_f16_svehalf(a, b, n, result);
970
- #elif NK_TARGET_NEONHALF
971
- nk_sqeuclidean_f16_neonhalf(a, b, n, result);
1098
+ #elif NK_TARGET_NEON
1099
+ nk_sqeuclidean_f16_neon(a, b, n, result);
1100
+ #elif NK_TARGET_DIAMOND
1101
+ nk_sqeuclidean_f16_diamond(a, b, n, result);
972
1102
  #elif NK_TARGET_SKYLAKE
973
1103
  nk_sqeuclidean_f16_skylake(a, b, n, result);
974
1104
  #elif NK_TARGET_HASWELL
@@ -981,14 +1111,18 @@ NK_PUBLIC void nk_sqeuclidean_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_
981
1111
  NK_PUBLIC void nk_angular_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
982
1112
  #if NK_TARGET_V128RELAXED
983
1113
  nk_angular_f16_v128relaxed(a, b, n, result);
1114
+ #elif NK_TARGET_POWERVSX
1115
+ nk_angular_f16_powervsx(a, b, n, result);
984
1116
  #elif NK_TARGET_RVVHALF
985
1117
  nk_angular_f16_rvvhalf(a, b, n, result);
986
1118
  #elif NK_TARGET_RVV
987
1119
  nk_angular_f16_rvv(a, b, n, result);
988
1120
  #elif NK_TARGET_SVEHALF
989
1121
  nk_angular_f16_svehalf(a, b, n, result);
990
- #elif NK_TARGET_NEONHALF
991
- nk_angular_f16_neonhalf(a, b, n, result);
1122
+ #elif NK_TARGET_NEON
1123
+ nk_angular_f16_neon(a, b, n, result);
1124
+ #elif NK_TARGET_DIAMOND
1125
+ nk_angular_f16_diamond(a, b, n, result);
992
1126
  #elif NK_TARGET_SKYLAKE
993
1127
  nk_angular_f16_skylake(a, b, n, result);
994
1128
  #elif NK_TARGET_HASWELL
@@ -1001,6 +1135,10 @@ NK_PUBLIC void nk_angular_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n,
1001
1135
  NK_PUBLIC void nk_euclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
1002
1136
  #if NK_TARGET_V128RELAXED
1003
1137
  nk_euclidean_bf16_v128relaxed(a, b, n, result);
1138
+ #elif NK_TARGET_POWERVSX
1139
+ nk_euclidean_bf16_powervsx(a, b, n, result);
1140
+ #elif NK_TARGET_LOONGSONASX
1141
+ nk_euclidean_bf16_loongsonasx(a, b, n, result);
1004
1142
  #elif NK_TARGET_RVVBF16
1005
1143
  nk_euclidean_bf16_rvvbf16(a, b, n, result);
1006
1144
  #elif NK_TARGET_RVV
@@ -1021,6 +1159,10 @@ NK_PUBLIC void nk_euclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size
1021
1159
  NK_PUBLIC void nk_sqeuclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
1022
1160
  #if NK_TARGET_V128RELAXED
1023
1161
  nk_sqeuclidean_bf16_v128relaxed(a, b, n, result);
1162
+ #elif NK_TARGET_POWERVSX
1163
+ nk_sqeuclidean_bf16_powervsx(a, b, n, result);
1164
+ #elif NK_TARGET_LOONGSONASX
1165
+ nk_sqeuclidean_bf16_loongsonasx(a, b, n, result);
1024
1166
  #elif NK_TARGET_RVVBF16
1025
1167
  nk_sqeuclidean_bf16_rvvbf16(a, b, n, result);
1026
1168
  #elif NK_TARGET_RVV
@@ -1041,6 +1183,10 @@ NK_PUBLIC void nk_sqeuclidean_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_si
1041
1183
  NK_PUBLIC void nk_angular_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
1042
1184
  #if NK_TARGET_V128RELAXED
1043
1185
  nk_angular_bf16_v128relaxed(a, b, n, result);
1186
+ #elif NK_TARGET_POWERVSX
1187
+ nk_angular_bf16_powervsx(a, b, n, result);
1188
+ #elif NK_TARGET_LOONGSONASX
1189
+ nk_angular_bf16_loongsonasx(a, b, n, result);
1044
1190
  #elif NK_TARGET_RVVBF16
1045
1191
  nk_angular_bf16_rvvbf16(a, b, n, result);
1046
1192
  #elif NK_TARGET_RVV
@@ -1059,84 +1205,118 @@ NK_PUBLIC void nk_angular_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t
1059
1205
  }
1060
1206
 
1061
1207
  NK_PUBLIC void nk_euclidean_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
1062
- #if NK_TARGET_SAPPHIRE
1063
- nk_euclidean_e4m3_sapphire(a, b, n, result);
1064
- #elif NK_TARGET_GENOA
1065
- nk_euclidean_e4m3_genoa(a, b, n, result);
1208
+ #if NK_TARGET_NEONFP8
1209
+ nk_euclidean_e4m3_neonfp8(a, b, n, result);
1210
+ #elif NK_TARGET_DIAMOND
1211
+ nk_euclidean_e4m3_diamond(a, b, n, result);
1212
+ #elif NK_TARGET_ICELAKE
1213
+ nk_euclidean_e4m3_icelake(a, b, n, result);
1066
1214
  #elif NK_TARGET_SKYLAKE
1067
1215
  nk_euclidean_e4m3_skylake(a, b, n, result);
1068
1216
  #elif NK_TARGET_RVV
1069
1217
  nk_euclidean_e4m3_rvv(a, b, n, result);
1218
+ #elif NK_TARGET_V128RELAXED
1219
+ nk_euclidean_e4m3_v128relaxed(a, b, n, result);
1070
1220
  #else
1071
1221
  nk_euclidean_e4m3_serial(a, b, n, result);
1072
1222
  #endif
1073
1223
  }
1074
1224
 
1075
1225
  NK_PUBLIC void nk_sqeuclidean_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
1076
- #if NK_TARGET_SAPPHIRE
1077
- nk_sqeuclidean_e4m3_sapphire(a, b, n, result);
1078
- #elif NK_TARGET_GENOA
1079
- nk_sqeuclidean_e4m3_genoa(a, b, n, result);
1226
+ #if NK_TARGET_NEONFP8
1227
+ nk_sqeuclidean_e4m3_neonfp8(a, b, n, result);
1228
+ #elif NK_TARGET_DIAMOND
1229
+ nk_sqeuclidean_e4m3_diamond(a, b, n, result);
1230
+ #elif NK_TARGET_ICELAKE
1231
+ nk_sqeuclidean_e4m3_icelake(a, b, n, result);
1080
1232
  #elif NK_TARGET_SKYLAKE
1081
1233
  nk_sqeuclidean_e4m3_skylake(a, b, n, result);
1082
1234
  #elif NK_TARGET_RVV
1083
1235
  nk_sqeuclidean_e4m3_rvv(a, b, n, result);
1236
+ #elif NK_TARGET_V128RELAXED
1237
+ nk_sqeuclidean_e4m3_v128relaxed(a, b, n, result);
1084
1238
  #else
1085
1239
  nk_sqeuclidean_e4m3_serial(a, b, n, result);
1086
1240
  #endif
1087
1241
  }
1088
1242
 
1089
1243
  NK_PUBLIC void nk_angular_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
1090
- #if NK_TARGET_GENOA
1091
- nk_angular_e4m3_genoa(a, b, n, result);
1244
+ #if NK_TARGET_NEONFP8
1245
+ nk_angular_e4m3_neonfp8(a, b, n, result);
1246
+ #elif NK_TARGET_DIAMOND
1247
+ nk_angular_e4m3_diamond(a, b, n, result);
1248
+ #elif NK_TARGET_ICELAKE
1249
+ nk_angular_e4m3_icelake(a, b, n, result);
1092
1250
  #elif NK_TARGET_SKYLAKE
1093
1251
  nk_angular_e4m3_skylake(a, b, n, result);
1094
1252
  #elif NK_TARGET_RVV
1095
1253
  nk_angular_e4m3_rvv(a, b, n, result);
1254
+ #elif NK_TARGET_V128RELAXED
1255
+ nk_angular_e4m3_v128relaxed(a, b, n, result);
1096
1256
  #else
1097
1257
  nk_angular_e4m3_serial(a, b, n, result);
1098
1258
  #endif
1099
1259
  }
1100
1260
 
1101
1261
  NK_PUBLIC void nk_euclidean_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
1102
- #if NK_TARGET_GENOA
1262
+ #if NK_TARGET_NEONFP8
1263
+ nk_euclidean_e5m2_neonfp8(a, b, n, result);
1264
+ #elif NK_TARGET_DIAMOND
1265
+ nk_euclidean_e5m2_diamond(a, b, n, result);
1266
+ #elif NK_TARGET_GENOA
1103
1267
  nk_euclidean_e5m2_genoa(a, b, n, result);
1104
1268
  #elif NK_TARGET_SKYLAKE
1105
1269
  nk_euclidean_e5m2_skylake(a, b, n, result);
1106
1270
  #elif NK_TARGET_RVV
1107
1271
  nk_euclidean_e5m2_rvv(a, b, n, result);
1272
+ #elif NK_TARGET_V128RELAXED
1273
+ nk_euclidean_e5m2_v128relaxed(a, b, n, result);
1108
1274
  #else
1109
1275
  nk_euclidean_e5m2_serial(a, b, n, result);
1110
1276
  #endif
1111
1277
  }
1112
1278
 
1113
1279
  NK_PUBLIC void nk_sqeuclidean_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
1114
- #if NK_TARGET_GENOA
1280
+ #if NK_TARGET_NEONFP8
1281
+ nk_sqeuclidean_e5m2_neonfp8(a, b, n, result);
1282
+ #elif NK_TARGET_DIAMOND
1283
+ nk_sqeuclidean_e5m2_diamond(a, b, n, result);
1284
+ #elif NK_TARGET_GENOA
1115
1285
  nk_sqeuclidean_e5m2_genoa(a, b, n, result);
1116
1286
  #elif NK_TARGET_SKYLAKE
1117
1287
  nk_sqeuclidean_e5m2_skylake(a, b, n, result);
1118
1288
  #elif NK_TARGET_RVV
1119
1289
  nk_sqeuclidean_e5m2_rvv(a, b, n, result);
1290
+ #elif NK_TARGET_V128RELAXED
1291
+ nk_sqeuclidean_e5m2_v128relaxed(a, b, n, result);
1120
1292
  #else
1121
1293
  nk_sqeuclidean_e5m2_serial(a, b, n, result);
1122
1294
  #endif
1123
1295
  }
1124
1296
 
1125
1297
  NK_PUBLIC void nk_angular_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
1126
- #if NK_TARGET_GENOA
1298
+ #if NK_TARGET_NEONFP8
1299
+ nk_angular_e5m2_neonfp8(a, b, n, result);
1300
+ #elif NK_TARGET_DIAMOND
1301
+ nk_angular_e5m2_diamond(a, b, n, result);
1302
+ #elif NK_TARGET_GENOA
1127
1303
  nk_angular_e5m2_genoa(a, b, n, result);
1128
1304
  #elif NK_TARGET_SKYLAKE
1129
1305
  nk_angular_e5m2_skylake(a, b, n, result);
1130
1306
  #elif NK_TARGET_RVV
1131
1307
  nk_angular_e5m2_rvv(a, b, n, result);
1308
+ #elif NK_TARGET_V128RELAXED
1309
+ nk_angular_e5m2_v128relaxed(a, b, n, result);
1132
1310
  #else
1133
1311
  nk_angular_e5m2_serial(a, b, n, result);
1134
1312
  #endif
1135
1313
  }
1136
1314
 
1137
1315
  NK_PUBLIC void nk_euclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
1138
- #if NK_TARGET_SAPPHIRE
1139
- nk_euclidean_e2m3_sapphire(a, b, n, result);
1316
+ #if NK_TARGET_NEONFP8
1317
+ nk_euclidean_e2m3_neonfp8(a, b, n, result);
1318
+ #elif NK_TARGET_ICELAKE
1319
+ nk_euclidean_e2m3_icelake(a, b, n, result);
1140
1320
  #elif NK_TARGET_SKYLAKE
1141
1321
  nk_euclidean_e2m3_skylake(a, b, n, result);
1142
1322
  #elif NK_TARGET_SIERRA
@@ -1147,14 +1327,18 @@ NK_PUBLIC void nk_euclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size
1147
1327
  nk_euclidean_e2m3_haswell(a, b, n, result);
1148
1328
  #elif NK_TARGET_NEON
1149
1329
  nk_euclidean_e2m3_neon(a, b, n, result);
1330
+ #elif NK_TARGET_V128RELAXED
1331
+ nk_euclidean_e2m3_v128relaxed(a, b, n, result);
1150
1332
  #else
1151
1333
  nk_euclidean_e2m3_serial(a, b, n, result);
1152
1334
  #endif
1153
1335
  }
1154
1336
 
1155
1337
  NK_PUBLIC void nk_sqeuclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
1156
- #if NK_TARGET_SAPPHIRE
1157
- nk_sqeuclidean_e2m3_sapphire(a, b, n, result);
1338
+ #if NK_TARGET_NEONFP8
1339
+ nk_sqeuclidean_e2m3_neonfp8(a, b, n, result);
1340
+ #elif NK_TARGET_ICELAKE
1341
+ nk_sqeuclidean_e2m3_icelake(a, b, n, result);
1158
1342
  #elif NK_TARGET_SKYLAKE
1159
1343
  nk_sqeuclidean_e2m3_skylake(a, b, n, result);
1160
1344
  #elif NK_TARGET_SIERRA
@@ -1165,14 +1349,18 @@ NK_PUBLIC void nk_sqeuclidean_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_si
1165
1349
  nk_sqeuclidean_e2m3_haswell(a, b, n, result);
1166
1350
  #elif NK_TARGET_NEON
1167
1351
  nk_sqeuclidean_e2m3_neon(a, b, n, result);
1352
+ #elif NK_TARGET_V128RELAXED
1353
+ nk_sqeuclidean_e2m3_v128relaxed(a, b, n, result);
1168
1354
  #else
1169
1355
  nk_sqeuclidean_e2m3_serial(a, b, n, result);
1170
1356
  #endif
1171
1357
  }
1172
1358
 
1173
1359
  NK_PUBLIC void nk_angular_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
1174
- #if NK_TARGET_SAPPHIRE
1175
- nk_angular_e2m3_sapphire(a, b, n, result);
1360
+ #if NK_TARGET_NEONFP8
1361
+ nk_angular_e2m3_neonfp8(a, b, n, result);
1362
+ #elif NK_TARGET_ICELAKE
1363
+ nk_angular_e2m3_icelake(a, b, n, result);
1176
1364
  #elif NK_TARGET_SKYLAKE
1177
1365
  nk_angular_e2m3_skylake(a, b, n, result);
1178
1366
  #elif NK_TARGET_SIERRA
@@ -1183,54 +1371,74 @@ NK_PUBLIC void nk_angular_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t
1183
1371
  nk_angular_e2m3_haswell(a, b, n, result);
1184
1372
  #elif NK_TARGET_NEON
1185
1373
  nk_angular_e2m3_neon(a, b, n, result);
1374
+ #elif NK_TARGET_V128RELAXED
1375
+ nk_angular_e2m3_v128relaxed(a, b, n, result);
1186
1376
  #else
1187
1377
  nk_angular_e2m3_serial(a, b, n, result);
1188
1378
  #endif
1189
1379
  }
1190
1380
 
1191
1381
  NK_PUBLIC void nk_euclidean_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
1192
- #if NK_TARGET_SAPPHIRE
1193
- nk_euclidean_e3m2_sapphire(a, b, n, result);
1382
+ #if NK_TARGET_NEONFP8
1383
+ nk_euclidean_e3m2_neonfp8(a, b, n, result);
1384
+ #elif NK_TARGET_ICELAKE
1385
+ nk_euclidean_e3m2_icelake(a, b, n, result);
1194
1386
  #elif NK_TARGET_SKYLAKE
1195
1387
  nk_euclidean_e3m2_skylake(a, b, n, result);
1388
+ #elif NK_TARGET_SIERRA
1389
+ nk_euclidean_e3m2_sierra(a, b, n, result);
1196
1390
  #elif NK_TARGET_ALDER
1197
1391
  nk_euclidean_e3m2_alder(a, b, n, result);
1198
1392
  #elif NK_TARGET_HASWELL
1199
1393
  nk_euclidean_e3m2_haswell(a, b, n, result);
1200
1394
  #elif NK_TARGET_NEON
1201
1395
  nk_euclidean_e3m2_neon(a, b, n, result);
1396
+ #elif NK_TARGET_V128RELAXED
1397
+ nk_euclidean_e3m2_v128relaxed(a, b, n, result);
1202
1398
  #else
1203
1399
  nk_euclidean_e3m2_serial(a, b, n, result);
1204
1400
  #endif
1205
1401
  }
1206
1402
 
1207
1403
  NK_PUBLIC void nk_sqeuclidean_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
1208
- #if NK_TARGET_SAPPHIRE
1209
- nk_sqeuclidean_e3m2_sapphire(a, b, n, result);
1404
+ #if NK_TARGET_NEONFP8
1405
+ nk_sqeuclidean_e3m2_neonfp8(a, b, n, result);
1406
+ #elif NK_TARGET_ICELAKE
1407
+ nk_sqeuclidean_e3m2_icelake(a, b, n, result);
1210
1408
  #elif NK_TARGET_SKYLAKE
1211
1409
  nk_sqeuclidean_e3m2_skylake(a, b, n, result);
1410
+ #elif NK_TARGET_SIERRA
1411
+ nk_sqeuclidean_e3m2_sierra(a, b, n, result);
1212
1412
  #elif NK_TARGET_ALDER
1213
1413
  nk_sqeuclidean_e3m2_alder(a, b, n, result);
1214
1414
  #elif NK_TARGET_HASWELL
1215
1415
  nk_sqeuclidean_e3m2_haswell(a, b, n, result);
1216
1416
  #elif NK_TARGET_NEON
1217
1417
  nk_sqeuclidean_e3m2_neon(a, b, n, result);
1418
+ #elif NK_TARGET_V128RELAXED
1419
+ nk_sqeuclidean_e3m2_v128relaxed(a, b, n, result);
1218
1420
  #else
1219
1421
  nk_sqeuclidean_e3m2_serial(a, b, n, result);
1220
1422
  #endif
1221
1423
  }
1222
1424
 
1223
1425
  NK_PUBLIC void nk_angular_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
1224
- #if NK_TARGET_SAPPHIRE
1225
- nk_angular_e3m2_sapphire(a, b, n, result);
1426
+ #if NK_TARGET_NEONFP8
1427
+ nk_angular_e3m2_neonfp8(a, b, n, result);
1428
+ #elif NK_TARGET_ICELAKE
1429
+ nk_angular_e3m2_icelake(a, b, n, result);
1226
1430
  #elif NK_TARGET_SKYLAKE
1227
1431
  nk_angular_e3m2_skylake(a, b, n, result);
1432
+ #elif NK_TARGET_SIERRA
1433
+ nk_angular_e3m2_sierra(a, b, n, result);
1228
1434
  #elif NK_TARGET_ALDER
1229
1435
  nk_angular_e3m2_alder(a, b, n, result);
1230
1436
  #elif NK_TARGET_HASWELL
1231
1437
  nk_angular_e3m2_haswell(a, b, n, result);
1232
1438
  #elif NK_TARGET_NEON
1233
1439
  nk_angular_e3m2_neon(a, b, n, result);
1440
+ #elif NK_TARGET_V128RELAXED
1441
+ nk_angular_e3m2_v128relaxed(a, b, n, result);
1234
1442
  #else
1235
1443
  nk_angular_e3m2_serial(a, b, n, result);
1236
1444
  #endif
@@ -1239,6 +1447,12 @@ NK_PUBLIC void nk_angular_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t
1239
1447
  NK_PUBLIC void nk_euclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
1240
1448
  #if NK_TARGET_RVV
1241
1449
  nk_euclidean_i8_rvv(a, b, n, result);
1450
+ #elif NK_TARGET_POWERVSX
1451
+ nk_euclidean_i8_powervsx(a, b, n, result);
1452
+ #elif NK_TARGET_LOONGSONASX
1453
+ nk_euclidean_i8_loongsonasx(a, b, n, result);
1454
+ #elif NK_TARGET_SVESDOT
1455
+ nk_euclidean_i8_svesdot(a, b, n, result);
1242
1456
  #elif NK_TARGET_NEONSDOT
1243
1457
  nk_euclidean_i8_neonsdot(a, b, n, result);
1244
1458
  #elif NK_TARGET_ICELAKE
@@ -1259,6 +1473,12 @@ NK_PUBLIC void nk_euclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n,
1259
1473
  NK_PUBLIC void nk_sqeuclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_u32_t *result) {
1260
1474
  #if NK_TARGET_RVV
1261
1475
  nk_sqeuclidean_i8_rvv(a, b, n, result);
1476
+ #elif NK_TARGET_POWERVSX
1477
+ nk_sqeuclidean_i8_powervsx(a, b, n, result);
1478
+ #elif NK_TARGET_LOONGSONASX
1479
+ nk_sqeuclidean_i8_loongsonasx(a, b, n, result);
1480
+ #elif NK_TARGET_SVESDOT
1481
+ nk_sqeuclidean_i8_svesdot(a, b, n, result);
1262
1482
  #elif NK_TARGET_NEONSDOT
1263
1483
  nk_sqeuclidean_i8_neonsdot(a, b, n, result);
1264
1484
  #elif NK_TARGET_ICELAKE
@@ -1279,6 +1499,12 @@ NK_PUBLIC void nk_sqeuclidean_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n
1279
1499
  NK_PUBLIC void nk_angular_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t *result) {
1280
1500
  #if NK_TARGET_RVV
1281
1501
  nk_angular_i8_rvv(a, b, n, result);
1502
+ #elif NK_TARGET_POWERVSX
1503
+ nk_angular_i8_powervsx(a, b, n, result);
1504
+ #elif NK_TARGET_LOONGSONASX
1505
+ nk_angular_i8_loongsonasx(a, b, n, result);
1506
+ #elif NK_TARGET_SVESDOT
1507
+ nk_angular_i8_svesdot(a, b, n, result);
1282
1508
  #elif NK_TARGET_NEONSDOT
1283
1509
  nk_angular_i8_neonsdot(a, b, n, result);
1284
1510
  #elif NK_TARGET_ICELAKE
@@ -1299,6 +1525,12 @@ NK_PUBLIC void nk_angular_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk
1299
1525
  NK_PUBLIC void nk_euclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
1300
1526
  #if NK_TARGET_RVV
1301
1527
  nk_euclidean_u8_rvv(a, b, n, result);
1528
+ #elif NK_TARGET_POWERVSX
1529
+ nk_euclidean_u8_powervsx(a, b, n, result);
1530
+ #elif NK_TARGET_LOONGSONASX
1531
+ nk_euclidean_u8_loongsonasx(a, b, n, result);
1532
+ #elif NK_TARGET_SVESDOT
1533
+ nk_euclidean_u8_svesdot(a, b, n, result);
1302
1534
  #elif NK_TARGET_NEONSDOT
1303
1535
  nk_euclidean_u8_neonsdot(a, b, n, result);
1304
1536
  #elif NK_TARGET_ICELAKE
@@ -1319,6 +1551,12 @@ NK_PUBLIC void nk_euclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n,
1319
1551
  NK_PUBLIC void nk_sqeuclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
1320
1552
  #if NK_TARGET_RVV
1321
1553
  nk_sqeuclidean_u8_rvv(a, b, n, result);
1554
+ #elif NK_TARGET_POWERVSX
1555
+ nk_sqeuclidean_u8_powervsx(a, b, n, result);
1556
+ #elif NK_TARGET_LOONGSONASX
1557
+ nk_sqeuclidean_u8_loongsonasx(a, b, n, result);
1558
+ #elif NK_TARGET_SVESDOT
1559
+ nk_sqeuclidean_u8_svesdot(a, b, n, result);
1322
1560
  #elif NK_TARGET_NEONSDOT
1323
1561
  nk_sqeuclidean_u8_neonsdot(a, b, n, result);
1324
1562
  #elif NK_TARGET_ICELAKE
@@ -1339,6 +1577,12 @@ NK_PUBLIC void nk_sqeuclidean_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n
1339
1577
  NK_PUBLIC void nk_angular_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t *result) {
1340
1578
  #if NK_TARGET_RVV
1341
1579
  nk_angular_u8_rvv(a, b, n, result);
1580
+ #elif NK_TARGET_POWERVSX
1581
+ nk_angular_u8_powervsx(a, b, n, result);
1582
+ #elif NK_TARGET_LOONGSONASX
1583
+ nk_angular_u8_loongsonasx(a, b, n, result);
1584
+ #elif NK_TARGET_SVESDOT
1585
+ nk_angular_u8_svesdot(a, b, n, result);
1342
1586
  #elif NK_TARGET_NEONSDOT
1343
1587
  nk_angular_u8_neonsdot(a, b, n, result);
1344
1588
  #elif NK_TARGET_ICELAKE
@@ -1359,6 +1603,8 @@ NK_PUBLIC void nk_angular_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk
1359
1603
  NK_PUBLIC void nk_euclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result) {
1360
1604
  #if NK_TARGET_ICELAKE
1361
1605
  nk_euclidean_i4_icelake(a, b, n, result);
1606
+ #elif NK_TARGET_NEONSDOT
1607
+ nk_euclidean_i4_neonsdot(a, b, n, result);
1362
1608
  #elif NK_TARGET_RVV
1363
1609
  nk_euclidean_i4_rvv(a, b, n, result);
1364
1610
  #else
@@ -1369,6 +1615,8 @@ NK_PUBLIC void nk_euclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t
1369
1615
  NK_PUBLIC void nk_sqeuclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_u32_t *result) {
1370
1616
  #if NK_TARGET_ICELAKE
1371
1617
  nk_sqeuclidean_i4_icelake(a, b, n, result);
1618
+ #elif NK_TARGET_NEONSDOT
1619
+ nk_sqeuclidean_i4_neonsdot(a, b, n, result);
1372
1620
  #elif NK_TARGET_RVV
1373
1621
  nk_sqeuclidean_i4_rvv(a, b, n, result);
1374
1622
  #else
@@ -1379,6 +1627,8 @@ NK_PUBLIC void nk_sqeuclidean_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size
1379
1627
  NK_PUBLIC void nk_angular_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n, nk_f32_t *result) {
1380
1628
  #if NK_TARGET_ICELAKE
1381
1629
  nk_angular_i4_icelake(a, b, n, result);
1630
+ #elif NK_TARGET_NEONSDOT
1631
+ nk_angular_i4_neonsdot(a, b, n, result);
1382
1632
  #elif NK_TARGET_RVV
1383
1633
  nk_angular_i4_rvv(a, b, n, result);
1384
1634
  #else
@@ -1389,6 +1639,8 @@ NK_PUBLIC void nk_angular_i4(nk_i4x2_t const *a, nk_i4x2_t const *b, nk_size_t n
1389
1639
  NK_PUBLIC void nk_euclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result) {
1390
1640
  #if NK_TARGET_ICELAKE
1391
1641
  nk_euclidean_u4_icelake(a, b, n, result);
1642
+ #elif NK_TARGET_NEONSDOT
1643
+ nk_euclidean_u4_neonsdot(a, b, n, result);
1392
1644
  #elif NK_TARGET_RVV
1393
1645
  nk_euclidean_u4_rvv(a, b, n, result);
1394
1646
  #else
@@ -1399,6 +1651,8 @@ NK_PUBLIC void nk_euclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t
1399
1651
  NK_PUBLIC void nk_sqeuclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_u32_t *result) {
1400
1652
  #if NK_TARGET_ICELAKE
1401
1653
  nk_sqeuclidean_u4_icelake(a, b, n, result);
1654
+ #elif NK_TARGET_NEONSDOT
1655
+ nk_sqeuclidean_u4_neonsdot(a, b, n, result);
1402
1656
  #elif NK_TARGET_RVV
1403
1657
  nk_sqeuclidean_u4_rvv(a, b, n, result);
1404
1658
  #else
@@ -1409,6 +1663,8 @@ NK_PUBLIC void nk_sqeuclidean_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size
1409
1663
  NK_PUBLIC void nk_angular_u4(nk_u4x2_t const *a, nk_u4x2_t const *b, nk_size_t n, nk_f32_t *result) {
1410
1664
  #if NK_TARGET_ICELAKE
1411
1665
  nk_angular_u4_icelake(a, b, n, result);
1666
+ #elif NK_TARGET_NEONSDOT
1667
+ nk_angular_u4_neonsdot(a, b, n, result);
1412
1668
  #elif NK_TARGET_RVV
1413
1669
  nk_angular_u4_rvv(a, b, n, result);
1414
1670
  #else