numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,1121 @@
1
+ /**
2
+ * @brief SIMD-accelerated Elementwise Arithmetic for RISC-V.
3
+ * @file include/numkong/each/rvv.h
4
+ * @author Ash Vardanian
5
+ * @date February 6, 2026
6
+ *
7
+ * @sa include/numkong/each.h
8
+ */
9
+ #ifndef NK_EACH_RVV_H
10
+ #define NK_EACH_RVV_H
11
+
12
+ #if NK_TARGET_RISCV_
13
+ #if NK_TARGET_RVV
14
+
15
+ #include "numkong/types.h"
16
+ #include "numkong/cast/rvv.h"
17
+
18
+ #if defined(__clang__)
19
+ #pragma clang attribute push(__attribute__((target("arch=+v"))), apply_to = function)
20
+ #elif defined(__GNUC__)
21
+ #pragma GCC push_options
22
+ #pragma GCC target("arch=+v")
23
+ #endif
24
+
25
+ #if defined(__cplusplus)
26
+ extern "C" {
27
+ #endif
28
+
29
+ NK_PUBLIC void nk_each_sum_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
30
+ for (nk_size_t vector_length; n > 0;
31
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
32
+ vector_length = __riscv_vsetvl_e64m4(n);
33
+ vfloat64m4_t a_f64m4 = __riscv_vle64_v_f64m4(a, vector_length);
34
+ vfloat64m4_t b_f64m4 = __riscv_vle64_v_f64m4(b, vector_length);
35
+ __riscv_vse64_v_f64m4(result, __riscv_vfadd_vv_f64m4(a_f64m4, b_f64m4, vector_length), vector_length);
36
+ }
37
+ }
38
+
39
+ NK_PUBLIC void nk_each_sum_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *result) {
40
+ for (nk_size_t vector_length; n > 0;
41
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
42
+ vector_length = __riscv_vsetvl_e32m4(n);
43
+ vfloat32m4_t a_f32m4 = __riscv_vle32_v_f32m4(a, vector_length);
44
+ vfloat32m4_t b_f32m4 = __riscv_vle32_v_f32m4(b, vector_length);
45
+ __riscv_vse32_v_f32m4(result, __riscv_vfadd_vv_f32m4(a_f32m4, b_f32m4, vector_length), vector_length);
46
+ }
47
+ }
48
+
49
+ NK_PUBLIC void nk_each_sum_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f16_t *result) {
50
+ for (nk_size_t vector_length; n > 0;
51
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
52
+ vector_length = __riscv_vsetvl_e16m1(n);
53
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
54
+ vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b, vector_length);
55
+ vfloat32m2_t a_f32m2 = nk_f16m1_to_f32m2_rvv_(a_u16m1, vector_length);
56
+ vfloat32m2_t b_f32m2 = nk_f16m1_to_f32m2_rvv_(b_u16m1, vector_length);
57
+ vfloat32m2_t result_f32m2 = __riscv_vfadd_vv_f32m2(a_f32m2, b_f32m2, vector_length);
58
+ vuint16m1_t result_u16m1 = nk_f32m2_to_f16m1_rvv_(result_f32m2, vector_length);
59
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
60
+ }
61
+ }
62
+
63
+ NK_PUBLIC void nk_each_sum_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_bf16_t *result) {
64
+ for (nk_size_t vector_length; n > 0;
65
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
66
+ vector_length = __riscv_vsetvl_e16m1(n);
67
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
68
+ vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b, vector_length);
69
+ vfloat32m2_t a_f32m2 = nk_bf16m1_to_f32m2_rvv_(a_u16m1, vector_length);
70
+ vfloat32m2_t b_f32m2 = nk_bf16m1_to_f32m2_rvv_(b_u16m1, vector_length);
71
+ vfloat32m2_t result_f32m2 = __riscv_vfadd_vv_f32m2(a_f32m2, b_f32m2, vector_length);
72
+ vuint16m1_t result_u16m1 = nk_f32m2_to_bf16m1_rvv_(result_f32m2, vector_length);
73
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
74
+ }
75
+ }
76
+
77
+ NK_PUBLIC void nk_each_sum_i8_rvv(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result) {
78
+ for (nk_size_t vector_length; n > 0;
79
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
80
+ vector_length = __riscv_vsetvl_e8m4(n);
81
+ vint8m4_t a_i8m4 = __riscv_vle8_v_i8m4(a, vector_length);
82
+ vint8m4_t b_i8m4 = __riscv_vle8_v_i8m4(b, vector_length);
83
+ __riscv_vse8_v_i8m4(result, __riscv_vsadd_vv_i8m4(a_i8m4, b_i8m4, vector_length), vector_length);
84
+ }
85
+ }
86
+
87
+ NK_PUBLIC void nk_each_sum_u8_rvv(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result) {
88
+ for (nk_size_t vector_length; n > 0;
89
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
90
+ vector_length = __riscv_vsetvl_e8m4(n);
91
+ vuint8m4_t a_u8m4 = __riscv_vle8_v_u8m4(a, vector_length);
92
+ vuint8m4_t b_u8m4 = __riscv_vle8_v_u8m4(b, vector_length);
93
+ __riscv_vse8_v_u8m4(result, __riscv_vsaddu_vv_u8m4(a_u8m4, b_u8m4, vector_length), vector_length);
94
+ }
95
+ }
96
+
97
+ NK_PUBLIC void nk_each_sum_i16_rvv(nk_i16_t const *a, nk_i16_t const *b, nk_size_t n, nk_i16_t *result) {
98
+ for (nk_size_t vector_length; n > 0;
99
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
100
+ vector_length = __riscv_vsetvl_e16m4(n);
101
+ vint16m4_t a_i16m4 = __riscv_vle16_v_i16m4(a, vector_length);
102
+ vint16m4_t b_i16m4 = __riscv_vle16_v_i16m4(b, vector_length);
103
+ __riscv_vse16_v_i16m4(result, __riscv_vsadd_vv_i16m4(a_i16m4, b_i16m4, vector_length), vector_length);
104
+ }
105
+ }
106
+
107
+ NK_PUBLIC void nk_each_sum_u16_rvv(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_u16_t *result) {
108
+ for (nk_size_t vector_length; n > 0;
109
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
110
+ vector_length = __riscv_vsetvl_e16m4(n);
111
+ vuint16m4_t a_u16m4 = __riscv_vle16_v_u16m4(a, vector_length);
112
+ vuint16m4_t b_u16m4 = __riscv_vle16_v_u16m4(b, vector_length);
113
+ __riscv_vse16_v_u16m4(result, __riscv_vsaddu_vv_u16m4(a_u16m4, b_u16m4, vector_length), vector_length);
114
+ }
115
+ }
116
+
117
+ NK_PUBLIC void nk_each_sum_i32_rvv(nk_i32_t const *a, nk_i32_t const *b, nk_size_t n, nk_i32_t *result) {
118
+ for (nk_size_t vector_length; n > 0;
119
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
120
+ vector_length = __riscv_vsetvl_e32m4(n);
121
+ vint32m4_t a_i32m4 = __riscv_vle32_v_i32m4(a, vector_length);
122
+ vint32m4_t b_i32m4 = __riscv_vle32_v_i32m4(b, vector_length);
123
+ __riscv_vse32_v_i32m4(result, __riscv_vsadd_vv_i32m4(a_i32m4, b_i32m4, vector_length), vector_length);
124
+ }
125
+ }
126
+
127
+ NK_PUBLIC void nk_each_sum_u32_rvv(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_u32_t *result) {
128
+ for (nk_size_t vector_length; n > 0;
129
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
130
+ vector_length = __riscv_vsetvl_e32m4(n);
131
+ vuint32m4_t a_u32m4 = __riscv_vle32_v_u32m4(a, vector_length);
132
+ vuint32m4_t b_u32m4 = __riscv_vle32_v_u32m4(b, vector_length);
133
+ __riscv_vse32_v_u32m4(result, __riscv_vsaddu_vv_u32m4(a_u32m4, b_u32m4, vector_length), vector_length);
134
+ }
135
+ }
136
+
137
+ NK_PUBLIC void nk_each_sum_i64_rvv(nk_i64_t const *a, nk_i64_t const *b, nk_size_t n, nk_i64_t *result) {
138
+ for (nk_size_t vector_length; n > 0;
139
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
140
+ vector_length = __riscv_vsetvl_e64m4(n);
141
+ vint64m4_t a_i64m4 = __riscv_vle64_v_i64m4(a, vector_length);
142
+ vint64m4_t b_i64m4 = __riscv_vle64_v_i64m4(b, vector_length);
143
+ __riscv_vse64_v_i64m4(result, __riscv_vsadd_vv_i64m4(a_i64m4, b_i64m4, vector_length), vector_length);
144
+ }
145
+ }
146
+
147
+ NK_PUBLIC void nk_each_sum_u64_rvv(nk_u64_t const *a, nk_u64_t const *b, nk_size_t n, nk_u64_t *result) {
148
+ for (nk_size_t vector_length; n > 0;
149
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
150
+ vector_length = __riscv_vsetvl_e64m4(n);
151
+ vuint64m4_t a_u64m4 = __riscv_vle64_v_u64m4(a, vector_length);
152
+ vuint64m4_t b_u64m4 = __riscv_vle64_v_u64m4(b, vector_length);
153
+ __riscv_vse64_v_u64m4(result, __riscv_vsaddu_vv_u64m4(a_u64m4, b_u64m4, vector_length), vector_length);
154
+ }
155
+ }
156
+
157
+ NK_PUBLIC void nk_each_sum_e4m3_rvv(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_e4m3_t *result) {
158
+ for (nk_size_t vector_length; n > 0;
159
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
160
+ vector_length = __riscv_vsetvl_e8m1(n);
161
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
162
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b, vector_length);
163
+ vfloat32m4_t a_f32m4 = nk_e4m3m1_to_f32m4_rvv_(a_u8m1, vector_length);
164
+ vfloat32m4_t b_f32m4 = nk_e4m3m1_to_f32m4_rvv_(b_u8m1, vector_length);
165
+ vfloat32m4_t result_f32m4 = __riscv_vfadd_vv_f32m4(a_f32m4, b_f32m4, vector_length);
166
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e4m3m1_rvv_(result_f32m4, vector_length);
167
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
168
+ }
169
+ }
170
+
171
+ NK_PUBLIC void nk_each_sum_e5m2_rvv(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_e5m2_t *result) {
172
+ for (nk_size_t vector_length; n > 0;
173
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
174
+ vector_length = __riscv_vsetvl_e8m1(n);
175
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
176
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b, vector_length);
177
+ vfloat32m4_t a_f32m4 = nk_e5m2m1_to_f32m4_rvv_(a_u8m1, vector_length);
178
+ vfloat32m4_t b_f32m4 = nk_e5m2m1_to_f32m4_rvv_(b_u8m1, vector_length);
179
+ vfloat32m4_t result_f32m4 = __riscv_vfadd_vv_f32m4(a_f32m4, b_f32m4, vector_length);
180
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e5m2m1_rvv_(result_f32m4, vector_length);
181
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
182
+ }
183
+ }
184
+
185
+ NK_PUBLIC void nk_each_scale_f64_rvv(nk_f64_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
186
+ nk_f64_t *result) {
187
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
188
+ nk_size_t vlmax = __riscv_vsetvlmax_e64m4();
189
+ vfloat64m4_t beta_f64m4 = __riscv_vfmv_v_f_f64m4(beta_val, vlmax);
190
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
191
+ vector_length = __riscv_vsetvl_e64m4(n);
192
+ vfloat64m4_t a_f64m4 = __riscv_vle64_v_f64m4(a, vector_length);
193
+ a_f64m4 = __riscv_vfmadd_vf_f64m4(a_f64m4, alpha_val, beta_f64m4, vector_length);
194
+ __riscv_vse64_v_f64m4(result, a_f64m4, vector_length);
195
+ }
196
+ }
197
+
198
+ NK_PUBLIC void nk_each_scale_f32_rvv(nk_f32_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
199
+ nk_f32_t *result) {
200
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
201
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
202
+ vfloat32m4_t beta_f32m4 = __riscv_vfmv_v_f_f32m4(beta_val, vlmax);
203
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
204
+ vector_length = __riscv_vsetvl_e32m4(n);
205
+ vfloat32m4_t a_f32m4 = __riscv_vle32_v_f32m4(a, vector_length);
206
+ a_f32m4 = __riscv_vfmadd_vf_f32m4(a_f32m4, alpha_val, beta_f32m4, vector_length);
207
+ __riscv_vse32_v_f32m4(result, a_f32m4, vector_length);
208
+ }
209
+ }
210
+
211
+ NK_PUBLIC void nk_each_scale_f16_rvv(nk_f16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
212
+ nk_f16_t *result) {
213
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
214
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
215
+ vfloat32m2_t beta_f32m2 = __riscv_vfmv_v_f_f32m2(beta_val, vlmax);
216
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
217
+ vector_length = __riscv_vsetvl_e16m1(n);
218
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
219
+ vfloat32m2_t a_f32m2 = nk_f16m1_to_f32m2_rvv_(a_u16m1, vector_length);
220
+ a_f32m2 = __riscv_vfmadd_vf_f32m2(a_f32m2, alpha_val, beta_f32m2, vector_length);
221
+ vuint16m1_t result_u16m1 = nk_f32m2_to_f16m1_rvv_(a_f32m2, vector_length);
222
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
223
+ }
224
+ }
225
+
226
+ NK_PUBLIC void nk_each_scale_bf16_rvv(nk_bf16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
227
+ nk_bf16_t *result) {
228
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
229
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
230
+ vfloat32m2_t beta_f32m2 = __riscv_vfmv_v_f_f32m2(beta_val, vlmax);
231
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
232
+ vector_length = __riscv_vsetvl_e16m1(n);
233
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
234
+ vfloat32m2_t a_f32m2 = nk_bf16m1_to_f32m2_rvv_(a_u16m1, vector_length);
235
+ a_f32m2 = __riscv_vfmadd_vf_f32m2(a_f32m2, alpha_val, beta_f32m2, vector_length);
236
+ vuint16m1_t result_u16m1 = nk_f32m2_to_bf16m1_rvv_(a_f32m2, vector_length);
237
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
238
+ }
239
+ }
240
+
241
+ NK_PUBLIC void nk_each_scale_i8_rvv(nk_i8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
242
+ nk_i8_t *result) {
243
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
244
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
245
+ vfloat32m4_t beta_f32m4 = __riscv_vfmv_v_f_f32m4(beta_val, vlmax);
246
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
247
+ vector_length = __riscv_vsetvl_e8m1(n);
248
+ vint8m1_t a_i8m1 = __riscv_vle8_v_i8m1(a, vector_length);
249
+ vint16m2_t a_i16m2 = __riscv_vwadd_vx_i16m2(a_i8m1, 0, vector_length);
250
+ vint32m4_t a_i32m4 = __riscv_vwadd_vx_i32m4(a_i16m2, 0, vector_length);
251
+ vfloat32m4_t a_f32m4 = __riscv_vfcvt_f_x_v_f32m4(a_i32m4, vector_length);
252
+ a_f32m4 = __riscv_vfmadd_vf_f32m4(a_f32m4, alpha_val, beta_f32m4, vector_length);
253
+ vint32m4_t result_i32m4 = __riscv_vfcvt_x_f_v_i32m4(a_f32m4, vector_length);
254
+ result_i32m4 = __riscv_vmax_vx_i32m4(result_i32m4, -128, vector_length);
255
+ result_i32m4 = __riscv_vmin_vx_i32m4(result_i32m4, 127, vector_length);
256
+ vint16m2_t result_i16m2 = __riscv_vncvt_x_x_w_i16m2(result_i32m4, vector_length);
257
+ vint8m1_t result_i8m1 = __riscv_vncvt_x_x_w_i8m1(result_i16m2, vector_length);
258
+ __riscv_vse8_v_i8m1(result, result_i8m1, vector_length);
259
+ }
260
+ }
261
+
262
+ NK_PUBLIC void nk_each_scale_u8_rvv(nk_u8_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
263
+ nk_u8_t *result) {
264
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
265
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
266
+ vfloat32m4_t beta_f32m4 = __riscv_vfmv_v_f_f32m4(beta_val, vlmax);
267
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
268
+ vector_length = __riscv_vsetvl_e8m1(n);
269
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1(a, vector_length);
270
+ vuint16m2_t a_u16m2 = __riscv_vwaddu_vx_u16m2(a_u8m1, 0, vector_length);
271
+ vuint32m4_t a_u32m4 = __riscv_vwaddu_vx_u32m4(a_u16m2, 0, vector_length);
272
+ vfloat32m4_t a_f32m4 = __riscv_vfcvt_f_xu_v_f32m4(a_u32m4, vector_length);
273
+ a_f32m4 = __riscv_vfmadd_vf_f32m4(a_f32m4, alpha_val, beta_f32m4, vector_length);
274
+ vuint32m4_t result_u32m4 = __riscv_vfcvt_xu_f_v_u32m4(a_f32m4, vector_length);
275
+ result_u32m4 = __riscv_vmaxu_vx_u32m4(result_u32m4, 0, vector_length);
276
+ result_u32m4 = __riscv_vminu_vx_u32m4(result_u32m4, 255, vector_length);
277
+ vuint16m2_t result_u16m2 = __riscv_vncvt_x_x_w_u16m2(result_u32m4, vector_length);
278
+ vuint8m1_t result_u8m1 = __riscv_vncvt_x_x_w_u8m1(result_u16m2, vector_length);
279
+ __riscv_vse8_v_u8m1(result, result_u8m1, vector_length);
280
+ }
281
+ }
282
+
283
+ NK_PUBLIC void nk_each_scale_i16_rvv(nk_i16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
284
+ nk_i16_t *result) {
285
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
286
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
287
+ vfloat32m2_t beta_f32m2 = __riscv_vfmv_v_f_f32m2(beta_val, vlmax);
288
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
289
+ vector_length = __riscv_vsetvl_e16m1(n);
290
+ vint16m1_t a_i16m1 = __riscv_vle16_v_i16m1(a, vector_length);
291
+ vint32m2_t a_i32m2 = __riscv_vwadd_vx_i32m2(a_i16m1, 0, vector_length);
292
+ vfloat32m2_t a_f32m2 = __riscv_vfcvt_f_x_v_f32m2(a_i32m2, vector_length);
293
+ a_f32m2 = __riscv_vfmadd_vf_f32m2(a_f32m2, alpha_val, beta_f32m2, vector_length);
294
+ vint32m2_t result_i32m2 = __riscv_vfcvt_x_f_v_i32m2(a_f32m2, vector_length);
295
+ result_i32m2 = __riscv_vmax_vx_i32m2(result_i32m2, -32768, vector_length);
296
+ result_i32m2 = __riscv_vmin_vx_i32m2(result_i32m2, 32767, vector_length);
297
+ vint16m1_t result_i16m1 = __riscv_vncvt_x_x_w_i16m1(result_i32m2, vector_length);
298
+ __riscv_vse16_v_i16m1(result, result_i16m1, vector_length);
299
+ }
300
+ }
301
+
302
+ NK_PUBLIC void nk_each_scale_u16_rvv(nk_u16_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
303
+ nk_u16_t *result) {
304
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
305
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
306
+ vfloat32m2_t beta_f32m2 = __riscv_vfmv_v_f_f32m2(beta_val, vlmax);
307
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
308
+ vector_length = __riscv_vsetvl_e16m1(n);
309
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1(a, vector_length);
310
+ vuint32m2_t a_u32m2 = __riscv_vwaddu_vx_u32m2(a_u16m1, 0, vector_length);
311
+ vfloat32m2_t a_f32m2 = __riscv_vfcvt_f_xu_v_f32m2(a_u32m2, vector_length);
312
+ a_f32m2 = __riscv_vfmadd_vf_f32m2(a_f32m2, alpha_val, beta_f32m2, vector_length);
313
+ vuint32m2_t result_u32m2 = __riscv_vfcvt_xu_f_v_u32m2(a_f32m2, vector_length);
314
+ result_u32m2 = __riscv_vmaxu_vx_u32m2(result_u32m2, 0, vector_length);
315
+ result_u32m2 = __riscv_vminu_vx_u32m2(result_u32m2, 65535, vector_length);
316
+ vuint16m1_t result_u16m1 = __riscv_vncvt_x_x_w_u16m1(result_u32m2, vector_length);
317
+ __riscv_vse16_v_u16m1(result, result_u16m1, vector_length);
318
+ }
319
+ }
320
+
321
+ NK_PUBLIC void nk_each_scale_i32_rvv(nk_i32_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
322
+ nk_i32_t *result) {
323
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
324
+ nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
325
+ vfloat64m2_t beta_f64m2 = __riscv_vfmv_v_f_f64m2(beta_val, vlmax);
326
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
327
+ vector_length = __riscv_vsetvl_e32m1(n);
328
+ vint32m1_t a_i32m1 = __riscv_vle32_v_i32m1(a, vector_length);
329
+ vfloat64m2_t a_f64m2 = __riscv_vfwcvt_f_x_v_f64m2(a_i32m1, vector_length);
330
+ a_f64m2 = __riscv_vfmadd_vf_f64m2(a_f64m2, alpha_val, beta_f64m2, vector_length);
331
+ a_f64m2 = __riscv_vfmax_vf_f64m2(a_f64m2, -2147483648.0, vector_length);
332
+ a_f64m2 = __riscv_vfmin_vf_f64m2(a_f64m2, 2147483647.0, vector_length);
333
+ vint32m1_t result_i32m1 = __riscv_vfncvt_x_f_w_i32m1(a_f64m2, vector_length);
334
+ __riscv_vse32_v_i32m1(result, result_i32m1, vector_length);
335
+ }
336
+ }
337
+
338
+ NK_PUBLIC void nk_each_scale_u32_rvv(nk_u32_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
339
+ nk_u32_t *result) {
340
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
341
+ nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
342
+ vfloat64m2_t beta_f64m2 = __riscv_vfmv_v_f_f64m2(beta_val, vlmax);
343
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
344
+ vector_length = __riscv_vsetvl_e32m1(n);
345
+ vuint32m1_t a_u32m1 = __riscv_vle32_v_u32m1(a, vector_length);
346
+ vfloat64m2_t a_f64m2 = __riscv_vfwcvt_f_xu_v_f64m2(a_u32m1, vector_length);
347
+ a_f64m2 = __riscv_vfmadd_vf_f64m2(a_f64m2, alpha_val, beta_f64m2, vector_length);
348
+ a_f64m2 = __riscv_vfmax_vf_f64m2(a_f64m2, 0.0, vector_length);
349
+ a_f64m2 = __riscv_vfmin_vf_f64m2(a_f64m2, 4294967295.0, vector_length);
350
+ vuint32m1_t result_u32m1 = __riscv_vfncvt_xu_f_w_u32m1(a_f64m2, vector_length);
351
+ __riscv_vse32_v_u32m1(result, result_u32m1, vector_length);
352
+ }
353
+ }
354
+
355
+ NK_PUBLIC void nk_each_scale_i64_rvv(nk_i64_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
356
+ nk_i64_t *result) {
357
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
358
+ nk_size_t vlmax = __riscv_vsetvlmax_e64m4();
359
+ vfloat64m4_t beta_f64m4 = __riscv_vfmv_v_f_f64m4(beta_val, vlmax);
360
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
361
+ vector_length = __riscv_vsetvl_e64m4(n);
362
+ vint64m4_t a_i64m4 = __riscv_vle64_v_i64m4(a, vector_length);
363
+ vfloat64m4_t a_f64m4 = __riscv_vfcvt_f_x_v_f64m4(a_i64m4, vector_length);
364
+ a_f64m4 = __riscv_vfmadd_vf_f64m4(a_f64m4, alpha_val, beta_f64m4, vector_length);
365
+ vint64m4_t result_i64m4 = __riscv_vfcvt_x_f_v_i64m4(a_f64m4, vector_length);
366
+ __riscv_vse64_v_i64m4(result, result_i64m4, vector_length);
367
+ }
368
+ }
369
+
370
+ NK_PUBLIC void nk_each_scale_u64_rvv(nk_u64_t const *a, nk_size_t n, nk_f64_t const *alpha, nk_f64_t const *beta,
371
+ nk_u64_t *result) {
372
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
373
+ nk_size_t vlmax = __riscv_vsetvlmax_e64m4();
374
+ vfloat64m4_t beta_f64m4 = __riscv_vfmv_v_f_f64m4(beta_val, vlmax);
375
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
376
+ vector_length = __riscv_vsetvl_e64m4(n);
377
+ vuint64m4_t a_u64m4 = __riscv_vle64_v_u64m4(a, vector_length);
378
+ vfloat64m4_t a_f64m4 = __riscv_vfcvt_f_xu_v_f64m4(a_u64m4, vector_length);
379
+ a_f64m4 = __riscv_vfmadd_vf_f64m4(a_f64m4, alpha_val, beta_f64m4, vector_length);
380
+ a_f64m4 = __riscv_vfmax_vf_f64m4(a_f64m4, 0.0, vector_length);
381
+ vuint64m4_t result_u64m4 = __riscv_vfcvt_xu_f_v_u64m4(a_f64m4, vector_length);
382
+ __riscv_vse64_v_u64m4(result, result_u64m4, vector_length);
383
+ }
384
+ }
385
+
386
+ NK_PUBLIC void nk_each_scale_e4m3_rvv(nk_e4m3_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
387
+ nk_e4m3_t *result) {
388
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
389
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
390
+ vfloat32m4_t beta_f32m4 = __riscv_vfmv_v_f_f32m4(beta_val, vlmax);
391
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
392
+ vector_length = __riscv_vsetvl_e8m1(n);
393
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
394
+ vfloat32m4_t a_f32m4 = nk_e4m3m1_to_f32m4_rvv_(a_u8m1, vector_length);
395
+ a_f32m4 = __riscv_vfmadd_vf_f32m4(a_f32m4, alpha_val, beta_f32m4, vector_length);
396
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e4m3m1_rvv_(a_f32m4, vector_length);
397
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
398
+ }
399
+ }
400
+
401
+ NK_PUBLIC void nk_each_scale_e5m2_rvv(nk_e5m2_t const *a, nk_size_t n, nk_f32_t const *alpha, nk_f32_t const *beta,
402
+ nk_e5m2_t *result) {
403
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
404
+ nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
405
+ vfloat32m4_t beta_f32m4 = __riscv_vfmv_v_f_f32m4(beta_val, vlmax);
406
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a += vector_length, result += vector_length) {
407
+ vector_length = __riscv_vsetvl_e8m1(n);
408
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
409
+ vfloat32m4_t a_f32m4 = nk_e5m2m1_to_f32m4_rvv_(a_u8m1, vector_length);
410
+ a_f32m4 = __riscv_vfmadd_vf_f32m4(a_f32m4, alpha_val, beta_f32m4, vector_length);
411
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e5m2m1_rvv_(a_f32m4, vector_length);
412
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
413
+ }
414
+ }
415
+
416
+ NK_PUBLIC void nk_each_blend_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t const *alpha,
417
+ nk_f64_t const *beta, nk_f64_t *result) {
418
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
419
+ if (alpha_val == 1 && beta_val == 1) {
420
+ nk_each_sum_f64_rvv(a, b, n, result);
421
+ return;
422
+ }
423
+ else if (alpha_val == 0 || beta_val == 0) {
424
+ nk_f64_t zero = 0;
425
+ if (beta_val == 0) { nk_each_scale_f64_rvv(a, n, alpha, &zero, result); }
426
+ else { nk_each_scale_f64_rvv(b, n, beta, &zero, result); }
427
+ return;
428
+ }
429
+ for (nk_size_t vector_length; n > 0;
430
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
431
+ vector_length = __riscv_vsetvl_e64m4(n);
432
+ vfloat64m4_t a_f64m4 = __riscv_vle64_v_f64m4(a, vector_length);
433
+ vfloat64m4_t b_f64m4 = __riscv_vle64_v_f64m4(b, vector_length);
434
+ vfloat64m4_t result_f64m4 = __riscv_vfmul_vf_f64m4(a_f64m4, alpha_val, vector_length);
435
+ result_f64m4 = __riscv_vfmacc_vf_f64m4(result_f64m4, beta_val, b_f64m4, vector_length);
436
+ __riscv_vse64_v_f64m4(result, result_f64m4, vector_length);
437
+ }
438
+ }
439
+
440
+ NK_PUBLIC void nk_each_blend_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t const *alpha,
441
+ nk_f32_t const *beta, nk_f32_t *result) {
442
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
443
+ if (alpha_val == 1 && beta_val == 1) {
444
+ nk_each_sum_f32_rvv(a, b, n, result);
445
+ return;
446
+ }
447
+ else if (alpha_val == 0 || beta_val == 0) {
448
+ nk_f32_t zero = 0;
449
+ if (beta_val == 0) { nk_each_scale_f32_rvv(a, n, alpha, &zero, result); }
450
+ else { nk_each_scale_f32_rvv(b, n, beta, &zero, result); }
451
+ return;
452
+ }
453
+ for (nk_size_t vector_length; n > 0;
454
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
455
+ vector_length = __riscv_vsetvl_e32m4(n);
456
+ vfloat32m4_t a_f32m4 = __riscv_vle32_v_f32m4(a, vector_length);
457
+ vfloat32m4_t b_f32m4 = __riscv_vle32_v_f32m4(b, vector_length);
458
+ vfloat32m4_t a_scaled_f32m4 = __riscv_vfmul_vf_f32m4(a_f32m4, alpha_val, vector_length);
459
+ vfloat32m4_t result_f32m4 = __riscv_vfmacc_vf_f32m4(a_scaled_f32m4, beta_val, b_f32m4, vector_length);
460
+ __riscv_vse32_v_f32m4(result, result_f32m4, vector_length);
461
+ }
462
+ }
463
+
464
+ NK_PUBLIC void nk_each_blend_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t const *alpha,
465
+ nk_f32_t const *beta, nk_f16_t *result) {
466
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
467
+ if (alpha_val == 1 && beta_val == 1) {
468
+ nk_each_sum_f16_rvv(a, b, n, result);
469
+ return;
470
+ }
471
+ else if (alpha_val == 0 || beta_val == 0) {
472
+ nk_f32_t zero = 0;
473
+ if (beta_val == 0) { nk_each_scale_f16_rvv(a, n, alpha, &zero, result); }
474
+ else { nk_each_scale_f16_rvv(b, n, beta, &zero, result); }
475
+ return;
476
+ }
477
+ for (nk_size_t vector_length; n > 0;
478
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
479
+ vector_length = __riscv_vsetvl_e16m1(n);
480
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
481
+ vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b, vector_length);
482
+ vfloat32m2_t a_f32m2 = nk_f16m1_to_f32m2_rvv_(a_u16m1, vector_length);
483
+ vfloat32m2_t b_f32m2 = nk_f16m1_to_f32m2_rvv_(b_u16m1, vector_length);
484
+ vfloat32m2_t result_f32m2 = __riscv_vfmul_vf_f32m2(a_f32m2, alpha_val, vector_length);
485
+ result_f32m2 = __riscv_vfmacc_vf_f32m2(result_f32m2, beta_val, b_f32m2, vector_length);
486
+ vuint16m1_t result_u16m1 = nk_f32m2_to_f16m1_rvv_(result_f32m2, vector_length);
487
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
488
+ }
489
+ }
490
+
491
+ NK_PUBLIC void nk_each_blend_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t const *alpha,
492
+ nk_f32_t const *beta, nk_bf16_t *result) {
493
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
494
+ if (alpha_val == 1 && beta_val == 1) {
495
+ nk_each_sum_bf16_rvv(a, b, n, result);
496
+ return;
497
+ }
498
+ else if (alpha_val == 0 || beta_val == 0) {
499
+ nk_f32_t zero = 0;
500
+ if (beta_val == 0) { nk_each_scale_bf16_rvv(a, n, alpha, &zero, result); }
501
+ else { nk_each_scale_bf16_rvv(b, n, beta, &zero, result); }
502
+ return;
503
+ }
504
+ for (nk_size_t vector_length; n > 0;
505
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
506
+ vector_length = __riscv_vsetvl_e16m1(n);
507
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
508
+ vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b, vector_length);
509
+ vfloat32m2_t a_f32m2 = nk_bf16m1_to_f32m2_rvv_(a_u16m1, vector_length);
510
+ vfloat32m2_t b_f32m2 = nk_bf16m1_to_f32m2_rvv_(b_u16m1, vector_length);
511
+ vfloat32m2_t result_f32m2 = __riscv_vfmul_vf_f32m2(a_f32m2, alpha_val, vector_length);
512
+ result_f32m2 = __riscv_vfmacc_vf_f32m2(result_f32m2, beta_val, b_f32m2, vector_length);
513
+ vuint16m1_t result_u16m1 = nk_f32m2_to_bf16m1_rvv_(result_f32m2, vector_length);
514
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
515
+ }
516
+ }
517
+
518
+ NK_PUBLIC void nk_each_blend_i8_rvv(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_f32_t const *alpha,
519
+ nk_f32_t const *beta, nk_i8_t *result) {
520
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
521
+ if (alpha_val == 1 && beta_val == 1) {
522
+ nk_each_sum_i8_rvv(a, b, n, result);
523
+ return;
524
+ }
525
+ else if (alpha_val == 0 || beta_val == 0) {
526
+ nk_f32_t zero = 0;
527
+ if (beta_val == 0) { nk_each_scale_i8_rvv(a, n, alpha, &zero, result); }
528
+ else { nk_each_scale_i8_rvv(b, n, beta, &zero, result); }
529
+ return;
530
+ }
531
+ for (nk_size_t vector_length; n > 0;
532
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
533
+ vector_length = __riscv_vsetvl_e8m1(n);
534
+ vint8m1_t a_i8m1 = __riscv_vle8_v_i8m1(a, vector_length);
535
+ vint8m1_t b_i8m1 = __riscv_vle8_v_i8m1(b, vector_length);
536
+ vint16m2_t a_i16m2 = __riscv_vwadd_vx_i16m2(a_i8m1, 0, vector_length);
537
+ vint32m4_t a_i32m4 = __riscv_vwadd_vx_i32m4(a_i16m2, 0, vector_length);
538
+ vfloat32m4_t a_f32m4 = __riscv_vfcvt_f_x_v_f32m4(a_i32m4, vector_length);
539
+ vint16m2_t b_i16m2 = __riscv_vwadd_vx_i16m2(b_i8m1, 0, vector_length);
540
+ vint32m4_t b_i32m4 = __riscv_vwadd_vx_i32m4(b_i16m2, 0, vector_length);
541
+ vfloat32m4_t b_f32m4 = __riscv_vfcvt_f_x_v_f32m4(b_i32m4, vector_length);
542
+ vfloat32m4_t result_f32m4 = __riscv_vfmul_vf_f32m4(a_f32m4, alpha_val, vector_length);
543
+ result_f32m4 = __riscv_vfmacc_vf_f32m4(result_f32m4, beta_val, b_f32m4, vector_length);
544
+ vint32m4_t result_i32m4 = __riscv_vfcvt_x_f_v_i32m4(result_f32m4, vector_length);
545
+ result_i32m4 = __riscv_vmax_vx_i32m4(result_i32m4, -128, vector_length);
546
+ result_i32m4 = __riscv_vmin_vx_i32m4(result_i32m4, 127, vector_length);
547
+ vint16m2_t result_i16m2 = __riscv_vncvt_x_x_w_i16m2(result_i32m4, vector_length);
548
+ vint8m1_t result_i8m1 = __riscv_vncvt_x_x_w_i8m1(result_i16m2, vector_length);
549
+ __riscv_vse8_v_i8m1(result, result_i8m1, vector_length);
550
+ }
551
+ }
552
+
553
+ NK_PUBLIC void nk_each_blend_u8_rvv(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_f32_t const *alpha,
554
+ nk_f32_t const *beta, nk_u8_t *result) {
555
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
556
+ if (alpha_val == 1 && beta_val == 1) {
557
+ nk_each_sum_u8_rvv(a, b, n, result);
558
+ return;
559
+ }
560
+ else if (alpha_val == 0 || beta_val == 0) {
561
+ nk_f32_t zero = 0;
562
+ if (beta_val == 0) { nk_each_scale_u8_rvv(a, n, alpha, &zero, result); }
563
+ else { nk_each_scale_u8_rvv(b, n, beta, &zero, result); }
564
+ return;
565
+ }
566
+ for (nk_size_t vector_length; n > 0;
567
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
568
+ vector_length = __riscv_vsetvl_e8m1(n);
569
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1(a, vector_length);
570
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1(b, vector_length);
571
+ vuint16m2_t a_u16m2 = __riscv_vwaddu_vx_u16m2(a_u8m1, 0, vector_length);
572
+ vuint32m4_t a_u32m4 = __riscv_vwaddu_vx_u32m4(a_u16m2, 0, vector_length);
573
+ vfloat32m4_t a_f32m4 = __riscv_vfcvt_f_xu_v_f32m4(a_u32m4, vector_length);
574
+ vuint16m2_t b_u16m2 = __riscv_vwaddu_vx_u16m2(b_u8m1, 0, vector_length);
575
+ vuint32m4_t b_u32m4 = __riscv_vwaddu_vx_u32m4(b_u16m2, 0, vector_length);
576
+ vfloat32m4_t b_f32m4 = __riscv_vfcvt_f_xu_v_f32m4(b_u32m4, vector_length);
577
+ vfloat32m4_t result_f32m4 = __riscv_vfmul_vf_f32m4(a_f32m4, alpha_val, vector_length);
578
+ result_f32m4 = __riscv_vfmacc_vf_f32m4(result_f32m4, beta_val, b_f32m4, vector_length);
579
+ vuint32m4_t result_u32m4 = __riscv_vfcvt_xu_f_v_u32m4(result_f32m4, vector_length);
580
+ result_u32m4 = __riscv_vmaxu_vx_u32m4(result_u32m4, 0, vector_length);
581
+ result_u32m4 = __riscv_vminu_vx_u32m4(result_u32m4, 255, vector_length);
582
+ vuint16m2_t result_u16m2 = __riscv_vncvt_x_x_w_u16m2(result_u32m4, vector_length);
583
+ vuint8m1_t result_u8m1 = __riscv_vncvt_x_x_w_u8m1(result_u16m2, vector_length);
584
+ __riscv_vse8_v_u8m1(result, result_u8m1, vector_length);
585
+ }
586
+ }
587
+
588
+ NK_PUBLIC void nk_each_blend_e4m3_rvv(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t const *alpha,
589
+ nk_f32_t const *beta, nk_e4m3_t *result) {
590
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
591
+ for (nk_size_t vector_length; n > 0;
592
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
593
+ vector_length = __riscv_vsetvl_e8m1(n);
594
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
595
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b, vector_length);
596
+ vfloat32m4_t a_f32m4 = nk_e4m3m1_to_f32m4_rvv_(a_u8m1, vector_length);
597
+ vfloat32m4_t b_f32m4 = nk_e4m3m1_to_f32m4_rvv_(b_u8m1, vector_length);
598
+ vfloat32m4_t result_f32m4 = __riscv_vfmul_vf_f32m4(a_f32m4, alpha_val, vector_length);
599
+ result_f32m4 = __riscv_vfmacc_vf_f32m4(result_f32m4, beta_val, b_f32m4, vector_length);
600
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e4m3m1_rvv_(result_f32m4, vector_length);
601
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
602
+ }
603
+ }
604
+
605
+ NK_PUBLIC void nk_each_blend_e5m2_rvv(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t const *alpha,
606
+ nk_f32_t const *beta, nk_e5m2_t *result) {
607
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
608
+ for (nk_size_t vector_length; n > 0;
609
+ n -= vector_length, a += vector_length, b += vector_length, result += vector_length) {
610
+ vector_length = __riscv_vsetvl_e8m1(n);
611
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
612
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b, vector_length);
613
+ vfloat32m4_t a_f32m4 = nk_e5m2m1_to_f32m4_rvv_(a_u8m1, vector_length);
614
+ vfloat32m4_t b_f32m4 = nk_e5m2m1_to_f32m4_rvv_(b_u8m1, vector_length);
615
+ vfloat32m4_t result_f32m4 = __riscv_vfmul_vf_f32m4(a_f32m4, alpha_val, vector_length);
616
+ result_f32m4 = __riscv_vfmacc_vf_f32m4(result_f32m4, beta_val, b_f32m4, vector_length);
617
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e5m2m1_rvv_(result_f32m4, vector_length);
618
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
619
+ }
620
+ }
621
+
622
+ NK_PUBLIC void nk_each_fma_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t n,
623
+ nk_f64_t const *alpha, nk_f64_t const *beta, nk_f64_t *result) {
624
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
625
+ for (nk_size_t vector_length; n > 0;
626
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
627
+ vector_length = __riscv_vsetvl_e64m4(n);
628
+ vfloat64m4_t a_f64m4 = __riscv_vle64_v_f64m4(a, vector_length);
629
+ vfloat64m4_t b_f64m4 = __riscv_vle64_v_f64m4(b, vector_length);
630
+ vfloat64m4_t c_f64m4 = __riscv_vle64_v_f64m4(c, vector_length);
631
+ vfloat64m4_t product_f64m4 = __riscv_vfmul_vv_f64m4(a_f64m4, b_f64m4, vector_length);
632
+ vfloat64m4_t scaled_product_f64m4 = __riscv_vfmul_vf_f64m4(product_f64m4, alpha_val, vector_length);
633
+ vfloat64m4_t result_f64m4 = __riscv_vfmacc_vf_f64m4(scaled_product_f64m4, beta_val, c_f64m4, vector_length);
634
+ __riscv_vse64_v_f64m4(result, result_f64m4, vector_length);
635
+ }
636
+ }
637
+
638
+ NK_PUBLIC void nk_each_fma_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t n,
639
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_f32_t *result) {
640
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
641
+ for (nk_size_t vector_length; n > 0;
642
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
643
+ vector_length = __riscv_vsetvl_e32m4(n);
644
+ vfloat32m4_t a_f32m4 = __riscv_vle32_v_f32m4(a, vector_length);
645
+ vfloat32m4_t b_f32m4 = __riscv_vle32_v_f32m4(b, vector_length);
646
+ vfloat32m4_t c_f32m4 = __riscv_vle32_v_f32m4(c, vector_length);
647
+ vfloat32m4_t product_f32m4 = __riscv_vfmul_vv_f32m4(a_f32m4, b_f32m4, vector_length);
648
+ vfloat32m4_t scaled_product_f32m4 = __riscv_vfmul_vf_f32m4(product_f32m4, alpha_val, vector_length);
649
+ vfloat32m4_t result_f32m4 = __riscv_vfmacc_vf_f32m4(scaled_product_f32m4, beta_val, c_f32m4, vector_length);
650
+ __riscv_vse32_v_f32m4(result, result_f32m4, vector_length);
651
+ }
652
+ }
653
+
654
+ NK_PUBLIC void nk_each_fma_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
655
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_f16_t *result) {
656
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
657
+ for (nk_size_t vector_length; n > 0;
658
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
659
+ vector_length = __riscv_vsetvl_e16m1(n);
660
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
661
+ vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b, vector_length);
662
+ vuint16m1_t c_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)c, vector_length);
663
+ vfloat32m2_t a_f32m2 = nk_f16m1_to_f32m2_rvv_(a_u16m1, vector_length);
664
+ vfloat32m2_t b_f32m2 = nk_f16m1_to_f32m2_rvv_(b_u16m1, vector_length);
665
+ vfloat32m2_t c_f32m2 = nk_f16m1_to_f32m2_rvv_(c_u16m1, vector_length);
666
+ vfloat32m2_t product_f32m2 = __riscv_vfmul_vv_f32m2(a_f32m2, b_f32m2, vector_length);
667
+ vfloat32m2_t scaled_product_f32m2 = __riscv_vfmul_vf_f32m2(product_f32m2, alpha_val, vector_length);
668
+ vfloat32m2_t result_f32m2 = __riscv_vfmacc_vf_f32m2(scaled_product_f32m2, beta_val, c_f32m2, vector_length);
669
+ vuint16m1_t result_u16m1 = nk_f32m2_to_f16m1_rvv_(result_f32m2, vector_length);
670
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
671
+ }
672
+ }
673
+
674
+ NK_PUBLIC void nk_each_fma_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_t const *c, nk_size_t n,
675
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_bf16_t *result) {
676
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
677
+ for (nk_size_t vector_length; n > 0;
678
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
679
+ vector_length = __riscv_vsetvl_e16m1(n);
680
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a, vector_length);
681
+ vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b, vector_length);
682
+ vuint16m1_t c_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)c, vector_length);
683
+ vfloat32m2_t a_f32m2 = nk_bf16m1_to_f32m2_rvv_(a_u16m1, vector_length);
684
+ vfloat32m2_t b_f32m2 = nk_bf16m1_to_f32m2_rvv_(b_u16m1, vector_length);
685
+ vfloat32m2_t c_f32m2 = nk_bf16m1_to_f32m2_rvv_(c_u16m1, vector_length);
686
+ vfloat32m2_t product_f32m2 = __riscv_vfmul_vv_f32m2(a_f32m2, b_f32m2, vector_length);
687
+ vfloat32m2_t scaled_product_f32m2 = __riscv_vfmul_vf_f32m2(product_f32m2, alpha_val, vector_length);
688
+ vfloat32m2_t result_f32m2 = __riscv_vfmacc_vf_f32m2(scaled_product_f32m2, beta_val, c_f32m2, vector_length);
689
+ vuint16m1_t result_u16m1 = nk_f32m2_to_bf16m1_rvv_(result_f32m2, vector_length);
690
+ __riscv_vse16_v_u16m1((nk_u16_t *)result, result_u16m1, vector_length);
691
+ }
692
+ }
693
+
694
+ NK_PUBLIC void nk_each_fma_i8_rvv(nk_i8_t const *a, nk_i8_t const *b, nk_i8_t const *c, nk_size_t n,
695
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_i8_t *result) {
696
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
697
+ for (nk_size_t vector_length; n > 0;
698
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
699
+ vector_length = __riscv_vsetvl_e8m1(n);
700
+ vint8m1_t a_i8m1 = __riscv_vle8_v_i8m1(a, vector_length);
701
+ vint8m1_t b_i8m1 = __riscv_vle8_v_i8m1(b, vector_length);
702
+ vint8m1_t c_i8m1 = __riscv_vle8_v_i8m1(c, vector_length);
703
+ vint16m2_t a_i16m2 = __riscv_vwadd_vx_i16m2(a_i8m1, 0, vector_length);
704
+ vint32m4_t a_i32m4 = __riscv_vwadd_vx_i32m4(a_i16m2, 0, vector_length);
705
+ vfloat32m4_t a_f32m4 = __riscv_vfcvt_f_x_v_f32m4(a_i32m4, vector_length);
706
+ vint16m2_t b_i16m2 = __riscv_vwadd_vx_i16m2(b_i8m1, 0, vector_length);
707
+ vint32m4_t b_i32m4 = __riscv_vwadd_vx_i32m4(b_i16m2, 0, vector_length);
708
+ vfloat32m4_t b_f32m4 = __riscv_vfcvt_f_x_v_f32m4(b_i32m4, vector_length);
709
+ vint16m2_t c_i16m2 = __riscv_vwadd_vx_i16m2(c_i8m1, 0, vector_length);
710
+ vint32m4_t c_i32m4 = __riscv_vwadd_vx_i32m4(c_i16m2, 0, vector_length);
711
+ vfloat32m4_t c_f32m4 = __riscv_vfcvt_f_x_v_f32m4(c_i32m4, vector_length);
712
+ vfloat32m4_t product_f32m4 = __riscv_vfmul_vv_f32m4(a_f32m4, b_f32m4, vector_length);
713
+ vfloat32m4_t scaled_product_f32m4 = __riscv_vfmul_vf_f32m4(product_f32m4, alpha_val, vector_length);
714
+ vfloat32m4_t result_f32m4 = __riscv_vfmacc_vf_f32m4(scaled_product_f32m4, beta_val, c_f32m4, vector_length);
715
+ vint32m4_t result_i32m4 = __riscv_vfcvt_x_f_v_i32m4(result_f32m4, vector_length);
716
+ result_i32m4 = __riscv_vmax_vx_i32m4(result_i32m4, -128, vector_length);
717
+ result_i32m4 = __riscv_vmin_vx_i32m4(result_i32m4, 127, vector_length);
718
+ vint16m2_t result_i16m2 = __riscv_vncvt_x_x_w_i16m2(result_i32m4, vector_length);
719
+ vint8m1_t result_i8m1 = __riscv_vncvt_x_x_w_i8m1(result_i16m2, vector_length);
720
+ __riscv_vse8_v_i8m1(result, result_i8m1, vector_length);
721
+ }
722
+ }
723
+
724
+ NK_PUBLIC void nk_each_fma_u8_rvv(nk_u8_t const *a, nk_u8_t const *b, nk_u8_t const *c, nk_size_t n,
725
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_u8_t *result) {
726
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
727
+ for (nk_size_t vector_length; n > 0;
728
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
729
+ vector_length = __riscv_vsetvl_e8m1(n);
730
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1(a, vector_length);
731
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1(b, vector_length);
732
+ vuint8m1_t c_u8m1 = __riscv_vle8_v_u8m1(c, vector_length);
733
+ vuint16m2_t a_u16m2 = __riscv_vwaddu_vx_u16m2(a_u8m1, 0, vector_length);
734
+ vuint32m4_t a_u32m4 = __riscv_vwaddu_vx_u32m4(a_u16m2, 0, vector_length);
735
+ vfloat32m4_t a_f32m4 = __riscv_vfcvt_f_xu_v_f32m4(a_u32m4, vector_length);
736
+ vuint16m2_t b_u16m2 = __riscv_vwaddu_vx_u16m2(b_u8m1, 0, vector_length);
737
+ vuint32m4_t b_u32m4 = __riscv_vwaddu_vx_u32m4(b_u16m2, 0, vector_length);
738
+ vfloat32m4_t b_f32m4 = __riscv_vfcvt_f_xu_v_f32m4(b_u32m4, vector_length);
739
+ vuint16m2_t c_u16m2 = __riscv_vwaddu_vx_u16m2(c_u8m1, 0, vector_length);
740
+ vuint32m4_t c_u32m4 = __riscv_vwaddu_vx_u32m4(c_u16m2, 0, vector_length);
741
+ vfloat32m4_t c_f32m4 = __riscv_vfcvt_f_xu_v_f32m4(c_u32m4, vector_length);
742
+ vfloat32m4_t product_f32m4 = __riscv_vfmul_vv_f32m4(a_f32m4, b_f32m4, vector_length);
743
+ vfloat32m4_t scaled_product_f32m4 = __riscv_vfmul_vf_f32m4(product_f32m4, alpha_val, vector_length);
744
+ vfloat32m4_t result_f32m4 = __riscv_vfmacc_vf_f32m4(scaled_product_f32m4, beta_val, c_f32m4, vector_length);
745
+ vuint32m4_t result_u32m4 = __riscv_vfcvt_xu_f_v_u32m4(result_f32m4, vector_length);
746
+ result_u32m4 = __riscv_vmaxu_vx_u32m4(result_u32m4, 0, vector_length);
747
+ result_u32m4 = __riscv_vminu_vx_u32m4(result_u32m4, 255, vector_length);
748
+ vuint16m2_t result_u16m2 = __riscv_vncvt_x_x_w_u16m2(result_u32m4, vector_length);
749
+ vuint8m1_t result_u8m1 = __riscv_vncvt_x_x_w_u8m1(result_u16m2, vector_length);
750
+ __riscv_vse8_v_u8m1(result, result_u8m1, vector_length);
751
+ }
752
+ }
753
+
754
+ NK_PUBLIC void nk_each_fma_i16_rvv(nk_i16_t const *a, nk_i16_t const *b, nk_i16_t const *c, nk_size_t n,
755
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_i16_t *result) {
756
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
757
+ for (nk_size_t vector_length; n > 0;
758
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
759
+ vector_length = __riscv_vsetvl_e16m1(n);
760
+ vint16m1_t a_i16m1 = __riscv_vle16_v_i16m1(a, vector_length);
761
+ vint16m1_t b_i16m1 = __riscv_vle16_v_i16m1(b, vector_length);
762
+ vint16m1_t c_i16m1 = __riscv_vle16_v_i16m1(c, vector_length);
763
+ vint32m2_t a_i32m2 = __riscv_vwadd_vx_i32m2(a_i16m1, 0, vector_length);
764
+ vfloat32m2_t a_f32m2 = __riscv_vfcvt_f_x_v_f32m2(a_i32m2, vector_length);
765
+ vint32m2_t b_i32m2 = __riscv_vwadd_vx_i32m2(b_i16m1, 0, vector_length);
766
+ vfloat32m2_t b_f32m2 = __riscv_vfcvt_f_x_v_f32m2(b_i32m2, vector_length);
767
+ vint32m2_t c_i32m2 = __riscv_vwadd_vx_i32m2(c_i16m1, 0, vector_length);
768
+ vfloat32m2_t c_f32m2 = __riscv_vfcvt_f_x_v_f32m2(c_i32m2, vector_length);
769
+ vfloat32m2_t product_f32m2 = __riscv_vfmul_vv_f32m2(a_f32m2, b_f32m2, vector_length);
770
+ vfloat32m2_t scaled_product_f32m2 = __riscv_vfmul_vf_f32m2(product_f32m2, alpha_val, vector_length);
771
+ vfloat32m2_t result_f32m2 = __riscv_vfmacc_vf_f32m2(scaled_product_f32m2, beta_val, c_f32m2, vector_length);
772
+ vint32m2_t result_i32m2 = __riscv_vfcvt_x_f_v_i32m2(result_f32m2, vector_length);
773
+ result_i32m2 = __riscv_vmax_vx_i32m2(result_i32m2, -32768, vector_length);
774
+ result_i32m2 = __riscv_vmin_vx_i32m2(result_i32m2, 32767, vector_length);
775
+ vint16m1_t result_i16m1 = __riscv_vncvt_x_x_w_i16m1(result_i32m2, vector_length);
776
+ __riscv_vse16_v_i16m1(result, result_i16m1, vector_length);
777
+ }
778
+ }
779
+
780
+ NK_PUBLIC void nk_each_fma_u16_rvv(nk_u16_t const *a, nk_u16_t const *b, nk_u16_t const *c, nk_size_t n,
781
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_u16_t *result) {
782
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
783
+ for (nk_size_t vector_length; n > 0;
784
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
785
+ vector_length = __riscv_vsetvl_e16m1(n);
786
+ vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1(a, vector_length);
787
+ vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1(b, vector_length);
788
+ vuint16m1_t c_u16m1 = __riscv_vle16_v_u16m1(c, vector_length);
789
+ vuint32m2_t a_u32m2 = __riscv_vwaddu_vx_u32m2(a_u16m1, 0, vector_length);
790
+ vfloat32m2_t a_f32m2 = __riscv_vfcvt_f_xu_v_f32m2(a_u32m2, vector_length);
791
+ vuint32m2_t b_u32m2 = __riscv_vwaddu_vx_u32m2(b_u16m1, 0, vector_length);
792
+ vfloat32m2_t b_f32m2 = __riscv_vfcvt_f_xu_v_f32m2(b_u32m2, vector_length);
793
+ vuint32m2_t c_u32m2 = __riscv_vwaddu_vx_u32m2(c_u16m1, 0, vector_length);
794
+ vfloat32m2_t c_f32m2 = __riscv_vfcvt_f_xu_v_f32m2(c_u32m2, vector_length);
795
+ vfloat32m2_t product_f32m2 = __riscv_vfmul_vv_f32m2(a_f32m2, b_f32m2, vector_length);
796
+ vfloat32m2_t scaled_product_f32m2 = __riscv_vfmul_vf_f32m2(product_f32m2, alpha_val, vector_length);
797
+ vfloat32m2_t result_f32m2 = __riscv_vfmacc_vf_f32m2(scaled_product_f32m2, beta_val, c_f32m2, vector_length);
798
+ vuint32m2_t result_u32m2 = __riscv_vfcvt_xu_f_v_u32m2(result_f32m2, vector_length);
799
+ result_u32m2 = __riscv_vmaxu_vx_u32m2(result_u32m2, 0, vector_length);
800
+ result_u32m2 = __riscv_vminu_vx_u32m2(result_u32m2, 65535, vector_length);
801
+ vuint16m1_t result_u16m1 = __riscv_vncvt_x_x_w_u16m1(result_u32m2, vector_length);
802
+ __riscv_vse16_v_u16m1(result, result_u16m1, vector_length);
803
+ }
804
+ }
805
+
806
+ NK_PUBLIC void nk_each_fma_i32_rvv(nk_i32_t const *a, nk_i32_t const *b, nk_i32_t const *c, nk_size_t n,
807
+ nk_f64_t const *alpha, nk_f64_t const *beta, nk_i32_t *result) {
808
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
809
+ for (nk_size_t vector_length; n > 0;
810
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
811
+ vector_length = __riscv_vsetvl_e32m1(n);
812
+ vint32m1_t a_i32m1 = __riscv_vle32_v_i32m1(a, vector_length);
813
+ vint32m1_t b_i32m1 = __riscv_vle32_v_i32m1(b, vector_length);
814
+ vint32m1_t c_i32m1 = __riscv_vle32_v_i32m1(c, vector_length);
815
+ vfloat64m2_t a_f64m2 = __riscv_vfwcvt_f_x_v_f64m2(a_i32m1, vector_length);
816
+ vfloat64m2_t b_f64m2 = __riscv_vfwcvt_f_x_v_f64m2(b_i32m1, vector_length);
817
+ vfloat64m2_t c_f64m2 = __riscv_vfwcvt_f_x_v_f64m2(c_i32m1, vector_length);
818
+ vfloat64m2_t product_f64m2 = __riscv_vfmul_vv_f64m2(a_f64m2, b_f64m2, vector_length);
819
+ vfloat64m2_t scaled_product_f64m2 = __riscv_vfmul_vf_f64m2(product_f64m2, alpha_val, vector_length);
820
+ vfloat64m2_t result_f64m2 = __riscv_vfmacc_vf_f64m2(scaled_product_f64m2, beta_val, c_f64m2, vector_length);
821
+ result_f64m2 = __riscv_vfmax_vf_f64m2(result_f64m2, -2147483648.0, vector_length);
822
+ result_f64m2 = __riscv_vfmin_vf_f64m2(result_f64m2, 2147483647.0, vector_length);
823
+ vint32m1_t result_i32m1 = __riscv_vfncvt_x_f_w_i32m1(result_f64m2, vector_length);
824
+ __riscv_vse32_v_i32m1(result, result_i32m1, vector_length);
825
+ }
826
+ }
827
+
828
+ NK_PUBLIC void nk_each_fma_u32_rvv(nk_u32_t const *a, nk_u32_t const *b, nk_u32_t const *c, nk_size_t n,
829
+ nk_f64_t const *alpha, nk_f64_t const *beta, nk_u32_t *result) {
830
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
831
+ for (nk_size_t vector_length; n > 0;
832
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
833
+ vector_length = __riscv_vsetvl_e32m1(n);
834
+ vuint32m1_t a_u32m1 = __riscv_vle32_v_u32m1(a, vector_length);
835
+ vuint32m1_t b_u32m1 = __riscv_vle32_v_u32m1(b, vector_length);
836
+ vuint32m1_t c_u32m1 = __riscv_vle32_v_u32m1(c, vector_length);
837
+ vfloat64m2_t a_f64m2 = __riscv_vfwcvt_f_xu_v_f64m2(a_u32m1, vector_length);
838
+ vfloat64m2_t b_f64m2 = __riscv_vfwcvt_f_xu_v_f64m2(b_u32m1, vector_length);
839
+ vfloat64m2_t c_f64m2 = __riscv_vfwcvt_f_xu_v_f64m2(c_u32m1, vector_length);
840
+ vfloat64m2_t product_f64m2 = __riscv_vfmul_vv_f64m2(a_f64m2, b_f64m2, vector_length);
841
+ vfloat64m2_t scaled_product_f64m2 = __riscv_vfmul_vf_f64m2(product_f64m2, alpha_val, vector_length);
842
+ vfloat64m2_t result_f64m2 = __riscv_vfmacc_vf_f64m2(scaled_product_f64m2, beta_val, c_f64m2, vector_length);
843
+ result_f64m2 = __riscv_vfmax_vf_f64m2(result_f64m2, 0.0, vector_length);
844
+ result_f64m2 = __riscv_vfmin_vf_f64m2(result_f64m2, 4294967295.0, vector_length);
845
+ vuint32m1_t result_u32m1 = __riscv_vfncvt_xu_f_w_u32m1(result_f64m2, vector_length);
846
+ __riscv_vse32_v_u32m1(result, result_u32m1, vector_length);
847
+ }
848
+ }
849
+
850
+ NK_PUBLIC void nk_each_fma_i64_rvv(nk_i64_t const *a, nk_i64_t const *b, nk_i64_t const *c, nk_size_t n,
851
+ nk_f64_t const *alpha, nk_f64_t const *beta, nk_i64_t *result) {
852
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
853
+ for (nk_size_t vector_length; n > 0;
854
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
855
+ vector_length = __riscv_vsetvl_e64m4(n);
856
+ vint64m4_t a_i64m4 = __riscv_vle64_v_i64m4(a, vector_length);
857
+ vint64m4_t b_i64m4 = __riscv_vle64_v_i64m4(b, vector_length);
858
+ vint64m4_t c_i64m4 = __riscv_vle64_v_i64m4(c, vector_length);
859
+ vfloat64m4_t a_f64m4 = __riscv_vfcvt_f_x_v_f64m4(a_i64m4, vector_length);
860
+ vfloat64m4_t b_f64m4 = __riscv_vfcvt_f_x_v_f64m4(b_i64m4, vector_length);
861
+ vfloat64m4_t c_f64m4 = __riscv_vfcvt_f_x_v_f64m4(c_i64m4, vector_length);
862
+ vfloat64m4_t product_f64m4 = __riscv_vfmul_vv_f64m4(a_f64m4, b_f64m4, vector_length);
863
+ vfloat64m4_t scaled_product_f64m4 = __riscv_vfmul_vf_f64m4(product_f64m4, alpha_val, vector_length);
864
+ vfloat64m4_t result_f64m4 = __riscv_vfmacc_vf_f64m4(scaled_product_f64m4, beta_val, c_f64m4, vector_length);
865
+ vint64m4_t result_i64m4 = __riscv_vfcvt_x_f_v_i64m4(result_f64m4, vector_length);
866
+ __riscv_vse64_v_i64m4(result, result_i64m4, vector_length);
867
+ }
868
+ }
869
+
870
+ NK_PUBLIC void nk_each_fma_u64_rvv(nk_u64_t const *a, nk_u64_t const *b, nk_u64_t const *c, nk_size_t n,
871
+ nk_f64_t const *alpha, nk_f64_t const *beta, nk_u64_t *result) {
872
+ nk_f64_t alpha_val = *alpha, beta_val = *beta;
873
+ for (nk_size_t vector_length; n > 0;
874
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
875
+ vector_length = __riscv_vsetvl_e64m4(n);
876
+ vuint64m4_t a_u64m4 = __riscv_vle64_v_u64m4(a, vector_length);
877
+ vuint64m4_t b_u64m4 = __riscv_vle64_v_u64m4(b, vector_length);
878
+ vuint64m4_t c_u64m4 = __riscv_vle64_v_u64m4(c, vector_length);
879
+ vfloat64m4_t a_f64m4 = __riscv_vfcvt_f_xu_v_f64m4(a_u64m4, vector_length);
880
+ vfloat64m4_t b_f64m4 = __riscv_vfcvt_f_xu_v_f64m4(b_u64m4, vector_length);
881
+ vfloat64m4_t c_f64m4 = __riscv_vfcvt_f_xu_v_f64m4(c_u64m4, vector_length);
882
+ vfloat64m4_t product_f64m4 = __riscv_vfmul_vv_f64m4(a_f64m4, b_f64m4, vector_length);
883
+ vfloat64m4_t scaled_product_f64m4 = __riscv_vfmul_vf_f64m4(product_f64m4, alpha_val, vector_length);
884
+ vfloat64m4_t result_f64m4 = __riscv_vfmacc_vf_f64m4(scaled_product_f64m4, beta_val, c_f64m4, vector_length);
885
+ result_f64m4 = __riscv_vfmax_vf_f64m4(result_f64m4, 0.0, vector_length);
886
+ vuint64m4_t result_u64m4 = __riscv_vfcvt_xu_f_v_u64m4(result_f64m4, vector_length);
887
+ __riscv_vse64_v_u64m4(result, result_u64m4, vector_length);
888
+ }
889
+ }
890
+
891
+ NK_PUBLIC void nk_each_fma_e4m3_rvv(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_e4m3_t const *c, nk_size_t n,
892
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_e4m3_t *result) {
893
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
894
+ for (nk_size_t vector_length; n > 0;
895
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
896
+ vector_length = __riscv_vsetvl_e8m1(n);
897
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
898
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b, vector_length);
899
+ vuint8m1_t c_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)c, vector_length);
900
+ vfloat32m4_t a_f32m4 = nk_e4m3m1_to_f32m4_rvv_(a_u8m1, vector_length);
901
+ vfloat32m4_t b_f32m4 = nk_e4m3m1_to_f32m4_rvv_(b_u8m1, vector_length);
902
+ vfloat32m4_t c_f32m4 = nk_e4m3m1_to_f32m4_rvv_(c_u8m1, vector_length);
903
+ vfloat32m4_t product_f32m4 = __riscv_vfmul_vv_f32m4(a_f32m4, b_f32m4, vector_length);
904
+ vfloat32m4_t scaled_product_f32m4 = __riscv_vfmul_vf_f32m4(product_f32m4, alpha_val, vector_length);
905
+ vfloat32m4_t result_f32m4 = __riscv_vfmacc_vf_f32m4(scaled_product_f32m4, beta_val, c_f32m4, vector_length);
906
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e4m3m1_rvv_(result_f32m4, vector_length);
907
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
908
+ }
909
+ }
910
+
911
+ NK_PUBLIC void nk_each_fma_e5m2_rvv(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_e5m2_t const *c, nk_size_t n,
912
+ nk_f32_t const *alpha, nk_f32_t const *beta, nk_e5m2_t *result) {
913
+ nk_f32_t alpha_val = *alpha, beta_val = *beta;
914
+ for (nk_size_t vector_length; n > 0;
915
+ n -= vector_length, a += vector_length, b += vector_length, c += vector_length, result += vector_length) {
916
+ vector_length = __riscv_vsetvl_e8m1(n);
917
+ vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a, vector_length);
918
+ vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b, vector_length);
919
+ vuint8m1_t c_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)c, vector_length);
920
+ vfloat32m4_t a_f32m4 = nk_e5m2m1_to_f32m4_rvv_(a_u8m1, vector_length);
921
+ vfloat32m4_t b_f32m4 = nk_e5m2m1_to_f32m4_rvv_(b_u8m1, vector_length);
922
+ vfloat32m4_t c_f32m4 = nk_e5m2m1_to_f32m4_rvv_(c_u8m1, vector_length);
923
+ vfloat32m4_t product_f32m4 = __riscv_vfmul_vv_f32m4(a_f32m4, b_f32m4, vector_length);
924
+ vfloat32m4_t scaled_product_f32m4 = __riscv_vfmul_vf_f32m4(product_f32m4, alpha_val, vector_length);
925
+ vfloat32m4_t result_f32m4 = __riscv_vfmacc_vf_f32m4(scaled_product_f32m4, beta_val, c_f32m4, vector_length);
926
+ vuint8m1_t result_u8m1 = nk_f32m4_to_e5m2m1_rvv_(result_f32m4, vector_length);
927
+ __riscv_vse8_v_u8m1((nk_u8_t *)result, result_u8m1, vector_length);
928
+ }
929
+ }
930
+
931
+ NK_PUBLIC void nk_each_scale_f32c_rvv(nk_f32c_t const *a, nk_size_t n, nk_f32c_t const *alpha, nk_f32c_t const *beta,
932
+ nk_f32c_t *result) {
933
+ nk_f32_t alpha_real = alpha->real, alpha_imag = alpha->imag;
934
+ nk_f32_t beta_real = beta->real, beta_imag = beta->imag;
935
+ nk_f32_t const *a_f32 = (nk_f32_t const *)a;
936
+ nk_f32_t *result_f32 = (nk_f32_t *)result;
937
+ for (nk_size_t vector_length; n > 0;
938
+ n -= vector_length, a_f32 += vector_length * 2, result_f32 += vector_length * 2) {
939
+ vector_length = __riscv_vsetvl_e32m2(n);
940
+ vfloat32m2x2_t a_f32m2x2 = __riscv_vlseg2e32_v_f32m2x2(a_f32, vector_length);
941
+ vfloat32m2_t a_real_f32m2 = __riscv_vget_v_f32m2x2_f32m2(a_f32m2x2, 0);
942
+ vfloat32m2_t a_imag_f32m2 = __riscv_vget_v_f32m2x2_f32m2(a_f32m2x2, 1);
943
+ vfloat32m2_t y_real_f32m2 = __riscv_vfmv_v_f_f32m2(beta_real, vector_length);
944
+ y_real_f32m2 = __riscv_vfmacc_vf_f32m2(y_real_f32m2, alpha_real, a_real_f32m2, vector_length);
945
+ y_real_f32m2 = __riscv_vfnmsac_vf_f32m2(y_real_f32m2, alpha_imag, a_imag_f32m2, vector_length);
946
+ vfloat32m2_t y_imag_f32m2 = __riscv_vfmv_v_f_f32m2(beta_imag, vector_length);
947
+ y_imag_f32m2 = __riscv_vfmacc_vf_f32m2(y_imag_f32m2, alpha_real, a_imag_f32m2, vector_length);
948
+ y_imag_f32m2 = __riscv_vfmacc_vf_f32m2(y_imag_f32m2, alpha_imag, a_real_f32m2, vector_length);
949
+ vfloat32m2x2_t out_f32m2x2 = __riscv_vcreate_v_f32m2x2(y_real_f32m2, y_imag_f32m2);
950
+ __riscv_vsseg2e32_v_f32m2x2(result_f32, out_f32m2x2, vector_length);
951
+ }
952
+ }
953
+
954
+ NK_PUBLIC void nk_each_scale_f64c_rvv(nk_f64c_t const *a, nk_size_t n, nk_f64c_t const *alpha, nk_f64c_t const *beta,
955
+ nk_f64c_t *result) {
956
+ nk_f64_t alpha_real = alpha->real, alpha_imag = alpha->imag;
957
+ nk_f64_t beta_real = beta->real, beta_imag = beta->imag;
958
+ nk_f64_t const *a_f64 = (nk_f64_t const *)a;
959
+ nk_f64_t *result_f64 = (nk_f64_t *)result;
960
+ for (nk_size_t vector_length; n > 0;
961
+ n -= vector_length, a_f64 += vector_length * 2, result_f64 += vector_length * 2) {
962
+ vector_length = __riscv_vsetvl_e64m2(n);
963
+ vfloat64m2x2_t a_f64m2x2 = __riscv_vlseg2e64_v_f64m2x2(a_f64, vector_length);
964
+ vfloat64m2_t a_real_f64m2 = __riscv_vget_v_f64m2x2_f64m2(a_f64m2x2, 0);
965
+ vfloat64m2_t a_imag_f64m2 = __riscv_vget_v_f64m2x2_f64m2(a_f64m2x2, 1);
966
+ vfloat64m2_t y_real_f64m2 = __riscv_vfmv_v_f_f64m2(beta_real, vector_length);
967
+ y_real_f64m2 = __riscv_vfmacc_vf_f64m2(y_real_f64m2, alpha_real, a_real_f64m2, vector_length);
968
+ y_real_f64m2 = __riscv_vfnmsac_vf_f64m2(y_real_f64m2, alpha_imag, a_imag_f64m2, vector_length);
969
+ vfloat64m2_t y_imag_f64m2 = __riscv_vfmv_v_f_f64m2(beta_imag, vector_length);
970
+ y_imag_f64m2 = __riscv_vfmacc_vf_f64m2(y_imag_f64m2, alpha_real, a_imag_f64m2, vector_length);
971
+ y_imag_f64m2 = __riscv_vfmacc_vf_f64m2(y_imag_f64m2, alpha_imag, a_real_f64m2, vector_length);
972
+ vfloat64m2x2_t out_f64m2x2 = __riscv_vcreate_v_f64m2x2(y_real_f64m2, y_imag_f64m2);
973
+ __riscv_vsseg2e64_v_f64m2x2(result_f64, out_f64m2x2, vector_length);
974
+ }
975
+ }
976
+
977
+ NK_PUBLIC void nk_each_blend_f32c_rvv(nk_f32c_t const *a, nk_f32c_t const *b, nk_size_t n, nk_f32c_t const *alpha,
978
+ nk_f32c_t const *beta, nk_f32c_t *result) {
979
+ nk_f32_t alpha_real = alpha->real, alpha_imag = alpha->imag;
980
+ nk_f32_t beta_real = beta->real, beta_imag = beta->imag;
981
+ nk_f32_t const *a_f32 = (nk_f32_t const *)a;
982
+ nk_f32_t const *b_f32 = (nk_f32_t const *)b;
983
+ nk_f32_t *result_f32 = (nk_f32_t *)result;
984
+ for (nk_size_t vector_length; n > 0;
985
+ n -= vector_length, a_f32 += vector_length * 2, b_f32 += vector_length * 2, result_f32 += vector_length * 2) {
986
+ vector_length = __riscv_vsetvl_e32m2(n);
987
+ vfloat32m2x2_t a_f32m2x2 = __riscv_vlseg2e32_v_f32m2x2(a_f32, vector_length);
988
+ vfloat32m2x2_t b_f32m2x2 = __riscv_vlseg2e32_v_f32m2x2(b_f32, vector_length);
989
+ vfloat32m2_t a_real_f32m2 = __riscv_vget_v_f32m2x2_f32m2(a_f32m2x2, 0);
990
+ vfloat32m2_t a_imag_f32m2 = __riscv_vget_v_f32m2x2_f32m2(a_f32m2x2, 1);
991
+ vfloat32m2_t b_real_f32m2 = __riscv_vget_v_f32m2x2_f32m2(b_f32m2x2, 0);
992
+ vfloat32m2_t b_imag_f32m2 = __riscv_vget_v_f32m2x2_f32m2(b_f32m2x2, 1);
993
+ vfloat32m2_t ya_real_f32m2 = __riscv_vfmul_vf_f32m2(a_real_f32m2, alpha_real, vector_length);
994
+ ya_real_f32m2 = __riscv_vfnmsac_vf_f32m2(ya_real_f32m2, alpha_imag, a_imag_f32m2, vector_length);
995
+ vfloat32m2_t ya_imag_f32m2 = __riscv_vfmul_vf_f32m2(a_imag_f32m2, alpha_real, vector_length);
996
+ ya_imag_f32m2 = __riscv_vfmacc_vf_f32m2(ya_imag_f32m2, alpha_imag, a_real_f32m2, vector_length);
997
+ vfloat32m2_t y_real_f32m2 = __riscv_vfmacc_vf_f32m2(ya_real_f32m2, beta_real, b_real_f32m2, vector_length);
998
+ y_real_f32m2 = __riscv_vfnmsac_vf_f32m2(y_real_f32m2, beta_imag, b_imag_f32m2, vector_length);
999
+ vfloat32m2_t y_imag_f32m2 = __riscv_vfmacc_vf_f32m2(ya_imag_f32m2, beta_real, b_imag_f32m2, vector_length);
1000
+ y_imag_f32m2 = __riscv_vfmacc_vf_f32m2(y_imag_f32m2, beta_imag, b_real_f32m2, vector_length);
1001
+ vfloat32m2x2_t out_f32m2x2 = __riscv_vcreate_v_f32m2x2(y_real_f32m2, y_imag_f32m2);
1002
+ __riscv_vsseg2e32_v_f32m2x2(result_f32, out_f32m2x2, vector_length);
1003
+ }
1004
+ }
1005
+
1006
+ NK_PUBLIC void nk_each_blend_f64c_rvv(nk_f64c_t const *a, nk_f64c_t const *b, nk_size_t n, nk_f64c_t const *alpha,
1007
+ nk_f64c_t const *beta, nk_f64c_t *result) {
1008
+ nk_f64_t alpha_real = alpha->real, alpha_imag = alpha->imag;
1009
+ nk_f64_t beta_real = beta->real, beta_imag = beta->imag;
1010
+ nk_f64_t const *a_f64 = (nk_f64_t const *)a;
1011
+ nk_f64_t const *b_f64 = (nk_f64_t const *)b;
1012
+ nk_f64_t *result_f64 = (nk_f64_t *)result;
1013
+ for (nk_size_t vector_length; n > 0;
1014
+ n -= vector_length, a_f64 += vector_length * 2, b_f64 += vector_length * 2, result_f64 += vector_length * 2) {
1015
+ vector_length = __riscv_vsetvl_e64m2(n);
1016
+ vfloat64m2x2_t a_f64m2x2 = __riscv_vlseg2e64_v_f64m2x2(a_f64, vector_length);
1017
+ vfloat64m2x2_t b_f64m2x2 = __riscv_vlseg2e64_v_f64m2x2(b_f64, vector_length);
1018
+ vfloat64m2_t a_real_f64m2 = __riscv_vget_v_f64m2x2_f64m2(a_f64m2x2, 0);
1019
+ vfloat64m2_t a_imag_f64m2 = __riscv_vget_v_f64m2x2_f64m2(a_f64m2x2, 1);
1020
+ vfloat64m2_t b_real_f64m2 = __riscv_vget_v_f64m2x2_f64m2(b_f64m2x2, 0);
1021
+ vfloat64m2_t b_imag_f64m2 = __riscv_vget_v_f64m2x2_f64m2(b_f64m2x2, 1);
1022
+ vfloat64m2_t ya_real_f64m2 = __riscv_vfmul_vf_f64m2(a_real_f64m2, alpha_real, vector_length);
1023
+ ya_real_f64m2 = __riscv_vfnmsac_vf_f64m2(ya_real_f64m2, alpha_imag, a_imag_f64m2, vector_length);
1024
+ vfloat64m2_t ya_imag_f64m2 = __riscv_vfmul_vf_f64m2(a_imag_f64m2, alpha_real, vector_length);
1025
+ ya_imag_f64m2 = __riscv_vfmacc_vf_f64m2(ya_imag_f64m2, alpha_imag, a_real_f64m2, vector_length);
1026
+ vfloat64m2_t y_real_f64m2 = __riscv_vfmacc_vf_f64m2(ya_real_f64m2, beta_real, b_real_f64m2, vector_length);
1027
+ y_real_f64m2 = __riscv_vfnmsac_vf_f64m2(y_real_f64m2, beta_imag, b_imag_f64m2, vector_length);
1028
+ vfloat64m2_t y_imag_f64m2 = __riscv_vfmacc_vf_f64m2(ya_imag_f64m2, beta_real, b_imag_f64m2, vector_length);
1029
+ y_imag_f64m2 = __riscv_vfmacc_vf_f64m2(y_imag_f64m2, beta_imag, b_real_f64m2, vector_length);
1030
+ vfloat64m2x2_t out_f64m2x2 = __riscv_vcreate_v_f64m2x2(y_real_f64m2, y_imag_f64m2);
1031
+ __riscv_vsseg2e64_v_f64m2x2(result_f64, out_f64m2x2, vector_length);
1032
+ }
1033
+ }
1034
+
1035
+ NK_PUBLIC void nk_each_fma_f32c_rvv(nk_f32c_t const *a, nk_f32c_t const *b, nk_f32c_t const *c, nk_size_t n,
1036
+ nk_f32c_t const *alpha, nk_f32c_t const *beta, nk_f32c_t *result) {
1037
+ nk_f32_t alpha_real = alpha->real, alpha_imag = alpha->imag;
1038
+ nk_f32_t beta_real = beta->real, beta_imag = beta->imag;
1039
+ nk_f32_t const *a_f32 = (nk_f32_t const *)a;
1040
+ nk_f32_t const *b_f32 = (nk_f32_t const *)b;
1041
+ nk_f32_t const *c_f32 = (nk_f32_t const *)c;
1042
+ nk_f32_t *result_f32 = (nk_f32_t *)result;
1043
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a_f32 += vector_length * 2, b_f32 += vector_length * 2,
1044
+ c_f32 += vector_length * 2, result_f32 += vector_length * 2) {
1045
+ vector_length = __riscv_vsetvl_e32m2(n);
1046
+ vfloat32m2x2_t a_f32m2x2 = __riscv_vlseg2e32_v_f32m2x2(a_f32, vector_length);
1047
+ vfloat32m2x2_t b_f32m2x2 = __riscv_vlseg2e32_v_f32m2x2(b_f32, vector_length);
1048
+ vfloat32m2x2_t c_f32m2x2 = __riscv_vlseg2e32_v_f32m2x2(c_f32, vector_length);
1049
+ vfloat32m2_t a_real_f32m2 = __riscv_vget_v_f32m2x2_f32m2(a_f32m2x2, 0);
1050
+ vfloat32m2_t a_imag_f32m2 = __riscv_vget_v_f32m2x2_f32m2(a_f32m2x2, 1);
1051
+ vfloat32m2_t b_real_f32m2 = __riscv_vget_v_f32m2x2_f32m2(b_f32m2x2, 0);
1052
+ vfloat32m2_t b_imag_f32m2 = __riscv_vget_v_f32m2x2_f32m2(b_f32m2x2, 1);
1053
+ vfloat32m2_t c_real_f32m2 = __riscv_vget_v_f32m2x2_f32m2(c_f32m2x2, 0);
1054
+ vfloat32m2_t c_imag_f32m2 = __riscv_vget_v_f32m2x2_f32m2(c_f32m2x2, 1);
1055
+ vfloat32m2_t ab_real_f32m2 = __riscv_vfmul_vv_f32m2(a_real_f32m2, b_real_f32m2, vector_length);
1056
+ ab_real_f32m2 = __riscv_vfnmsac_vv_f32m2(ab_real_f32m2, a_imag_f32m2, b_imag_f32m2, vector_length);
1057
+ vfloat32m2_t ab_imag_f32m2 = __riscv_vfmul_vv_f32m2(a_real_f32m2, b_imag_f32m2, vector_length);
1058
+ ab_imag_f32m2 = __riscv_vfmacc_vv_f32m2(ab_imag_f32m2, a_imag_f32m2, b_real_f32m2, vector_length);
1059
+ vfloat32m2_t y_real_f32m2 = __riscv_vfmul_vf_f32m2(ab_real_f32m2, alpha_real, vector_length);
1060
+ y_real_f32m2 = __riscv_vfnmsac_vf_f32m2(y_real_f32m2, alpha_imag, ab_imag_f32m2, vector_length);
1061
+ vfloat32m2_t y_imag_f32m2 = __riscv_vfmul_vf_f32m2(ab_imag_f32m2, alpha_real, vector_length);
1062
+ y_imag_f32m2 = __riscv_vfmacc_vf_f32m2(y_imag_f32m2, alpha_imag, ab_real_f32m2, vector_length);
1063
+ y_real_f32m2 = __riscv_vfmacc_vf_f32m2(y_real_f32m2, beta_real, c_real_f32m2, vector_length);
1064
+ y_real_f32m2 = __riscv_vfnmsac_vf_f32m2(y_real_f32m2, beta_imag, c_imag_f32m2, vector_length);
1065
+ y_imag_f32m2 = __riscv_vfmacc_vf_f32m2(y_imag_f32m2, beta_real, c_imag_f32m2, vector_length);
1066
+ y_imag_f32m2 = __riscv_vfmacc_vf_f32m2(y_imag_f32m2, beta_imag, c_real_f32m2, vector_length);
1067
+ vfloat32m2x2_t out_f32m2x2 = __riscv_vcreate_v_f32m2x2(y_real_f32m2, y_imag_f32m2);
1068
+ __riscv_vsseg2e32_v_f32m2x2(result_f32, out_f32m2x2, vector_length);
1069
+ }
1070
+ }
1071
+
1072
+ NK_PUBLIC void nk_each_fma_f64c_rvv(nk_f64c_t const *a, nk_f64c_t const *b, nk_f64c_t const *c, nk_size_t n,
1073
+ nk_f64c_t const *alpha, nk_f64c_t const *beta, nk_f64c_t *result) {
1074
+ nk_f64_t alpha_real = alpha->real, alpha_imag = alpha->imag;
1075
+ nk_f64_t beta_real = beta->real, beta_imag = beta->imag;
1076
+ nk_f64_t const *a_f64 = (nk_f64_t const *)a;
1077
+ nk_f64_t const *b_f64 = (nk_f64_t const *)b;
1078
+ nk_f64_t const *c_f64 = (nk_f64_t const *)c;
1079
+ nk_f64_t *result_f64 = (nk_f64_t *)result;
1080
+ for (nk_size_t vector_length; n > 0; n -= vector_length, a_f64 += vector_length * 2, b_f64 += vector_length * 2,
1081
+ c_f64 += vector_length * 2, result_f64 += vector_length * 2) {
1082
+ vector_length = __riscv_vsetvl_e64m2(n);
1083
+ vfloat64m2x2_t a_f64m2x2 = __riscv_vlseg2e64_v_f64m2x2(a_f64, vector_length);
1084
+ vfloat64m2x2_t b_f64m2x2 = __riscv_vlseg2e64_v_f64m2x2(b_f64, vector_length);
1085
+ vfloat64m2x2_t c_f64m2x2 = __riscv_vlseg2e64_v_f64m2x2(c_f64, vector_length);
1086
+ vfloat64m2_t a_real_f64m2 = __riscv_vget_v_f64m2x2_f64m2(a_f64m2x2, 0);
1087
+ vfloat64m2_t a_imag_f64m2 = __riscv_vget_v_f64m2x2_f64m2(a_f64m2x2, 1);
1088
+ vfloat64m2_t b_real_f64m2 = __riscv_vget_v_f64m2x2_f64m2(b_f64m2x2, 0);
1089
+ vfloat64m2_t b_imag_f64m2 = __riscv_vget_v_f64m2x2_f64m2(b_f64m2x2, 1);
1090
+ vfloat64m2_t c_real_f64m2 = __riscv_vget_v_f64m2x2_f64m2(c_f64m2x2, 0);
1091
+ vfloat64m2_t c_imag_f64m2 = __riscv_vget_v_f64m2x2_f64m2(c_f64m2x2, 1);
1092
+ vfloat64m2_t ab_real_f64m2 = __riscv_vfmul_vv_f64m2(a_real_f64m2, b_real_f64m2, vector_length);
1093
+ ab_real_f64m2 = __riscv_vfnmsac_vv_f64m2(ab_real_f64m2, a_imag_f64m2, b_imag_f64m2, vector_length);
1094
+ vfloat64m2_t ab_imag_f64m2 = __riscv_vfmul_vv_f64m2(a_real_f64m2, b_imag_f64m2, vector_length);
1095
+ ab_imag_f64m2 = __riscv_vfmacc_vv_f64m2(ab_imag_f64m2, a_imag_f64m2, b_real_f64m2, vector_length);
1096
+ vfloat64m2_t y_real_f64m2 = __riscv_vfmul_vf_f64m2(ab_real_f64m2, alpha_real, vector_length);
1097
+ y_real_f64m2 = __riscv_vfnmsac_vf_f64m2(y_real_f64m2, alpha_imag, ab_imag_f64m2, vector_length);
1098
+ vfloat64m2_t y_imag_f64m2 = __riscv_vfmul_vf_f64m2(ab_imag_f64m2, alpha_real, vector_length);
1099
+ y_imag_f64m2 = __riscv_vfmacc_vf_f64m2(y_imag_f64m2, alpha_imag, ab_real_f64m2, vector_length);
1100
+ y_real_f64m2 = __riscv_vfmacc_vf_f64m2(y_real_f64m2, beta_real, c_real_f64m2, vector_length);
1101
+ y_real_f64m2 = __riscv_vfnmsac_vf_f64m2(y_real_f64m2, beta_imag, c_imag_f64m2, vector_length);
1102
+ y_imag_f64m2 = __riscv_vfmacc_vf_f64m2(y_imag_f64m2, beta_real, c_imag_f64m2, vector_length);
1103
+ y_imag_f64m2 = __riscv_vfmacc_vf_f64m2(y_imag_f64m2, beta_imag, c_real_f64m2, vector_length);
1104
+ vfloat64m2x2_t out_f64m2x2 = __riscv_vcreate_v_f64m2x2(y_real_f64m2, y_imag_f64m2);
1105
+ __riscv_vsseg2e64_v_f64m2x2(result_f64, out_f64m2x2, vector_length);
1106
+ }
1107
+ }
1108
+
1109
+ #if defined(__cplusplus)
1110
+ } // extern "C"
1111
+ #endif
1112
+
1113
+ #if defined(__clang__)
1114
+ #pragma clang attribute pop
1115
+ #elif defined(__GNUC__)
1116
+ #pragma GCC pop_options
1117
+ #endif
1118
+
1119
+ #endif // NK_TARGET_RVV
1120
+ #endif // NK_TARGET_RISCV_
1121
+ #endif // NK_EACH_RVV_H