numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -59,18 +59,18 @@ extern "C" {
59
59
  * Internal helpers return vector register groups for use by geospatial/rvv.h.
60
60
  */
61
61
 
62
- NK_INTERNAL vfloat32m4_t nk_f32m4_sin_rvv_(vfloat32m4_t angles, nk_size_t vl) {
62
+ NK_INTERNAL vfloat32m4_t nk_f32m4_sin_rvv_(vfloat32m4_t angles_f32m4, nk_size_t vl) {
63
63
  nk_f32_t const pi = 3.14159265358979323846f;
64
64
  nk_f32_t const pi_recip = 0.31830988618379067154f;
65
65
 
66
66
  // Range reduce: round(angle / pi)
67
- vfloat32m4_t quotients_f32m4 = __riscv_vfmul_vf_f32m4(angles, pi_recip, vl);
67
+ vfloat32m4_t quotients_f32m4 = __riscv_vfmul_vf_f32m4(angles_f32m4, pi_recip, vl);
68
68
  // vfcvt_x_f rounds to nearest integer by default (RNE)
69
69
  vint32m4_t rounded_i32m4 = __riscv_vfcvt_x_f_v_i32m4(quotients_f32m4, vl);
70
70
  vfloat32m4_t rounded_f32m4 = __riscv_vfcvt_f_x_v_f32m4(rounded_i32m4, vl);
71
71
 
72
72
  // reduced = angle - rounded * pi
73
- vfloat32m4_t reduced_f32m4 = __riscv_vfnmsac_vf_f32m4(angles, pi, rounded_f32m4, vl);
73
+ vfloat32m4_t reduced_f32m4 = __riscv_vfnmsac_vf_f32m4(angles_f32m4, pi, rounded_f32m4, vl);
74
74
 
75
75
  // Polynomial: sin(x) ~ x + x^3 * (c1 + x^2 * (c3 + x^2 * c5))
76
76
  vfloat32m4_t squared_f32m4 = __riscv_vfmul_vv_f32m4(reduced_f32m4, reduced_f32m4, vl);
@@ -88,19 +88,19 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_sin_rvv_(vfloat32m4_t angles, nk_size_t vl) {
88
88
  return __riscv_vreinterpret_v_u32m4_f32m4(result_bits_u32m4);
89
89
  }
90
90
 
91
- NK_INTERNAL vfloat32m4_t nk_f32m4_cos_rvv_(vfloat32m4_t angles, nk_size_t vl) {
91
+ NK_INTERNAL vfloat32m4_t nk_f32m4_cos_rvv_(vfloat32m4_t angles_f32m4, nk_size_t vl) {
92
92
  nk_f32_t const pi = 3.14159265358979323846f;
93
93
  nk_f32_t const pi_half = 1.57079632679489661923f;
94
94
  nk_f32_t const pi_recip = 0.31830988618379067154f;
95
95
 
96
96
  // Compute round((angle / pi) - 0.5)
97
- vfloat32m4_t quotients_f32m4 = __riscv_vfsub_vf_f32m4(__riscv_vfmul_vf_f32m4(angles, pi_recip, vl), 0.5f, vl);
97
+ vfloat32m4_t quotients_f32m4 = __riscv_vfsub_vf_f32m4(__riscv_vfmul_vf_f32m4(angles_f32m4, pi_recip, vl), 0.5f, vl);
98
98
  vint32m4_t rounded_i32m4 = __riscv_vfcvt_x_f_v_i32m4(quotients_f32m4, vl);
99
99
  vfloat32m4_t rounded_f32m4 = __riscv_vfcvt_f_x_v_f32m4(rounded_i32m4, vl);
100
100
 
101
101
  // Reduce: angle - (rounded * pi + pi/2)
102
102
  vfloat32m4_t offset_f32m4 = __riscv_vfmacc_vf_f32m4(__riscv_vfmv_v_f_f32m4(pi_half, vl), pi, rounded_f32m4, vl);
103
- vfloat32m4_t reduced_f32m4 = __riscv_vfsub_vv_f32m4(angles, offset_f32m4, vl);
103
+ vfloat32m4_t reduced_f32m4 = __riscv_vfsub_vv_f32m4(angles_f32m4, offset_f32m4, vl);
104
104
 
105
105
  // Polynomial: same 3-term approximation
106
106
  vfloat32m4_t squared_f32m4 = __riscv_vfmul_vv_f32m4(reduced_f32m4, reduced_f32m4, vl);
@@ -118,7 +118,7 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_cos_rvv_(vfloat32m4_t angles, nk_size_t vl) {
118
118
  return result_f32m4;
119
119
  }
120
120
 
121
- NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t inputs, nk_size_t vl) {
121
+ NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t inputs_f32m4, nk_size_t vl) {
122
122
  // 8-term polynomial coefficients for atan approximation
123
123
  nk_f32_t const c8 = -0.333331018686294555664062f;
124
124
  nk_f32_t const c7 = +0.199926957488059997558594f;
@@ -130,8 +130,8 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t inputs, nk_size_t vl) {
130
130
  nk_f32_t const c1 = +0.00282363896258175373077393f;
131
131
 
132
132
  // Detect negative values
133
- vbool8_t negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(inputs, 0.0f, vl);
134
- vfloat32m4_t values_f32m4 = __riscv_vfabs_v_f32m4(inputs, vl);
133
+ vbool8_t negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(inputs_f32m4, 0.0f, vl);
134
+ vfloat32m4_t values_f32m4 = __riscv_vfabs_v_f32m4(inputs_f32m4, vl);
135
135
 
136
136
  // Check if values > 1 (need reciprocal)
137
137
  vbool8_t reciprocal_mask_b8 = __riscv_vmfgt_vf_f32m4_b8(values_f32m4, 1.0f, vl);
@@ -163,7 +163,7 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan_rvv_(vfloat32m4_t inputs, nk_size_t vl) {
163
163
  return result_f32m4;
164
164
  }
165
165
 
166
- NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t ys_inputs, vfloat32m4_t xs_inputs, nk_size_t vl) {
166
+ NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t ys_inputs_f32m4, vfloat32m4_t xs_inputs_f32m4, nk_size_t vl) {
167
167
  // 8-term polynomial coefficients (same as atan)
168
168
  nk_f32_t const c8 = -0.333331018686294555664062f;
169
169
  nk_f32_t const c7 = +0.199926957488059997558594f;
@@ -175,9 +175,9 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t ys_inputs, vfloat32m4_
175
175
  nk_f32_t const c1 = +0.00282363896258175373077393f;
176
176
 
177
177
  // Quadrant adjustments - take absolute values
178
- vbool8_t xs_negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(xs_inputs, 0.0f, vl);
179
- vfloat32m4_t xs_f32m4 = __riscv_vfabs_v_f32m4(xs_inputs, vl);
180
- vfloat32m4_t ys_f32m4 = __riscv_vfabs_v_f32m4(ys_inputs, vl);
178
+ vbool8_t xs_negative_mask_b8 = __riscv_vmflt_vf_f32m4_b8(xs_inputs_f32m4, 0.0f, vl);
179
+ vfloat32m4_t xs_f32m4 = __riscv_vfabs_v_f32m4(xs_inputs_f32m4, vl);
180
+ vfloat32m4_t ys_f32m4 = __riscv_vfabs_v_f32m4(ys_inputs_f32m4, vl);
181
181
 
182
182
  // Ensure proper fraction where numerator < denominator
183
183
  vbool8_t swap_mask_b8 = __riscv_vmfgt_vv_f32m4_b8(ys_f32m4, xs_f32m4, vl);
@@ -214,22 +214,22 @@ NK_INTERNAL vfloat32m4_t nk_f32m4_atan2_rvv_(vfloat32m4_t ys_inputs, vfloat32m4_
214
214
  // Adjust for quadrant: result += quadrant * pi/2
215
215
  results_f32m4 = __riscv_vfmacc_vf_f32m4(results_f32m4, 1.5707963267948966f, quadrant_f32m4, vl);
216
216
 
217
- // Transfer sign from x (XOR with sign bit of xs_inputs)
217
+ // Transfer sign from x (XOR with sign bit of xs_inputs_f32m4)
218
218
  vuint32m4_t sign_mask_u32m4 = __riscv_vreinterpret_v_f32m4_u32m4(__riscv_vfmv_v_f_f32m4(-0.0f, vl));
219
- vuint32m4_t xs_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(xs_inputs),
219
+ vuint32m4_t xs_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(xs_inputs_f32m4),
220
220
  sign_mask_u32m4, vl);
221
221
  vuint32m4_t result_bits_u32m4 = __riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(results_f32m4),
222
222
  xs_sign_bits_u32m4, vl);
223
223
 
224
- // Transfer sign from y (XOR with sign bit of ys_inputs)
225
- vuint32m4_t ys_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(ys_inputs),
224
+ // Transfer sign from y (XOR with sign bit of ys_inputs_f32m4)
225
+ vuint32m4_t ys_sign_bits_u32m4 = __riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(ys_inputs_f32m4),
226
226
  sign_mask_u32m4, vl);
227
227
  result_bits_u32m4 = __riscv_vxor_vv_u32m4(result_bits_u32m4, ys_sign_bits_u32m4, vl);
228
228
 
229
229
  return __riscv_vreinterpret_v_u32m4_f32m4(result_bits_u32m4);
230
230
  }
231
231
 
232
- NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t angles_radians, nk_size_t vl) {
232
+ NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t angles_radians_f64m4, nk_size_t vl) {
233
233
  // Constants for two-step Cody-Waite range reduction
234
234
  nk_f64_t const pi_high = 3.141592653589793116;
235
235
  nk_f64_t const pi_low = 1.2246467991473532072e-16;
@@ -247,13 +247,13 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t angles_radians, nk_size_
247
247
  nk_f64_t const c8 = -0.166666666666666657414808;
248
248
 
249
249
  // Compute round(angle / pi)
250
- vfloat64m4_t quotients_f64m4 = __riscv_vfmul_vf_f64m4(angles_radians, pi_recip, vl);
250
+ vfloat64m4_t quotients_f64m4 = __riscv_vfmul_vf_f64m4(angles_radians_f64m4, pi_recip, vl);
251
251
  // Round to nearest: vfcvt_x_f rounds to nearest (RNE), then convert back
252
252
  vint64m4_t rounded_i64m4 = __riscv_vfcvt_x_f_v_i64m4(quotients_f64m4, vl);
253
253
  vfloat64m4_t rounded_f64m4 = __riscv_vfcvt_f_x_v_f64m4(rounded_i64m4, vl);
254
254
 
255
255
  // Two-step Cody-Waite reduction: angle - rounded * pi_high - rounded * pi_low
256
- vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_radians, pi_high, rounded_f64m4, vl);
256
+ vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_radians_f64m4, pi_high, rounded_f64m4, vl);
257
257
  angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_f64m4, pi_low, rounded_f64m4, vl);
258
258
 
259
259
  // If rounded is odd, negate the angle
@@ -289,13 +289,13 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_sin_rvv_(vfloat64m4_t angles_radians, nk_size_
289
289
  results_f64m4 = __riscv_vfmacc_vv_f64m4(angles_f64m4, cubed_f64m4, results_f64m4, vl);
290
290
 
291
291
  // Handle zero input (preserve sign of zero)
292
- vbool16_t non_zero_mask_b16 = __riscv_vmfne_vf_f64m4_b16(angles_radians, 0.0, vl);
292
+ vbool16_t non_zero_mask_b16 = __riscv_vmfne_vf_f64m4_b16(angles_radians_f64m4, 0.0, vl);
293
293
  vfloat64m4_t zeros_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vl);
294
294
  results_f64m4 = __riscv_vmerge_vvm_f64m4(zeros_f64m4, results_f64m4, non_zero_mask_b16, vl);
295
295
  return results_f64m4;
296
296
  }
297
297
 
298
- NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians, nk_size_t vl) {
298
+ NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians_f64m4, nk_size_t vl) {
299
299
  // Constants for two-step Cody-Waite range reduction
300
300
  nk_f64_t const pi_high_half = 3.141592653589793116 * 0.5;
301
301
  nk_f64_t const pi_low_half = 1.2246467991473532072e-16 * 0.5;
@@ -313,8 +313,8 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians, nk_size_
313
313
  nk_f64_t const c8 = -0.166666666666666657414808;
314
314
 
315
315
  // Compute 2 * round(angle / pi - 0.5) + 1
316
- vfloat64m4_t quotients_f64m4 = __riscv_vfsub_vf_f64m4(__riscv_vfmul_vf_f64m4(angles_radians, pi_recip, vl), 0.5,
317
- vl);
316
+ vfloat64m4_t quotients_f64m4 = __riscv_vfsub_vf_f64m4(__riscv_vfmul_vf_f64m4(angles_radians_f64m4, pi_recip, vl),
317
+ 0.5, vl);
318
318
  vint64m4_t rounded_i64m4 = __riscv_vfcvt_x_f_v_i64m4(quotients_f64m4, vl);
319
319
  vfloat64m4_t rounded_f64m4 = __riscv_vfcvt_f_x_v_f64m4(rounded_i64m4, vl);
320
320
  // rounded_quotients = 2 * rounded + 1
@@ -322,7 +322,8 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians, nk_size_
322
322
  vl);
323
323
 
324
324
  // Two-step Cody-Waite reduction: angle - rounded_quotients * pi_high_half - rounded_quotients * pi_low_half
325
- vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_radians, pi_high_half, rounded_quotients_f64m4, vl);
325
+ vfloat64m4_t angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_radians_f64m4, pi_high_half, rounded_quotients_f64m4,
326
+ vl);
326
327
  angles_f64m4 = __riscv_vfnmsac_vf_f64m4(angles_f64m4, pi_low_half, rounded_quotients_f64m4, vl);
327
328
 
328
329
  // If (rounded_quotients & 2) == 0, negate the angle
@@ -352,7 +353,7 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_cos_rvv_(vfloat64m4_t angles_radians, nk_size_
352
353
  return results_f64m4;
353
354
  }
354
355
 
355
- NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t inputs, nk_size_t vl) {
356
+ NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t inputs_f64m4, nk_size_t vl) {
356
357
  // 19-term polynomial coefficients
357
358
  nk_f64_t const c19 = -1.88796008463073496563746e-05;
358
359
  nk_f64_t const c18 = +0.000209850076645816976906797;
@@ -375,8 +376,8 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t inputs, nk_size_t vl) {
375
376
  nk_f64_t const c1 = -0.333333333333311110369124;
376
377
 
377
378
  // Detect negative values
378
- vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(inputs, 0.0, vl);
379
- vfloat64m4_t values_f64m4 = __riscv_vfabs_v_f64m4(inputs, vl);
379
+ vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(inputs_f64m4, 0.0, vl);
380
+ vfloat64m4_t values_f64m4 = __riscv_vfabs_v_f64m4(inputs_f64m4, vl);
380
381
 
381
382
  // Check if values > 1 (need reciprocal)
382
383
  vbool16_t reciprocal_mask_b16 = __riscv_vmfgt_vf_f64m4_b16(values_f64m4, 1.0, vl);
@@ -419,7 +420,7 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan_rvv_(vfloat64m4_t inputs, nk_size_t vl) {
419
420
  return result_f64m4;
420
421
  }
421
422
 
422
- NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs, vfloat64m4_t xs_inputs, nk_size_t vl) {
423
+ NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs_f64m4, vfloat64m4_t xs_inputs_f64m4, nk_size_t vl) {
423
424
  // 19-term polynomial coefficients (same as atan)
424
425
  nk_f64_t const c19 = -1.88796008463073496563746e-05;
425
426
  nk_f64_t const c18 = +0.000209850076645816976906797;
@@ -442,9 +443,9 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs, vfloat64m4_
442
443
  nk_f64_t const c1 = -0.333333333333311110369124;
443
444
 
444
445
  // Quadrant adjustments - take absolute values
445
- vbool16_t xs_negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(xs_inputs, 0.0, vl);
446
- vfloat64m4_t xs_f64m4 = __riscv_vfabs_v_f64m4(xs_inputs, vl);
447
- vfloat64m4_t ys_f64m4 = __riscv_vfabs_v_f64m4(ys_inputs, vl);
446
+ vbool16_t xs_negative_mask_b16 = __riscv_vmflt_vf_f64m4_b16(xs_inputs_f64m4, 0.0, vl);
447
+ vfloat64m4_t xs_f64m4 = __riscv_vfabs_v_f64m4(xs_inputs_f64m4, vl);
448
+ vfloat64m4_t ys_f64m4 = __riscv_vfabs_v_f64m4(ys_inputs_f64m4, vl);
448
449
 
449
450
  // Ensure proper fraction where numerator < denominator
450
451
  vbool16_t swap_mask_b16 = __riscv_vmfgt_vv_f64m4_b16(ys_f64m4, xs_f64m4, vl);
@@ -492,15 +493,15 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs, vfloat64m4_
492
493
  // Adjust for quadrant: result += quadrant * pi/2
493
494
  results_f64m4 = __riscv_vfmacc_vf_f64m4(results_f64m4, 1.5707963267948966, quadrant_f64m4, vl);
494
495
 
495
- // Transfer sign from x (XOR with sign bit of xs_inputs)
496
+ // Transfer sign from x (XOR with sign bit of xs_inputs_f64m4)
496
497
  vuint64m4_t sign_mask_u64m4 = __riscv_vreinterpret_v_f64m4_u64m4(__riscv_vfmv_v_f_f64m4(-0.0, vl));
497
- vuint64m4_t xs_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(xs_inputs),
498
+ vuint64m4_t xs_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(xs_inputs_f64m4),
498
499
  sign_mask_u64m4, vl);
499
500
  vuint64m4_t result_bits_u64m4 = __riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(results_f64m4),
500
501
  xs_sign_bits_u64m4, vl);
501
502
 
502
- // Transfer sign from y (XOR with sign bit of ys_inputs)
503
- vuint64m4_t ys_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(ys_inputs),
503
+ // Transfer sign from y (XOR with sign bit of ys_inputs_f64m4)
504
+ vuint64m4_t ys_sign_bits_u64m4 = __riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(ys_inputs_f64m4),
504
505
  sign_mask_u64m4, vl);
505
506
  result_bits_u64m4 = __riscv_vxor_vv_u64m4(result_bits_u64m4, ys_sign_bits_u64m4, vl);
506
507
 
@@ -511,15 +512,15 @@ NK_INTERNAL vfloat64m4_t nk_f64m4_atan2_rvv_(vfloat64m4_t ys_inputs, vfloat64m4_
511
512
  * f16 data is loaded as m1 (16-bit), widened to f32 m2, computed, then narrowed back.
512
513
  */
513
514
 
514
- NK_INTERNAL vfloat32m2_t nk_f32m2_sin_rvv_(vfloat32m2_t angles, nk_size_t vl) {
515
+ NK_INTERNAL vfloat32m2_t nk_f32m2_sin_rvv_(vfloat32m2_t angles_f32m2, nk_size_t vl) {
515
516
  nk_f32_t const pi = 3.14159265358979323846f;
516
517
  nk_f32_t const pi_recip = 0.31830988618379067154f;
517
518
 
518
- vfloat32m2_t quotients_f32m2 = __riscv_vfmul_vf_f32m2(angles, pi_recip, vl);
519
+ vfloat32m2_t quotients_f32m2 = __riscv_vfmul_vf_f32m2(angles_f32m2, pi_recip, vl);
519
520
  vint32m2_t rounded_i32m2 = __riscv_vfcvt_x_f_v_i32m2(quotients_f32m2, vl);
520
521
  vfloat32m2_t rounded_f32m2 = __riscv_vfcvt_f_x_v_f32m2(rounded_i32m2, vl);
521
522
 
522
- vfloat32m2_t reduced_f32m2 = __riscv_vfnmsac_vf_f32m2(angles, pi, rounded_f32m2, vl);
523
+ vfloat32m2_t reduced_f32m2 = __riscv_vfnmsac_vf_f32m2(angles_f32m2, pi, rounded_f32m2, vl);
523
524
  vfloat32m2_t squared_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, reduced_f32m2, vl);
524
525
  vfloat32m2_t cubed_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, squared_f32m2, vl);
525
526
 
@@ -534,17 +535,17 @@ NK_INTERNAL vfloat32m2_t nk_f32m2_sin_rvv_(vfloat32m2_t angles, nk_size_t vl) {
534
535
  return __riscv_vreinterpret_v_u32m2_f32m2(result_bits_u32m2);
535
536
  }
536
537
 
537
- NK_INTERNAL vfloat32m2_t nk_f32m2_cos_rvv_(vfloat32m2_t angles, nk_size_t vl) {
538
+ NK_INTERNAL vfloat32m2_t nk_f32m2_cos_rvv_(vfloat32m2_t angles_f32m2, nk_size_t vl) {
538
539
  nk_f32_t const pi = 3.14159265358979323846f;
539
540
  nk_f32_t const pi_half = 1.57079632679489661923f;
540
541
  nk_f32_t const pi_recip = 0.31830988618379067154f;
541
542
 
542
- vfloat32m2_t quotients_f32m2 = __riscv_vfsub_vf_f32m2(__riscv_vfmul_vf_f32m2(angles, pi_recip, vl), 0.5f, vl);
543
+ vfloat32m2_t quotients_f32m2 = __riscv_vfsub_vf_f32m2(__riscv_vfmul_vf_f32m2(angles_f32m2, pi_recip, vl), 0.5f, vl);
543
544
  vint32m2_t rounded_i32m2 = __riscv_vfcvt_x_f_v_i32m2(quotients_f32m2, vl);
544
545
  vfloat32m2_t rounded_f32m2 = __riscv_vfcvt_f_x_v_f32m2(rounded_i32m2, vl);
545
546
 
546
547
  vfloat32m2_t offset_f32m2 = __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(pi_half, vl), pi, rounded_f32m2, vl);
547
- vfloat32m2_t reduced_f32m2 = __riscv_vfsub_vv_f32m2(angles, offset_f32m2, vl);
548
+ vfloat32m2_t reduced_f32m2 = __riscv_vfsub_vv_f32m2(angles_f32m2, offset_f32m2, vl);
548
549
 
549
550
  vfloat32m2_t squared_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, reduced_f32m2, vl);
550
551
  vfloat32m2_t cubed_f32m2 = __riscv_vfmul_vv_f32m2(reduced_f32m2, squared_f32m2, vl);
@@ -560,7 +561,7 @@ NK_INTERNAL vfloat32m2_t nk_f32m2_cos_rvv_(vfloat32m2_t angles, nk_size_t vl) {
560
561
  return result_f32m2;
561
562
  }
562
563
 
563
- NK_INTERNAL vfloat32m2_t nk_f32m2_atan_rvv_(vfloat32m2_t inputs, nk_size_t vl) {
564
+ NK_INTERNAL vfloat32m2_t nk_f32m2_atan_rvv_(vfloat32m2_t inputs_f32m2, nk_size_t vl) {
564
565
  nk_f32_t const c8 = -0.333331018686294555664062f;
565
566
  nk_f32_t const c7 = +0.199926957488059997558594f;
566
567
  nk_f32_t const c6 = -0.142027363181114196777344f;
@@ -570,8 +571,8 @@ NK_INTERNAL vfloat32m2_t nk_f32m2_atan_rvv_(vfloat32m2_t inputs, nk_size_t vl) {
570
571
  nk_f32_t const c2 = -0.0159569028764963150024414f;
571
572
  nk_f32_t const c1 = +0.00282363896258175373077393f;
572
573
 
573
- vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f32m2_b16(inputs, 0.0f, vl);
574
- vfloat32m2_t values_f32m2 = __riscv_vfabs_v_f32m2(inputs, vl);
574
+ vbool16_t negative_mask_b16 = __riscv_vmflt_vf_f32m2_b16(inputs_f32m2, 0.0f, vl);
575
+ vfloat32m2_t values_f32m2 = __riscv_vfabs_v_f32m2(inputs_f32m2, vl);
575
576
 
576
577
  vbool16_t reciprocal_mask_b16 = __riscv_vmfgt_vf_f32m2_b16(values_f32m2, 1.0f, vl);
577
578
  vfloat32m2_t reciprocal_values_f32m2 = nk_f32m2_reciprocal_rvv_(values_f32m2, vl);
@@ -657,8 +658,8 @@ NK_PUBLIC void nk_each_sin_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *o
657
658
  vuint16m1_t f16_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)ins, vector_length);
658
659
  vfloat32m2_t values_f32m2 = nk_f16m1_to_f32m2_rvv_(f16_u16m1, vector_length);
659
660
  vfloat32m2_t results_f32m2 = nk_f32m2_sin_rvv_(values_f32m2, vector_length);
660
- vuint16m1_t f16_results = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
661
- __riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results, vector_length);
661
+ vuint16m1_t f16_results_u16m1 = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
662
+ __riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results_u16m1, vector_length);
662
663
  }
663
664
  }
664
665
 
@@ -668,8 +669,8 @@ NK_PUBLIC void nk_each_cos_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *o
668
669
  vuint16m1_t f16_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)ins, vector_length);
669
670
  vfloat32m2_t values_f32m2 = nk_f16m1_to_f32m2_rvv_(f16_u16m1, vector_length);
670
671
  vfloat32m2_t results_f32m2 = nk_f32m2_cos_rvv_(values_f32m2, vector_length);
671
- vuint16m1_t f16_results = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
672
- __riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results, vector_length);
672
+ vuint16m1_t f16_results_u16m1 = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
673
+ __riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results_u16m1, vector_length);
673
674
  }
674
675
  }
675
676
 
@@ -679,8 +680,8 @@ NK_PUBLIC void nk_each_atan_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *
679
680
  vuint16m1_t f16_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)ins, vector_length);
680
681
  vfloat32m2_t values_f32m2 = nk_f16m1_to_f32m2_rvv_(f16_u16m1, vector_length);
681
682
  vfloat32m2_t results_f32m2 = nk_f32m2_atan_rvv_(values_f32m2, vector_length);
682
- vuint16m1_t f16_results = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
683
- __riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results, vector_length);
683
+ vuint16m1_t f16_results_u16m1 = nk_f32m2_to_f16m1_rvv_(results_f32m2, vector_length);
684
+ __riscv_vse16_v_u16m1((nk_u16_t *)outs, f16_results_u16m1, vector_length);
684
685
  }
685
686
  }
686
687
 
@@ -27,8 +27,8 @@ extern "C" {
27
27
  NK_PUBLIC nk_f32_t nk_f32_sin(nk_f32_t const angle_radians) {
28
28
 
29
29
  // Cody-Waite constants for argument reduction (pi split into hi + lo)
30
- nk_f32_t const pi_hi = 3.1415927f;
31
- nk_f32_t const pi_lo = -8.742278e-8f;
30
+ nk_f32_t const pi_high = 3.1415927f;
31
+ nk_f32_t const pi_low = -8.742278e-8f;
32
32
  nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
33
33
 
34
34
  // Degree-9 minimax coefficients: sin(x) ≈ x + c3*x³ + c5*x⁵ + c7*x⁷ + c9*x⁹
@@ -41,9 +41,9 @@ NK_PUBLIC nk_f32_t nk_f32_sin(nk_f32_t const angle_radians) {
41
41
  nk_f32_t const quotient = angle_radians * pi_reciprocal;
42
42
  int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
43
43
 
44
- // Cody-Waite range reduction: angle = angle_radians - multiple * (pi_hi + pi_lo)
45
- nk_f32_t angle = angle_radians - multiple_of_pi * pi_hi;
46
- angle -= multiple_of_pi * pi_lo;
44
+ // Cody-Waite range reduction: angle = angle_radians - multiple * (pi_high + pi_low)
45
+ nk_f32_t angle = angle_radians - (nk_f32_t)multiple_of_pi * pi_high;
46
+ angle -= (nk_f32_t)multiple_of_pi * pi_low;
47
47
  nk_f32_t const angle_squared = angle * angle;
48
48
  nk_f32_t const angle_cubed = angle * angle_squared;
49
49
 
@@ -68,8 +68,8 @@ NK_PUBLIC nk_f32_t nk_f32_sin(nk_f32_t const angle_radians) {
68
68
  NK_PUBLIC nk_f32_t nk_f32_cos(nk_f32_t const angle_radians) {
69
69
 
70
70
  // Cody-Waite constants for argument reduction (pi split into hi + lo)
71
- nk_f32_t const pi_hi = 3.1415927f;
72
- nk_f32_t const pi_lo = -8.742278e-8f;
71
+ nk_f32_t const pi_high = 3.1415927f;
72
+ nk_f32_t const pi_low = -8.742278e-8f;
73
73
  nk_f32_t const pi_half = 1.57079632679489661923f; /// π/2
74
74
  nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
75
75
 
@@ -84,9 +84,9 @@ NK_PUBLIC nk_f32_t nk_f32_cos(nk_f32_t const angle_radians) {
84
84
  int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
85
85
 
86
86
  // Cody-Waite range reduction: angle = angle_radians - (multiple * pi + pi/2)
87
- nk_f32_t const offset = pi_half + multiple_of_pi * pi_hi;
87
+ nk_f32_t const offset = pi_half + (nk_f32_t)multiple_of_pi * pi_high;
88
88
  nk_f32_t angle = angle_radians - offset;
89
- angle -= multiple_of_pi * pi_lo;
89
+ angle -= (nk_f32_t)multiple_of_pi * pi_low;
90
90
  nk_f32_t const angle_squared = angle * angle;
91
91
  nk_f32_t const angle_cubed = angle * angle_squared;
92
92
 
@@ -544,8 +544,8 @@ NK_PUBLIC nk_f64_t nk_f64_atan2(nk_f64_t const y_input, nk_f64_t const x_input)
544
544
  NK_PUBLIC nk_f32_t nk_f32_tan(nk_f32_t const angle_radians) {
545
545
 
546
546
  // Cody-Waite constants for argument reduction
547
- nk_f32_t const pi_hi = 3.1415927f;
548
- nk_f32_t const pi_lo = -8.742278e-8f;
547
+ nk_f32_t const pi_high = 3.1415927f;
548
+ nk_f32_t const pi_low = -8.742278e-8f;
549
549
  nk_f32_t const pi_half = 1.57079632679489661923f; /// π/2
550
550
  nk_f32_t const pi_quarter = 0.78539816339744830962f; /// π/4
551
551
  nk_f32_t const pi_reciprocal = 0.31830988618379067154f; /// 1/π
@@ -560,8 +560,8 @@ NK_PUBLIC nk_f32_t nk_f32_tan(nk_f32_t const angle_radians) {
560
560
  int const multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5f : quotient + 0.5f);
561
561
 
562
562
  // Cody-Waite range reduction
563
- nk_f32_t angle = angle_radians - multiple_of_pi * pi_hi;
564
- angle -= multiple_of_pi * pi_lo;
563
+ nk_f32_t angle = angle_radians - (nk_f32_t)multiple_of_pi * pi_high;
564
+ angle -= (nk_f32_t)multiple_of_pi * pi_low;
565
565
 
566
566
  // If |angle| > π/4, use tan(x) = 1/tan(π/2 - x) for better accuracy
567
567
  int reciprocal = 0;