numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -9,20 +9,19 @@
9
9
  *
10
10
  * @section trigonometry_neon_instructions ARM NEON Instructions
11
11
  *
12
- * Intrinsic Instruction Latency Throughput
13
- * A76 M4+/V1+/Oryon
14
- * vfmaq_f32 FMLA (V.4S, V.4S, V.4S) 4cy 2/cy 4/cy
15
- * vfmsq_f32 FMLS (V.4S, V.4S, V.4S) 4cy 2/cy 4/cy
16
- * vmulq_f32 FMUL (V.4S, V.4S, V.4S) 3cy 2/cy 4/cy
17
- * vaddq_f32 FADD (V.4S, V.4S, V.4S) 2cy 2/cy 4/cy
18
- * vsubq_f32 FSUB (V.4S, V.4S, V.4S) 2cy 2/cy 4/cy
19
- * vcvtnq_s32_f32 FCVTNS (V.4S, V.4S) 3cy 2/cy 2/cy
20
- * vcvtq_f32_s32 SCVTF (V.4S, V.4S) 3cy 2/cy 2/cy
21
- * vbslq_f32 BSL (V.16B, V.16B, V.16B) 2cy 2/cy 4/cy
22
- * vrecpeq_f32 FRECPE (V.4S, V.4S) 2cy 2/cy 2/cy
23
- * vrecpsq_f32 FRECPS (V.4S, V.4S, V.4S) 4cy 2/cy 4/cy
24
- * vfmaq_f64 FMLA (V.2D, V.2D, V.2D) 4cy 2/cy 4/cy
25
- * vdivq_f64 FDIV (V.2D, V.2D, V.2D) 15cy 0.5/cy 0.5/cy
12
+ * Intrinsic Instruction A76 M5
13
+ * vfmaq_f32 FMLA (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 4p
14
+ * vfmsq_f32 FMLS (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 4p
15
+ * vmulq_f32 FMUL (V.4S, V.4S, V.4S) 3cy @ 2p 3cy @ 4p
16
+ * vaddq_f32 FADD (V.4S, V.4S, V.4S) 2cy @ 2p 2cy @ 4p
17
+ * vsubq_f32 FSUB (V.4S, V.4S, V.4S) 2cy @ 2p 2cy @ 4p
18
+ * vcvtnq_s32_f32 FCVTNS (V.4S, V.4S) 3cy @ 2p 3cy @ 4p
19
+ * vcvtq_f32_s32 SCVTF (V.4S, V.4S) 3cy @ 2p 3cy @ 4p
20
+ * vbslq_f32 BSL (V.16B, V.16B, V.16B) 1cy @ 2p 2cy @ 4p
21
+ * vrecpeq_f32 FRECPE (V.4S, V.4S) 2cy @ 2p 3cy @ 1p
22
+ * vrecpsq_f32 FRECPS (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 2p
23
+ * vfmaq_f64 FMLA (V.2D, V.2D, V.2D) 4cy @ 2p 3cy @ 4p
24
+ * vdivq_f64 FDIV (V.2D, V.2D, V.2D) 12cy @ 1p 7cy @ 1p
26
25
  *
27
26
  * Polynomial approximations for sin/cos/atan are FMA-dominated. On 4-pipe cores (Apple M4+,
28
27
  * Graviton3+, Oryon), FMA throughput is 4/cy with 4cy latency.
@@ -56,478 +55,478 @@ extern "C" {
56
55
 
57
56
  NK_INTERNAL float32x4_t nk_sin_f32x4_neon_(float32x4_t const angles_radians) {
58
57
  // Cody-Waite constants for argument reduction
59
- float32x4_t const pi_hi_f32x4 = vdupq_n_f32(3.1415927f);
60
- float32x4_t const pi_lo_f32x4 = vdupq_n_f32(-8.742278e-8f);
61
- float32x4_t const pi_reciprocal = vdupq_n_f32(0.31830988618379067154f);
58
+ float32x4_t const pi_high_f32x4 = vdupq_n_f32(3.1415927f);
59
+ float32x4_t const pi_low_f32x4 = vdupq_n_f32(-8.742278e-8f);
60
+ float32x4_t const pi_reciprocal_f32x4 = vdupq_n_f32(0.31830988618379067154f);
62
61
  // Degree-9 minimax coefficients
63
- float32x4_t const coeff_9 = vdupq_n_f32(+2.7557319224e-6f);
64
- float32x4_t const coeff_7 = vdupq_n_f32(-1.9841269841e-4f);
65
- float32x4_t const coeff_5 = vdupq_n_f32(+8.3333293855e-3f);
66
- float32x4_t const coeff_3 = vdupq_n_f32(-1.6666666641e-1f);
62
+ float32x4_t const coeff_9_f32x4 = vdupq_n_f32(+2.7557319224e-6f);
63
+ float32x4_t const coeff_7_f32x4 = vdupq_n_f32(-1.9841269841e-4f);
64
+ float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+8.3333293855e-3f);
65
+ float32x4_t const coeff_3_f32x4 = vdupq_n_f32(-1.6666666641e-1f);
67
66
 
68
- // Compute (multiples_of_pi) = round(angle / π) using vcvtnq which rounds to nearest
69
- float32x4_t quotients = vmulq_f32(angles_radians, pi_reciprocal);
70
- int32x4_t multiples_of_pi = vcvtnq_s32_f32(quotients);
71
- float32x4_t rounded_quotients = vcvtq_f32_s32(multiples_of_pi);
67
+ // Compute (multiples_of_pi_i32x4) = round(angle / π) using vcvtnq which rounds to nearest
68
+ float32x4_t quotients_f32x4 = vmulq_f32(angles_radians, pi_reciprocal_f32x4);
69
+ int32x4_t multiples_of_pi_i32x4 = vcvtnq_s32_f32(quotients_f32x4);
70
+ float32x4_t rounded_quotients_f32x4 = vcvtq_f32_s32(multiples_of_pi_i32x4);
72
71
 
73
72
  // Cody-Waite range reduction
74
- float32x4_t angles = vfmsq_f32(angles_radians, rounded_quotients, pi_hi_f32x4);
75
- angles = vfmsq_f32(angles, rounded_quotients, pi_lo_f32x4);
76
- float32x4_t const angles_squared = vmulq_f32(angles, angles);
77
- float32x4_t const angles_cubed = vmulq_f32(angles, angles_squared);
73
+ float32x4_t angles_f32x4 = vfmsq_f32(angles_radians, rounded_quotients_f32x4, pi_high_f32x4);
74
+ angles_f32x4 = vfmsq_f32(angles_f32x4, rounded_quotients_f32x4, pi_low_f32x4);
75
+ float32x4_t const angles_squared_f32x4 = vmulq_f32(angles_f32x4, angles_f32x4);
76
+ float32x4_t const angles_cubed_f32x4 = vmulq_f32(angles_f32x4, angles_squared_f32x4);
78
77
 
79
78
  // Degree-9 polynomial via Horner's method
80
- float32x4_t polynomials = coeff_9;
81
- polynomials = vfmaq_f32(coeff_7, polynomials, angles_squared);
82
- polynomials = vfmaq_f32(coeff_5, polynomials, angles_squared);
83
- polynomials = vfmaq_f32(coeff_3, polynomials, angles_squared);
84
- float32x4_t results = vfmaq_f32(angles, angles_cubed, polynomials);
85
-
86
- // If multiples_of_pi is odd, flip the sign
87
- int32x4_t parity = vandq_s32(multiples_of_pi, vdupq_n_s32(1));
88
- uint32x4_t odd_mask = vceqq_s32(parity, vdupq_n_s32(1));
89
- float32x4_t negated = vnegq_f32(results);
90
- results = vbslq_f32(odd_mask, negated, results);
91
- return results;
79
+ float32x4_t polynomials_f32x4 = coeff_9_f32x4;
80
+ polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, angles_squared_f32x4);
81
+ polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, angles_squared_f32x4);
82
+ polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, angles_squared_f32x4);
83
+ float32x4_t results_f32x4 = vfmaq_f32(angles_f32x4, angles_cubed_f32x4, polynomials_f32x4);
84
+
85
+ // If multiples_of_pi_i32x4 is odd, flip the sign
86
+ int32x4_t parity_i32x4 = vandq_s32(multiples_of_pi_i32x4, vdupq_n_s32(1));
87
+ uint32x4_t odd_mask_u32x4 = vceqq_s32(parity_i32x4, vdupq_n_s32(1));
88
+ float32x4_t negated_f32x4 = vnegq_f32(results_f32x4);
89
+ results_f32x4 = vbslq_f32(odd_mask_u32x4, negated_f32x4, results_f32x4);
90
+ return results_f32x4;
92
91
  }
93
92
 
94
93
  NK_INTERNAL float32x4_t nk_cos_f32x4_neon_(float32x4_t const angles_radians) {
95
94
  // Cody-Waite constants for argument reduction
96
- float32x4_t const pi_hi_f32x4 = vdupq_n_f32(3.1415927f);
97
- float32x4_t const pi_lo_f32x4 = vdupq_n_f32(-8.742278e-8f);
98
- float32x4_t const pi_half = vdupq_n_f32(1.57079632679489661923f);
99
- float32x4_t const pi_reciprocal = vdupq_n_f32(0.31830988618379067154f);
95
+ float32x4_t const pi_high_f32x4 = vdupq_n_f32(3.1415927f);
96
+ float32x4_t const pi_low_f32x4 = vdupq_n_f32(-8.742278e-8f);
97
+ float32x4_t const pi_half_f32x4 = vdupq_n_f32(1.57079632679489661923f);
98
+ float32x4_t const pi_reciprocal_f32x4 = vdupq_n_f32(0.31830988618379067154f);
100
99
  // Degree-9 minimax coefficients
101
- float32x4_t const coeff_9 = vdupq_n_f32(+2.7557319224e-6f);
102
- float32x4_t const coeff_7 = vdupq_n_f32(-1.9841269841e-4f);
103
- float32x4_t const coeff_5 = vdupq_n_f32(+8.3333293855e-3f);
104
- float32x4_t const coeff_3 = vdupq_n_f32(-1.6666666641e-1f);
100
+ float32x4_t const coeff_9_f32x4 = vdupq_n_f32(+2.7557319224e-6f);
101
+ float32x4_t const coeff_7_f32x4 = vdupq_n_f32(-1.9841269841e-4f);
102
+ float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+8.3333293855e-3f);
103
+ float32x4_t const coeff_3_f32x4 = vdupq_n_f32(-1.6666666641e-1f);
105
104
 
106
105
  // Compute round((angle / π) - 0.5)
107
- float32x4_t quotients = vsubq_f32(vmulq_f32(angles_radians, pi_reciprocal), vdupq_n_f32(0.5f));
108
- int32x4_t multiples_of_pi = vcvtnq_s32_f32(quotients);
109
- float32x4_t rounded_quotients = vcvtq_f32_s32(multiples_of_pi);
106
+ float32x4_t quotients_f32x4 = vsubq_f32(vmulq_f32(angles_radians, pi_reciprocal_f32x4), vdupq_n_f32(0.5f));
107
+ int32x4_t multiples_of_pi_i32x4 = vcvtnq_s32_f32(quotients_f32x4);
108
+ float32x4_t rounded_quotients_f32x4 = vcvtq_f32_s32(multiples_of_pi_i32x4);
110
109
 
111
- // Cody-Waite range reduction: angle = (angle - pi/2) - rounded * (pi_hi + pi_lo)
112
- float32x4_t shifted = vsubq_f32(angles_radians, pi_half);
113
- float32x4_t angles = vfmsq_f32(shifted, rounded_quotients, pi_hi_f32x4);
114
- angles = vfmsq_f32(angles, rounded_quotients, pi_lo_f32x4);
115
- float32x4_t const angles_squared = vmulq_f32(angles, angles);
116
- float32x4_t const angles_cubed = vmulq_f32(angles, angles_squared);
110
+ // Cody-Waite range reduction: angle = (angle - pi/2) - rounded * (pi_high + pi_low)
111
+ float32x4_t shifted_f32x4 = vsubq_f32(angles_radians, pi_half_f32x4);
112
+ float32x4_t angles_f32x4 = vfmsq_f32(shifted_f32x4, rounded_quotients_f32x4, pi_high_f32x4);
113
+ angles_f32x4 = vfmsq_f32(angles_f32x4, rounded_quotients_f32x4, pi_low_f32x4);
114
+ float32x4_t const angles_squared_f32x4 = vmulq_f32(angles_f32x4, angles_f32x4);
115
+ float32x4_t const angles_cubed_f32x4 = vmulq_f32(angles_f32x4, angles_squared_f32x4);
117
116
 
118
117
  // Degree-9 polynomial via Horner's method
119
- float32x4_t polynomials = coeff_9;
120
- polynomials = vfmaq_f32(coeff_7, polynomials, angles_squared);
121
- polynomials = vfmaq_f32(coeff_5, polynomials, angles_squared);
122
- polynomials = vfmaq_f32(coeff_3, polynomials, angles_squared);
123
- float32x4_t results = vfmaq_f32(angles, angles_cubed, polynomials);
124
-
125
- // If multiples_of_pi is even, flip the sign
126
- int32x4_t parity = vandq_s32(multiples_of_pi, vdupq_n_s32(1));
127
- uint32x4_t even_mask = vceqq_s32(parity, vdupq_n_s32(0));
128
- float32x4_t negated = vnegq_f32(results);
129
- results = vbslq_f32(even_mask, negated, results);
130
- return results;
118
+ float32x4_t polynomials_f32x4 = coeff_9_f32x4;
119
+ polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, angles_squared_f32x4);
120
+ polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, angles_squared_f32x4);
121
+ polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, angles_squared_f32x4);
122
+ float32x4_t results_f32x4 = vfmaq_f32(angles_f32x4, angles_cubed_f32x4, polynomials_f32x4);
123
+
124
+ // If multiples_of_pi_i32x4 is even, flip the sign
125
+ int32x4_t parity_i32x4 = vandq_s32(multiples_of_pi_i32x4, vdupq_n_s32(1));
126
+ uint32x4_t even_mask_u32x4 = vceqq_s32(parity_i32x4, vdupq_n_s32(0));
127
+ float32x4_t negated_f32x4 = vnegq_f32(results_f32x4);
128
+ results_f32x4 = vbslq_f32(even_mask_u32x4, negated_f32x4, results_f32x4);
129
+ return results_f32x4;
131
130
  }
132
131
 
133
132
  NK_INTERNAL float32x4_t nk_atan_f32x4_neon_(float32x4_t const inputs) {
134
133
  // Polynomial coefficients for atan approximation (8 terms)
135
- float32x4_t const coeff_8 = vdupq_n_f32(-0.333331018686294555664062f);
136
- float32x4_t const coeff_7 = vdupq_n_f32(+0.199926957488059997558594f);
137
- float32x4_t const coeff_6 = vdupq_n_f32(-0.142027363181114196777344f);
138
- float32x4_t const coeff_5 = vdupq_n_f32(+0.106347933411598205566406f);
139
- float32x4_t const coeff_4 = vdupq_n_f32(-0.0748900920152664184570312f);
140
- float32x4_t const coeff_3 = vdupq_n_f32(+0.0425049886107444763183594f);
141
- float32x4_t const coeff_2 = vdupq_n_f32(-0.0159569028764963150024414f);
142
- float32x4_t const coeff_1 = vdupq_n_f32(+0.00282363896258175373077393f);
143
- float32x4_t const half_pi = vdupq_n_f32(1.5707963267948966f);
144
-
145
- // Detect negative values and take absolute value
146
- float32x4_t const zeros = vdupq_n_f32(0);
147
- uint32x4_t negative_mask = vcltq_f32(inputs, zeros);
148
- float32x4_t values = vabsq_f32(inputs);
149
-
150
- // Check if values > 1 (need reciprocal)
151
- uint32x4_t reciprocal_mask = vcgtq_f32(values, vdupq_n_f32(1.0f));
134
+ float32x4_t const coeff_8_f32x4 = vdupq_n_f32(-0.333331018686294555664062f);
135
+ float32x4_t const coeff_7_f32x4 = vdupq_n_f32(+0.199926957488059997558594f);
136
+ float32x4_t const coeff_6_f32x4 = vdupq_n_f32(-0.142027363181114196777344f);
137
+ float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+0.106347933411598205566406f);
138
+ float32x4_t const coeff_4_f32x4 = vdupq_n_f32(-0.0748900920152664184570312f);
139
+ float32x4_t const coeff_3_f32x4 = vdupq_n_f32(+0.0425049886107444763183594f);
140
+ float32x4_t const coeff_2_f32x4 = vdupq_n_f32(-0.0159569028764963150024414f);
141
+ float32x4_t const coeff_1_f32x4 = vdupq_n_f32(+0.00282363896258175373077393f);
142
+ float32x4_t const half_pi_f32x4 = vdupq_n_f32(1.5707963267948966f);
143
+
144
+ // Detect negative values_f32x4 and take absolute value
145
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
146
+ uint32x4_t negative_mask_u32x4 = vcltq_f32(inputs, zeros_f32x4);
147
+ float32x4_t values_f32x4 = vabsq_f32(inputs);
148
+
149
+ // Check if values_f32x4 > 1 (need reciprocal)
150
+ uint32x4_t reciprocal_mask_u32x4 = vcgtq_f32(values_f32x4, vdupq_n_f32(1.0f));
152
151
 
153
152
  // Fast reciprocal using vrecpeq + Newton-Raphson (faster than vdivq on many Arm cores)
154
- float32x4_t recip = vrecpeq_f32(values);
155
- recip = vmulq_f32(recip, vrecpsq_f32(values, recip));
156
- recip = vmulq_f32(recip, vrecpsq_f32(values, recip));
157
- values = vbslq_f32(reciprocal_mask, recip, values);
153
+ float32x4_t recip_f32x4 = vrecpeq_f32(values_f32x4);
154
+ recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(values_f32x4, recip_f32x4));
155
+ recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(values_f32x4, recip_f32x4));
156
+ values_f32x4 = vbslq_f32(reciprocal_mask_u32x4, recip_f32x4, values_f32x4);
158
157
 
159
158
  // Compute powers
160
- float32x4_t const values_squared = vmulq_f32(values, values);
161
- float32x4_t const values_cubed = vmulq_f32(values, values_squared);
159
+ float32x4_t const values_squared_f32x4 = vmulq_f32(values_f32x4, values_f32x4);
160
+ float32x4_t const values_cubed_f32x4 = vmulq_f32(values_f32x4, values_squared_f32x4);
162
161
 
163
162
  // Polynomial evaluation using Horner's method
164
- float32x4_t polynomials = coeff_1;
165
- polynomials = vfmaq_f32(coeff_2, polynomials, values_squared);
166
- polynomials = vfmaq_f32(coeff_3, polynomials, values_squared);
167
- polynomials = vfmaq_f32(coeff_4, polynomials, values_squared);
168
- polynomials = vfmaq_f32(coeff_5, polynomials, values_squared);
169
- polynomials = vfmaq_f32(coeff_6, polynomials, values_squared);
170
- polynomials = vfmaq_f32(coeff_7, polynomials, values_squared);
171
- polynomials = vfmaq_f32(coeff_8, polynomials, values_squared);
172
-
173
- // Compute result: atan(x) ≈ x + x³ * P(x²)
174
- float32x4_t result = vfmaq_f32(values, values_cubed, polynomials);
175
-
176
- // Adjust for reciprocal: result = π/2 - result
177
- float32x4_t adjusted = vsubq_f32(half_pi, result);
178
- result = vbslq_f32(reciprocal_mask, adjusted, result);
179
-
180
- // Adjust for negative: result = -result
181
- float32x4_t negated = vnegq_f32(result);
182
- result = vbslq_f32(negative_mask, negated, result);
183
- return result;
163
+ float32x4_t polynomials_f32x4 = coeff_1_f32x4;
164
+ polynomials_f32x4 = vfmaq_f32(coeff_2_f32x4, polynomials_f32x4, values_squared_f32x4);
165
+ polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, values_squared_f32x4);
166
+ polynomials_f32x4 = vfmaq_f32(coeff_4_f32x4, polynomials_f32x4, values_squared_f32x4);
167
+ polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, values_squared_f32x4);
168
+ polynomials_f32x4 = vfmaq_f32(coeff_6_f32x4, polynomials_f32x4, values_squared_f32x4);
169
+ polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, values_squared_f32x4);
170
+ polynomials_f32x4 = vfmaq_f32(coeff_8_f32x4, polynomials_f32x4, values_squared_f32x4);
171
+
172
+ // Compute result_f32x4: atan(x) ≈ x + x³ * P(x²)
173
+ float32x4_t result_f32x4 = vfmaq_f32(values_f32x4, values_cubed_f32x4, polynomials_f32x4);
174
+
175
+ // Adjust for reciprocal: result_f32x4 = π/2 - result_f32x4
176
+ float32x4_t adjusted_f32x4 = vsubq_f32(half_pi_f32x4, result_f32x4);
177
+ result_f32x4 = vbslq_f32(reciprocal_mask_u32x4, adjusted_f32x4, result_f32x4);
178
+
179
+ // Adjust for negative: result_f32x4 = -result_f32x4
180
+ float32x4_t negated_f32x4 = vnegq_f32(result_f32x4);
181
+ result_f32x4 = vbslq_f32(negative_mask_u32x4, negated_f32x4, result_f32x4);
182
+ return result_f32x4;
184
183
  }
185
184
 
186
185
  NK_INTERNAL float32x4_t nk_atan2_f32x4_neon_(float32x4_t const ys_inputs, float32x4_t const xs_inputs) {
187
186
  // Polynomial coefficients (same as atan)
188
- float32x4_t const coeff_8 = vdupq_n_f32(-0.333331018686294555664062f);
189
- float32x4_t const coeff_7 = vdupq_n_f32(+0.199926957488059997558594f);
190
- float32x4_t const coeff_6 = vdupq_n_f32(-0.142027363181114196777344f);
191
- float32x4_t const coeff_5 = vdupq_n_f32(+0.106347933411598205566406f);
192
- float32x4_t const coeff_4 = vdupq_n_f32(-0.0748900920152664184570312f);
193
- float32x4_t const coeff_3 = vdupq_n_f32(+0.0425049886107444763183594f);
194
- float32x4_t const coeff_2 = vdupq_n_f32(-0.0159569028764963150024414f);
195
- float32x4_t const coeff_1 = vdupq_n_f32(+0.00282363896258175373077393f);
196
- float32x4_t const half_pi = vdupq_n_f32(1.5707963267948966f);
197
- float32x4_t const zeros = vdupq_n_f32(0);
187
+ float32x4_t const coeff_8_f32x4 = vdupq_n_f32(-0.333331018686294555664062f);
188
+ float32x4_t const coeff_7_f32x4 = vdupq_n_f32(+0.199926957488059997558594f);
189
+ float32x4_t const coeff_6_f32x4 = vdupq_n_f32(-0.142027363181114196777344f);
190
+ float32x4_t const coeff_5_f32x4 = vdupq_n_f32(+0.106347933411598205566406f);
191
+ float32x4_t const coeff_4_f32x4 = vdupq_n_f32(-0.0748900920152664184570312f);
192
+ float32x4_t const coeff_3_f32x4 = vdupq_n_f32(+0.0425049886107444763183594f);
193
+ float32x4_t const coeff_2_f32x4 = vdupq_n_f32(-0.0159569028764963150024414f);
194
+ float32x4_t const coeff_1_f32x4 = vdupq_n_f32(+0.00282363896258175373077393f);
195
+ float32x4_t const half_pi_f32x4 = vdupq_n_f32(1.5707963267948966f);
196
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
198
197
 
199
198
  // Quadrant adjustments - take absolute values
200
- uint32x4_t xs_negative_mask = vcltq_f32(xs_inputs, zeros);
201
- float32x4_t xs = vabsq_f32(xs_inputs);
202
- float32x4_t ys = vabsq_f32(ys_inputs);
199
+ uint32x4_t xs_negative_mask_u32x4 = vcltq_f32(xs_inputs, zeros_f32x4);
200
+ float32x4_t xs_f32x4 = vabsq_f32(xs_inputs);
201
+ float32x4_t ys_f32x4 = vabsq_f32(ys_inputs);
203
202
 
204
203
  // Ensure proper fraction where numerator < denominator
205
- uint32x4_t swap_mask = vcgtq_f32(ys, xs);
206
- float32x4_t temps = xs;
207
- xs = vbslq_f32(swap_mask, ys, xs);
208
- ys = vbslq_f32(swap_mask, vnegq_f32(temps), ys);
209
-
210
- // Fast reciprocal for division: ratio = ys / xsys * recip(xs)
211
- float32x4_t recip = vrecpeq_f32(xs);
212
- recip = vmulq_f32(recip, vrecpsq_f32(xs, recip));
213
- recip = vmulq_f32(recip, vrecpsq_f32(xs, recip));
214
- float32x4_t const ratio = vmulq_f32(ys, recip);
215
- float32x4_t const ratio_squared = vmulq_f32(ratio, ratio);
216
- float32x4_t const ratio_cubed = vmulq_f32(ratio, ratio_squared);
204
+ uint32x4_t swap_mask_u32x4 = vcgtq_f32(ys_f32x4, xs_f32x4);
205
+ float32x4_t temps_f32x4 = xs_f32x4;
206
+ xs_f32x4 = vbslq_f32(swap_mask_u32x4, ys_f32x4, xs_f32x4);
207
+ ys_f32x4 = vbslq_f32(swap_mask_u32x4, vnegq_f32(temps_f32x4), ys_f32x4);
208
+
209
+ // Fast reciprocal for division: ratio_f32x4 = ys_f32x4 / xs_f32x4ys_f32x4 * recip_f32x4(xs_f32x4)
210
+ float32x4_t recip_f32x4 = vrecpeq_f32(xs_f32x4);
211
+ recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(xs_f32x4, recip_f32x4));
212
+ recip_f32x4 = vmulq_f32(recip_f32x4, vrecpsq_f32(xs_f32x4, recip_f32x4));
213
+ float32x4_t const ratio_f32x4 = vmulq_f32(ys_f32x4, recip_f32x4);
214
+ float32x4_t const ratio_squared_f32x4 = vmulq_f32(ratio_f32x4, ratio_f32x4);
215
+ float32x4_t const ratio_cubed_f32x4 = vmulq_f32(ratio_f32x4, ratio_squared_f32x4);
217
216
 
218
217
  // Polynomial evaluation using Horner's method
219
- float32x4_t polynomials = coeff_1;
220
- polynomials = vfmaq_f32(coeff_2, polynomials, ratio_squared);
221
- polynomials = vfmaq_f32(coeff_3, polynomials, ratio_squared);
222
- polynomials = vfmaq_f32(coeff_4, polynomials, ratio_squared);
223
- polynomials = vfmaq_f32(coeff_5, polynomials, ratio_squared);
224
- polynomials = vfmaq_f32(coeff_6, polynomials, ratio_squared);
225
- polynomials = vfmaq_f32(coeff_7, polynomials, ratio_squared);
226
- polynomials = vfmaq_f32(coeff_8, polynomials, ratio_squared);
218
+ float32x4_t polynomials_f32x4 = coeff_1_f32x4;
219
+ polynomials_f32x4 = vfmaq_f32(coeff_2_f32x4, polynomials_f32x4, ratio_squared_f32x4);
220
+ polynomials_f32x4 = vfmaq_f32(coeff_3_f32x4, polynomials_f32x4, ratio_squared_f32x4);
221
+ polynomials_f32x4 = vfmaq_f32(coeff_4_f32x4, polynomials_f32x4, ratio_squared_f32x4);
222
+ polynomials_f32x4 = vfmaq_f32(coeff_5_f32x4, polynomials_f32x4, ratio_squared_f32x4);
223
+ polynomials_f32x4 = vfmaq_f32(coeff_6_f32x4, polynomials_f32x4, ratio_squared_f32x4);
224
+ polynomials_f32x4 = vfmaq_f32(coeff_7_f32x4, polynomials_f32x4, ratio_squared_f32x4);
225
+ polynomials_f32x4 = vfmaq_f32(coeff_8_f32x4, polynomials_f32x4, ratio_squared_f32x4);
227
226
 
228
227
  // Compute the result
229
- float32x4_t results = vfmaq_f32(ratio, ratio_cubed, polynomials);
228
+ float32x4_t results_f32x4 = vfmaq_f32(ratio_f32x4, ratio_cubed_f32x4, polynomials_f32x4);
230
229
 
231
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
230
+ // Compute quadrant_f32x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
232
231
  // -2 for x<0 && !swap, -1 for x<0 && swap
233
- float32x4_t quadrant = vdupq_n_f32(0.0f);
234
- float32x4_t neg_two = vdupq_n_f32(-2.0f);
235
- quadrant = vbslq_f32(xs_negative_mask, neg_two, quadrant);
236
- float32x4_t quadrant_incremented = vaddq_f32(quadrant, vdupq_n_f32(1.0f));
237
- quadrant = vbslq_f32(swap_mask, quadrant_incremented, quadrant);
232
+ float32x4_t quadrant_f32x4 = vdupq_n_f32(0.0f);
233
+ float32x4_t neg_two_f32x4 = vdupq_n_f32(-2.0f);
234
+ quadrant_f32x4 = vbslq_f32(xs_negative_mask_u32x4, neg_two_f32x4, quadrant_f32x4);
235
+ float32x4_t quadrant_incremented_f32x4 = vaddq_f32(quadrant_f32x4, vdupq_n_f32(1.0f));
236
+ quadrant_f32x4 = vbslq_f32(swap_mask_u32x4, quadrant_incremented_f32x4, quadrant_f32x4);
238
237
 
239
- // Adjust for quadrant: result += quadrant * π/2
240
- results = vfmaq_f32(results, quadrant, half_pi);
238
+ // Adjust for quadrant_f32x4: result += quadrant_f32x4 * π/2
239
+ results_f32x4 = vfmaq_f32(results_f32x4, quadrant_f32x4, half_pi_f32x4);
241
240
 
242
241
  // Transfer sign from x and y by XOR with sign bits
243
- uint32x4_t sign_mask = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
244
- uint32x4_t xs_sign = vandq_u32(vreinterpretq_u32_f32(xs_inputs), sign_mask);
245
- uint32x4_t ys_sign = vandq_u32(vreinterpretq_u32_f32(ys_inputs), sign_mask);
246
- uint32x4_t result_bits = vreinterpretq_u32_f32(results);
247
- result_bits = veorq_u32(result_bits, xs_sign);
248
- result_bits = veorq_u32(result_bits, ys_sign);
249
- results = vreinterpretq_f32_u32(result_bits);
250
-
251
- return results;
242
+ uint32x4_t sign_mask_u32x4 = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
243
+ uint32x4_t xs_sign_u32x4 = vandq_u32(vreinterpretq_u32_f32(xs_inputs), sign_mask_u32x4);
244
+ uint32x4_t ys_sign_u32x4 = vandq_u32(vreinterpretq_u32_f32(ys_inputs), sign_mask_u32x4);
245
+ uint32x4_t result_bits_u32x4 = vreinterpretq_u32_f32(results_f32x4);
246
+ result_bits_u32x4 = veorq_u32(result_bits_u32x4, xs_sign_u32x4);
247
+ result_bits_u32x4 = veorq_u32(result_bits_u32x4, ys_sign_u32x4);
248
+ results_f32x4 = vreinterpretq_f32_u32(result_bits_u32x4);
249
+
250
+ return results_f32x4;
252
251
  }
253
252
 
254
253
  NK_INTERNAL float64x2_t nk_sin_f64x2_neon_(float64x2_t const angles_radians) {
255
254
  // Constants for argument reduction
256
- float64x2_t const pi_high = vdupq_n_f64(3.141592653589793116);
257
- float64x2_t const pi_low = vdupq_n_f64(1.2246467991473532072e-16);
258
- float64x2_t const pi_reciprocal = vdupq_n_f64(0.31830988618379067154);
255
+ float64x2_t const pi_high_f64x2 = vdupq_n_f64(3.141592653589793116);
256
+ float64x2_t const pi_low_f64x2 = vdupq_n_f64(1.2246467991473532072e-16);
257
+ float64x2_t const pi_reciprocal_f64x2 = vdupq_n_f64(0.31830988618379067154);
259
258
 
260
259
  // Polynomial coefficients for sine approximation
261
- float64x2_t const coeff_0 = vdupq_n_f64(+0.00833333333333332974823815);
262
- float64x2_t const coeff_1 = vdupq_n_f64(-0.000198412698412696162806809);
263
- float64x2_t const coeff_2 = vdupq_n_f64(+2.75573192239198747630416e-06);
264
- float64x2_t const coeff_3 = vdupq_n_f64(-2.50521083763502045810755e-08);
265
- float64x2_t const coeff_4 = vdupq_n_f64(+1.60590430605664501629054e-10);
266
- float64x2_t const coeff_5 = vdupq_n_f64(-7.64712219118158833288484e-13);
267
- float64x2_t const coeff_6 = vdupq_n_f64(+2.81009972710863200091251e-15);
268
- float64x2_t const coeff_7 = vdupq_n_f64(-7.97255955009037868891952e-18);
269
- float64x2_t const coeff_8 = vdupq_n_f64(-0.166666666666666657414808);
260
+ float64x2_t const coeff_0_f64x2 = vdupq_n_f64(+0.00833333333333332974823815);
261
+ float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.000198412698412696162806809);
262
+ float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+2.75573192239198747630416e-06);
263
+ float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-2.50521083763502045810755e-08);
264
+ float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+1.60590430605664501629054e-10);
265
+ float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-7.64712219118158833288484e-13);
266
+ float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+2.81009972710863200091251e-15);
267
+ float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-7.97255955009037868891952e-18);
268
+ float64x2_t const coeff_8_f64x2 = vdupq_n_f64(-0.166666666666666657414808);
270
269
 
271
270
  // Compute round(angle / π)
272
- float64x2_t const quotients = vmulq_f64(angles_radians, pi_reciprocal);
273
- int64x2_t multiples_of_pi = vcvtnq_s64_f64(quotients);
274
- float64x2_t rounded_quotients = vcvtq_f64_s64(multiples_of_pi);
271
+ float64x2_t const quotients_f64x2 = vmulq_f64(angles_radians, pi_reciprocal_f64x2);
272
+ int64x2_t multiples_of_pi_i64x2 = vcvtnq_s64_f64(quotients_f64x2);
273
+ float64x2_t rounded_quotients_f64x2 = vcvtq_f64_s64(multiples_of_pi_i64x2);
275
274
 
276
275
  // Two-step Cody-Waite reduction: angle - rounded * π_high - rounded * π_low
277
- float64x2_t angles = angles_radians;
278
- angles = vfmsq_f64(angles, rounded_quotients, pi_high);
279
- angles = vfmsq_f64(angles, rounded_quotients, pi_low);
276
+ float64x2_t angles_f64x2 = angles_radians;
277
+ angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_high_f64x2);
278
+ angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_low_f64x2);
280
279
 
281
- // If multiples_of_pi is odd, negate the angle
282
- int64x2_t parity = vandq_s64(multiples_of_pi, vdupq_n_s64(1));
283
- uint64x2_t odd_mask = vceqq_s64(parity, vdupq_n_s64(1));
284
- float64x2_t negated_angles = vnegq_f64(angles);
285
- angles = vbslq_f64(odd_mask, negated_angles, angles);
280
+ // If multiples_of_pi_i64x2 is odd, negate the angle
281
+ int64x2_t parity_i64x2 = vandq_s64(multiples_of_pi_i64x2, vdupq_n_s64(1));
282
+ uint64x2_t odd_mask_u64x2 = vceqq_s64(parity_i64x2, vdupq_n_s64(1));
283
+ float64x2_t negated_angles_f64x2 = vnegq_f64(angles_f64x2);
284
+ angles_f64x2 = vbslq_f64(odd_mask_u64x2, negated_angles_f64x2, angles_f64x2);
286
285
 
287
- float64x2_t const angles_squared = vmulq_f64(angles, angles);
288
- float64x2_t const angles_cubed = vmulq_f64(angles, angles_squared);
289
- float64x2_t const angles_quadratic = vmulq_f64(angles_squared, angles_squared);
290
- float64x2_t const angles_octic = vmulq_f64(angles_quadratic, angles_quadratic);
286
+ float64x2_t const angles_squared_f64x2 = vmulq_f64(angles_f64x2, angles_f64x2);
287
+ float64x2_t const angles_cubed_f64x2 = vmulq_f64(angles_f64x2, angles_squared_f64x2);
288
+ float64x2_t const angles_quadratic_f64x2 = vmulq_f64(angles_squared_f64x2, angles_squared_f64x2);
289
+ float64x2_t const angles_octic_f64x2 = vmulq_f64(angles_quadratic_f64x2, angles_quadratic_f64x2);
291
290
 
292
291
  // Compute polynomial terms using Estrin's scheme for better ILP
293
- float64x2_t const poly_67 = vfmaq_f64(coeff_6, angles_squared, coeff_7);
294
- float64x2_t const poly_45 = vfmaq_f64(coeff_4, angles_squared, coeff_5);
295
- float64x2_t const poly_4567 = vfmaq_f64(poly_45, angles_quadratic, poly_67);
292
+ float64x2_t const poly_67_f64x2 = vfmaq_f64(coeff_6_f64x2, angles_squared_f64x2, coeff_7_f64x2);
293
+ float64x2_t const poly_45_f64x2 = vfmaq_f64(coeff_4_f64x2, angles_squared_f64x2, coeff_5_f64x2);
294
+ float64x2_t const poly_4567_f64x2 = vfmaq_f64(poly_45_f64x2, angles_quadratic_f64x2, poly_67_f64x2);
296
295
 
297
- float64x2_t const poly_23 = vfmaq_f64(coeff_2, angles_squared, coeff_3);
298
- float64x2_t const poly_01 = vfmaq_f64(coeff_0, angles_squared, coeff_1);
299
- float64x2_t const poly_0123 = vfmaq_f64(poly_01, angles_quadratic, poly_23);
296
+ float64x2_t const poly_23_f64x2 = vfmaq_f64(coeff_2_f64x2, angles_squared_f64x2, coeff_3_f64x2);
297
+ float64x2_t const poly_01_f64x2 = vfmaq_f64(coeff_0_f64x2, angles_squared_f64x2, coeff_1_f64x2);
298
+ float64x2_t const poly_0123_f64x2 = vfmaq_f64(poly_01_f64x2, angles_quadratic_f64x2, poly_23_f64x2);
300
299
 
301
300
  // Combine polynomial terms
302
- float64x2_t results = vfmaq_f64(poly_0123, angles_octic, poly_4567);
303
- results = vfmaq_f64(coeff_8, results, angles_squared);
304
- results = vfmaq_f64(angles, results, angles_cubed);
301
+ float64x2_t results_f64x2 = vfmaq_f64(poly_0123_f64x2, angles_octic_f64x2, poly_4567_f64x2);
302
+ results_f64x2 = vfmaq_f64(coeff_8_f64x2, results_f64x2, angles_squared_f64x2);
303
+ results_f64x2 = vfmaq_f64(angles_f64x2, results_f64x2, angles_cubed_f64x2);
305
304
 
306
305
  // Handle zero input (preserve sign of zero)
307
- uint64x2_t const non_zero_mask = vceqq_f64(angles_radians, vdupq_n_f64(0));
308
- results = vbslq_f64(non_zero_mask, angles_radians, results);
309
- return results;
306
+ uint64x2_t const non_zero_mask_u64x2 = vceqq_f64(angles_radians, vdupq_n_f64(0));
307
+ results_f64x2 = vbslq_f64(non_zero_mask_u64x2, angles_radians, results_f64x2);
308
+ return results_f64x2;
310
309
  }
311
310
 
312
311
  NK_INTERNAL float64x2_t nk_cos_f64x2_neon_(float64x2_t const angles_radians) {
313
312
  // Constants for argument reduction
314
- float64x2_t const pi_high_half = vdupq_n_f64(3.141592653589793116 * 0.5);
315
- float64x2_t const pi_low_half = vdupq_n_f64(1.2246467991473532072e-16 * 0.5);
316
- float64x2_t const pi_reciprocal = vdupq_n_f64(0.31830988618379067154);
313
+ float64x2_t const pi_high_half_f64x2 = vdupq_n_f64(3.141592653589793116 * 0.5);
314
+ float64x2_t const pi_low_half_f64x2 = vdupq_n_f64(1.2246467991473532072e-16 * 0.5);
315
+ float64x2_t const pi_reciprocal_f64x2 = vdupq_n_f64(0.31830988618379067154);
317
316
 
318
317
  // Polynomial coefficients for cosine approximation
319
- float64x2_t const coeff_0 = vdupq_n_f64(+0.00833333333333332974823815);
320
- float64x2_t const coeff_1 = vdupq_n_f64(-0.000198412698412696162806809);
321
- float64x2_t const coeff_2 = vdupq_n_f64(+2.75573192239198747630416e-06);
322
- float64x2_t const coeff_3 = vdupq_n_f64(-2.50521083763502045810755e-08);
323
- float64x2_t const coeff_4 = vdupq_n_f64(+1.60590430605664501629054e-10);
324
- float64x2_t const coeff_5 = vdupq_n_f64(-7.64712219118158833288484e-13);
325
- float64x2_t const coeff_6 = vdupq_n_f64(+2.81009972710863200091251e-15);
326
- float64x2_t const coeff_7 = vdupq_n_f64(-7.97255955009037868891952e-18);
327
- float64x2_t const coeff_8 = vdupq_n_f64(-0.166666666666666657414808);
318
+ float64x2_t const coeff_0_f64x2 = vdupq_n_f64(+0.00833333333333332974823815);
319
+ float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.000198412698412696162806809);
320
+ float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+2.75573192239198747630416e-06);
321
+ float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-2.50521083763502045810755e-08);
322
+ float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+1.60590430605664501629054e-10);
323
+ float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-7.64712219118158833288484e-13);
324
+ float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+2.81009972710863200091251e-15);
325
+ float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-7.97255955009037868891952e-18);
326
+ float64x2_t const coeff_8_f64x2 = vdupq_n_f64(-0.166666666666666657414808);
328
327
 
329
328
  // Compute 2 * round(angle / π - 0.5) + 1
330
- float64x2_t const quotients = vsubq_f64(vmulq_f64(angles_radians, pi_reciprocal), vdupq_n_f64(0.5));
331
- float64x2_t const rounded = vcvtq_f64_s64(vcvtnq_s64_f64(quotients));
332
- float64x2_t const rounded_quotients = vfmaq_f64(vdupq_n_f64(1.0), vdupq_n_f64(2.0), rounded);
333
- int64x2_t quotients_i64 = vcvtnq_s64_f64(rounded_quotients);
329
+ float64x2_t const quotients_f64x2 = vsubq_f64(vmulq_f64(angles_radians, pi_reciprocal_f64x2), vdupq_n_f64(0.5));
330
+ float64x2_t const rounded_f64x2 = vcvtq_f64_s64(vcvtnq_s64_f64(quotients_f64x2));
331
+ float64x2_t const rounded_quotients_f64x2 = vfmaq_f64(vdupq_n_f64(1.0), vdupq_n_f64(2.0), rounded_f64x2);
332
+ int64x2_t quotients_i64_i64x2 = vcvtnq_s64_f64(rounded_quotients_f64x2);
334
333
 
335
334
  // Two-step Cody-Waite reduction
336
- float64x2_t angles = angles_radians;
337
- angles = vfmsq_f64(angles, rounded_quotients, pi_high_half);
338
- angles = vfmsq_f64(angles, rounded_quotients, pi_low_half);
335
+ float64x2_t angles_f64x2 = angles_radians;
336
+ angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_high_half_f64x2);
337
+ angles_f64x2 = vfmsq_f64(angles_f64x2, rounded_quotients_f64x2, pi_low_half_f64x2);
339
338
 
340
- // If (rounded_quotients & 2) == 0, negate the angle
341
- int64x2_t bit2 = vandq_s64(quotients_i64, vdupq_n_s64(2));
342
- uint64x2_t flip_mask = vceqq_s64(bit2, vdupq_n_s64(0));
343
- float64x2_t negated_angles = vnegq_f64(angles);
344
- angles = vbslq_f64(flip_mask, negated_angles, angles);
339
+ // If (rounded_quotients_f64x2 & 2) == 0, negate the angle
340
+ int64x2_t bit2_i64x2 = vandq_s64(quotients_i64_i64x2, vdupq_n_s64(2));
341
+ uint64x2_t flip_mask_u64x2 = vceqq_s64(bit2_i64x2, vdupq_n_s64(0));
342
+ float64x2_t negated_angles_f64x2 = vnegq_f64(angles_f64x2);
343
+ angles_f64x2 = vbslq_f64(flip_mask_u64x2, negated_angles_f64x2, angles_f64x2);
345
344
 
346
- float64x2_t const angles_squared = vmulq_f64(angles, angles);
347
- float64x2_t const angles_cubed = vmulq_f64(angles, angles_squared);
348
- float64x2_t const angles_quadratic = vmulq_f64(angles_squared, angles_squared);
349
- float64x2_t const angles_octic = vmulq_f64(angles_quadratic, angles_quadratic);
345
+ float64x2_t const angles_squared_f64x2 = vmulq_f64(angles_f64x2, angles_f64x2);
346
+ float64x2_t const angles_cubed_f64x2 = vmulq_f64(angles_f64x2, angles_squared_f64x2);
347
+ float64x2_t const angles_quadratic_f64x2 = vmulq_f64(angles_squared_f64x2, angles_squared_f64x2);
348
+ float64x2_t const angles_octic_f64x2 = vmulq_f64(angles_quadratic_f64x2, angles_quadratic_f64x2);
350
349
 
351
350
  // Compute polynomial terms using Estrin's scheme
352
- float64x2_t const poly_67 = vfmaq_f64(coeff_6, angles_squared, coeff_7);
353
- float64x2_t const poly_45 = vfmaq_f64(coeff_4, angles_squared, coeff_5);
354
- float64x2_t const poly_4567 = vfmaq_f64(poly_45, angles_quadratic, poly_67);
351
+ float64x2_t const poly_67_f64x2 = vfmaq_f64(coeff_6_f64x2, angles_squared_f64x2, coeff_7_f64x2);
352
+ float64x2_t const poly_45_f64x2 = vfmaq_f64(coeff_4_f64x2, angles_squared_f64x2, coeff_5_f64x2);
353
+ float64x2_t const poly_4567_f64x2 = vfmaq_f64(poly_45_f64x2, angles_quadratic_f64x2, poly_67_f64x2);
355
354
 
356
- float64x2_t const poly_23 = vfmaq_f64(coeff_2, angles_squared, coeff_3);
357
- float64x2_t const poly_01 = vfmaq_f64(coeff_0, angles_squared, coeff_1);
358
- float64x2_t const poly_0123 = vfmaq_f64(poly_01, angles_quadratic, poly_23);
355
+ float64x2_t const poly_23_f64x2 = vfmaq_f64(coeff_2_f64x2, angles_squared_f64x2, coeff_3_f64x2);
356
+ float64x2_t const poly_01_f64x2 = vfmaq_f64(coeff_0_f64x2, angles_squared_f64x2, coeff_1_f64x2);
357
+ float64x2_t const poly_0123_f64x2 = vfmaq_f64(poly_01_f64x2, angles_quadratic_f64x2, poly_23_f64x2);
359
358
 
360
359
  // Combine polynomial terms
361
- float64x2_t results = vfmaq_f64(poly_0123, angles_octic, poly_4567);
362
- results = vfmaq_f64(coeff_8, results, angles_squared);
363
- results = vfmaq_f64(angles, results, angles_cubed);
364
- return results;
360
+ float64x2_t results_f64x2 = vfmaq_f64(poly_0123_f64x2, angles_octic_f64x2, poly_4567_f64x2);
361
+ results_f64x2 = vfmaq_f64(coeff_8_f64x2, results_f64x2, angles_squared_f64x2);
362
+ results_f64x2 = vfmaq_f64(angles_f64x2, results_f64x2, angles_cubed_f64x2);
363
+ return results_f64x2;
365
364
  }
366
365
 
367
366
  NK_INTERNAL float64x2_t nk_atan_f64x2_neon_(float64x2_t const inputs) {
368
367
  // Polynomial coefficients for atan approximation (19 terms)
369
- float64x2_t const coeff_19 = vdupq_n_f64(-1.88796008463073496563746e-05);
370
- float64x2_t const coeff_18 = vdupq_n_f64(+0.000209850076645816976906797);
371
- float64x2_t const coeff_17 = vdupq_n_f64(-0.00110611831486672482563471);
372
- float64x2_t const coeff_16 = vdupq_n_f64(+0.00370026744188713119232403);
373
- float64x2_t const coeff_15 = vdupq_n_f64(-0.00889896195887655491740809);
374
- float64x2_t const coeff_14 = vdupq_n_f64(+0.016599329773529201970117);
375
- float64x2_t const coeff_13 = vdupq_n_f64(-0.0254517624932312641616861);
376
- float64x2_t const coeff_12 = vdupq_n_f64(+0.0337852580001353069993897);
377
- float64x2_t const coeff_11 = vdupq_n_f64(-0.0407629191276836500001934);
378
- float64x2_t const coeff_10 = vdupq_n_f64(+0.0466667150077840625632675);
379
- float64x2_t const coeff_9 = vdupq_n_f64(-0.0523674852303482457616113);
380
- float64x2_t const coeff_8 = vdupq_n_f64(+0.0587666392926673580854313);
381
- float64x2_t const coeff_7 = vdupq_n_f64(-0.0666573579361080525984562);
382
- float64x2_t const coeff_6 = vdupq_n_f64(+0.0769219538311769618355029);
383
- float64x2_t const coeff_5 = vdupq_n_f64(-0.090908995008245008229153);
384
- float64x2_t const coeff_4 = vdupq_n_f64(+0.111111105648261418443745);
385
- float64x2_t const coeff_3 = vdupq_n_f64(-0.14285714266771329383765);
386
- float64x2_t const coeff_2 = vdupq_n_f64(+0.199999999996591265594148);
387
- float64x2_t const coeff_1 = vdupq_n_f64(-0.333333333333311110369124);
388
- float64x2_t const half_pi = vdupq_n_f64(1.5707963267948966);
389
- float64x2_t const zeros = vdupq_n_f64(0);
368
+ float64x2_t const coeff_19_f64x2 = vdupq_n_f64(-1.88796008463073496563746e-05);
369
+ float64x2_t const coeff_18_f64x2 = vdupq_n_f64(+0.000209850076645816976906797);
370
+ float64x2_t const coeff_17_f64x2 = vdupq_n_f64(-0.00110611831486672482563471);
371
+ float64x2_t const coeff_16_f64x2 = vdupq_n_f64(+0.00370026744188713119232403);
372
+ float64x2_t const coeff_15_f64x2 = vdupq_n_f64(-0.00889896195887655491740809);
373
+ float64x2_t const coeff_14_f64x2 = vdupq_n_f64(+0.016599329773529201970117);
374
+ float64x2_t const coeff_13_f64x2 = vdupq_n_f64(-0.0254517624932312641616861);
375
+ float64x2_t const coeff_12_f64x2 = vdupq_n_f64(+0.0337852580001353069993897);
376
+ float64x2_t const coeff_11_f64x2 = vdupq_n_f64(-0.0407629191276836500001934);
377
+ float64x2_t const coeff_10_f64x2 = vdupq_n_f64(+0.0466667150077840625632675);
378
+ float64x2_t const coeff_9_f64x2 = vdupq_n_f64(-0.0523674852303482457616113);
379
+ float64x2_t const coeff_8_f64x2 = vdupq_n_f64(+0.0587666392926673580854313);
380
+ float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-0.0666573579361080525984562);
381
+ float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+0.0769219538311769618355029);
382
+ float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-0.090908995008245008229153);
383
+ float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+0.111111105648261418443745);
384
+ float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-0.14285714266771329383765);
385
+ float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+0.199999999996591265594148);
386
+ float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.333333333333311110369124);
387
+ float64x2_t const half_pi_f64x2 = vdupq_n_f64(1.5707963267948966);
388
+ float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
390
389
 
391
390
  // Detect negative and take absolute value
392
- uint64x2_t negative_mask = vcltq_f64(inputs, zeros);
393
- float64x2_t values = vabsq_f64(inputs);
391
+ uint64x2_t negative_mask_u64x2 = vcltq_f64(inputs, zeros_f64x2);
392
+ float64x2_t values_f64x2 = vabsq_f64(inputs);
394
393
 
395
- // Check if values > 1 (need reciprocal) - use division for f64 precision
396
- uint64x2_t reciprocal_mask = vcgtq_f64(values, vdupq_n_f64(1.0));
397
- float64x2_t reciprocal_values = vdivq_f64(vdupq_n_f64(1.0), values);
398
- values = vbslq_f64(reciprocal_mask, reciprocal_values, values);
394
+ // Check if values_f64x2 > 1 (need reciprocal) - use division for f64 precision
395
+ uint64x2_t reciprocal_mask_u64x2 = vcgtq_f64(values_f64x2, vdupq_n_f64(1.0));
396
+ float64x2_t reciprocal_values_f64x2 = vdivq_f64(vdupq_n_f64(1.0), values_f64x2);
397
+ values_f64x2 = vbslq_f64(reciprocal_mask_u64x2, reciprocal_values_f64x2, values_f64x2);
399
398
 
400
399
  // Compute powers
401
- float64x2_t const values_squared = vmulq_f64(values, values);
402
- float64x2_t const values_cubed = vmulq_f64(values, values_squared);
400
+ float64x2_t const values_squared_f64x2 = vmulq_f64(values_f64x2, values_f64x2);
401
+ float64x2_t const values_cubed_f64x2 = vmulq_f64(values_f64x2, values_squared_f64x2);
403
402
 
404
403
  // Polynomial evaluation using Horner's method
405
- float64x2_t polynomials = coeff_19;
406
- polynomials = vfmaq_f64(coeff_18, polynomials, values_squared);
407
- polynomials = vfmaq_f64(coeff_17, polynomials, values_squared);
408
- polynomials = vfmaq_f64(coeff_16, polynomials, values_squared);
409
- polynomials = vfmaq_f64(coeff_15, polynomials, values_squared);
410
- polynomials = vfmaq_f64(coeff_14, polynomials, values_squared);
411
- polynomials = vfmaq_f64(coeff_13, polynomials, values_squared);
412
- polynomials = vfmaq_f64(coeff_12, polynomials, values_squared);
413
- polynomials = vfmaq_f64(coeff_11, polynomials, values_squared);
414
- polynomials = vfmaq_f64(coeff_10, polynomials, values_squared);
415
- polynomials = vfmaq_f64(coeff_9, polynomials, values_squared);
416
- polynomials = vfmaq_f64(coeff_8, polynomials, values_squared);
417
- polynomials = vfmaq_f64(coeff_7, polynomials, values_squared);
418
- polynomials = vfmaq_f64(coeff_6, polynomials, values_squared);
419
- polynomials = vfmaq_f64(coeff_5, polynomials, values_squared);
420
- polynomials = vfmaq_f64(coeff_4, polynomials, values_squared);
421
- polynomials = vfmaq_f64(coeff_3, polynomials, values_squared);
422
- polynomials = vfmaq_f64(coeff_2, polynomials, values_squared);
423
- polynomials = vfmaq_f64(coeff_1, polynomials, values_squared);
424
-
425
- // Compute result
426
- float64x2_t result = vfmaq_f64(values, values_cubed, polynomials);
427
-
428
- // Adjust for reciprocal: result = π/2 - result
429
- float64x2_t adjusted = vsubq_f64(half_pi, result);
430
- result = vbslq_f64(reciprocal_mask, adjusted, result);
431
-
432
- // Adjust for negative: result = -result
433
- float64x2_t negated = vnegq_f64(result);
434
- result = vbslq_f64(negative_mask, negated, result);
435
- return result;
404
+ float64x2_t polynomials_f64x2 = coeff_19_f64x2;
405
+ polynomials_f64x2 = vfmaq_f64(coeff_18_f64x2, polynomials_f64x2, values_squared_f64x2);
406
+ polynomials_f64x2 = vfmaq_f64(coeff_17_f64x2, polynomials_f64x2, values_squared_f64x2);
407
+ polynomials_f64x2 = vfmaq_f64(coeff_16_f64x2, polynomials_f64x2, values_squared_f64x2);
408
+ polynomials_f64x2 = vfmaq_f64(coeff_15_f64x2, polynomials_f64x2, values_squared_f64x2);
409
+ polynomials_f64x2 = vfmaq_f64(coeff_14_f64x2, polynomials_f64x2, values_squared_f64x2);
410
+ polynomials_f64x2 = vfmaq_f64(coeff_13_f64x2, polynomials_f64x2, values_squared_f64x2);
411
+ polynomials_f64x2 = vfmaq_f64(coeff_12_f64x2, polynomials_f64x2, values_squared_f64x2);
412
+ polynomials_f64x2 = vfmaq_f64(coeff_11_f64x2, polynomials_f64x2, values_squared_f64x2);
413
+ polynomials_f64x2 = vfmaq_f64(coeff_10_f64x2, polynomials_f64x2, values_squared_f64x2);
414
+ polynomials_f64x2 = vfmaq_f64(coeff_9_f64x2, polynomials_f64x2, values_squared_f64x2);
415
+ polynomials_f64x2 = vfmaq_f64(coeff_8_f64x2, polynomials_f64x2, values_squared_f64x2);
416
+ polynomials_f64x2 = vfmaq_f64(coeff_7_f64x2, polynomials_f64x2, values_squared_f64x2);
417
+ polynomials_f64x2 = vfmaq_f64(coeff_6_f64x2, polynomials_f64x2, values_squared_f64x2);
418
+ polynomials_f64x2 = vfmaq_f64(coeff_5_f64x2, polynomials_f64x2, values_squared_f64x2);
419
+ polynomials_f64x2 = vfmaq_f64(coeff_4_f64x2, polynomials_f64x2, values_squared_f64x2);
420
+ polynomials_f64x2 = vfmaq_f64(coeff_3_f64x2, polynomials_f64x2, values_squared_f64x2);
421
+ polynomials_f64x2 = vfmaq_f64(coeff_2_f64x2, polynomials_f64x2, values_squared_f64x2);
422
+ polynomials_f64x2 = vfmaq_f64(coeff_1_f64x2, polynomials_f64x2, values_squared_f64x2);
423
+
424
+ // Compute result_f64x2
425
+ float64x2_t result_f64x2 = vfmaq_f64(values_f64x2, values_cubed_f64x2, polynomials_f64x2);
426
+
427
+ // Adjust for reciprocal: result_f64x2 = π/2 - result_f64x2
428
+ float64x2_t adjusted_f64x2 = vsubq_f64(half_pi_f64x2, result_f64x2);
429
+ result_f64x2 = vbslq_f64(reciprocal_mask_u64x2, adjusted_f64x2, result_f64x2);
430
+
431
+ // Adjust for negative: result_f64x2 = -result_f64x2
432
+ float64x2_t negated_f64x2 = vnegq_f64(result_f64x2);
433
+ result_f64x2 = vbslq_f64(negative_mask_u64x2, negated_f64x2, result_f64x2);
434
+ return result_f64x2;
436
435
  }
437
436
 
438
437
  NK_INTERNAL float64x2_t nk_atan2_f64x2_neon_(float64x2_t const ys_inputs, float64x2_t const xs_inputs) {
439
438
  // Polynomial coefficients (same as atan)
440
- float64x2_t const coeff_19 = vdupq_n_f64(-1.88796008463073496563746e-05);
441
- float64x2_t const coeff_18 = vdupq_n_f64(+0.000209850076645816976906797);
442
- float64x2_t const coeff_17 = vdupq_n_f64(-0.00110611831486672482563471);
443
- float64x2_t const coeff_16 = vdupq_n_f64(+0.00370026744188713119232403);
444
- float64x2_t const coeff_15 = vdupq_n_f64(-0.00889896195887655491740809);
445
- float64x2_t const coeff_14 = vdupq_n_f64(+0.016599329773529201970117);
446
- float64x2_t const coeff_13 = vdupq_n_f64(-0.0254517624932312641616861);
447
- float64x2_t const coeff_12 = vdupq_n_f64(+0.0337852580001353069993897);
448
- float64x2_t const coeff_11 = vdupq_n_f64(-0.0407629191276836500001934);
449
- float64x2_t const coeff_10 = vdupq_n_f64(+0.0466667150077840625632675);
450
- float64x2_t const coeff_9 = vdupq_n_f64(-0.0523674852303482457616113);
451
- float64x2_t const coeff_8 = vdupq_n_f64(+0.0587666392926673580854313);
452
- float64x2_t const coeff_7 = vdupq_n_f64(-0.0666573579361080525984562);
453
- float64x2_t const coeff_6 = vdupq_n_f64(+0.0769219538311769618355029);
454
- float64x2_t const coeff_5 = vdupq_n_f64(-0.090908995008245008229153);
455
- float64x2_t const coeff_4 = vdupq_n_f64(+0.111111105648261418443745);
456
- float64x2_t const coeff_3 = vdupq_n_f64(-0.14285714266771329383765);
457
- float64x2_t const coeff_2 = vdupq_n_f64(+0.199999999996591265594148);
458
- float64x2_t const coeff_1 = vdupq_n_f64(-0.333333333333311110369124);
459
- float64x2_t const half_pi = vdupq_n_f64(1.5707963267948966);
460
- float64x2_t const zeros = vdupq_n_f64(0);
439
+ float64x2_t const coeff_19_f64x2 = vdupq_n_f64(-1.88796008463073496563746e-05);
440
+ float64x2_t const coeff_18_f64x2 = vdupq_n_f64(+0.000209850076645816976906797);
441
+ float64x2_t const coeff_17_f64x2 = vdupq_n_f64(-0.00110611831486672482563471);
442
+ float64x2_t const coeff_16_f64x2 = vdupq_n_f64(+0.00370026744188713119232403);
443
+ float64x2_t const coeff_15_f64x2 = vdupq_n_f64(-0.00889896195887655491740809);
444
+ float64x2_t const coeff_14_f64x2 = vdupq_n_f64(+0.016599329773529201970117);
445
+ float64x2_t const coeff_13_f64x2 = vdupq_n_f64(-0.0254517624932312641616861);
446
+ float64x2_t const coeff_12_f64x2 = vdupq_n_f64(+0.0337852580001353069993897);
447
+ float64x2_t const coeff_11_f64x2 = vdupq_n_f64(-0.0407629191276836500001934);
448
+ float64x2_t const coeff_10_f64x2 = vdupq_n_f64(+0.0466667150077840625632675);
449
+ float64x2_t const coeff_9_f64x2 = vdupq_n_f64(-0.0523674852303482457616113);
450
+ float64x2_t const coeff_8_f64x2 = vdupq_n_f64(+0.0587666392926673580854313);
451
+ float64x2_t const coeff_7_f64x2 = vdupq_n_f64(-0.0666573579361080525984562);
452
+ float64x2_t const coeff_6_f64x2 = vdupq_n_f64(+0.0769219538311769618355029);
453
+ float64x2_t const coeff_5_f64x2 = vdupq_n_f64(-0.090908995008245008229153);
454
+ float64x2_t const coeff_4_f64x2 = vdupq_n_f64(+0.111111105648261418443745);
455
+ float64x2_t const coeff_3_f64x2 = vdupq_n_f64(-0.14285714266771329383765);
456
+ float64x2_t const coeff_2_f64x2 = vdupq_n_f64(+0.199999999996591265594148);
457
+ float64x2_t const coeff_1_f64x2 = vdupq_n_f64(-0.333333333333311110369124);
458
+ float64x2_t const half_pi_f64x2 = vdupq_n_f64(1.5707963267948966);
459
+ float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
461
460
 
462
461
  // Quadrant adjustments - take absolute values
463
- uint64x2_t xs_negative_mask = vcltq_f64(xs_inputs, zeros);
464
- float64x2_t xs = vabsq_f64(xs_inputs);
465
- float64x2_t ys = vabsq_f64(ys_inputs);
462
+ uint64x2_t xs_negative_mask_u64x2 = vcltq_f64(xs_inputs, zeros_f64x2);
463
+ float64x2_t xs_f64x2 = vabsq_f64(xs_inputs);
464
+ float64x2_t ys_f64x2 = vabsq_f64(ys_inputs);
466
465
 
467
466
  // Ensure proper fraction where numerator < denominator
468
- uint64x2_t swap_mask = vcgtq_f64(ys, xs);
469
- float64x2_t temps = xs;
470
- xs = vbslq_f64(swap_mask, ys, xs);
471
- ys = vbslq_f64(swap_mask, vnegq_f64(temps), ys);
467
+ uint64x2_t swap_mask_u64x2 = vcgtq_f64(ys_f64x2, xs_f64x2);
468
+ float64x2_t temps_f64x2 = xs_f64x2;
469
+ xs_f64x2 = vbslq_f64(swap_mask_u64x2, ys_f64x2, xs_f64x2);
470
+ ys_f64x2 = vbslq_f64(swap_mask_u64x2, vnegq_f64(temps_f64x2), ys_f64x2);
472
471
 
473
472
  // Division for f64 precision
474
- float64x2_t const ratio = vdivq_f64(ys, xs);
475
- float64x2_t const ratio_squared = vmulq_f64(ratio, ratio);
476
- float64x2_t const ratio_cubed = vmulq_f64(ratio, ratio_squared);
473
+ float64x2_t const ratio_f64x2 = vdivq_f64(ys_f64x2, xs_f64x2);
474
+ float64x2_t const ratio_squared_f64x2 = vmulq_f64(ratio_f64x2, ratio_f64x2);
475
+ float64x2_t const ratio_cubed_f64x2 = vmulq_f64(ratio_f64x2, ratio_squared_f64x2);
477
476
 
478
477
  // Polynomial evaluation using Horner's method
479
- float64x2_t polynomials = coeff_19;
480
- polynomials = vfmaq_f64(coeff_18, polynomials, ratio_squared);
481
- polynomials = vfmaq_f64(coeff_17, polynomials, ratio_squared);
482
- polynomials = vfmaq_f64(coeff_16, polynomials, ratio_squared);
483
- polynomials = vfmaq_f64(coeff_15, polynomials, ratio_squared);
484
- polynomials = vfmaq_f64(coeff_14, polynomials, ratio_squared);
485
- polynomials = vfmaq_f64(coeff_13, polynomials, ratio_squared);
486
- polynomials = vfmaq_f64(coeff_12, polynomials, ratio_squared);
487
- polynomials = vfmaq_f64(coeff_11, polynomials, ratio_squared);
488
- polynomials = vfmaq_f64(coeff_10, polynomials, ratio_squared);
489
- polynomials = vfmaq_f64(coeff_9, polynomials, ratio_squared);
490
- polynomials = vfmaq_f64(coeff_8, polynomials, ratio_squared);
491
- polynomials = vfmaq_f64(coeff_7, polynomials, ratio_squared);
492
- polynomials = vfmaq_f64(coeff_6, polynomials, ratio_squared);
493
- polynomials = vfmaq_f64(coeff_5, polynomials, ratio_squared);
494
- polynomials = vfmaq_f64(coeff_4, polynomials, ratio_squared);
495
- polynomials = vfmaq_f64(coeff_3, polynomials, ratio_squared);
496
- polynomials = vfmaq_f64(coeff_2, polynomials, ratio_squared);
497
- polynomials = vfmaq_f64(coeff_1, polynomials, ratio_squared);
478
+ float64x2_t polynomials_f64x2 = coeff_19_f64x2;
479
+ polynomials_f64x2 = vfmaq_f64(coeff_18_f64x2, polynomials_f64x2, ratio_squared_f64x2);
480
+ polynomials_f64x2 = vfmaq_f64(coeff_17_f64x2, polynomials_f64x2, ratio_squared_f64x2);
481
+ polynomials_f64x2 = vfmaq_f64(coeff_16_f64x2, polynomials_f64x2, ratio_squared_f64x2);
482
+ polynomials_f64x2 = vfmaq_f64(coeff_15_f64x2, polynomials_f64x2, ratio_squared_f64x2);
483
+ polynomials_f64x2 = vfmaq_f64(coeff_14_f64x2, polynomials_f64x2, ratio_squared_f64x2);
484
+ polynomials_f64x2 = vfmaq_f64(coeff_13_f64x2, polynomials_f64x2, ratio_squared_f64x2);
485
+ polynomials_f64x2 = vfmaq_f64(coeff_12_f64x2, polynomials_f64x2, ratio_squared_f64x2);
486
+ polynomials_f64x2 = vfmaq_f64(coeff_11_f64x2, polynomials_f64x2, ratio_squared_f64x2);
487
+ polynomials_f64x2 = vfmaq_f64(coeff_10_f64x2, polynomials_f64x2, ratio_squared_f64x2);
488
+ polynomials_f64x2 = vfmaq_f64(coeff_9_f64x2, polynomials_f64x2, ratio_squared_f64x2);
489
+ polynomials_f64x2 = vfmaq_f64(coeff_8_f64x2, polynomials_f64x2, ratio_squared_f64x2);
490
+ polynomials_f64x2 = vfmaq_f64(coeff_7_f64x2, polynomials_f64x2, ratio_squared_f64x2);
491
+ polynomials_f64x2 = vfmaq_f64(coeff_6_f64x2, polynomials_f64x2, ratio_squared_f64x2);
492
+ polynomials_f64x2 = vfmaq_f64(coeff_5_f64x2, polynomials_f64x2, ratio_squared_f64x2);
493
+ polynomials_f64x2 = vfmaq_f64(coeff_4_f64x2, polynomials_f64x2, ratio_squared_f64x2);
494
+ polynomials_f64x2 = vfmaq_f64(coeff_3_f64x2, polynomials_f64x2, ratio_squared_f64x2);
495
+ polynomials_f64x2 = vfmaq_f64(coeff_2_f64x2, polynomials_f64x2, ratio_squared_f64x2);
496
+ polynomials_f64x2 = vfmaq_f64(coeff_1_f64x2, polynomials_f64x2, ratio_squared_f64x2);
498
497
 
499
498
  // Compute the result
500
- float64x2_t results = vfmaq_f64(ratio, ratio_cubed, polynomials);
499
+ float64x2_t results_f64x2 = vfmaq_f64(ratio_f64x2, ratio_cubed_f64x2, polynomials_f64x2);
501
500
 
502
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
501
+ // Compute quadrant_f64x2 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
503
502
  // -2 for x<0 && !swap, -1 for x<0 && swap
504
- float64x2_t quadrant = vdupq_n_f64(0.0);
505
- float64x2_t neg_two = vdupq_n_f64(-2.0);
506
- quadrant = vbslq_f64(xs_negative_mask, neg_two, quadrant);
507
- float64x2_t quadrant_incremented = vaddq_f64(quadrant, vdupq_n_f64(1.0));
508
- quadrant = vbslq_f64(swap_mask, quadrant_incremented, quadrant);
503
+ float64x2_t quadrant_f64x2 = vdupq_n_f64(0.0);
504
+ float64x2_t neg_two_f64x2 = vdupq_n_f64(-2.0);
505
+ quadrant_f64x2 = vbslq_f64(xs_negative_mask_u64x2, neg_two_f64x2, quadrant_f64x2);
506
+ float64x2_t quadrant_incremented_f64x2 = vaddq_f64(quadrant_f64x2, vdupq_n_f64(1.0));
507
+ quadrant_f64x2 = vbslq_f64(swap_mask_u64x2, quadrant_incremented_f64x2, quadrant_f64x2);
509
508
 
510
- // Adjust for quadrant: result += quadrant * π/2
511
- results = vfmaq_f64(results, quadrant, half_pi);
509
+ // Adjust for quadrant_f64x2: result += quadrant_f64x2 * π/2
510
+ results_f64x2 = vfmaq_f64(results_f64x2, quadrant_f64x2, half_pi_f64x2);
512
511
 
513
512
  // Transfer sign from x and y by XOR with sign bits
514
- uint64x2_t sign_mask = vreinterpretq_u64_f64(vdupq_n_f64(-0.0));
515
- uint64x2_t xs_sign = vandq_u64(vreinterpretq_u64_f64(xs_inputs), sign_mask);
516
- uint64x2_t ys_sign = vandq_u64(vreinterpretq_u64_f64(ys_inputs), sign_mask);
517
- uint64x2_t result_bits = vreinterpretq_u64_f64(results);
518
- result_bits = veorq_u64(result_bits, xs_sign);
519
- result_bits = veorq_u64(result_bits, ys_sign);
520
- results = vreinterpretq_f64_u64(result_bits);
521
-
522
- return results;
513
+ uint64x2_t sign_mask_u64x2 = vreinterpretq_u64_f64(vdupq_n_f64(-0.0));
514
+ uint64x2_t xs_sign_u64x2 = vandq_u64(vreinterpretq_u64_f64(xs_inputs), sign_mask_u64x2);
515
+ uint64x2_t ys_sign_u64x2 = vandq_u64(vreinterpretq_u64_f64(ys_inputs), sign_mask_u64x2);
516
+ uint64x2_t result_bits_u64x2 = vreinterpretq_u64_f64(results_f64x2);
517
+ result_bits_u64x2 = veorq_u64(result_bits_u64x2, xs_sign_u64x2);
518
+ result_bits_u64x2 = veorq_u64(result_bits_u64x2, ys_sign_u64x2);
519
+ results_f64x2 = vreinterpretq_f64_u64(result_bits_u64x2);
520
+
521
+ return results_f64x2;
523
522
  }
524
523
 
525
524
  NK_PUBLIC void nk_each_sin_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
526
525
  nk_size_t i = 0;
527
526
  for (; i + 4 <= n; i += 4) {
528
- float32x4_t angles = vld1q_f32(ins + i);
529
- float32x4_t results = nk_sin_f32x4_neon_(angles);
530
- vst1q_f32(outs + i, results);
527
+ float32x4_t angles_f32x4 = vld1q_f32(ins + i);
528
+ float32x4_t results_f32x4 = nk_sin_f32x4_neon_(angles_f32x4);
529
+ vst1q_f32(outs + i, results_f32x4);
531
530
  }
532
531
  if (i < n) {
533
532
  nk_size_t remaining = n - i;
@@ -542,9 +541,9 @@ NK_PUBLIC void nk_each_sin_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *
542
541
  NK_PUBLIC void nk_each_cos_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
543
542
  nk_size_t i = 0;
544
543
  for (; i + 4 <= n; i += 4) {
545
- float32x4_t angles = vld1q_f32(ins + i);
546
- float32x4_t results = nk_cos_f32x4_neon_(angles);
547
- vst1q_f32(outs + i, results);
544
+ float32x4_t angles_f32x4 = vld1q_f32(ins + i);
545
+ float32x4_t results_f32x4 = nk_cos_f32x4_neon_(angles_f32x4);
546
+ vst1q_f32(outs + i, results_f32x4);
548
547
  }
549
548
  if (i < n) {
550
549
  nk_size_t remaining = n - i;
@@ -559,9 +558,9 @@ NK_PUBLIC void nk_each_cos_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *
559
558
  NK_PUBLIC void nk_each_atan_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
560
559
  nk_size_t i = 0;
561
560
  for (; i + 4 <= n; i += 4) {
562
- float32x4_t values = vld1q_f32(ins + i);
563
- float32x4_t results = nk_atan_f32x4_neon_(values);
564
- vst1q_f32(outs + i, results);
561
+ float32x4_t values_f32x4 = vld1q_f32(ins + i);
562
+ float32x4_t results_f32x4 = nk_atan_f32x4_neon_(values_f32x4);
563
+ vst1q_f32(outs + i, results_f32x4);
565
564
  }
566
565
  if (i < n) {
567
566
  nk_size_t remaining = n - i;
@@ -576,9 +575,9 @@ NK_PUBLIC void nk_each_atan_f32_neon(nk_f32_t const *ins, nk_size_t n, nk_f32_t
576
575
  NK_PUBLIC void nk_each_sin_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
577
576
  nk_size_t i = 0;
578
577
  for (; i + 2 <= n; i += 2) {
579
- float64x2_t angles = vld1q_f64(ins + i);
580
- float64x2_t results = nk_sin_f64x2_neon_(angles);
581
- vst1q_f64(outs + i, results);
578
+ float64x2_t angles_f64x2 = vld1q_f64(ins + i);
579
+ float64x2_t results_f64x2 = nk_sin_f64x2_neon_(angles_f64x2);
580
+ vst1q_f64(outs + i, results_f64x2);
582
581
  }
583
582
  if (i < n) {
584
583
  nk_size_t remaining = n - i;
@@ -593,9 +592,9 @@ NK_PUBLIC void nk_each_sin_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *
593
592
  NK_PUBLIC void nk_each_cos_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
594
593
  nk_size_t i = 0;
595
594
  for (; i + 2 <= n; i += 2) {
596
- float64x2_t angles = vld1q_f64(ins + i);
597
- float64x2_t results = nk_cos_f64x2_neon_(angles);
598
- vst1q_f64(outs + i, results);
595
+ float64x2_t angles_f64x2 = vld1q_f64(ins + i);
596
+ float64x2_t results_f64x2 = nk_cos_f64x2_neon_(angles_f64x2);
597
+ vst1q_f64(outs + i, results_f64x2);
599
598
  }
600
599
  if (i < n) {
601
600
  nk_size_t remaining = n - i;
@@ -610,9 +609,9 @@ NK_PUBLIC void nk_each_cos_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *
610
609
  NK_PUBLIC void nk_each_atan_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
611
610
  nk_size_t i = 0;
612
611
  for (; i + 2 <= n; i += 2) {
613
- float64x2_t values = vld1q_f64(ins + i);
614
- float64x2_t results = nk_atan_f64x2_neon_(values);
615
- vst1q_f64(outs + i, results);
612
+ float64x2_t values_f64x2 = vld1q_f64(ins + i);
613
+ float64x2_t results_f64x2 = nk_atan_f64x2_neon_(values_f64x2);
614
+ vst1q_f64(outs + i, results_f64x2);
616
615
  }
617
616
  if (i < n) {
618
617
  nk_size_t remaining = n - i;