numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -9,12 +9,12 @@
9
9
  *
10
10
  * @section skylake_trig_instructions Key AVX-512 Trigonometry Instructions
11
11
  *
12
- * Intrinsic Instruction Latency Throughput Ports
13
- * _mm512_fmadd_ps VFMADD132PS (ZMM, ZMM, ZMM) 4cy 0.5/cy p05
14
- * _mm512_mul_ps VMULPS (ZMM, ZMM, ZMM) 4cy 0.5/cy p05
15
- * _mm512_and_ps VANDPS (ZMM, ZMM, ZMM) 1cy 0.33/cy p015
16
- * _mm512_cmp_ps_mask VCMPPS (K, ZMM, ZMM, I8) 3cy 1/cy p01
17
- * _mm512_roundscale_ps VRNDSCALEPS (ZMM, ZMM, I8) 8cy 0.5/cy p01
12
+ * Intrinsic Instruction Skylake-X Genoa
13
+ * _mm512_fmadd_ps VFMADD132PS (ZMM, ZMM, ZMM) 4cy @ p05 4cy @ p01
14
+ * _mm512_mul_ps VMULPS (ZMM, ZMM, ZMM) 4cy @ p05 3cy @ p01
15
+ * _mm512_and_ps VANDPS (ZMM, ZMM, ZMM) 1cy @ p05 1cy @ p0123
16
+ * _mm512_cmp_ps_mask VCMPPS (K, ZMM, ZMM, I8) 4cy @ p5 5cy @ p01
17
+ * _mm512_roundscale_ps VRNDSCALEPS (ZMM, ZMM, I8) 8cy @ p05+p05 3cy @ p23
18
18
  *
19
19
  * Trigonometric functions use polynomial approximations evaluated via Horner's method with FMA chains.
20
20
  * AVX-512 mask registers enable branchless range reduction and sign handling without blend overhead.
@@ -42,394 +42,398 @@ extern "C" {
42
42
 
43
43
  NK_INTERNAL __m512 nk_sin_f32x16_skylake_(__m512 const angles_radians) {
44
44
  // Cody-Waite constants for argument reduction
45
- __m512 const pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
46
- __m512 const pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
47
- __m512 const pi_reciprocal = _mm512_set1_ps(0.31830988618379067154f); // 1/π
45
+ __m512 const pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
46
+ __m512 const pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
47
+ __m512 const pi_reciprocal_f32x16 = _mm512_set1_ps(0.31830988618379067154f); // 1/π
48
48
  // Degree-9 minimax coefficients
49
- __m512 const coeff_9 = _mm512_set1_ps(+2.7557319224e-6f);
50
- __m512 const coeff_7 = _mm512_set1_ps(-1.9841269841e-4f);
51
- __m512 const coeff_5 = _mm512_set1_ps(+8.3333293855e-3f);
52
- __m512 const coeff_3 = _mm512_set1_ps(-1.6666666641e-1f);
53
-
54
- // Compute (multiples_of_pi) = round(angle / π)
55
- __m512 quotients = _mm512_mul_ps(angles_radians, pi_reciprocal);
56
- __m512 rounded_quotients = _mm512_roundscale_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
49
+ __m512 const coeff_9_f32x16 = _mm512_set1_ps(+2.7557319224e-6f);
50
+ __m512 const coeff_7_f32x16 = _mm512_set1_ps(-1.9841269841e-4f);
51
+ __m512 const coeff_5_f32x16 = _mm512_set1_ps(+8.3333293855e-3f);
52
+ __m512 const coeff_3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
53
+
54
+ // Compute (multiples_of_pi_i32x16) = round(angle / π)
55
+ __m512 quotients_f32x16 = _mm512_mul_ps(angles_radians, pi_reciprocal_f32x16);
56
+ __m512 rounded_quotients_f32x16 = _mm512_roundscale_ps(quotients_f32x16,
57
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
57
58
  // Use explicit rounding to match roundscale (MXCSR-independent)
58
- __m512i multiples_of_pi = _mm512_cvt_roundps_epi32(rounded_quotients,
59
- _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
59
+ __m512i multiples_of_pi_i32x16 = _mm512_cvt_roundps_epi32(rounded_quotients_f32x16,
60
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
60
61
 
61
62
  // Cody-Waite range reduction
62
- __m512 angles = _mm512_fnmadd_ps(rounded_quotients, pi_hi_f32x16, angles_radians);
63
- angles = _mm512_fnmadd_ps(rounded_quotients, pi_lo_f32x16, angles);
64
- __m512 const angles_squared = _mm512_mul_ps(angles, angles);
65
- __m512 const angles_cubed = _mm512_mul_ps(angles, angles_squared);
63
+ __m512 angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_high_f32x16, angles_radians);
64
+ angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_low_f32x16, angles_f32x16);
65
+ __m512 const angles_squared_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
66
+ __m512 const angles_cubed_f32x16 = _mm512_mul_ps(angles_f32x16, angles_squared_f32x16);
66
67
 
67
68
  // Degree-9 polynomial via Horner's method
68
- __m512 polynomials = coeff_9;
69
- polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_7);
70
- polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_5);
71
- polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_3);
72
-
73
- // If multiples_of_pi is odd, flip the sign of the results
74
- __mmask16 odd_mask = _mm512_test_epi32_mask(multiples_of_pi, _mm512_set1_epi32(1));
75
- __m512 results = _mm512_fmadd_ps(angles_cubed, polynomials, angles);
76
- results = _mm512_mask_sub_ps(results, odd_mask, _mm512_setzero_ps(), results);
77
- return results;
69
+ __m512 polynomials_f32x16 = coeff_9_f32x16;
70
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_7_f32x16);
71
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_5_f32x16);
72
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_3_f32x16);
73
+
74
+ // If multiples_of_pi_i32x16 is odd, flip the sign of the results_f32x16
75
+ __mmask16 odd_mask = _mm512_test_epi32_mask(multiples_of_pi_i32x16, _mm512_set1_epi32(1));
76
+ __m512 results_f32x16 = _mm512_fmadd_ps(angles_cubed_f32x16, polynomials_f32x16, angles_f32x16);
77
+ results_f32x16 = _mm512_mask_sub_ps(results_f32x16, odd_mask, _mm512_setzero_ps(), results_f32x16);
78
+ return results_f32x16;
78
79
  }
79
80
 
80
81
  NK_INTERNAL __m512 nk_cos_f32x16_skylake_(__m512 const angles_radians) {
81
82
  // Cody-Waite constants for argument reduction
82
- __m512 const pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
83
- __m512 const pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
84
- __m512 const pi_half = _mm512_set1_ps(1.57079632679489661923f); // π/2
85
- __m512 const pi_reciprocal = _mm512_set1_ps(0.31830988618379067154f); // 1/π
83
+ __m512 const pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
84
+ __m512 const pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
85
+ __m512 const pi_half_f32x16 = _mm512_set1_ps(1.57079632679489661923f); // π/2
86
+ __m512 const pi_reciprocal_f32x16 = _mm512_set1_ps(0.31830988618379067154f); // 1/π
86
87
  // Degree-9 minimax coefficients
87
- __m512 const coeff_9 = _mm512_set1_ps(+2.7557319224e-6f);
88
- __m512 const coeff_7 = _mm512_set1_ps(-1.9841269841e-4f);
89
- __m512 const coeff_5 = _mm512_set1_ps(+8.3333293855e-3f);
90
- __m512 const coeff_3 = _mm512_set1_ps(-1.6666666641e-1f);
91
-
92
- // Compute (multiples_of_pi) = round((angle / π) - 0.5)
93
- __m512 quotients = _mm512_fmsub_ps(angles_radians, pi_reciprocal, _mm512_set1_ps(0.5f));
94
- __m512 rounded_quotients = _mm512_roundscale_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
88
+ __m512 const coeff_9_f32x16 = _mm512_set1_ps(+2.7557319224e-6f);
89
+ __m512 const coeff_7_f32x16 = _mm512_set1_ps(-1.9841269841e-4f);
90
+ __m512 const coeff_5_f32x16 = _mm512_set1_ps(+8.3333293855e-3f);
91
+ __m512 const coeff_3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
92
+
93
+ // Compute (multiples_of_pi_i32x16) = round((angle / π) - 0.5)
94
+ __m512 quotients_f32x16 = _mm512_fmsub_ps(angles_radians, pi_reciprocal_f32x16, _mm512_set1_ps(0.5f));
95
+ __m512 rounded_quotients_f32x16 = _mm512_roundscale_ps(quotients_f32x16,
96
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
95
97
  // Use explicit rounding to match roundscale (MXCSR-independent)
96
- __m512i multiples_of_pi = _mm512_cvt_roundps_epi32(rounded_quotients,
97
- _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
98
+ __m512i multiples_of_pi_i32x16 = _mm512_cvt_roundps_epi32(rounded_quotients_f32x16,
99
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
98
100
 
99
101
  // Cody-Waite range reduction: angle = angle_radians - (multiples * pi + pi/2)
100
- __m512 const offset = _mm512_fmadd_ps(rounded_quotients, pi_hi_f32x16, pi_half);
101
- __m512 angles = _mm512_sub_ps(angles_radians, offset);
102
- angles = _mm512_fnmadd_ps(rounded_quotients, pi_lo_f32x16, angles);
103
- __m512 const angles_squared = _mm512_mul_ps(angles, angles);
104
- __m512 const angles_cubed = _mm512_mul_ps(angles, angles_squared);
102
+ __m512 const offset_f32x16 = _mm512_fmadd_ps(rounded_quotients_f32x16, pi_high_f32x16, pi_half_f32x16);
103
+ __m512 angles_f32x16 = _mm512_sub_ps(angles_radians, offset_f32x16);
104
+ angles_f32x16 = _mm512_fnmadd_ps(rounded_quotients_f32x16, pi_low_f32x16, angles_f32x16);
105
+ __m512 const angles_squared_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
106
+ __m512 const angles_cubed_f32x16 = _mm512_mul_ps(angles_f32x16, angles_squared_f32x16);
105
107
 
106
108
  // Degree-9 polynomial via Horner's method
107
- __m512 polynomials = coeff_9;
108
- polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_7);
109
- polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_5);
110
- polynomials = _mm512_fmadd_ps(polynomials, angles_squared, coeff_3);
111
- __m512 results = _mm512_fmadd_ps(angles_cubed, polynomials, angles);
112
-
113
- // If multiples_of_pi is even, flip the sign of the results
114
- __mmask16 even_mask = _mm512_testn_epi32_mask(multiples_of_pi, _mm512_set1_epi32(1));
115
- results = _mm512_mask_sub_ps(results, even_mask, _mm512_setzero_ps(), results);
116
- return results;
109
+ __m512 polynomials_f32x16 = coeff_9_f32x16;
110
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_7_f32x16);
111
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_5_f32x16);
112
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, angles_squared_f32x16, coeff_3_f32x16);
113
+ __m512 results_f32x16 = _mm512_fmadd_ps(angles_cubed_f32x16, polynomials_f32x16, angles_f32x16);
114
+
115
+ // If multiples_of_pi_i32x16 is even, flip the sign of the results_f32x16
116
+ __mmask16 even_mask = _mm512_testn_epi32_mask(multiples_of_pi_i32x16, _mm512_set1_epi32(1));
117
+ results_f32x16 = _mm512_mask_sub_ps(results_f32x16, even_mask, _mm512_setzero_ps(), results_f32x16);
118
+ return results_f32x16;
117
119
  }
118
120
 
119
121
  NK_INTERNAL __m512 nk_atan_f32x16_skylake_(__m512 const inputs) {
120
122
  // Polynomial coefficients
121
- __m512 const coeff_8 = _mm512_set1_ps(-0.333331018686294555664062f);
122
- __m512 const coeff_7 = _mm512_set1_ps(+0.199926957488059997558594f);
123
- __m512 const coeff_6 = _mm512_set1_ps(-0.142027363181114196777344f);
124
- __m512 const coeff_5 = _mm512_set1_ps(+0.106347933411598205566406f);
125
- __m512 const coeff_4 = _mm512_set1_ps(-0.0748900920152664184570312f);
126
- __m512 const coeff_3 = _mm512_set1_ps(+0.0425049886107444763183594f);
127
- __m512 const coeff_2 = _mm512_set1_ps(-0.0159569028764963150024414f);
128
- __m512 const coeff_1 = _mm512_set1_ps(+0.00282363896258175373077393f);
123
+ __m512 const coeff_8_f32x16 = _mm512_set1_ps(-0.333331018686294555664062f);
124
+ __m512 const coeff_7_f32x16 = _mm512_set1_ps(+0.199926957488059997558594f);
125
+ __m512 const coeff_6_f32x16 = _mm512_set1_ps(-0.142027363181114196777344f);
126
+ __m512 const coeff_5_f32x16 = _mm512_set1_ps(+0.106347933411598205566406f);
127
+ __m512 const coeff_4_f32x16 = _mm512_set1_ps(-0.0748900920152664184570312f);
128
+ __m512 const coeff_3_f32x16 = _mm512_set1_ps(+0.0425049886107444763183594f);
129
+ __m512 const coeff_2_f32x16 = _mm512_set1_ps(-0.0159569028764963150024414f);
130
+ __m512 const coeff_1_f32x16 = _mm512_set1_ps(+0.00282363896258175373077393f);
129
131
 
130
132
  // Adjust for quadrant
131
- __m512 values = inputs;
132
- __mmask16 const negative_mask = _mm512_fpclass_ps_mask(values, 0x40);
133
- values = _mm512_abs_ps(values);
134
- __mmask16 const reciprocal_mask = _mm512_cmp_ps_mask(values, _mm512_set1_ps(1.0f), _CMP_GT_OS);
135
- values = _mm512_mask_div_ps(values, reciprocal_mask, _mm512_set1_ps(1.0f), values);
133
+ __m512 values_f32x16 = inputs;
134
+ __mmask16 const negative_mask = _mm512_fpclass_ps_mask(values_f32x16, 0x40);
135
+ values_f32x16 = _mm512_abs_ps(values_f32x16);
136
+ __mmask16 const reciprocal_mask = _mm512_cmp_ps_mask(values_f32x16, _mm512_set1_ps(1.0f), _CMP_GT_OS);
137
+ values_f32x16 = _mm512_mask_div_ps(values_f32x16, reciprocal_mask, _mm512_set1_ps(1.0f), values_f32x16);
136
138
 
137
139
  // Argument reduction
138
- __m512 const values_squared = _mm512_mul_ps(values, values);
139
- __m512 const values_cubed = _mm512_mul_ps(values, values_squared);
140
+ __m512 const values_squared_f32x16 = _mm512_mul_ps(values_f32x16, values_f32x16);
141
+ __m512 const values_cubed_f32x16 = _mm512_mul_ps(values_f32x16, values_squared_f32x16);
140
142
 
141
143
  // Polynomial evaluation
142
- __m512 polynomials = coeff_1;
143
- polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_2);
144
- polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_3);
145
- polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_4);
146
- polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_5);
147
- polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_6);
148
- polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_7);
149
- polynomials = _mm512_fmadd_ps(polynomials, values_squared, coeff_8);
150
-
151
- // Adjust result for quadrants
152
- __m512 result = _mm512_fmadd_ps(values_cubed, polynomials, values);
153
- result = _mm512_mask_sub_ps(result, reciprocal_mask, _mm512_set1_ps(1.5707963267948966f), result);
154
- result = _mm512_mask_sub_ps(result, negative_mask, _mm512_setzero_ps(), result);
155
- return result;
144
+ __m512 polynomials_f32x16 = coeff_1_f32x16;
145
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_2_f32x16);
146
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_3_f32x16);
147
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_4_f32x16);
148
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_5_f32x16);
149
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_6_f32x16);
150
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_7_f32x16);
151
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, values_squared_f32x16, coeff_8_f32x16);
152
+
153
+ // Adjust result_f32x16 for quadrants
154
+ __m512 result_f32x16 = _mm512_fmadd_ps(values_cubed_f32x16, polynomials_f32x16, values_f32x16);
155
+ result_f32x16 = _mm512_mask_sub_ps(result_f32x16, reciprocal_mask, _mm512_set1_ps(1.5707963267948966f),
156
+ result_f32x16);
157
+ result_f32x16 = _mm512_mask_sub_ps(result_f32x16, negative_mask, _mm512_setzero_ps(), result_f32x16);
158
+ return result_f32x16;
156
159
  }
157
160
 
158
161
  NK_INTERNAL __m512 nk_atan2_f32x16_skylake_(__m512 const ys_inputs, __m512 const xs_inputs) {
159
162
  // Polynomial coefficients
160
- __m512 const coeff_8 = _mm512_set1_ps(-0.333331018686294555664062f);
161
- __m512 const coeff_7 = _mm512_set1_ps(+0.199926957488059997558594f);
162
- __m512 const coeff_6 = _mm512_set1_ps(-0.142027363181114196777344f);
163
- __m512 const coeff_5 = _mm512_set1_ps(+0.106347933411598205566406f);
164
- __m512 const coeff_4 = _mm512_set1_ps(-0.0748900920152664184570312f);
165
- __m512 const coeff_3 = _mm512_set1_ps(+0.0425049886107444763183594f);
166
- __m512 const coeff_2 = _mm512_set1_ps(-0.0159569028764963150024414f);
167
- __m512 const coeff_1 = _mm512_set1_ps(+0.00282363896258175373077393f);
163
+ __m512 const coeff_8_f32x16 = _mm512_set1_ps(-0.333331018686294555664062f);
164
+ __m512 const coeff_7_f32x16 = _mm512_set1_ps(+0.199926957488059997558594f);
165
+ __m512 const coeff_6_f32x16 = _mm512_set1_ps(-0.142027363181114196777344f);
166
+ __m512 const coeff_5_f32x16 = _mm512_set1_ps(+0.106347933411598205566406f);
167
+ __m512 const coeff_4_f32x16 = _mm512_set1_ps(-0.0748900920152664184570312f);
168
+ __m512 const coeff_3_f32x16 = _mm512_set1_ps(+0.0425049886107444763183594f);
169
+ __m512 const coeff_2_f32x16 = _mm512_set1_ps(-0.0159569028764963150024414f);
170
+ __m512 const coeff_1_f32x16 = _mm512_set1_ps(+0.00282363896258175373077393f);
168
171
 
169
172
  // Quadrant adjustments normalizing to absolute values of x and y
170
173
  __mmask16 const xs_negative_mask = _mm512_fpclass_ps_mask(xs_inputs, 0x40);
171
- __m512 xs = _mm512_abs_ps(xs_inputs);
172
- __m512 ys = _mm512_abs_ps(ys_inputs);
174
+ __m512 xs_f32x16 = _mm512_abs_ps(xs_inputs);
175
+ __m512 ys_f32x16 = _mm512_abs_ps(ys_inputs);
173
176
  // Ensure proper fraction where the numerator is smaller than the denominator
174
- __mmask16 const swap_mask = _mm512_cmp_ps_mask(ys, xs, _CMP_GT_OS);
175
- __m512 temps = xs;
176
- xs = _mm512_mask_blend_ps(swap_mask, xs, ys);
177
- ys = _mm512_mask_sub_ps(ys, swap_mask, _mm512_setzero_ps(), temps);
177
+ __mmask16 const swap_mask = _mm512_cmp_ps_mask(ys_f32x16, xs_f32x16, _CMP_GT_OS);
178
+ __m512 temps_f32x16 = xs_f32x16;
179
+ xs_f32x16 = _mm512_mask_blend_ps(swap_mask, xs_f32x16, ys_f32x16);
180
+ ys_f32x16 = _mm512_mask_sub_ps(ys_f32x16, swap_mask, _mm512_setzero_ps(), temps_f32x16);
178
181
 
179
- // Compute ratio and ratio²
180
- __m512 const ratio = _mm512_div_ps(ys, xs);
181
- __m512 const ratio_squared = _mm512_mul_ps(ratio, ratio);
182
- __m512 const ratio_cubed = _mm512_mul_ps(ratio, ratio_squared);
182
+ // Compute ratio_f32x16 and ratio²
183
+ __m512 const ratio_f32x16 = _mm512_div_ps(ys_f32x16, xs_f32x16);
184
+ __m512 const ratio_squared_f32x16 = _mm512_mul_ps(ratio_f32x16, ratio_f32x16);
185
+ __m512 const ratio_cubed_f32x16 = _mm512_mul_ps(ratio_f32x16, ratio_squared_f32x16);
183
186
 
184
187
  // Polynomial evaluation
185
- __m512 polynomials = coeff_1;
186
- polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_2);
187
- polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_3);
188
- polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_4);
189
- polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_5);
190
- polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_6);
191
- polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_7);
192
- polynomials = _mm512_fmadd_ps(polynomials, ratio_squared, coeff_8);
193
-
194
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
188
+ __m512 polynomials_f32x16 = coeff_1_f32x16;
189
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_2_f32x16);
190
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_3_f32x16);
191
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_4_f32x16);
192
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_5_f32x16);
193
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_6_f32x16);
194
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_7_f32x16);
195
+ polynomials_f32x16 = _mm512_fmadd_ps(polynomials_f32x16, ratio_squared_f32x16, coeff_8_f32x16);
196
+
197
+ // Compute quadrant_f32x16 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
195
198
  // -2 for x<0 && !swap, -1 for x<0 && swap
196
- __m512 results = _mm512_fmadd_ps(ratio_cubed, polynomials, ratio);
197
- __m512 quadrant = _mm512_setzero_ps();
198
- __m512 neg_two = _mm512_set1_ps(-2.0f);
199
- quadrant = _mm512_mask_blend_ps(xs_negative_mask, quadrant, neg_two);
200
- __m512 one = _mm512_set1_ps(1.0f);
201
- __m512 quadrant_incremented = _mm512_add_ps(quadrant, one);
202
- quadrant = _mm512_mask_blend_ps(swap_mask, quadrant, quadrant_incremented);
203
-
204
- // Adjust for quadrant: result += quadrant * π/2
205
- __m512 pi_half = _mm512_set1_ps(1.5707963267948966f);
206
- results = _mm512_fmadd_ps(quadrant, pi_half, results);
199
+ __m512 results_f32x16 = _mm512_fmadd_ps(ratio_cubed_f32x16, polynomials_f32x16, ratio_f32x16);
200
+ __m512 quadrant_f32x16 = _mm512_setzero_ps();
201
+ __m512 neg_two_f32x16 = _mm512_set1_ps(-2.0f);
202
+ quadrant_f32x16 = _mm512_mask_blend_ps(xs_negative_mask, quadrant_f32x16, neg_two_f32x16);
203
+ __m512 one_f32x16 = _mm512_set1_ps(1.0f);
204
+ __m512 quadrant_incremented_f32x16 = _mm512_add_ps(quadrant_f32x16, one_f32x16);
205
+ quadrant_f32x16 = _mm512_mask_blend_ps(swap_mask, quadrant_f32x16, quadrant_incremented_f32x16);
206
+
207
+ // Adjust for quadrant_f32x16: result += quadrant_f32x16 * π/2
208
+ __m512 pi_half_f32x16 = _mm512_set1_ps(1.5707963267948966f);
209
+ results_f32x16 = _mm512_fmadd_ps(quadrant_f32x16, pi_half_f32x16, results_f32x16);
207
210
 
208
211
  // Transfer sign from x (XOR with sign bit of x_input)
209
- __m512 xs_sign_bits = _mm512_and_ps(xs_inputs, _mm512_set1_ps(-0.0f));
210
- results = _mm512_xor_ps(results, xs_sign_bits);
212
+ __m512 xs_sign_bits_f32x16 = _mm512_and_ps(xs_inputs, _mm512_set1_ps(-0.0f));
213
+ results_f32x16 = _mm512_xor_ps(results_f32x16, xs_sign_bits_f32x16);
211
214
 
212
215
  // Transfer sign from y (XOR with sign bit of y_input)
213
- __m512 ys_sign_bits = _mm512_and_ps(ys_inputs, _mm512_set1_ps(-0.0f));
214
- results = _mm512_xor_ps(results, ys_sign_bits);
216
+ __m512 ys_sign_bits_f32x16 = _mm512_and_ps(ys_inputs, _mm512_set1_ps(-0.0f));
217
+ results_f32x16 = _mm512_xor_ps(results_f32x16, ys_sign_bits_f32x16);
215
218
 
216
- return results;
219
+ return results_f32x16;
217
220
  }
218
221
 
219
222
  NK_PUBLIC void nk_each_sin_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
220
223
  nk_size_t i = 0;
221
224
  for (; i + 16 <= n; i += 16) {
222
- __m512 angles = _mm512_loadu_ps(ins + i);
223
- __m512 results = nk_sin_f32x16_skylake_(angles);
224
- _mm512_storeu_ps(outs + i, results);
225
+ __m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
226
+ __m512 results_f32x16 = nk_sin_f32x16_skylake_(angles_f32x16);
227
+ _mm512_storeu_ps(outs + i, results_f32x16);
225
228
  }
226
229
  if (i < n) {
227
230
  __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
228
- __m512 angles = _mm512_maskz_loadu_ps(mask, ins + i);
229
- __m512 results = nk_sin_f32x16_skylake_(angles);
230
- _mm512_mask_storeu_ps(outs + i, mask, results);
231
+ __m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
232
+ __m512 results_f32x16 = nk_sin_f32x16_skylake_(angles_f32x16);
233
+ _mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
231
234
  }
232
235
  }
233
236
  NK_PUBLIC void nk_each_cos_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
234
237
  nk_size_t i = 0;
235
238
  for (; i + 16 <= n; i += 16) {
236
- __m512 angles = _mm512_loadu_ps(ins + i);
237
- __m512 results = nk_cos_f32x16_skylake_(angles);
238
- _mm512_storeu_ps(outs + i, results);
239
+ __m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
240
+ __m512 results_f32x16 = nk_cos_f32x16_skylake_(angles_f32x16);
241
+ _mm512_storeu_ps(outs + i, results_f32x16);
239
242
  }
240
243
  if (i < n) {
241
244
  __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
242
- __m512 angles = _mm512_maskz_loadu_ps(mask, ins + i);
243
- __m512 results = nk_cos_f32x16_skylake_(angles);
244
- _mm512_mask_storeu_ps(outs + i, mask, results);
245
+ __m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
246
+ __m512 results_f32x16 = nk_cos_f32x16_skylake_(angles_f32x16);
247
+ _mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
245
248
  }
246
249
  }
247
250
  NK_PUBLIC void nk_each_atan_f32_skylake(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
248
251
  nk_size_t i = 0;
249
252
  for (; i + 16 <= n; i += 16) {
250
- __m512 angles = _mm512_loadu_ps(ins + i);
251
- __m512 results = nk_atan_f32x16_skylake_(angles);
252
- _mm512_storeu_ps(outs + i, results);
253
+ __m512 angles_f32x16 = _mm512_loadu_ps(ins + i);
254
+ __m512 results_f32x16 = nk_atan_f32x16_skylake_(angles_f32x16);
255
+ _mm512_storeu_ps(outs + i, results_f32x16);
253
256
  }
254
257
  if (i < n) {
255
258
  __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, n - i);
256
- __m512 angles = _mm512_maskz_loadu_ps(mask, ins + i);
257
- __m512 results = nk_atan_f32x16_skylake_(angles);
258
- _mm512_mask_storeu_ps(outs + i, mask, results);
259
+ __m512 angles_f32x16 = _mm512_maskz_loadu_ps(mask, ins + i);
260
+ __m512 results_f32x16 = nk_atan_f32x16_skylake_(angles_f32x16);
261
+ _mm512_mask_storeu_ps(outs + i, mask, results_f32x16);
259
262
  }
260
263
  }
261
264
 
262
265
  NK_INTERNAL __m512d nk_sin_f64x8_skylake_(__m512d const angles_radians) {
263
266
  // Constants for argument reduction
264
- __m512d const pi_high = _mm512_set1_pd(3.141592653589793116); // High-digits part of π
265
- __m512d const pi_low = _mm512_set1_pd(1.2246467991473532072e-16); // Low-digits part of π
266
- __m512d const pi_reciprocal = _mm512_set1_pd(0.31830988618379067154); // 1/π
267
+ __m512d const pi_high_f64x8 = _mm512_set1_pd(3.141592653589793116); // High-digits part of π
268
+ __m512d const pi_low_f64x8 = _mm512_set1_pd(1.2246467991473532072e-16); // Low-digits part of π
269
+ __m512d const pi_reciprocal_f64x8 = _mm512_set1_pd(0.31830988618379067154); // 1/π
267
270
 
268
271
  // Polynomial coefficients for sine/cosine approximation (minimax polynomial)
269
- __m512d const coeff_0 = _mm512_set1_pd(+0.00833333333333332974823815);
270
- __m512d const coeff_1 = _mm512_set1_pd(-0.000198412698412696162806809);
271
- __m512d const coeff_2 = _mm512_set1_pd(+2.75573192239198747630416e-06);
272
- __m512d const coeff_3 = _mm512_set1_pd(-2.50521083763502045810755e-08);
273
- __m512d const coeff_4 = _mm512_set1_pd(+1.60590430605664501629054e-10);
274
- __m512d const coeff_5 = _mm512_set1_pd(-7.64712219118158833288484e-13);
275
- __m512d const coeff_6 = _mm512_set1_pd(+2.81009972710863200091251e-15);
276
- __m512d const coeff_7 = _mm512_set1_pd(-7.97255955009037868891952e-18);
277
- __m512d const coeff_8 = _mm512_set1_pd(-0.166666666666666657414808);
278
-
279
- // Compute (rounded_quotients) = round(angle / π)
280
- __m512d const quotients = _mm512_mul_pd(angles_radians, pi_reciprocal);
281
- __m512d const rounded_quotients = _mm512_roundscale_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
282
-
283
- // Reduce the angle to: angle - (rounded_quotients * π_high + rounded_quotients * π_low)
284
- __m512d angles = angles_radians;
285
- angles = _mm512_fnmadd_pd(rounded_quotients, pi_high, angles);
286
- angles = _mm512_fnmadd_pd(rounded_quotients, pi_low, angles);
287
-
288
- // If rounded_quotients is odd (bit 0 set), negate the angle
272
+ __m512d const coeff_0_f64x8 = _mm512_set1_pd(+0.00833333333333332974823815);
273
+ __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.000198412698412696162806809);
274
+ __m512d const coeff_2_f64x8 = _mm512_set1_pd(+2.75573192239198747630416e-06);
275
+ __m512d const coeff_3_f64x8 = _mm512_set1_pd(-2.50521083763502045810755e-08);
276
+ __m512d const coeff_4_f64x8 = _mm512_set1_pd(+1.60590430605664501629054e-10);
277
+ __m512d const coeff_5_f64x8 = _mm512_set1_pd(-7.64712219118158833288484e-13);
278
+ __m512d const coeff_6_f64x8 = _mm512_set1_pd(+2.81009972710863200091251e-15);
279
+ __m512d const coeff_7_f64x8 = _mm512_set1_pd(-7.97255955009037868891952e-18);
280
+ __m512d const coeff_8_f64x8 = _mm512_set1_pd(-0.166666666666666657414808);
281
+
282
+ // Compute (rounded_quotients_f64x8) = round(angle / π)
283
+ __m512d const quotients_f64x8 = _mm512_mul_pd(angles_radians, pi_reciprocal_f64x8);
284
+ __m512d const rounded_quotients_f64x8 = _mm512_roundscale_pd(quotients_f64x8,
285
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
286
+
287
+ // Reduce the angle to: angle - (rounded_quotients_f64x8 * π_high + rounded_quotients_f64x8 * π_low)
288
+ __m512d angles_f64x8 = angles_radians;
289
+ angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_high_f64x8, angles_f64x8);
290
+ angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_low_f64x8, angles_f64x8);
291
+
292
+ // If rounded_quotients_f64x8 is odd (bit 0 set), negate the angle
289
293
  // Use explicit rounding to match roundscale (MXCSR-independent)
290
294
  __mmask8 const sign_flip_mask = _mm256_test_epi32_mask(
291
- _mm512_cvt_roundpd_epi32(rounded_quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
295
+ _mm512_cvt_roundpd_epi32(rounded_quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
292
296
  _mm256_set1_epi32(1));
293
- angles = _mm512_mask_sub_pd(angles, sign_flip_mask, _mm512_setzero_pd(), angles);
297
+ angles_f64x8 = _mm512_mask_sub_pd(angles_f64x8, sign_flip_mask, _mm512_setzero_pd(), angles_f64x8);
294
298
 
295
- __m512d const angles_squared = _mm512_mul_pd(angles, angles);
296
- __m512d const angles_cubed = _mm512_mul_pd(angles, angles_squared);
297
- __m512d const angles_quadratic = _mm512_mul_pd(angles_squared, angles_squared);
298
- __m512d const angles_octic = _mm512_mul_pd(angles_quadratic, angles_quadratic);
299
+ __m512d const angles_squared_f64x8 = _mm512_mul_pd(angles_f64x8, angles_f64x8);
300
+ __m512d const angles_cubed_f64x8 = _mm512_mul_pd(angles_f64x8, angles_squared_f64x8);
301
+ __m512d const angles_quadratic_f64x8 = _mm512_mul_pd(angles_squared_f64x8, angles_squared_f64x8);
302
+ __m512d const angles_octic_f64x8 = _mm512_mul_pd(angles_quadratic_f64x8, angles_quadratic_f64x8);
299
303
 
300
304
  // Compute higher-degree polynomial terms
301
- __m512d const poly_67 = _mm512_fmadd_pd(angles_squared, coeff_7, coeff_6);
302
- __m512d const poly_45 = _mm512_fmadd_pd(angles_squared, coeff_5, coeff_4);
303
- __m512d const poly_4567 = _mm512_fmadd_pd(angles_quadratic, poly_67, poly_45);
305
+ __m512d const poly_67_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_7_f64x8, coeff_6_f64x8);
306
+ __m512d const poly_45_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_5_f64x8, coeff_4_f64x8);
307
+ __m512d const poly_4567_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_67_f64x8, poly_45_f64x8);
304
308
 
305
309
  // Compute lower-degree polynomial terms
306
- __m512d const poly_23 = _mm512_fmadd_pd(angles_squared, coeff_3, coeff_2);
307
- __m512d const poly_01 = _mm512_fmadd_pd(angles_squared, coeff_1, coeff_0);
308
- __m512d const poly_0123 = _mm512_fmadd_pd(angles_quadratic, poly_23, poly_01);
310
+ __m512d const poly_23_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_3_f64x8, coeff_2_f64x8);
311
+ __m512d const poly_01_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_1_f64x8, coeff_0_f64x8);
312
+ __m512d const poly_0123_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_23_f64x8, poly_01_f64x8);
309
313
 
310
314
  // Combine polynomial terms
311
- __m512d results = _mm512_fmadd_pd(angles_octic, poly_4567, poly_0123);
312
- results = _mm512_fmadd_pd(results, angles_squared, coeff_8);
313
- results = _mm512_fmadd_pd(results, angles_cubed, angles);
315
+ __m512d results_f64x8 = _mm512_fmadd_pd(angles_octic_f64x8, poly_4567_f64x8, poly_0123_f64x8);
316
+ results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_squared_f64x8, coeff_8_f64x8);
317
+ results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_cubed_f64x8, angles_f64x8);
314
318
 
315
319
  // Handle the special case of negative zero input
316
320
  __mmask8 const non_zero_mask = _mm512_cmpneq_pd_mask(angles_radians, _mm512_setzero_pd());
317
- results = _mm512_maskz_mov_pd(non_zero_mask, results);
318
- return results;
321
+ results_f64x8 = _mm512_maskz_mov_pd(non_zero_mask, results_f64x8);
322
+ return results_f64x8;
319
323
  }
320
324
 
321
325
  NK_INTERNAL __m512d nk_cos_f64x8_skylake_(__m512d const angles_radians) {
322
326
  // Constants for argument reduction
323
- __m512d const pi_high_half = _mm512_set1_pd(3.141592653589793116 * 0.5); // High-digits part of π
324
- __m512d const pi_low_half = _mm512_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π
325
- __m512d const pi_reciprocal = _mm512_set1_pd(0.31830988618379067154); // 1/π
327
+ __m512d const pi_high_half_f64x8 = _mm512_set1_pd(3.141592653589793116 * 0.5); // High-digits part of π
328
+ __m512d const pi_low_half_f64x8 = _mm512_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π
329
+ __m512d const pi_reciprocal_f64x8 = _mm512_set1_pd(0.31830988618379067154); // 1/π
326
330
 
327
331
  // Polynomial coefficients for sine/cosine approximation (minimax polynomial)
328
- __m512d const coeff_0 = _mm512_set1_pd(+0.00833333333333332974823815);
329
- __m512d const coeff_1 = _mm512_set1_pd(-0.000198412698412696162806809);
330
- __m512d const coeff_2 = _mm512_set1_pd(+2.75573192239198747630416e-06);
331
- __m512d const coeff_3 = _mm512_set1_pd(-2.50521083763502045810755e-08);
332
- __m512d const coeff_4 = _mm512_set1_pd(+1.60590430605664501629054e-10);
333
- __m512d const coeff_5 = _mm512_set1_pd(-7.64712219118158833288484e-13);
334
- __m512d const coeff_6 = _mm512_set1_pd(+2.81009972710863200091251e-15);
335
- __m512d const coeff_7 = _mm512_set1_pd(-7.97255955009037868891952e-18);
336
- __m512d const coeff_8 = _mm512_set1_pd(-0.166666666666666657414808);
337
-
338
- // Compute (rounded_quotients) = 2 * round(angle / π - 0.5) + 1
339
- // Use fmsub: a*b - c = angles * (1/π) - 0.5
340
- __m512d const quotients = _mm512_fmsub_pd(angles_radians, pi_reciprocal, _mm512_set1_pd(0.5));
341
- __m512d const rounded_quotients = _mm512_fmadd_pd( //
342
- _mm512_set1_pd(2), //
343
- _mm512_roundscale_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
332
+ __m512d const coeff_0_f64x8 = _mm512_set1_pd(+0.00833333333333332974823815);
333
+ __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.000198412698412696162806809);
334
+ __m512d const coeff_2_f64x8 = _mm512_set1_pd(+2.75573192239198747630416e-06);
335
+ __m512d const coeff_3_f64x8 = _mm512_set1_pd(-2.50521083763502045810755e-08);
336
+ __m512d const coeff_4_f64x8 = _mm512_set1_pd(+1.60590430605664501629054e-10);
337
+ __m512d const coeff_5_f64x8 = _mm512_set1_pd(-7.64712219118158833288484e-13);
338
+ __m512d const coeff_6_f64x8 = _mm512_set1_pd(+2.81009972710863200091251e-15);
339
+ __m512d const coeff_7_f64x8 = _mm512_set1_pd(-7.97255955009037868891952e-18);
340
+ __m512d const coeff_8_f64x8 = _mm512_set1_pd(-0.166666666666666657414808);
341
+
342
+ // Compute (rounded_quotients_f64x8) = 2 * round(angle / π - 0.5) + 1
343
+ // Use fmsub: a*b - c = angles_f64x8 * (1/π) - 0.5
344
+ __m512d const quotients_f64x8 = _mm512_fmsub_pd(angles_radians, pi_reciprocal_f64x8, _mm512_set1_pd(0.5));
345
+ __m512d const rounded_quotients_f64x8 = _mm512_fmadd_pd( //
346
+ _mm512_set1_pd(2), //
347
+ _mm512_roundscale_pd(quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
344
348
  _mm512_set1_pd(1));
345
349
 
346
- // Reduce the angle to: angle - (rounded_quotients * π_high + rounded_quotients * π_low)
347
- __m512d angles = angles_radians;
348
- angles = _mm512_fnmadd_pd(rounded_quotients, pi_high_half, angles);
349
- angles = _mm512_fnmadd_pd(rounded_quotients, pi_low_half, angles);
350
+ // Reduce the angle to: angle - (rounded_quotients_f64x8 * π_high + rounded_quotients_f64x8 * π_low)
351
+ __m512d angles_f64x8 = angles_radians;
352
+ angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_high_half_f64x8, angles_f64x8);
353
+ angles_f64x8 = _mm512_fnmadd_pd(rounded_quotients_f64x8, pi_low_half_f64x8, angles_f64x8);
350
354
  // Use explicit rounding to match roundscale (MXCSR-independent)
351
355
  __mmask8 const sign_flip_mask = _mm256_testn_epi32_mask(
352
- _mm512_cvt_roundpd_epi32(rounded_quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
356
+ _mm512_cvt_roundpd_epi32(rounded_quotients_f64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
353
357
  _mm256_set1_epi32(2));
354
- angles = _mm512_mask_sub_pd(angles, sign_flip_mask, _mm512_setzero_pd(), angles);
355
- __m512d const angles_squared = _mm512_mul_pd(angles, angles);
356
- __m512d const angles_cubed = _mm512_mul_pd(angles, angles_squared);
357
- __m512d const angles_quadratic = _mm512_mul_pd(angles_squared, angles_squared);
358
- __m512d const angles_octic = _mm512_mul_pd(angles_quadratic, angles_quadratic);
358
+ angles_f64x8 = _mm512_mask_sub_pd(angles_f64x8, sign_flip_mask, _mm512_setzero_pd(), angles_f64x8);
359
+ __m512d const angles_squared_f64x8 = _mm512_mul_pd(angles_f64x8, angles_f64x8);
360
+ __m512d const angles_cubed_f64x8 = _mm512_mul_pd(angles_f64x8, angles_squared_f64x8);
361
+ __m512d const angles_quadratic_f64x8 = _mm512_mul_pd(angles_squared_f64x8, angles_squared_f64x8);
362
+ __m512d const angles_octic_f64x8 = _mm512_mul_pd(angles_quadratic_f64x8, angles_quadratic_f64x8);
359
363
 
360
364
  // Compute higher-degree polynomial terms
361
- __m512d const poly_67 = _mm512_fmadd_pd(angles_squared, coeff_7, coeff_6);
362
- __m512d const poly_45 = _mm512_fmadd_pd(angles_squared, coeff_5, coeff_4);
363
- __m512d const poly_4567 = _mm512_fmadd_pd(angles_quadratic, poly_67, poly_45);
365
+ __m512d const poly_67_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_7_f64x8, coeff_6_f64x8);
366
+ __m512d const poly_45_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_5_f64x8, coeff_4_f64x8);
367
+ __m512d const poly_4567_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_67_f64x8, poly_45_f64x8);
364
368
 
365
369
  // Compute lower-degree polynomial terms
366
- __m512d const poly_23 = _mm512_fmadd_pd(angles_squared, coeff_3, coeff_2);
367
- __m512d const poly_01 = _mm512_fmadd_pd(angles_squared, coeff_1, coeff_0);
368
- __m512d const poly_0123 = _mm512_fmadd_pd(angles_quadratic, poly_23, poly_01);
370
+ __m512d const poly_23_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_3_f64x8, coeff_2_f64x8);
371
+ __m512d const poly_01_f64x8 = _mm512_fmadd_pd(angles_squared_f64x8, coeff_1_f64x8, coeff_0_f64x8);
372
+ __m512d const poly_0123_f64x8 = _mm512_fmadd_pd(angles_quadratic_f64x8, poly_23_f64x8, poly_01_f64x8);
369
373
 
370
374
  // Combine polynomial terms
371
- __m512d results = _mm512_fmadd_pd(angles_octic, poly_4567, poly_0123);
372
- results = _mm512_fmadd_pd(results, angles_squared, coeff_8);
373
- results = _mm512_fmadd_pd(results, angles_cubed, angles);
374
- return results;
375
+ __m512d results_f64x8 = _mm512_fmadd_pd(angles_octic_f64x8, poly_4567_f64x8, poly_0123_f64x8);
376
+ results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_squared_f64x8, coeff_8_f64x8);
377
+ results_f64x8 = _mm512_fmadd_pd(results_f64x8, angles_cubed_f64x8, angles_f64x8);
378
+ return results_f64x8;
375
379
  }
376
380
 
377
381
  NK_INTERNAL __m512d nk_atan_f64x8_skylake_(__m512d const inputs) {
378
382
  // Polynomial coefficients for atan approximation
379
- __m512d const coeff_19 = _mm512_set1_pd(-1.88796008463073496563746e-05);
380
- __m512d const coeff_18 = _mm512_set1_pd(+0.000209850076645816976906797);
381
- __m512d const coeff_17 = _mm512_set1_pd(-0.00110611831486672482563471);
382
- __m512d const coeff_16 = _mm512_set1_pd(+0.00370026744188713119232403);
383
- __m512d const coeff_15 = _mm512_set1_pd(-0.00889896195887655491740809);
384
- __m512d const coeff_14 = _mm512_set1_pd(+0.016599329773529201970117);
385
- __m512d const coeff_13 = _mm512_set1_pd(-0.0254517624932312641616861);
386
- __m512d const coeff_12 = _mm512_set1_pd(+0.0337852580001353069993897);
387
- __m512d const coeff_11 = _mm512_set1_pd(-0.0407629191276836500001934);
388
- __m512d const coeff_10 = _mm512_set1_pd(+0.0466667150077840625632675);
389
- __m512d const coeff_9 = _mm512_set1_pd(-0.0523674852303482457616113);
390
- __m512d const coeff_8 = _mm512_set1_pd(+0.0587666392926673580854313);
391
- __m512d const coeff_7 = _mm512_set1_pd(-0.0666573579361080525984562);
392
- __m512d const coeff_6 = _mm512_set1_pd(+0.0769219538311769618355029);
393
- __m512d const coeff_5 = _mm512_set1_pd(-0.090908995008245008229153);
394
- __m512d const coeff_4 = _mm512_set1_pd(+0.111111105648261418443745);
395
- __m512d const coeff_3 = _mm512_set1_pd(-0.14285714266771329383765);
396
- __m512d const coeff_2 = _mm512_set1_pd(+0.199999999996591265594148);
397
- __m512d const coeff_1 = _mm512_set1_pd(-0.333333333333311110369124);
383
+ __m512d const coeff_19_f64x8 = _mm512_set1_pd(-1.88796008463073496563746e-05);
384
+ __m512d const coeff_18_f64x8 = _mm512_set1_pd(+0.000209850076645816976906797);
385
+ __m512d const coeff_17_f64x8 = _mm512_set1_pd(-0.00110611831486672482563471);
386
+ __m512d const coeff_16_f64x8 = _mm512_set1_pd(+0.00370026744188713119232403);
387
+ __m512d const coeff_15_f64x8 = _mm512_set1_pd(-0.00889896195887655491740809);
388
+ __m512d const coeff_14_f64x8 = _mm512_set1_pd(+0.016599329773529201970117);
389
+ __m512d const coeff_13_f64x8 = _mm512_set1_pd(-0.0254517624932312641616861);
390
+ __m512d const coeff_12_f64x8 = _mm512_set1_pd(+0.0337852580001353069993897);
391
+ __m512d const coeff_11_f64x8 = _mm512_set1_pd(-0.0407629191276836500001934);
392
+ __m512d const coeff_10_f64x8 = _mm512_set1_pd(+0.0466667150077840625632675);
393
+ __m512d const coeff_9_f64x8 = _mm512_set1_pd(-0.0523674852303482457616113);
394
+ __m512d const coeff_8_f64x8 = _mm512_set1_pd(+0.0587666392926673580854313);
395
+ __m512d const coeff_7_f64x8 = _mm512_set1_pd(-0.0666573579361080525984562);
396
+ __m512d const coeff_6_f64x8 = _mm512_set1_pd(+0.0769219538311769618355029);
397
+ __m512d const coeff_5_f64x8 = _mm512_set1_pd(-0.090908995008245008229153);
398
+ __m512d const coeff_4_f64x8 = _mm512_set1_pd(+0.111111105648261418443745);
399
+ __m512d const coeff_3_f64x8 = _mm512_set1_pd(-0.14285714266771329383765);
400
+ __m512d const coeff_2_f64x8 = _mm512_set1_pd(+0.199999999996591265594148);
401
+ __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.333333333333311110369124);
398
402
 
399
403
  // Quadrant adjustments
400
404
  __mmask8 negative_mask = _mm512_cmp_pd_mask(inputs, _mm512_setzero_pd(), _CMP_LT_OS);
401
- __m512d values = _mm512_abs_pd(inputs);
402
- __mmask8 reciprocal_mask = _mm512_cmp_pd_mask(values, _mm512_set1_pd(1.0), _CMP_GT_OS);
403
- values = _mm512_mask_div_pd(values, reciprocal_mask, _mm512_set1_pd(1.0), values);
404
- __m512d const values_squared = _mm512_mul_pd(values, values);
405
- __m512d const values_cubed = _mm512_mul_pd(values, values_squared);
405
+ __m512d values_f64x8 = _mm512_abs_pd(inputs);
406
+ __mmask8 reciprocal_mask = _mm512_cmp_pd_mask(values_f64x8, _mm512_set1_pd(1.0), _CMP_GT_OS);
407
+ values_f64x8 = _mm512_mask_div_pd(values_f64x8, reciprocal_mask, _mm512_set1_pd(1.0), values_f64x8);
408
+ __m512d const values_squared_f64x8 = _mm512_mul_pd(values_f64x8, values_f64x8);
409
+ __m512d const values_cubed_f64x8 = _mm512_mul_pd(values_f64x8, values_squared_f64x8);
406
410
 
407
411
  // Polynomial evaluation (argument reduction and approximation)
408
- __m512d polynomials = coeff_19;
409
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_18);
410
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_17);
411
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_16);
412
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_15);
413
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_14);
414
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_13);
415
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_12);
416
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_11);
417
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_10);
418
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_9);
419
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_8);
420
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_7);
421
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_6);
422
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_5);
423
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_4);
424
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_3);
425
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_2);
426
- polynomials = _mm512_fmadd_pd(polynomials, values_squared, coeff_1);
412
+ __m512d polynomials_f64x8 = coeff_19_f64x8;
413
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_18_f64x8);
414
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_17_f64x8);
415
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_16_f64x8);
416
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_15_f64x8);
417
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_14_f64x8);
418
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_13_f64x8);
419
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_12_f64x8);
420
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_11_f64x8);
421
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_10_f64x8);
422
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_9_f64x8);
423
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_8_f64x8);
424
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_7_f64x8);
425
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_6_f64x8);
426
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_5_f64x8);
427
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_4_f64x8);
428
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_3_f64x8);
429
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_2_f64x8);
430
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, values_squared_f64x8, coeff_1_f64x8);
427
431
 
428
432
  // Compute atan approximation
429
- __m512d result = _mm512_fmadd_pd(values_cubed, polynomials, values);
430
- result = _mm512_mask_sub_pd(result, reciprocal_mask, _mm512_set1_pd(1.5707963267948966), result);
431
- result = _mm512_mask_sub_pd(result, negative_mask, _mm512_setzero_pd(), result);
432
- return result;
433
+ __m512d result_f64x8 = _mm512_fmadd_pd(values_cubed_f64x8, polynomials_f64x8, values_f64x8);
434
+ result_f64x8 = _mm512_mask_sub_pd(result_f64x8, reciprocal_mask, _mm512_set1_pd(1.5707963267948966), result_f64x8);
435
+ result_f64x8 = _mm512_mask_sub_pd(result_f64x8, negative_mask, _mm512_setzero_pd(), result_f64x8);
436
+ return result_f64x8;
433
437
  }
434
438
 
435
439
  /**
@@ -438,126 +442,126 @@ NK_INTERNAL __m512d nk_atan_f64x8_skylake_(__m512d const inputs) {
438
442
  */
439
443
  NK_INTERNAL __m512d nk_atan2_f64x8_skylake_(__m512d const ys_inputs, __m512d const xs_inputs) {
440
444
  // Polynomial coefficients for atan approximation (higher precision than f32)
441
- __m512d const coeff_19 = _mm512_set1_pd(-1.88796008463073496563746e-05);
442
- __m512d const coeff_18 = _mm512_set1_pd(+0.000209850076645816976906797);
443
- __m512d const coeff_17 = _mm512_set1_pd(-0.00110611831486672482563471);
444
- __m512d const coeff_16 = _mm512_set1_pd(+0.00370026744188713119232403);
445
- __m512d const coeff_15 = _mm512_set1_pd(-0.00889896195887655491740809);
446
- __m512d const coeff_14 = _mm512_set1_pd(+0.016599329773529201970117);
447
- __m512d const coeff_13 = _mm512_set1_pd(-0.0254517624932312641616861);
448
- __m512d const coeff_12 = _mm512_set1_pd(+0.0337852580001353069993897);
449
- __m512d const coeff_11 = _mm512_set1_pd(-0.0407629191276836500001934);
450
- __m512d const coeff_10 = _mm512_set1_pd(+0.0466667150077840625632675);
451
- __m512d const coeff_9 = _mm512_set1_pd(-0.0523674852303482457616113);
452
- __m512d const coeff_8 = _mm512_set1_pd(+0.0587666392926673580854313);
453
- __m512d const coeff_7 = _mm512_set1_pd(-0.0666573579361080525984562);
454
- __m512d const coeff_6 = _mm512_set1_pd(+0.0769219538311769618355029);
455
- __m512d const coeff_5 = _mm512_set1_pd(-0.090908995008245008229153);
456
- __m512d const coeff_4 = _mm512_set1_pd(+0.111111105648261418443745);
457
- __m512d const coeff_3 = _mm512_set1_pd(-0.14285714266771329383765);
458
- __m512d const coeff_2 = _mm512_set1_pd(+0.199999999996591265594148);
459
- __m512d const coeff_1 = _mm512_set1_pd(-0.333333333333311110369124);
445
+ __m512d const coeff_19_f64x8 = _mm512_set1_pd(-1.88796008463073496563746e-05);
446
+ __m512d const coeff_18_f64x8 = _mm512_set1_pd(+0.000209850076645816976906797);
447
+ __m512d const coeff_17_f64x8 = _mm512_set1_pd(-0.00110611831486672482563471);
448
+ __m512d const coeff_16_f64x8 = _mm512_set1_pd(+0.00370026744188713119232403);
449
+ __m512d const coeff_15_f64x8 = _mm512_set1_pd(-0.00889896195887655491740809);
450
+ __m512d const coeff_14_f64x8 = _mm512_set1_pd(+0.016599329773529201970117);
451
+ __m512d const coeff_13_f64x8 = _mm512_set1_pd(-0.0254517624932312641616861);
452
+ __m512d const coeff_12_f64x8 = _mm512_set1_pd(+0.0337852580001353069993897);
453
+ __m512d const coeff_11_f64x8 = _mm512_set1_pd(-0.0407629191276836500001934);
454
+ __m512d const coeff_10_f64x8 = _mm512_set1_pd(+0.0466667150077840625632675);
455
+ __m512d const coeff_9_f64x8 = _mm512_set1_pd(-0.0523674852303482457616113);
456
+ __m512d const coeff_8_f64x8 = _mm512_set1_pd(+0.0587666392926673580854313);
457
+ __m512d const coeff_7_f64x8 = _mm512_set1_pd(-0.0666573579361080525984562);
458
+ __m512d const coeff_6_f64x8 = _mm512_set1_pd(+0.0769219538311769618355029);
459
+ __m512d const coeff_5_f64x8 = _mm512_set1_pd(-0.090908995008245008229153);
460
+ __m512d const coeff_4_f64x8 = _mm512_set1_pd(+0.111111105648261418443745);
461
+ __m512d const coeff_3_f64x8 = _mm512_set1_pd(-0.14285714266771329383765);
462
+ __m512d const coeff_2_f64x8 = _mm512_set1_pd(+0.199999999996591265594148);
463
+ __m512d const coeff_1_f64x8 = _mm512_set1_pd(-0.333333333333311110369124);
460
464
 
461
465
  // Quadrant adjustments normalizing to absolute values of x and y
462
466
  __mmask8 const xs_negative_mask = _mm512_cmp_pd_mask(xs_inputs, _mm512_setzero_pd(), _CMP_LT_OS);
463
- __m512d xs = _mm512_abs_pd(xs_inputs);
464
- __m512d ys = _mm512_abs_pd(ys_inputs);
467
+ __m512d xs_f64x8 = _mm512_abs_pd(xs_inputs);
468
+ __m512d ys_f64x8 = _mm512_abs_pd(ys_inputs);
465
469
  // Ensure proper fraction where the numerator is smaller than the denominator
466
- __mmask8 const swap_mask = _mm512_cmp_pd_mask(ys, xs, _CMP_GT_OS);
467
- __m512d temps = xs;
468
- xs = _mm512_mask_blend_pd(swap_mask, xs, ys);
469
- ys = _mm512_mask_sub_pd(ys, swap_mask, _mm512_setzero_pd(), temps);
470
+ __mmask8 const swap_mask = _mm512_cmp_pd_mask(ys_f64x8, xs_f64x8, _CMP_GT_OS);
471
+ __m512d temps_f64x8 = xs_f64x8;
472
+ xs_f64x8 = _mm512_mask_blend_pd(swap_mask, xs_f64x8, ys_f64x8);
473
+ ys_f64x8 = _mm512_mask_sub_pd(ys_f64x8, swap_mask, _mm512_setzero_pd(), temps_f64x8);
470
474
 
471
- // Compute ratio and ratio²
472
- __m512d const ratio = _mm512_div_pd(ys, xs);
473
- __m512d const ratio_squared = _mm512_mul_pd(ratio, ratio);
474
- __m512d const ratio_cubed = _mm512_mul_pd(ratio, ratio_squared);
475
+ // Compute ratio_f64x8 and ratio²
476
+ __m512d const ratio_f64x8 = _mm512_div_pd(ys_f64x8, xs_f64x8);
477
+ __m512d const ratio_squared_f64x8 = _mm512_mul_pd(ratio_f64x8, ratio_f64x8);
478
+ __m512d const ratio_cubed_f64x8 = _mm512_mul_pd(ratio_f64x8, ratio_squared_f64x8);
475
479
 
476
480
  // Polynomial evaluation
477
- __m512d polynomials = coeff_19;
478
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_18);
479
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_17);
480
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_16);
481
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_15);
482
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_14);
483
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_13);
484
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_12);
485
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_11);
486
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_10);
487
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_9);
488
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_8);
489
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_7);
490
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_6);
491
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_5);
492
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_4);
493
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_3);
494
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_2);
495
- polynomials = _mm512_fmadd_pd(polynomials, ratio_squared, coeff_1);
496
-
497
- // Compute the result with quadrant adjustments
498
- __m512d results = _mm512_fmadd_pd(ratio_cubed, polynomials, ratio);
499
-
500
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
481
+ __m512d polynomials_f64x8 = coeff_19_f64x8;
482
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_18_f64x8);
483
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_17_f64x8);
484
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_16_f64x8);
485
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_15_f64x8);
486
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_14_f64x8);
487
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_13_f64x8);
488
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_12_f64x8);
489
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_11_f64x8);
490
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_10_f64x8);
491
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_9_f64x8);
492
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_8_f64x8);
493
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_7_f64x8);
494
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_6_f64x8);
495
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_5_f64x8);
496
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_4_f64x8);
497
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_3_f64x8);
498
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_2_f64x8);
499
+ polynomials_f64x8 = _mm512_fmadd_pd(polynomials_f64x8, ratio_squared_f64x8, coeff_1_f64x8);
500
+
501
+ // Compute the result with quadrant_f64x8 adjustments
502
+ __m512d results_f64x8 = _mm512_fmadd_pd(ratio_cubed_f64x8, polynomials_f64x8, ratio_f64x8);
503
+
504
+ // Compute quadrant_f64x8 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
501
505
  // -2 for x<0 && !swap, -1 for x<0 && swap
502
- __m512d quadrant = _mm512_setzero_pd();
503
- quadrant = _mm512_mask_blend_pd(xs_negative_mask, quadrant, _mm512_set1_pd(-2.0));
504
- __m512d quadrant_incremented = _mm512_add_pd(quadrant, _mm512_set1_pd(1.0));
505
- quadrant = _mm512_mask_blend_pd(swap_mask, quadrant, quadrant_incremented);
506
+ __m512d quadrant_f64x8 = _mm512_setzero_pd();
507
+ quadrant_f64x8 = _mm512_mask_blend_pd(xs_negative_mask, quadrant_f64x8, _mm512_set1_pd(-2.0));
508
+ __m512d quadrant_incremented_f64x8 = _mm512_add_pd(quadrant_f64x8, _mm512_set1_pd(1.0));
509
+ quadrant_f64x8 = _mm512_mask_blend_pd(swap_mask, quadrant_f64x8, quadrant_incremented_f64x8);
506
510
 
507
- // Adjust for quadrant: result += quadrant * π/2
508
- results = _mm512_fmadd_pd(quadrant, _mm512_set1_pd(1.5707963267948966), results);
511
+ // Adjust for quadrant_f64x8: result += quadrant_f64x8 * π/2
512
+ results_f64x8 = _mm512_fmadd_pd(quadrant_f64x8, _mm512_set1_pd(1.5707963267948966), results_f64x8);
509
513
 
510
514
  // Transfer sign from x (XOR with sign bit of x_input)
511
- __m512d xs_sign = _mm512_and_pd(xs_inputs, _mm512_set1_pd(-0.0));
512
- results = _mm512_xor_pd(results, xs_sign);
515
+ __m512d xs_sign_f64x8 = _mm512_and_pd(xs_inputs, _mm512_set1_pd(-0.0));
516
+ results_f64x8 = _mm512_xor_pd(results_f64x8, xs_sign_f64x8);
513
517
 
514
518
  // Transfer sign from y (XOR with sign bit of y_input)
515
- __m512d ys_sign = _mm512_and_pd(ys_inputs, _mm512_set1_pd(-0.0));
516
- results = _mm512_xor_pd(results, ys_sign);
519
+ __m512d ys_sign_f64x8 = _mm512_and_pd(ys_inputs, _mm512_set1_pd(-0.0));
520
+ results_f64x8 = _mm512_xor_pd(results_f64x8, ys_sign_f64x8);
517
521
 
518
- return results;
522
+ return results_f64x8;
519
523
  }
520
524
 
521
525
  NK_PUBLIC void nk_each_sin_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
522
526
  nk_size_t i = 0;
523
527
  for (; i + 8 <= n; i += 8) {
524
- __m512d angles = _mm512_loadu_pd(ins + i);
525
- __m512d results = nk_sin_f64x8_skylake_(angles);
526
- _mm512_storeu_pd(outs + i, results);
528
+ __m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
529
+ __m512d results_f64x8 = nk_sin_f64x8_skylake_(angles_f64x8);
530
+ _mm512_storeu_pd(outs + i, results_f64x8);
527
531
  }
528
532
  if (i < n) {
529
533
  __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
530
- __m512d angles = _mm512_maskz_loadu_pd(mask, ins + i);
531
- __m512d results = nk_sin_f64x8_skylake_(angles);
532
- _mm512_mask_storeu_pd(outs + i, mask, results);
534
+ __m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
535
+ __m512d results_f64x8 = nk_sin_f64x8_skylake_(angles_f64x8);
536
+ _mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
533
537
  }
534
538
  }
535
539
  NK_PUBLIC void nk_each_cos_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
536
540
  nk_size_t i = 0;
537
541
  for (; i + 8 <= n; i += 8) {
538
- __m512d angles = _mm512_loadu_pd(ins + i);
539
- __m512d results = nk_cos_f64x8_skylake_(angles);
540
- _mm512_storeu_pd(outs + i, results);
542
+ __m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
543
+ __m512d results_f64x8 = nk_cos_f64x8_skylake_(angles_f64x8);
544
+ _mm512_storeu_pd(outs + i, results_f64x8);
541
545
  }
542
546
  if (i < n) {
543
547
  __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
544
- __m512d angles = _mm512_maskz_loadu_pd(mask, ins + i);
545
- __m512d results = nk_cos_f64x8_skylake_(angles);
546
- _mm512_mask_storeu_pd(outs + i, mask, results);
548
+ __m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
549
+ __m512d results_f64x8 = nk_cos_f64x8_skylake_(angles_f64x8);
550
+ _mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
547
551
  }
548
552
  }
549
553
  NK_PUBLIC void nk_each_atan_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
550
554
  nk_size_t i = 0;
551
555
  for (; i + 8 <= n; i += 8) {
552
- __m512d angles = _mm512_loadu_pd(ins + i);
553
- __m512d results = nk_atan_f64x8_skylake_(angles);
554
- _mm512_storeu_pd(outs + i, results);
556
+ __m512d angles_f64x8 = _mm512_loadu_pd(ins + i);
557
+ __m512d results_f64x8 = nk_atan_f64x8_skylake_(angles_f64x8);
558
+ _mm512_storeu_pd(outs + i, results_f64x8);
555
559
  }
556
560
  if (i < n) {
557
561
  __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFF, n - i);
558
- __m512d angles = _mm512_maskz_loadu_pd(mask, ins + i);
559
- __m512d results = nk_atan_f64x8_skylake_(angles);
560
- _mm512_mask_storeu_pd(outs + i, mask, results);
562
+ __m512d angles_f64x8 = _mm512_maskz_loadu_pd(mask, ins + i);
563
+ __m512d results_f64x8 = nk_atan_f64x8_skylake_(angles_f64x8);
564
+ _mm512_mask_storeu_pd(outs + i, mask, results_f64x8);
561
565
  }
562
566
  }
563
567
 
@@ -570,8 +574,8 @@ NK_PUBLIC void nk_each_atan_f64_skylake(nk_f64_t const *ins, nk_size_t n, nk_f64
570
574
  NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
571
575
  __m512 angles_f32x16 = _mm512_cvtph_ps(angles_f16x16);
572
576
  // Cody-Waite range reduction constants
573
- __m512 pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
574
- __m512 pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
577
+ __m512 pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
578
+ __m512 pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
575
579
  __m512 pi_recip_f32x16 = _mm512_set1_ps(0.31830988618f);
576
580
  __m512 c3_f32x16 = _mm512_set1_ps(-1.6666666641e-1f);
577
581
  __m512 c5_f32x16 = _mm512_set1_ps(8.3333293855e-3f);
@@ -581,8 +585,8 @@ NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
581
585
  // Use explicit rounding to match roundscale (MXCSR-independent)
582
586
  __m512i multiple_i32x16 = _mm512_cvt_roundps_epi32(rounded_f32x16, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
583
587
 
584
- angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_hi_f32x16, angles_f32x16);
585
- angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_lo_f32x16, angles_f32x16);
588
+ angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_high_f32x16, angles_f32x16);
589
+ angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_low_f32x16, angles_f32x16);
586
590
 
587
591
  __m512 x2_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
588
592
  __m512 poly_f32x16 = _mm512_fmadd_ps(c5_f32x16, x2_f32x16, c3_f32x16);
@@ -601,8 +605,8 @@ NK_INTERNAL __m256i nk_sin_f16x16_skylake_(__m256i angles_f16x16) {
601
605
  */
602
606
  NK_INTERNAL __m256i nk_cos_f16x16_skylake_(__m256i angles_f16x16) {
603
607
  __m512 angles_f32x16 = _mm512_cvtph_ps(angles_f16x16);
604
- __m512 pi_hi_f32x16 = _mm512_set1_ps(3.1415927f);
605
- __m512 pi_lo_f32x16 = _mm512_set1_ps(-8.742278e-8f);
608
+ __m512 pi_high_f32x16 = _mm512_set1_ps(3.1415927f);
609
+ __m512 pi_low_f32x16 = _mm512_set1_ps(-8.742278e-8f);
606
610
  __m512 pi_half_f32x16 = _mm512_set1_ps(1.5707963268f);
607
611
  __m512 pi_recip_f32x16 = _mm512_set1_ps(0.31830988618f);
608
612
  __m512 half_f32x16 = _mm512_set1_ps(0.5f);
@@ -614,9 +618,9 @@ NK_INTERNAL __m256i nk_cos_f16x16_skylake_(__m256i angles_f16x16) {
614
618
  // Use explicit rounding to match roundscale (MXCSR-independent)
615
619
  __m512i multiple_i32x16 = _mm512_cvt_roundps_epi32(rounded_f32x16, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
616
620
 
617
- __m512 shift_f32x16 = _mm512_fmadd_ps(rounded_f32x16, pi_hi_f32x16, pi_half_f32x16);
621
+ __m512 shift_f32x16 = _mm512_fmadd_ps(rounded_f32x16, pi_high_f32x16, pi_half_f32x16);
618
622
  angles_f32x16 = _mm512_sub_ps(angles_f32x16, shift_f32x16);
619
- angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_lo_f32x16, angles_f32x16);
623
+ angles_f32x16 = _mm512_fnmadd_ps(rounded_f32x16, pi_low_f32x16, angles_f32x16);
620
624
 
621
625
  __m512 x2_f32x16 = _mm512_mul_ps(angles_f32x16, angles_f32x16);
622
626
  __m512 poly_f32x16 = _mm512_fmadd_ps(c5_f32x16, x2_f32x16, c3_f32x16);