numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -9,12 +9,12 @@
9
9
  *
10
10
  * @section haswell_trig_instructions Key AVX2 Trigonometry Instructions
11
11
  *
12
- * Intrinsic Instruction Latency Throughput Ports
13
- * _mm256_fmadd_ps/pd VFMADD (YMM, YMM, YMM) 5cy 0.5/cy p01
14
- * _mm256_mul_ps/pd VMULPS/PD (YMM, YMM, YMM) 5cy 0.5/cy p01
15
- * _mm256_blendv_ps/pd VBLENDVPS/PD (YMM, YMM, YMM) 2cy 1/cy p015
16
- * _mm256_round_ps/pd VROUNDPS/PD (YMM, YMM, I8) 6cy 1/cy p01
17
- * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 13cy 5/cy p0
12
+ * Intrinsic Instruction Haswell Genoa
13
+ * _mm256_fmadd_ps/pd VFMADD (YMM, YMM, YMM) 5cy @ p01 4cy @ p01
14
+ * _mm256_mul_ps/pd VMULPS/PD (YMM, YMM, YMM) 5cy @ p01 3cy @ p01
15
+ * _mm256_blendv_ps/pd VBLENDVPS/PD (YMM, YMM, YMM) 2cy @ p015 1cy @ p01
16
+ * _mm256_round_ps/pd VROUNDPS/PD (YMM, YMM, I8) 6cy @ p01 3cy @ p23
17
+ * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 13cy @ p0 11cy @ p01
18
18
  *
19
19
  * Polynomial evaluation uses Horner's method with FMA for sin/cos/atan approximation. For large
20
20
  * arrays, out-of-order execution across loop iterations hides FMA latency better than Estrin's
@@ -46,501 +46,502 @@ extern "C" {
46
46
 
47
47
  NK_INTERNAL __m256 nk_sin_f32x8_haswell_(__m256 const angles_radians) {
48
48
  // Cody-Waite constants for argument reduction
49
- __m256 const pi_hi_f32x8 = _mm256_set1_ps(3.1415927f);
50
- __m256 const pi_lo_f32x8 = _mm256_set1_ps(-8.742278e-8f);
51
- __m256 const pi_reciprocal = _mm256_set1_ps(0.31830988618379067154f); // 1/π
49
+ __m256 const pi_high_f32x8 = _mm256_set1_ps(3.1415927f);
50
+ __m256 const pi_low_f32x8 = _mm256_set1_ps(-8.742278e-8f);
51
+ __m256 const pi_reciprocal_f32x8 = _mm256_set1_ps(0.31830988618379067154f); // 1/π
52
52
  // Degree-9 minimax coefficients
53
- __m256 const coeff_9 = _mm256_set1_ps(+2.7557319224e-6f);
54
- __m256 const coeff_7 = _mm256_set1_ps(-1.9841269841e-4f);
55
- __m256 const coeff_5 = _mm256_set1_ps(+8.3333293855e-3f);
56
- __m256 const coeff_3 = _mm256_set1_ps(-1.6666666641e-1f);
53
+ __m256 const coeff_9_f32x8 = _mm256_set1_ps(+2.7557319224e-6f);
54
+ __m256 const coeff_7_f32x8 = _mm256_set1_ps(-1.9841269841e-4f);
55
+ __m256 const coeff_5_f32x8 = _mm256_set1_ps(+8.3333293855e-3f);
56
+ __m256 const coeff_3_f32x8 = _mm256_set1_ps(-1.6666666641e-1f);
57
57
 
58
- // Compute (multiples_of_pi) = round(angle / π)
59
- __m256 quotients = _mm256_mul_ps(angles_radians, pi_reciprocal);
60
- __m256 rounded_quotients = _mm256_round_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
61
- // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
62
- __m256i multiples_of_pi = _mm256_cvttps_epi32(rounded_quotients);
58
+ // Compute (multiples_of_pi_i32x8) = round(angle / π)
59
+ __m256 quotients_f32x8 = _mm256_mul_ps(angles_radians, pi_reciprocal_f32x8);
60
+ __m256 rounded_quotients_f32x8 = _mm256_round_ps(quotients_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
61
+ // Use truncation (MXCSR-independent) since rounded_quotients_f32x8 is already integer-valued
62
+ __m256i multiples_of_pi_i32x8 = _mm256_cvttps_epi32(rounded_quotients_f32x8);
63
63
 
64
64
  // Cody-Waite range reduction
65
- __m256 angles = _mm256_fnmadd_ps(rounded_quotients, pi_hi_f32x8, angles_radians);
66
- angles = _mm256_fnmadd_ps(rounded_quotients, pi_lo_f32x8, angles);
67
- __m256 const angles_squared = _mm256_mul_ps(angles, angles);
68
- __m256 const angles_cubed = _mm256_mul_ps(angles, angles_squared);
65
+ __m256 angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_high_f32x8, angles_radians);
66
+ angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_low_f32x8, angles_f32x8);
67
+ __m256 const angles_squared_f32x8 = _mm256_mul_ps(angles_f32x8, angles_f32x8);
68
+ __m256 const angles_cubed_f32x8 = _mm256_mul_ps(angles_f32x8, angles_squared_f32x8);
69
69
 
70
70
  // Degree-9 polynomial via Horner's method
71
- __m256 polynomials = coeff_9;
72
- polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_7);
73
- polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_5);
74
- polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_3);
75
- __m256 results = _mm256_fmadd_ps(angles_cubed, polynomials, angles);
76
-
77
- // If multiples_of_pi is odd, flip the sign of the results
78
- __m256i parity = _mm256_and_si256(multiples_of_pi, _mm256_set1_epi32(1));
79
- __m256i odd_mask = _mm256_cmpeq_epi32(parity, _mm256_set1_epi32(1));
80
- __m256 float_mask = _mm256_castsi256_ps(odd_mask);
81
- __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), results);
82
- results = _mm256_blendv_ps(results, negated, float_mask);
83
- return results;
71
+ __m256 polynomials_f32x8 = coeff_9_f32x8;
72
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_7_f32x8);
73
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_5_f32x8);
74
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_3_f32x8);
75
+ __m256 results_f32x8 = _mm256_fmadd_ps(angles_cubed_f32x8, polynomials_f32x8, angles_f32x8);
76
+
77
+ // If multiples_of_pi_i32x8 is odd, flip the sign of the results_f32x8
78
+ __m256i parity_i32x8 = _mm256_and_si256(multiples_of_pi_i32x8, _mm256_set1_epi32(1));
79
+ __m256i odd_mask_i32x8 = _mm256_cmpeq_epi32(parity_i32x8, _mm256_set1_epi32(1));
80
+ __m256 float_mask_f32x8 = _mm256_castsi256_ps(odd_mask_i32x8);
81
+ __m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), results_f32x8);
82
+ results_f32x8 = _mm256_blendv_ps(results_f32x8, negated_f32x8, float_mask_f32x8);
83
+ return results_f32x8;
84
84
  }
85
85
 
86
86
  NK_INTERNAL __m256 nk_cos_f32x8_haswell_(__m256 const angles_radians) {
87
87
  // Cody-Waite constants for argument reduction
88
- __m256 const pi_hi_f32x8 = _mm256_set1_ps(3.1415927f);
89
- __m256 const pi_lo_f32x8 = _mm256_set1_ps(-8.742278e-8f);
90
- __m256 const pi_half = _mm256_set1_ps(1.57079632679489661923f); // π/2
91
- __m256 const pi_reciprocal = _mm256_set1_ps(0.31830988618379067154f); // 1/π
88
+ __m256 const pi_high_f32x8 = _mm256_set1_ps(3.1415927f);
89
+ __m256 const pi_low_f32x8 = _mm256_set1_ps(-8.742278e-8f);
90
+ __m256 const pi_half_f32x8 = _mm256_set1_ps(1.57079632679489661923f); // π/2
91
+ __m256 const pi_reciprocal_f32x8 = _mm256_set1_ps(0.31830988618379067154f); // 1/π
92
92
  // Degree-9 minimax coefficients
93
- __m256 const coeff_9 = _mm256_set1_ps(+2.7557319224e-6f);
94
- __m256 const coeff_7 = _mm256_set1_ps(-1.9841269841e-4f);
95
- __m256 const coeff_5 = _mm256_set1_ps(+8.3333293855e-3f);
96
- __m256 const coeff_3 = _mm256_set1_ps(-1.6666666641e-1f);
93
+ __m256 const coeff_9_f32x8 = _mm256_set1_ps(+2.7557319224e-6f);
94
+ __m256 const coeff_7_f32x8 = _mm256_set1_ps(-1.9841269841e-4f);
95
+ __m256 const coeff_5_f32x8 = _mm256_set1_ps(+8.3333293855e-3f);
96
+ __m256 const coeff_3_f32x8 = _mm256_set1_ps(-1.6666666641e-1f);
97
97
 
98
- // Compute (multiples_of_pi) = round((angle / π) - 0.5)
99
- __m256 quotients = _mm256_fmsub_ps(angles_radians, pi_reciprocal, _mm256_set1_ps(0.5f));
100
- __m256 rounded_quotients = _mm256_round_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
101
- // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
102
- __m256i multiples_of_pi = _mm256_cvttps_epi32(rounded_quotients);
98
+ // Compute (multiples_of_pi_i32x8) = round((angle / π) - 0.5)
99
+ __m256 quotients_f32x8 = _mm256_fmsub_ps(angles_radians, pi_reciprocal_f32x8, _mm256_set1_ps(0.5f));
100
+ __m256 rounded_quotients_f32x8 = _mm256_round_ps(quotients_f32x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
101
+ // Use truncation (MXCSR-independent) since rounded_quotients_f32x8 is already integer-valued
102
+ __m256i multiples_of_pi_i32x8 = _mm256_cvttps_epi32(rounded_quotients_f32x8);
103
103
 
104
104
  // Cody-Waite range reduction: angle = angle_radians - (multiples * pi + pi/2)
105
- __m256 const offset = _mm256_fmadd_ps(rounded_quotients, pi_hi_f32x8, pi_half);
106
- __m256 angles = _mm256_sub_ps(angles_radians, offset);
107
- angles = _mm256_fnmadd_ps(rounded_quotients, pi_lo_f32x8, angles);
108
- __m256 const angles_squared = _mm256_mul_ps(angles, angles);
109
- __m256 const angles_cubed = _mm256_mul_ps(angles, angles_squared);
105
+ __m256 const offset_f32x8 = _mm256_fmadd_ps(rounded_quotients_f32x8, pi_high_f32x8, pi_half_f32x8);
106
+ __m256 angles_f32x8 = _mm256_sub_ps(angles_radians, offset_f32x8);
107
+ angles_f32x8 = _mm256_fnmadd_ps(rounded_quotients_f32x8, pi_low_f32x8, angles_f32x8);
108
+ __m256 const angles_squared_f32x8 = _mm256_mul_ps(angles_f32x8, angles_f32x8);
109
+ __m256 const angles_cubed_f32x8 = _mm256_mul_ps(angles_f32x8, angles_squared_f32x8);
110
110
 
111
111
  // Degree-9 polynomial via Horner's method
112
- __m256 polynomials = coeff_9;
113
- polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_7);
114
- polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_5);
115
- polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_3);
116
- __m256 results = _mm256_fmadd_ps(angles_cubed, polynomials, angles);
117
-
118
- // If multiples_of_pi is even, flip the sign of the results
119
- __m256i parity = _mm256_and_si256(multiples_of_pi, _mm256_set1_epi32(1));
120
- __m256i even_mask = _mm256_cmpeq_epi32(parity, _mm256_setzero_si256());
121
- __m256 float_mask = _mm256_castsi256_ps(even_mask);
122
- __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), results);
123
- results = _mm256_blendv_ps(results, negated, float_mask);
124
- return results;
112
+ __m256 polynomials_f32x8 = coeff_9_f32x8;
113
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_7_f32x8);
114
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_5_f32x8);
115
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, angles_squared_f32x8, coeff_3_f32x8);
116
+ __m256 results_f32x8 = _mm256_fmadd_ps(angles_cubed_f32x8, polynomials_f32x8, angles_f32x8);
117
+
118
+ // If multiples_of_pi_i32x8 is even, flip the sign of the results_f32x8
119
+ __m256i parity_i32x8 = _mm256_and_si256(multiples_of_pi_i32x8, _mm256_set1_epi32(1));
120
+ __m256i even_mask_i32x8 = _mm256_cmpeq_epi32(parity_i32x8, _mm256_setzero_si256());
121
+ __m256 float_mask_f32x8 = _mm256_castsi256_ps(even_mask_i32x8);
122
+ __m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), results_f32x8);
123
+ results_f32x8 = _mm256_blendv_ps(results_f32x8, negated_f32x8, float_mask_f32x8);
124
+ return results_f32x8;
125
125
  }
126
126
 
127
127
  NK_INTERNAL __m256 nk_atan_f32x8_haswell_(__m256 const inputs) {
128
128
  // Polynomial coefficients for atan approximation (8 terms)
129
129
  // These coefficients approximate: atan(x) ≈ x + c8 × x³ + c7 × x⁵ + c6 × x⁷ + ... + c1 × x¹⁵
130
- __m256 const coeff_8 = _mm256_set1_ps(-0.333331018686294555664062f);
131
- __m256 const coeff_7 = _mm256_set1_ps(+0.199926957488059997558594f);
132
- __m256 const coeff_6 = _mm256_set1_ps(-0.142027363181114196777344f);
133
- __m256 const coeff_5 = _mm256_set1_ps(+0.106347933411598205566406f);
134
- __m256 const coeff_4 = _mm256_set1_ps(-0.0748900920152664184570312f);
135
- __m256 const coeff_3 = _mm256_set1_ps(+0.0425049886107444763183594f);
136
- __m256 const coeff_2 = _mm256_set1_ps(-0.0159569028764963150024414f);
137
- __m256 const coeff_1 = _mm256_set1_ps(+0.00282363896258175373077393f);
138
- __m256 const sign_mask = _mm256_set1_ps(-0.0f);
139
-
140
- // Adjust for quadrant - detect negative values
141
- __m256 values = inputs;
142
- __m256 negative_mask = _mm256_cmp_ps(values, _mm256_setzero_ps(), _CMP_LT_OS);
143
- values = _mm256_andnot_ps(sign_mask, values); // abs(values)
144
-
145
- // Check if values > 1 (need reciprocal)
146
- __m256 reciprocal_mask = _mm256_cmp_ps(values, _mm256_set1_ps(1.0f), _CMP_GT_OS);
147
- __m256 reciprocal_values = _mm256_div_ps(_mm256_set1_ps(1.0f), values);
148
- values = _mm256_blendv_ps(values, reciprocal_values, reciprocal_mask);
130
+ __m256 const coeff_8_f32x8 = _mm256_set1_ps(-0.333331018686294555664062f);
131
+ __m256 const coeff_7_f32x8 = _mm256_set1_ps(+0.199926957488059997558594f);
132
+ __m256 const coeff_6_f32x8 = _mm256_set1_ps(-0.142027363181114196777344f);
133
+ __m256 const coeff_5_f32x8 = _mm256_set1_ps(+0.106347933411598205566406f);
134
+ __m256 const coeff_4_f32x8 = _mm256_set1_ps(-0.0748900920152664184570312f);
135
+ __m256 const coeff_3_f32x8 = _mm256_set1_ps(+0.0425049886107444763183594f);
136
+ __m256 const coeff_2_f32x8 = _mm256_set1_ps(-0.0159569028764963150024414f);
137
+ __m256 const coeff_1_f32x8 = _mm256_set1_ps(+0.00282363896258175373077393f);
138
+ __m256 const sign_mask_f32x8 = _mm256_set1_ps(-0.0f);
139
+
140
+ // Adjust for quadrant - detect negative values_f32x8
141
+ __m256 values_f32x8 = inputs;
142
+ __m256 negative_mask_f32x8 = _mm256_cmp_ps(values_f32x8, _mm256_setzero_ps(), _CMP_LT_OS);
143
+ values_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, values_f32x8); // abs(values_f32x8)
144
+
145
+ // Check if values_f32x8 > 1 (need reciprocal)
146
+ __m256 reciprocal_mask_f32x8 = _mm256_cmp_ps(values_f32x8, _mm256_set1_ps(1.0f), _CMP_GT_OS);
147
+ __m256 reciprocal_values_f32x8 = _mm256_div_ps(_mm256_set1_ps(1.0f), values_f32x8);
148
+ values_f32x8 = _mm256_blendv_ps(values_f32x8, reciprocal_values_f32x8, reciprocal_mask_f32x8);
149
149
 
150
150
  // Argument reduction
151
- __m256 const values_squared = _mm256_mul_ps(values, values);
152
- __m256 const values_cubed = _mm256_mul_ps(values, values_squared);
151
+ __m256 const values_squared_f32x8 = _mm256_mul_ps(values_f32x8, values_f32x8);
152
+ __m256 const values_cubed_f32x8 = _mm256_mul_ps(values_f32x8, values_squared_f32x8);
153
153
 
154
154
  // Polynomial evaluation using Horner's method.
155
155
  // For large arrays, out-of-order execution across loop iterations already hides
156
156
  // FMA latency. Estrin's scheme was tested but showed ~20% regression because
157
157
  // the extra power computations (y², y⁴) hurt throughput more than the reduced
158
158
  // dependency depth helps latency.
159
- __m256 polynomials = coeff_1;
160
- polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_2);
161
- polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_3);
162
- polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_4);
163
- polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_5);
164
- polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_6);
165
- polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_7);
166
- polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_8);
167
-
168
- // Compute result: atan(x) ≈ x + x³ * P(x²)
169
- __m256 result = _mm256_fmadd_ps(values_cubed, polynomials, values);
170
-
171
- // Adjust for reciprocal: result = π/2 - result
172
- __m256 adjusted = _mm256_sub_ps(_mm256_set1_ps(1.5707963267948966f), result);
173
- result = _mm256_blendv_ps(result, adjusted, reciprocal_mask);
174
-
175
- // Adjust for negative: result = -result
176
- __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), result);
177
- result = _mm256_blendv_ps(result, negated, negative_mask);
178
- return result;
159
+ __m256 polynomials_f32x8 = coeff_1_f32x8;
160
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_2_f32x8);
161
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_3_f32x8);
162
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_4_f32x8);
163
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_5_f32x8);
164
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_6_f32x8);
165
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_7_f32x8);
166
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, values_squared_f32x8, coeff_8_f32x8);
167
+
168
+ // Compute result_f32x8: atan(x) ≈ x + x³ * P(x²)
169
+ __m256 result_f32x8 = _mm256_fmadd_ps(values_cubed_f32x8, polynomials_f32x8, values_f32x8);
170
+
171
+ // Adjust for reciprocal: result_f32x8 = π/2 - result_f32x8
172
+ __m256 adjusted_f32x8 = _mm256_sub_ps(_mm256_set1_ps(1.5707963267948966f), result_f32x8);
173
+ result_f32x8 = _mm256_blendv_ps(result_f32x8, adjusted_f32x8, reciprocal_mask_f32x8);
174
+
175
+ // Adjust for negative: result_f32x8 = -result_f32x8
176
+ __m256 negated_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), result_f32x8);
177
+ result_f32x8 = _mm256_blendv_ps(result_f32x8, negated_f32x8, negative_mask_f32x8);
178
+ return result_f32x8;
179
179
  }
180
180
 
181
181
  NK_INTERNAL __m256 nk_atan2_f32x8_haswell_(__m256 const ys_inputs, __m256 const xs_inputs) {
182
182
  // Polynomial coefficients (same as atan)
183
- __m256 const coeff_8 = _mm256_set1_ps(-0.333331018686294555664062f);
184
- __m256 const coeff_7 = _mm256_set1_ps(+0.199926957488059997558594f);
185
- __m256 const coeff_6 = _mm256_set1_ps(-0.142027363181114196777344f);
186
- __m256 const coeff_5 = _mm256_set1_ps(+0.106347933411598205566406f);
187
- __m256 const coeff_4 = _mm256_set1_ps(-0.0748900920152664184570312f);
188
- __m256 const coeff_3 = _mm256_set1_ps(+0.0425049886107444763183594f);
189
- __m256 const coeff_2 = _mm256_set1_ps(-0.0159569028764963150024414f);
190
- __m256 const coeff_1 = _mm256_set1_ps(+0.00282363896258175373077393f);
191
- __m256 const sign_mask = _mm256_set1_ps(-0.0f);
183
+ __m256 const coeff_8_f32x8 = _mm256_set1_ps(-0.333331018686294555664062f);
184
+ __m256 const coeff_7_f32x8 = _mm256_set1_ps(+0.199926957488059997558594f);
185
+ __m256 const coeff_6_f32x8 = _mm256_set1_ps(-0.142027363181114196777344f);
186
+ __m256 const coeff_5_f32x8 = _mm256_set1_ps(+0.106347933411598205566406f);
187
+ __m256 const coeff_4_f32x8 = _mm256_set1_ps(-0.0748900920152664184570312f);
188
+ __m256 const coeff_3_f32x8 = _mm256_set1_ps(+0.0425049886107444763183594f);
189
+ __m256 const coeff_2_f32x8 = _mm256_set1_ps(-0.0159569028764963150024414f);
190
+ __m256 const coeff_1_f32x8 = _mm256_set1_ps(+0.00282363896258175373077393f);
191
+ __m256 const sign_mask_f32x8 = _mm256_set1_ps(-0.0f);
192
192
 
193
193
  // Quadrant adjustments normalizing to absolute values of x and y
194
- __m256 xs_negative_mask = _mm256_cmp_ps(xs_inputs, _mm256_setzero_ps(), _CMP_LT_OS);
195
- __m256 xs = _mm256_andnot_ps(sign_mask, xs_inputs); // abs(xs_inputs)
196
- __m256 ys = _mm256_andnot_ps(sign_mask, ys_inputs); // abs(ys_inputs)
194
+ __m256 xs_negative_mask_f32x8 = _mm256_cmp_ps(xs_inputs, _mm256_setzero_ps(), _CMP_LT_OS);
195
+ __m256 xs_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, xs_inputs); // abs(xs_inputs)
196
+ __m256 ys_f32x8 = _mm256_andnot_ps(sign_mask_f32x8, ys_inputs); // abs(ys_inputs)
197
197
 
198
198
  // Ensure proper fraction where the numerator is smaller than the denominator
199
- __m256 swap_mask = _mm256_cmp_ps(ys, xs, _CMP_GT_OS);
200
- __m256 temps = xs;
201
- xs = _mm256_blendv_ps(xs, ys, swap_mask);
202
- __m256 neg_temps = _mm256_sub_ps(_mm256_setzero_ps(), temps);
203
- ys = _mm256_blendv_ps(ys, neg_temps, swap_mask);
199
+ __m256 swap_mask_f32x8 = _mm256_cmp_ps(ys_f32x8, xs_f32x8, _CMP_GT_OS);
200
+ __m256 temps_f32x8 = xs_f32x8;
201
+ xs_f32x8 = _mm256_blendv_ps(xs_f32x8, ys_f32x8, swap_mask_f32x8);
202
+ __m256 neg_temps_f32x8 = _mm256_sub_ps(_mm256_setzero_ps(), temps_f32x8);
203
+ ys_f32x8 = _mm256_blendv_ps(ys_f32x8, neg_temps_f32x8, swap_mask_f32x8);
204
204
 
205
- // Compute ratio and powers
206
- __m256 const ratio = _mm256_div_ps(ys, xs);
207
- __m256 const ratio_squared = _mm256_mul_ps(ratio, ratio);
208
- __m256 const ratio_cubed = _mm256_mul_ps(ratio, ratio_squared);
205
+ // Compute ratio_f32x8 and powers
206
+ __m256 const ratio_f32x8 = _mm256_div_ps(ys_f32x8, xs_f32x8);
207
+ __m256 const ratio_squared_f32x8 = _mm256_mul_ps(ratio_f32x8, ratio_f32x8);
208
+ __m256 const ratio_cubed_f32x8 = _mm256_mul_ps(ratio_f32x8, ratio_squared_f32x8);
209
209
 
210
210
  // Polynomial evaluation using Horner's method
211
- __m256 polynomials = coeff_1;
212
- polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_2);
213
- polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_3);
214
- polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_4);
215
- polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_5);
216
- polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_6);
217
- polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_7);
218
- polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_8);
219
-
220
- // Compute the result using masks for quadrant adjustments
221
- __m256 results = _mm256_fmadd_ps(ratio_cubed, polynomials, ratio);
222
-
223
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
211
+ __m256 polynomials_f32x8 = coeff_1_f32x8;
212
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_2_f32x8);
213
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_3_f32x8);
214
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_4_f32x8);
215
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_5_f32x8);
216
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_6_f32x8);
217
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_7_f32x8);
218
+ polynomials_f32x8 = _mm256_fmadd_ps(polynomials_f32x8, ratio_squared_f32x8, coeff_8_f32x8);
219
+
220
+ // Compute the result using masks for quadrant_f32x8 adjustments
221
+ __m256 results_f32x8 = _mm256_fmadd_ps(ratio_cubed_f32x8, polynomials_f32x8, ratio_f32x8);
222
+
223
+ // Compute quadrant_f32x8 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
224
224
  // -2 for x<0 && !swap, -1 for x<0 && swap
225
- __m256 quadrant = _mm256_setzero_ps();
226
- __m256 neg_two = _mm256_set1_ps(-2.0f);
227
- quadrant = _mm256_blendv_ps(quadrant, neg_two, xs_negative_mask);
228
- __m256 one = _mm256_set1_ps(1.0f);
229
- __m256 quadrant_incremented = _mm256_add_ps(quadrant, one);
230
- quadrant = _mm256_blendv_ps(quadrant, quadrant_incremented, swap_mask);
225
+ __m256 quadrant_f32x8 = _mm256_setzero_ps();
226
+ __m256 neg_two_f32x8 = _mm256_set1_ps(-2.0f);
227
+ quadrant_f32x8 = _mm256_blendv_ps(quadrant_f32x8, neg_two_f32x8, xs_negative_mask_f32x8);
228
+ __m256 one_f32x8 = _mm256_set1_ps(1.0f);
229
+ __m256 quadrant_incremented_f32x8 = _mm256_add_ps(quadrant_f32x8, one_f32x8);
230
+ quadrant_f32x8 = _mm256_blendv_ps(quadrant_f32x8, quadrant_incremented_f32x8, swap_mask_f32x8);
231
231
 
232
- // Adjust for quadrant: result += quadrant * π/2
233
- __m256 pi_half = _mm256_set1_ps(1.5707963267948966f);
234
- results = _mm256_fmadd_ps(quadrant, pi_half, results);
232
+ // Adjust for quadrant_f32x8: result += quadrant_f32x8 * π/2
233
+ __m256 pi_half_f32x8 = _mm256_set1_ps(1.5707963267948966f);
234
+ results_f32x8 = _mm256_fmadd_ps(quadrant_f32x8, pi_half_f32x8, results_f32x8);
235
235
 
236
236
  // Transfer sign from x (XOR with sign bit of x_input)
237
- __m256 xs_sign_bits = _mm256_and_ps(xs_inputs, sign_mask);
238
- results = _mm256_xor_ps(results, xs_sign_bits);
237
+ __m256 xs_sign_bits_f32x8 = _mm256_and_ps(xs_inputs, sign_mask_f32x8);
238
+ results_f32x8 = _mm256_xor_ps(results_f32x8, xs_sign_bits_f32x8);
239
239
 
240
240
  // Transfer sign from y (XOR with sign bit of y_input)
241
- __m256 ys_sign_bits = _mm256_and_ps(ys_inputs, sign_mask);
242
- results = _mm256_xor_ps(results, ys_sign_bits);
241
+ __m256 ys_sign_bits_f32x8 = _mm256_and_ps(ys_inputs, sign_mask_f32x8);
242
+ results_f32x8 = _mm256_xor_ps(results_f32x8, ys_sign_bits_f32x8);
243
243
 
244
- return results;
244
+ return results_f32x8;
245
245
  }
246
246
 
247
247
  NK_INTERNAL __m256d nk_sin_f64x4_haswell_(__m256d const angles_radians) {
248
248
  // Constants for argument reduction
249
- __m256d const pi_high = _mm256_set1_pd(3.141592653589793116); // High-digits part of π
250
- __m256d const pi_low = _mm256_set1_pd(1.2246467991473532072e-16); // Low-digits part of π
251
- __m256d const pi_reciprocal = _mm256_set1_pd(0.31830988618379067154); // 1/π
249
+ __m256d const pi_high_f64x4 = _mm256_set1_pd(3.141592653589793116); // High-digits part of π
250
+ __m256d const pi_low_f64x4 = _mm256_set1_pd(1.2246467991473532072e-16); // Low-digits part of π
251
+ __m256d const pi_reciprocal_f64x4 = _mm256_set1_pd(0.31830988618379067154); // 1/π
252
252
 
253
253
  // Polynomial coefficients for sine approximation (minimax polynomial)
254
- __m256d const coeff_0 = _mm256_set1_pd(+0.00833333333333332974823815);
255
- __m256d const coeff_1 = _mm256_set1_pd(-0.000198412698412696162806809);
256
- __m256d const coeff_2 = _mm256_set1_pd(+2.75573192239198747630416e-06);
257
- __m256d const coeff_3 = _mm256_set1_pd(-2.50521083763502045810755e-08);
258
- __m256d const coeff_4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
259
- __m256d const coeff_5 = _mm256_set1_pd(-7.64712219118158833288484e-13);
260
- __m256d const coeff_6 = _mm256_set1_pd(+2.81009972710863200091251e-15);
261
- __m256d const coeff_7 = _mm256_set1_pd(-7.97255955009037868891952e-18);
262
- __m256d const coeff_8 = _mm256_set1_pd(-0.166666666666666657414808);
263
-
264
- // Compute (rounded_quotients) = round(angle / π)
265
- __m256d const quotients = _mm256_mul_pd(angles_radians, pi_reciprocal);
266
- __m256d const rounded_quotients = _mm256_round_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
267
-
268
- // Reduce the angle: angle - (rounded_quotients * π_high + rounded_quotients * π_low)
269
- __m256d angles = angles_radians;
270
- angles = _mm256_fnmadd_pd(rounded_quotients, pi_high, angles);
271
- angles = _mm256_fnmadd_pd(rounded_quotients, pi_low, angles);
272
-
273
- // If rounded_quotients is odd (bit 0 set), negate the angle
254
+ __m256d const coeff_0_f64x4 = _mm256_set1_pd(+0.00833333333333332974823815);
255
+ __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.000198412698412696162806809);
256
+ __m256d const coeff_2_f64x4 = _mm256_set1_pd(+2.75573192239198747630416e-06);
257
+ __m256d const coeff_3_f64x4 = _mm256_set1_pd(-2.50521083763502045810755e-08);
258
+ __m256d const coeff_4_f64x4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
259
+ __m256d const coeff_5_f64x4 = _mm256_set1_pd(-7.64712219118158833288484e-13);
260
+ __m256d const coeff_6_f64x4 = _mm256_set1_pd(+2.81009972710863200091251e-15);
261
+ __m256d const coeff_7_f64x4 = _mm256_set1_pd(-7.97255955009037868891952e-18);
262
+ __m256d const coeff_8_f64x4 = _mm256_set1_pd(-0.166666666666666657414808);
263
+
264
+ // Compute (rounded_quotients_f64x4) = round(angle / π)
265
+ __m256d const quotients_f64x4 = _mm256_mul_pd(angles_radians, pi_reciprocal_f64x4);
266
+ __m256d const rounded_quotients_f64x4 = _mm256_round_pd(quotients_f64x4,
267
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
268
+
269
+ // Reduce the angle: angle - (rounded_quotients_f64x4 * π_high + rounded_quotients_f64x4 * π_low)
270
+ __m256d angles_f64x4 = angles_radians;
271
+ angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_high_f64x4, angles_f64x4);
272
+ angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_low_f64x4, angles_f64x4);
273
+
274
+ // If rounded_quotients_f64x4 is odd (bit 0 set), negate the angle
274
275
  // Convert to 32-bit int (returns __m128i with 4 x 32-bit ints)
275
- // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
276
- __m128i quotients_i32 = _mm256_cvttpd_epi32(rounded_quotients);
277
- __m128i parity = _mm_and_si128(quotients_i32, _mm_set1_epi32(1));
278
- __m128i odd_mask_i32 = _mm_cmpeq_epi32(parity, _mm_set1_epi32(1));
276
+ // Use truncation (MXCSR-independent) since rounded_quotients_f64x4 is already integer-valued
277
+ __m128i quotients_i32_i32x4 = _mm256_cvttpd_epi32(rounded_quotients_f64x4);
278
+ __m128i parity_i32x4 = _mm_and_si128(quotients_i32_i32x4, _mm_set1_epi32(1));
279
+ __m128i odd_mask_i32_i32x4 = _mm_cmpeq_epi32(parity_i32x4, _mm_set1_epi32(1));
279
280
  // Expand 32-bit mask to 64-bit by shuffling
280
- __m256i odd_mask_i64 = _mm256_cvtepi32_epi64(odd_mask_i32);
281
- __m256d float_mask = _mm256_castsi256_pd(odd_mask_i64);
282
- __m256d negated_angles = _mm256_sub_pd(_mm256_setzero_pd(), angles);
283
- angles = _mm256_blendv_pd(angles, negated_angles, float_mask);
281
+ __m256i odd_mask_i64_i32x8 = _mm256_cvtepi32_epi64(odd_mask_i32_i32x4);
282
+ __m256d float_mask_f64x4 = _mm256_castsi256_pd(odd_mask_i64_i32x8);
283
+ __m256d negated_angles_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), angles_f64x4);
284
+ angles_f64x4 = _mm256_blendv_pd(angles_f64x4, negated_angles_f64x4, float_mask_f64x4);
284
285
 
285
- __m256d const angles_squared = _mm256_mul_pd(angles, angles);
286
- __m256d const angles_cubed = _mm256_mul_pd(angles, angles_squared);
287
- __m256d const angles_quadratic = _mm256_mul_pd(angles_squared, angles_squared);
288
- __m256d const angles_octic = _mm256_mul_pd(angles_quadratic, angles_quadratic);
286
+ __m256d const angles_squared_f64x4 = _mm256_mul_pd(angles_f64x4, angles_f64x4);
287
+ __m256d const angles_cubed_f64x4 = _mm256_mul_pd(angles_f64x4, angles_squared_f64x4);
288
+ __m256d const angles_quadratic_f64x4 = _mm256_mul_pd(angles_squared_f64x4, angles_squared_f64x4);
289
+ __m256d const angles_octic_f64x4 = _mm256_mul_pd(angles_quadratic_f64x4, angles_quadratic_f64x4);
289
290
 
290
291
  // Compute higher-degree polynomial terms
291
- __m256d const poly_67 = _mm256_fmadd_pd(angles_squared, coeff_7, coeff_6);
292
- __m256d const poly_45 = _mm256_fmadd_pd(angles_squared, coeff_5, coeff_4);
293
- __m256d const poly_4567 = _mm256_fmadd_pd(angles_quadratic, poly_67, poly_45);
292
+ __m256d const poly_67_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_7_f64x4, coeff_6_f64x4);
293
+ __m256d const poly_45_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_5_f64x4, coeff_4_f64x4);
294
+ __m256d const poly_4567_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_67_f64x4, poly_45_f64x4);
294
295
 
295
296
  // Compute lower-degree polynomial terms
296
- __m256d const poly_23 = _mm256_fmadd_pd(angles_squared, coeff_3, coeff_2);
297
- __m256d const poly_01 = _mm256_fmadd_pd(angles_squared, coeff_1, coeff_0);
298
- __m256d const poly_0123 = _mm256_fmadd_pd(angles_quadratic, poly_23, poly_01);
297
+ __m256d const poly_23_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_3_f64x4, coeff_2_f64x4);
298
+ __m256d const poly_01_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_1_f64x4, coeff_0_f64x4);
299
+ __m256d const poly_0123_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_23_f64x4, poly_01_f64x4);
299
300
 
300
301
  // Combine polynomial terms
301
- __m256d results = _mm256_fmadd_pd(angles_octic, poly_4567, poly_0123);
302
- results = _mm256_fmadd_pd(results, angles_squared, coeff_8);
303
- results = _mm256_fmadd_pd(results, angles_cubed, angles);
302
+ __m256d results_f64x4 = _mm256_fmadd_pd(angles_octic_f64x4, poly_4567_f64x4, poly_0123_f64x4);
303
+ results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_squared_f64x4, coeff_8_f64x4);
304
+ results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_cubed_f64x4, angles_f64x4);
304
305
 
305
306
  // Handle the special case of negative zero input
306
- __m256d const non_zero_mask = _mm256_cmp_pd(angles_radians, _mm256_setzero_pd(), _CMP_NEQ_UQ);
307
- results = _mm256_and_pd(results, non_zero_mask);
308
- return results;
307
+ __m256d const non_zero_mask_f64x4 = _mm256_cmp_pd(angles_radians, _mm256_setzero_pd(), _CMP_NEQ_UQ);
308
+ results_f64x4 = _mm256_and_pd(results_f64x4, non_zero_mask_f64x4);
309
+ return results_f64x4;
309
310
  }
310
311
 
311
312
  NK_INTERNAL __m256d nk_cos_f64x4_haswell_(__m256d const angles_radians) {
312
313
  // Constants for argument reduction
313
- __m256d const pi_high_half = _mm256_set1_pd(3.141592653589793116 * 0.5); // High-digits part of π/2
314
- __m256d const pi_low_half = _mm256_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π/2
315
- __m256d const pi_reciprocal = _mm256_set1_pd(0.31830988618379067154); // 1/π
314
+ __m256d const pi_high_half_f64x4 = _mm256_set1_pd(3.141592653589793116 * 0.5); // High-digits part of π/2
315
+ __m256d const pi_low_half_f64x4 = _mm256_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π/2
316
+ __m256d const pi_reciprocal_f64x4 = _mm256_set1_pd(0.31830988618379067154); // 1/π
316
317
 
317
318
  // Polynomial coefficients for cosine approximation
318
- __m256d const coeff_0 = _mm256_set1_pd(+0.00833333333333332974823815);
319
- __m256d const coeff_1 = _mm256_set1_pd(-0.000198412698412696162806809);
320
- __m256d const coeff_2 = _mm256_set1_pd(+2.75573192239198747630416e-06);
321
- __m256d const coeff_3 = _mm256_set1_pd(-2.50521083763502045810755e-08);
322
- __m256d const coeff_4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
323
- __m256d const coeff_5 = _mm256_set1_pd(-7.64712219118158833288484e-13);
324
- __m256d const coeff_6 = _mm256_set1_pd(+2.81009972710863200091251e-15);
325
- __m256d const coeff_7 = _mm256_set1_pd(-7.97255955009037868891952e-18);
326
- __m256d const coeff_8 = _mm256_set1_pd(-0.166666666666666657414808);
327
-
328
- // Compute (rounded_quotients) = 2 * round(angle / π - 0.5) + 1
329
- // Use fmsub: a*b - c = angles * (1/π) - 0.5
330
- __m256d const quotients = _mm256_fmsub_pd(angles_radians, pi_reciprocal, _mm256_set1_pd(0.5));
331
- __m256d const rounded_quotients = _mm256_fmadd_pd( //
332
- _mm256_set1_pd(2.0), //
333
- _mm256_round_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
319
+ __m256d const coeff_0_f64x4 = _mm256_set1_pd(+0.00833333333333332974823815);
320
+ __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.000198412698412696162806809);
321
+ __m256d const coeff_2_f64x4 = _mm256_set1_pd(+2.75573192239198747630416e-06);
322
+ __m256d const coeff_3_f64x4 = _mm256_set1_pd(-2.50521083763502045810755e-08);
323
+ __m256d const coeff_4_f64x4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
324
+ __m256d const coeff_5_f64x4 = _mm256_set1_pd(-7.64712219118158833288484e-13);
325
+ __m256d const coeff_6_f64x4 = _mm256_set1_pd(+2.81009972710863200091251e-15);
326
+ __m256d const coeff_7_f64x4 = _mm256_set1_pd(-7.97255955009037868891952e-18);
327
+ __m256d const coeff_8_f64x4 = _mm256_set1_pd(-0.166666666666666657414808);
328
+
329
+ // Compute (rounded_quotients_f64x4) = 2 * round(angle / π - 0.5) + 1
330
+ // Use fmsub: a*b - c = angles_f64x4 * (1/π) - 0.5
331
+ __m256d const quotients_f64x4 = _mm256_fmsub_pd(angles_radians, pi_reciprocal_f64x4, _mm256_set1_pd(0.5));
332
+ __m256d const rounded_quotients_f64x4 = _mm256_fmadd_pd( //
333
+ _mm256_set1_pd(2.0), //
334
+ _mm256_round_pd(quotients_f64x4, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
334
335
  _mm256_set1_pd(1.0));
335
336
 
336
- // Reduce the angle: angle - (rounded_quotients * π_high_half + rounded_quotients * π_low_half)
337
- __m256d angles = angles_radians;
338
- angles = _mm256_fnmadd_pd(rounded_quotients, pi_high_half, angles);
339
- angles = _mm256_fnmadd_pd(rounded_quotients, pi_low_half, angles);
340
-
341
- // If (rounded_quotients & 2) == 0, negate the angle
342
- // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
343
- __m128i quotients_i32 = _mm256_cvttpd_epi32(rounded_quotients);
344
- __m128i bit2 = _mm_and_si128(quotients_i32, _mm_set1_epi32(2));
345
- __m128i flip_mask_i32 = _mm_cmpeq_epi32(bit2, _mm_setzero_si128());
346
- __m256i flip_mask_i64 = _mm256_cvtepi32_epi64(flip_mask_i32);
347
- __m256d float_mask = _mm256_castsi256_pd(flip_mask_i64);
348
- __m256d negated_angles = _mm256_sub_pd(_mm256_setzero_pd(), angles);
349
- angles = _mm256_blendv_pd(angles, negated_angles, float_mask);
350
-
351
- __m256d const angles_squared = _mm256_mul_pd(angles, angles);
352
- __m256d const angles_cubed = _mm256_mul_pd(angles, angles_squared);
353
- __m256d const angles_quadratic = _mm256_mul_pd(angles_squared, angles_squared);
354
- __m256d const angles_octic = _mm256_mul_pd(angles_quadratic, angles_quadratic);
337
+ // Reduce the angle: angle - (rounded_quotients_f64x4 * π_high_half + rounded_quotients_f64x4 * π_low_half)
338
+ __m256d angles_f64x4 = angles_radians;
339
+ angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_high_half_f64x4, angles_f64x4);
340
+ angles_f64x4 = _mm256_fnmadd_pd(rounded_quotients_f64x4, pi_low_half_f64x4, angles_f64x4);
341
+
342
+ // If (rounded_quotients_f64x4 & 2) == 0, negate the angle
343
+ // Use truncation (MXCSR-independent) since rounded_quotients_f64x4 is already integer-valued
344
+ __m128i quotients_i32_i32x4 = _mm256_cvttpd_epi32(rounded_quotients_f64x4);
345
+ __m128i bit2_i32x4 = _mm_and_si128(quotients_i32_i32x4, _mm_set1_epi32(2));
346
+ __m128i flip_mask_i32_i32x4 = _mm_cmpeq_epi32(bit2_i32x4, _mm_setzero_si128());
347
+ __m256i flip_mask_i64_i32x8 = _mm256_cvtepi32_epi64(flip_mask_i32_i32x4);
348
+ __m256d float_mask_f64x4 = _mm256_castsi256_pd(flip_mask_i64_i32x8);
349
+ __m256d negated_angles_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), angles_f64x4);
350
+ angles_f64x4 = _mm256_blendv_pd(angles_f64x4, negated_angles_f64x4, float_mask_f64x4);
351
+
352
+ __m256d const angles_squared_f64x4 = _mm256_mul_pd(angles_f64x4, angles_f64x4);
353
+ __m256d const angles_cubed_f64x4 = _mm256_mul_pd(angles_f64x4, angles_squared_f64x4);
354
+ __m256d const angles_quadratic_f64x4 = _mm256_mul_pd(angles_squared_f64x4, angles_squared_f64x4);
355
+ __m256d const angles_octic_f64x4 = _mm256_mul_pd(angles_quadratic_f64x4, angles_quadratic_f64x4);
355
356
 
356
357
  // Compute higher-degree polynomial terms
357
- __m256d const poly_67 = _mm256_fmadd_pd(angles_squared, coeff_7, coeff_6);
358
- __m256d const poly_45 = _mm256_fmadd_pd(angles_squared, coeff_5, coeff_4);
359
- __m256d const poly_4567 = _mm256_fmadd_pd(angles_quadratic, poly_67, poly_45);
358
+ __m256d const poly_67_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_7_f64x4, coeff_6_f64x4);
359
+ __m256d const poly_45_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_5_f64x4, coeff_4_f64x4);
360
+ __m256d const poly_4567_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_67_f64x4, poly_45_f64x4);
360
361
 
361
362
  // Compute lower-degree polynomial terms
362
- __m256d const poly_23 = _mm256_fmadd_pd(angles_squared, coeff_3, coeff_2);
363
- __m256d const poly_01 = _mm256_fmadd_pd(angles_squared, coeff_1, coeff_0);
364
- __m256d const poly_0123 = _mm256_fmadd_pd(angles_quadratic, poly_23, poly_01);
363
+ __m256d const poly_23_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_3_f64x4, coeff_2_f64x4);
364
+ __m256d const poly_01_f64x4 = _mm256_fmadd_pd(angles_squared_f64x4, coeff_1_f64x4, coeff_0_f64x4);
365
+ __m256d const poly_0123_f64x4 = _mm256_fmadd_pd(angles_quadratic_f64x4, poly_23_f64x4, poly_01_f64x4);
365
366
 
366
367
  // Combine polynomial terms
367
- __m256d results = _mm256_fmadd_pd(angles_octic, poly_4567, poly_0123);
368
- results = _mm256_fmadd_pd(results, angles_squared, coeff_8);
369
- results = _mm256_fmadd_pd(results, angles_cubed, angles);
370
- return results;
368
+ __m256d results_f64x4 = _mm256_fmadd_pd(angles_octic_f64x4, poly_4567_f64x4, poly_0123_f64x4);
369
+ results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_squared_f64x4, coeff_8_f64x4);
370
+ results_f64x4 = _mm256_fmadd_pd(results_f64x4, angles_cubed_f64x4, angles_f64x4);
371
+ return results_f64x4;
371
372
  }
372
373
 
373
374
  NK_INTERNAL __m256d nk_atan_f64x4_haswell_(__m256d const inputs) {
374
375
  // Polynomial coefficients for atan approximation (19 coefficients)
375
376
  // The polynomial approximates: atan(x) ≈ x + x³ * P(x²) where P has 19 terms
376
- __m256d const coeff_19 = _mm256_set1_pd(-1.88796008463073496563746e-05);
377
- __m256d const coeff_18 = _mm256_set1_pd(+0.000209850076645816976906797);
378
- __m256d const coeff_17 = _mm256_set1_pd(-0.00110611831486672482563471);
379
- __m256d const coeff_16 = _mm256_set1_pd(+0.00370026744188713119232403);
380
- __m256d const coeff_15 = _mm256_set1_pd(-0.00889896195887655491740809);
381
- __m256d const coeff_14 = _mm256_set1_pd(+0.016599329773529201970117);
382
- __m256d const coeff_13 = _mm256_set1_pd(-0.0254517624932312641616861);
383
- __m256d const coeff_12 = _mm256_set1_pd(+0.0337852580001353069993897);
384
- __m256d const coeff_11 = _mm256_set1_pd(-0.0407629191276836500001934);
385
- __m256d const coeff_10 = _mm256_set1_pd(+0.0466667150077840625632675);
386
- __m256d const coeff_9 = _mm256_set1_pd(-0.0523674852303482457616113);
387
- __m256d const coeff_8 = _mm256_set1_pd(+0.0587666392926673580854313);
388
- __m256d const coeff_7 = _mm256_set1_pd(-0.0666573579361080525984562);
389
- __m256d const coeff_6 = _mm256_set1_pd(+0.0769219538311769618355029);
390
- __m256d const coeff_5 = _mm256_set1_pd(-0.090908995008245008229153);
391
- __m256d const coeff_4 = _mm256_set1_pd(+0.111111105648261418443745);
392
- __m256d const coeff_3 = _mm256_set1_pd(-0.14285714266771329383765);
393
- __m256d const coeff_2 = _mm256_set1_pd(+0.199999999996591265594148);
394
- __m256d const coeff_1 = _mm256_set1_pd(-0.333333333333311110369124);
395
- __m256d const sign_mask = _mm256_set1_pd(-0.0);
396
-
397
- // Adjust for quadrant - detect negative values
398
- __m256d values = inputs;
399
- __m256d negative_mask = _mm256_cmp_pd(values, _mm256_setzero_pd(), _CMP_LT_OS);
400
- values = _mm256_andnot_pd(sign_mask, values); // abs(values)
401
-
402
- // Check if values > 1 (need reciprocal)
377
+ __m256d const coeff_19_f64x4 = _mm256_set1_pd(-1.88796008463073496563746e-05);
378
+ __m256d const coeff_18_f64x4 = _mm256_set1_pd(+0.000209850076645816976906797);
379
+ __m256d const coeff_17_f64x4 = _mm256_set1_pd(-0.00110611831486672482563471);
380
+ __m256d const coeff_16_f64x4 = _mm256_set1_pd(+0.00370026744188713119232403);
381
+ __m256d const coeff_15_f64x4 = _mm256_set1_pd(-0.00889896195887655491740809);
382
+ __m256d const coeff_14_f64x4 = _mm256_set1_pd(+0.016599329773529201970117);
383
+ __m256d const coeff_13_f64x4 = _mm256_set1_pd(-0.0254517624932312641616861);
384
+ __m256d const coeff_12_f64x4 = _mm256_set1_pd(+0.0337852580001353069993897);
385
+ __m256d const coeff_11_f64x4 = _mm256_set1_pd(-0.0407629191276836500001934);
386
+ __m256d const coeff_10_f64x4 = _mm256_set1_pd(+0.0466667150077840625632675);
387
+ __m256d const coeff_9_f64x4 = _mm256_set1_pd(-0.0523674852303482457616113);
388
+ __m256d const coeff_8_f64x4 = _mm256_set1_pd(+0.0587666392926673580854313);
389
+ __m256d const coeff_7_f64x4 = _mm256_set1_pd(-0.0666573579361080525984562);
390
+ __m256d const coeff_6_f64x4 = _mm256_set1_pd(+0.0769219538311769618355029);
391
+ __m256d const coeff_5_f64x4 = _mm256_set1_pd(-0.090908995008245008229153);
392
+ __m256d const coeff_4_f64x4 = _mm256_set1_pd(+0.111111105648261418443745);
393
+ __m256d const coeff_3_f64x4 = _mm256_set1_pd(-0.14285714266771329383765);
394
+ __m256d const coeff_2_f64x4 = _mm256_set1_pd(+0.199999999996591265594148);
395
+ __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.333333333333311110369124);
396
+ __m256d const sign_mask_f64x4 = _mm256_set1_pd(-0.0);
397
+
398
+ // Adjust for quadrant - detect negative values_f64x4
399
+ __m256d values_f64x4 = inputs;
400
+ __m256d negative_mask_f64x4 = _mm256_cmp_pd(values_f64x4, _mm256_setzero_pd(), _CMP_LT_OS);
401
+ values_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, values_f64x4); // abs(values_f64x4)
402
+
403
+ // Check if values_f64x4 > 1 (need reciprocal)
403
404
  // Note: For f64, we keep VDIVPD since RCPPD doesn't exist and Newton-Raphson
404
405
  // would need 2 iterations for sufficient precision (~44 bits needed for f64)
405
- __m256d reciprocal_mask = _mm256_cmp_pd(values, _mm256_set1_pd(1.0), _CMP_GT_OS);
406
- __m256d reciprocal_values = _mm256_div_pd(_mm256_set1_pd(1.0), values);
407
- values = _mm256_blendv_pd(values, reciprocal_values, reciprocal_mask);
406
+ __m256d reciprocal_mask_f64x4 = _mm256_cmp_pd(values_f64x4, _mm256_set1_pd(1.0), _CMP_GT_OS);
407
+ __m256d reciprocal_values_f64x4 = _mm256_div_pd(_mm256_set1_pd(1.0), values_f64x4);
408
+ values_f64x4 = _mm256_blendv_pd(values_f64x4, reciprocal_values_f64x4, reciprocal_mask_f64x4);
408
409
 
409
410
  // Argument reduction
410
- __m256d const values_squared = _mm256_mul_pd(values, values);
411
- __m256d const values_cubed = _mm256_mul_pd(values, values_squared);
411
+ __m256d const values_squared_f64x4 = _mm256_mul_pd(values_f64x4, values_f64x4);
412
+ __m256d const values_cubed_f64x4 = _mm256_mul_pd(values_f64x4, values_squared_f64x4);
412
413
 
413
414
  // Polynomial evaluation using Horner's method.
414
415
  // For large arrays, out-of-order execution across loop iterations already hides
415
416
  // FMA latency. Estrin's scheme was tested but showed minimal improvement (~1%)
416
417
  // while adding complexity. Keeping Horner for maintainability.
417
- __m256d polynomials = coeff_19;
418
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_18);
419
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_17);
420
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_16);
421
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_15);
422
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_14);
423
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_13);
424
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_12);
425
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_11);
426
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_10);
427
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_9);
428
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_8);
429
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_7);
430
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_6);
431
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_5);
432
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_4);
433
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_3);
434
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_2);
435
- polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_1);
436
-
437
- // Compute result
438
- __m256d result = _mm256_fmadd_pd(values_cubed, polynomials, values);
439
-
440
- // Adjust for reciprocal: result = π/2 - result
441
- __m256d adjusted = _mm256_sub_pd(_mm256_set1_pd(1.5707963267948966), result);
442
- result = _mm256_blendv_pd(result, adjusted, reciprocal_mask);
443
-
444
- // Adjust for negative: result = -result
445
- __m256d negated = _mm256_sub_pd(_mm256_setzero_pd(), result);
446
- result = _mm256_blendv_pd(result, negated, negative_mask);
447
- return result;
418
+ __m256d polynomials_f64x4 = coeff_19_f64x4;
419
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_18_f64x4);
420
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_17_f64x4);
421
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_16_f64x4);
422
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_15_f64x4);
423
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_14_f64x4);
424
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_13_f64x4);
425
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_12_f64x4);
426
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_11_f64x4);
427
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_10_f64x4);
428
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_9_f64x4);
429
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_8_f64x4);
430
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_7_f64x4);
431
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_6_f64x4);
432
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_5_f64x4);
433
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_4_f64x4);
434
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_3_f64x4);
435
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_2_f64x4);
436
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, values_squared_f64x4, coeff_1_f64x4);
437
+
438
+ // Compute result_f64x4
439
+ __m256d result_f64x4 = _mm256_fmadd_pd(values_cubed_f64x4, polynomials_f64x4, values_f64x4);
440
+
441
+ // Adjust for reciprocal: result_f64x4 = π/2 - result_f64x4
442
+ __m256d adjusted_f64x4 = _mm256_sub_pd(_mm256_set1_pd(1.5707963267948966), result_f64x4);
443
+ result_f64x4 = _mm256_blendv_pd(result_f64x4, adjusted_f64x4, reciprocal_mask_f64x4);
444
+
445
+ // Adjust for negative: result_f64x4 = -result_f64x4
446
+ __m256d negated_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), result_f64x4);
447
+ result_f64x4 = _mm256_blendv_pd(result_f64x4, negated_f64x4, negative_mask_f64x4);
448
+ return result_f64x4;
448
449
  }
449
450
 
450
451
  NK_INTERNAL __m256d nk_atan2_f64x4_haswell_(__m256d const ys_inputs, __m256d const xs_inputs) {
451
452
  // Polynomial coefficients for atan approximation (19 coefficients, same as atan)
452
- __m256d const coeff_19 = _mm256_set1_pd(-1.88796008463073496563746e-05);
453
- __m256d const coeff_18 = _mm256_set1_pd(+0.000209850076645816976906797);
454
- __m256d const coeff_17 = _mm256_set1_pd(-0.00110611831486672482563471);
455
- __m256d const coeff_16 = _mm256_set1_pd(+0.00370026744188713119232403);
456
- __m256d const coeff_15 = _mm256_set1_pd(-0.00889896195887655491740809);
457
- __m256d const coeff_14 = _mm256_set1_pd(+0.016599329773529201970117);
458
- __m256d const coeff_13 = _mm256_set1_pd(-0.0254517624932312641616861);
459
- __m256d const coeff_12 = _mm256_set1_pd(+0.0337852580001353069993897);
460
- __m256d const coeff_11 = _mm256_set1_pd(-0.0407629191276836500001934);
461
- __m256d const coeff_10 = _mm256_set1_pd(+0.0466667150077840625632675);
462
- __m256d const coeff_9 = _mm256_set1_pd(-0.0523674852303482457616113);
463
- __m256d const coeff_8 = _mm256_set1_pd(+0.0587666392926673580854313);
464
- __m256d const coeff_7 = _mm256_set1_pd(-0.0666573579361080525984562);
465
- __m256d const coeff_6 = _mm256_set1_pd(+0.0769219538311769618355029);
466
- __m256d const coeff_5 = _mm256_set1_pd(-0.090908995008245008229153);
467
- __m256d const coeff_4 = _mm256_set1_pd(+0.111111105648261418443745);
468
- __m256d const coeff_3 = _mm256_set1_pd(-0.14285714266771329383765);
469
- __m256d const coeff_2 = _mm256_set1_pd(+0.199999999996591265594148);
470
- __m256d const coeff_1 = _mm256_set1_pd(-0.333333333333311110369124);
471
- __m256d const sign_mask = _mm256_set1_pd(-0.0);
453
+ __m256d const coeff_19_f64x4 = _mm256_set1_pd(-1.88796008463073496563746e-05);
454
+ __m256d const coeff_18_f64x4 = _mm256_set1_pd(+0.000209850076645816976906797);
455
+ __m256d const coeff_17_f64x4 = _mm256_set1_pd(-0.00110611831486672482563471);
456
+ __m256d const coeff_16_f64x4 = _mm256_set1_pd(+0.00370026744188713119232403);
457
+ __m256d const coeff_15_f64x4 = _mm256_set1_pd(-0.00889896195887655491740809);
458
+ __m256d const coeff_14_f64x4 = _mm256_set1_pd(+0.016599329773529201970117);
459
+ __m256d const coeff_13_f64x4 = _mm256_set1_pd(-0.0254517624932312641616861);
460
+ __m256d const coeff_12_f64x4 = _mm256_set1_pd(+0.0337852580001353069993897);
461
+ __m256d const coeff_11_f64x4 = _mm256_set1_pd(-0.0407629191276836500001934);
462
+ __m256d const coeff_10_f64x4 = _mm256_set1_pd(+0.0466667150077840625632675);
463
+ __m256d const coeff_9_f64x4 = _mm256_set1_pd(-0.0523674852303482457616113);
464
+ __m256d const coeff_8_f64x4 = _mm256_set1_pd(+0.0587666392926673580854313);
465
+ __m256d const coeff_7_f64x4 = _mm256_set1_pd(-0.0666573579361080525984562);
466
+ __m256d const coeff_6_f64x4 = _mm256_set1_pd(+0.0769219538311769618355029);
467
+ __m256d const coeff_5_f64x4 = _mm256_set1_pd(-0.090908995008245008229153);
468
+ __m256d const coeff_4_f64x4 = _mm256_set1_pd(+0.111111105648261418443745);
469
+ __m256d const coeff_3_f64x4 = _mm256_set1_pd(-0.14285714266771329383765);
470
+ __m256d const coeff_2_f64x4 = _mm256_set1_pd(+0.199999999996591265594148);
471
+ __m256d const coeff_1_f64x4 = _mm256_set1_pd(-0.333333333333311110369124);
472
+ __m256d const sign_mask_f64x4 = _mm256_set1_pd(-0.0);
472
473
 
473
474
  // Quadrant adjustments normalizing to absolute values of x and y
474
- __m256d xs_negative_mask = _mm256_cmp_pd(xs_inputs, _mm256_setzero_pd(), _CMP_LT_OS);
475
- __m256d xs = _mm256_andnot_pd(sign_mask, xs_inputs); // abs(xs_inputs)
476
- __m256d ys = _mm256_andnot_pd(sign_mask, ys_inputs); // abs(ys_inputs)
475
+ __m256d xs_negative_mask_f64x4 = _mm256_cmp_pd(xs_inputs, _mm256_setzero_pd(), _CMP_LT_OS);
476
+ __m256d xs_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, xs_inputs); // abs(xs_inputs)
477
+ __m256d ys_f64x4 = _mm256_andnot_pd(sign_mask_f64x4, ys_inputs); // abs(ys_inputs)
477
478
 
478
479
  // Ensure proper fraction where the numerator is smaller than the denominator
479
- __m256d swap_mask = _mm256_cmp_pd(ys, xs, _CMP_GT_OS);
480
- __m256d temps = xs;
481
- xs = _mm256_blendv_pd(xs, ys, swap_mask);
482
- __m256d neg_temps = _mm256_sub_pd(_mm256_setzero_pd(), temps);
483
- ys = _mm256_blendv_pd(ys, neg_temps, swap_mask);
480
+ __m256d swap_mask_f64x4 = _mm256_cmp_pd(ys_f64x4, xs_f64x4, _CMP_GT_OS);
481
+ __m256d temps_f64x4 = xs_f64x4;
482
+ xs_f64x4 = _mm256_blendv_pd(xs_f64x4, ys_f64x4, swap_mask_f64x4);
483
+ __m256d neg_temps_f64x4 = _mm256_sub_pd(_mm256_setzero_pd(), temps_f64x4);
484
+ ys_f64x4 = _mm256_blendv_pd(ys_f64x4, neg_temps_f64x4, swap_mask_f64x4);
484
485
 
485
- // Compute ratio and powers
486
- __m256d const ratio = _mm256_div_pd(ys, xs);
487
- __m256d const ratio_squared = _mm256_mul_pd(ratio, ratio);
488
- __m256d const ratio_cubed = _mm256_mul_pd(ratio, ratio_squared);
486
+ // Compute ratio_f64x4 and powers
487
+ __m256d const ratio_f64x4 = _mm256_div_pd(ys_f64x4, xs_f64x4);
488
+ __m256d const ratio_squared_f64x4 = _mm256_mul_pd(ratio_f64x4, ratio_f64x4);
489
+ __m256d const ratio_cubed_f64x4 = _mm256_mul_pd(ratio_f64x4, ratio_squared_f64x4);
489
490
 
490
491
  // Polynomial evaluation using Horner's method
491
- __m256d polynomials = coeff_19;
492
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_18);
493
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_17);
494
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_16);
495
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_15);
496
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_14);
497
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_13);
498
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_12);
499
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_11);
500
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_10);
501
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_9);
502
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_8);
503
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_7);
504
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_6);
505
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_5);
506
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_4);
507
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_3);
508
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_2);
509
- polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_1);
510
-
511
- // Compute the result using masks for quadrant adjustments
512
- __m256d results = _mm256_fmadd_pd(ratio_cubed, polynomials, ratio);
513
-
514
- // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
492
+ __m256d polynomials_f64x4 = coeff_19_f64x4;
493
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_18_f64x4);
494
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_17_f64x4);
495
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_16_f64x4);
496
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_15_f64x4);
497
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_14_f64x4);
498
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_13_f64x4);
499
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_12_f64x4);
500
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_11_f64x4);
501
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_10_f64x4);
502
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_9_f64x4);
503
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_8_f64x4);
504
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_7_f64x4);
505
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_6_f64x4);
506
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_5_f64x4);
507
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_4_f64x4);
508
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_3_f64x4);
509
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_2_f64x4);
510
+ polynomials_f64x4 = _mm256_fmadd_pd(polynomials_f64x4, ratio_squared_f64x4, coeff_1_f64x4);
511
+
512
+ // Compute the result using masks for quadrant_f64x4 adjustments
513
+ __m256d results_f64x4 = _mm256_fmadd_pd(ratio_cubed_f64x4, polynomials_f64x4, ratio_f64x4);
514
+
515
+ // Compute quadrant_f64x4 value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
515
516
  // -2 for x<0 && !swap, -1 for x<0 && swap
516
- __m256d quadrant = _mm256_setzero_pd();
517
- __m256d neg_two = _mm256_set1_pd(-2.0);
518
- quadrant = _mm256_blendv_pd(quadrant, neg_two, xs_negative_mask);
519
- __m256d one = _mm256_set1_pd(1.0);
520
- __m256d quadrant_incremented = _mm256_add_pd(quadrant, one);
521
- quadrant = _mm256_blendv_pd(quadrant, quadrant_incremented, swap_mask);
517
+ __m256d quadrant_f64x4 = _mm256_setzero_pd();
518
+ __m256d neg_two_f64x4 = _mm256_set1_pd(-2.0);
519
+ quadrant_f64x4 = _mm256_blendv_pd(quadrant_f64x4, neg_two_f64x4, xs_negative_mask_f64x4);
520
+ __m256d one_f64x4 = _mm256_set1_pd(1.0);
521
+ __m256d quadrant_incremented_f64x4 = _mm256_add_pd(quadrant_f64x4, one_f64x4);
522
+ quadrant_f64x4 = _mm256_blendv_pd(quadrant_f64x4, quadrant_incremented_f64x4, swap_mask_f64x4);
522
523
 
523
- // Adjust for quadrant: result += quadrant * π/2
524
- __m256d pi_half = _mm256_set1_pd(1.5707963267948966);
525
- results = _mm256_fmadd_pd(quadrant, pi_half, results);
524
+ // Adjust for quadrant_f64x4: result += quadrant_f64x4 * π/2
525
+ __m256d pi_half_f64x4 = _mm256_set1_pd(1.5707963267948966);
526
+ results_f64x4 = _mm256_fmadd_pd(quadrant_f64x4, pi_half_f64x4, results_f64x4);
526
527
 
527
528
  // Transfer sign from x (XOR with sign bit of x_input)
528
- __m256d xs_sign_bits = _mm256_and_pd(xs_inputs, sign_mask);
529
- results = _mm256_xor_pd(results, xs_sign_bits);
529
+ __m256d xs_sign_bits_f64x4 = _mm256_and_pd(xs_inputs, sign_mask_f64x4);
530
+ results_f64x4 = _mm256_xor_pd(results_f64x4, xs_sign_bits_f64x4);
530
531
 
531
532
  // Transfer sign from y (XOR with sign bit of y_input)
532
- __m256d ys_sign_bits = _mm256_and_pd(ys_inputs, sign_mask);
533
- results = _mm256_xor_pd(results, ys_sign_bits);
533
+ __m256d ys_sign_bits_f64x4 = _mm256_and_pd(ys_inputs, sign_mask_f64x4);
534
+ results_f64x4 = _mm256_xor_pd(results_f64x4, ys_sign_bits_f64x4);
534
535
 
535
- return results;
536
+ return results_f64x4;
536
537
  }
537
538
 
538
539
  NK_PUBLIC void nk_each_sin_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
539
540
  nk_size_t i = 0;
540
541
  for (; i + 8 <= n; i += 8) {
541
- __m256 angles = _mm256_loadu_ps(ins + i);
542
- __m256 results = nk_sin_f32x8_haswell_(angles);
543
- _mm256_storeu_ps(outs + i, results);
542
+ __m256 angles_f32x8 = _mm256_loadu_ps(ins + i);
543
+ __m256 results_f32x8 = nk_sin_f32x8_haswell_(angles_f32x8);
544
+ _mm256_storeu_ps(outs + i, results_f32x8);
544
545
  }
545
546
  if (i < n) {
546
547
  nk_size_t remaining = n - i;
@@ -555,9 +556,9 @@ NK_PUBLIC void nk_each_sin_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_
555
556
  NK_PUBLIC void nk_each_cos_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
556
557
  nk_size_t i = 0;
557
558
  for (; i + 8 <= n; i += 8) {
558
- __m256 angles = _mm256_loadu_ps(ins + i);
559
- __m256 results = nk_cos_f32x8_haswell_(angles);
560
- _mm256_storeu_ps(outs + i, results);
559
+ __m256 angles_f32x8 = _mm256_loadu_ps(ins + i);
560
+ __m256 results_f32x8 = nk_cos_f32x8_haswell_(angles_f32x8);
561
+ _mm256_storeu_ps(outs + i, results_f32x8);
561
562
  }
562
563
  if (i < n) {
563
564
  nk_size_t remaining = n - i;
@@ -572,9 +573,9 @@ NK_PUBLIC void nk_each_cos_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_
572
573
  NK_PUBLIC void nk_each_atan_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
573
574
  nk_size_t i = 0;
574
575
  for (; i + 8 <= n; i += 8) {
575
- __m256 values = _mm256_loadu_ps(ins + i);
576
- __m256 results = nk_atan_f32x8_haswell_(values);
577
- _mm256_storeu_ps(outs + i, results);
576
+ __m256 values_f32x8 = _mm256_loadu_ps(ins + i);
577
+ __m256 results_f32x8 = nk_atan_f32x8_haswell_(values_f32x8);
578
+ _mm256_storeu_ps(outs + i, results_f32x8);
578
579
  }
579
580
  if (i < n) {
580
581
  nk_size_t remaining = n - i;
@@ -589,9 +590,9 @@ NK_PUBLIC void nk_each_atan_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32
589
590
  NK_PUBLIC void nk_each_sin_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
590
591
  nk_size_t i = 0;
591
592
  for (; i + 4 <= n; i += 4) {
592
- __m256d angles = _mm256_loadu_pd(ins + i);
593
- __m256d results = nk_sin_f64x4_haswell_(angles);
594
- _mm256_storeu_pd(outs + i, results);
593
+ __m256d angles_f64x4 = _mm256_loadu_pd(ins + i);
594
+ __m256d results_f64x4 = nk_sin_f64x4_haswell_(angles_f64x4);
595
+ _mm256_storeu_pd(outs + i, results_f64x4);
595
596
  }
596
597
  if (i < n) {
597
598
  nk_size_t remaining = n - i;
@@ -606,9 +607,9 @@ NK_PUBLIC void nk_each_sin_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_
606
607
  NK_PUBLIC void nk_each_cos_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
607
608
  nk_size_t i = 0;
608
609
  for (; i + 4 <= n; i += 4) {
609
- __m256d angles = _mm256_loadu_pd(ins + i);
610
- __m256d results = nk_cos_f64x4_haswell_(angles);
611
- _mm256_storeu_pd(outs + i, results);
610
+ __m256d angles_f64x4 = _mm256_loadu_pd(ins + i);
611
+ __m256d results_f64x4 = nk_cos_f64x4_haswell_(angles_f64x4);
612
+ _mm256_storeu_pd(outs + i, results_f64x4);
612
613
  }
613
614
  if (i < n) {
614
615
  nk_size_t remaining = n - i;
@@ -623,9 +624,9 @@ NK_PUBLIC void nk_each_cos_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_
623
624
  NK_PUBLIC void nk_each_atan_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
624
625
  nk_size_t i = 0;
625
626
  for (; i + 4 <= n; i += 4) {
626
- __m256d values = _mm256_loadu_pd(ins + i);
627
- __m256d results = nk_atan_f64x4_haswell_(values);
628
- _mm256_storeu_pd(outs + i, results);
627
+ __m256d values_f64x4 = _mm256_loadu_pd(ins + i);
628
+ __m256d results_f64x4 = nk_atan_f64x4_haswell_(values_f64x4);
629
+ _mm256_storeu_pd(outs + i, results_f64x4);
629
630
  }
630
631
  if (i < n) {
631
632
  nk_size_t remaining = n - i;