numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,652 @@
1
+ /**
2
+ * @brief SIMD-accelerated Trigonometric Functions for Haswell.
3
+ * @file include/numkong/trigonometry/haswell.h
4
+ * @author Ash Vardanian
5
+ * @date December 27, 2025
6
+ *
7
+ * @sa include/numkong/trigonometry.h
8
+ * @see https://sleef.org
9
+ *
10
+ * @section haswell_trig_instructions Key AVX2 Trigonometry Instructions
11
+ *
12
+ * Intrinsic Instruction Latency Throughput Ports
13
+ * _mm256_fmadd_ps/pd VFMADD (YMM, YMM, YMM) 5cy 0.5/cy p01
14
+ * _mm256_mul_ps/pd VMULPS/PD (YMM, YMM, YMM) 5cy 0.5/cy p01
15
+ * _mm256_blendv_ps/pd VBLENDVPS/PD (YMM, YMM, YMM) 2cy 1/cy p015
16
+ * _mm256_round_ps/pd VROUNDPS/PD (YMM, YMM, I8) 6cy 1/cy p01
17
+ * _mm256_div_ps VDIVPS (YMM, YMM, YMM) 13cy 5/cy p0
18
+ *
19
+ * Polynomial evaluation uses Horner's method with FMA for sin/cos/atan approximation. For large
20
+ * arrays, out-of-order execution across loop iterations hides FMA latency better than Estrin's
21
+ * scheme. Range reduction uses argument folding modulo pi with high/low precision constants.
22
+ */
23
+ #ifndef NK_TRIGONOMETRY_HASWELL_H
24
+ #define NK_TRIGONOMETRY_HASWELL_H
25
+
26
+ #if NK_TARGET_X86_
27
+ #if NK_TARGET_HASWELL
28
+
29
+ #include "numkong/types.h"
30
+ #include "numkong/reduce/haswell.h"
31
+
32
+ #if defined(__cplusplus)
33
+ extern "C" {
34
+ #endif
35
+
36
+ #if defined(__clang__)
37
+ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2"))), apply_to = function)
38
+ #elif defined(__GNUC__)
39
+ #pragma GCC push_options
40
+ #pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2")
41
+ #endif
42
+
43
+ /* Haswell AVX2 trigonometry kernels (8-way f32, 4-way f64)
44
+ * These implement the same polynomial approximations as Skylake but with 256-bit vectors.
45
+ */
46
+
47
+ NK_INTERNAL __m256 nk_sin_f32x8_haswell_(__m256 const angles_radians) {
48
+ // Cody-Waite constants for argument reduction
49
+ __m256 const pi_hi_f32x8 = _mm256_set1_ps(3.1415927f);
50
+ __m256 const pi_lo_f32x8 = _mm256_set1_ps(-8.742278e-8f);
51
+ __m256 const pi_reciprocal = _mm256_set1_ps(0.31830988618379067154f); // 1/π
52
+ // Degree-9 minimax coefficients
53
+ __m256 const coeff_9 = _mm256_set1_ps(+2.7557319224e-6f);
54
+ __m256 const coeff_7 = _mm256_set1_ps(-1.9841269841e-4f);
55
+ __m256 const coeff_5 = _mm256_set1_ps(+8.3333293855e-3f);
56
+ __m256 const coeff_3 = _mm256_set1_ps(-1.6666666641e-1f);
57
+
58
+ // Compute (multiples_of_pi) = round(angle / π)
59
+ __m256 quotients = _mm256_mul_ps(angles_radians, pi_reciprocal);
60
+ __m256 rounded_quotients = _mm256_round_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
61
+ // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
62
+ __m256i multiples_of_pi = _mm256_cvttps_epi32(rounded_quotients);
63
+
64
+ // Cody-Waite range reduction
65
+ __m256 angles = _mm256_fnmadd_ps(rounded_quotients, pi_hi_f32x8, angles_radians);
66
+ angles = _mm256_fnmadd_ps(rounded_quotients, pi_lo_f32x8, angles);
67
+ __m256 const angles_squared = _mm256_mul_ps(angles, angles);
68
+ __m256 const angles_cubed = _mm256_mul_ps(angles, angles_squared);
69
+
70
+ // Degree-9 polynomial via Horner's method
71
+ __m256 polynomials = coeff_9;
72
+ polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_7);
73
+ polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_5);
74
+ polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_3);
75
+ __m256 results = _mm256_fmadd_ps(angles_cubed, polynomials, angles);
76
+
77
+ // If multiples_of_pi is odd, flip the sign of the results
78
+ __m256i parity = _mm256_and_si256(multiples_of_pi, _mm256_set1_epi32(1));
79
+ __m256i odd_mask = _mm256_cmpeq_epi32(parity, _mm256_set1_epi32(1));
80
+ __m256 float_mask = _mm256_castsi256_ps(odd_mask);
81
+ __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), results);
82
+ results = _mm256_blendv_ps(results, negated, float_mask);
83
+ return results;
84
+ }
85
+
86
+ NK_INTERNAL __m256 nk_cos_f32x8_haswell_(__m256 const angles_radians) {
87
+ // Cody-Waite constants for argument reduction
88
+ __m256 const pi_hi_f32x8 = _mm256_set1_ps(3.1415927f);
89
+ __m256 const pi_lo_f32x8 = _mm256_set1_ps(-8.742278e-8f);
90
+ __m256 const pi_half = _mm256_set1_ps(1.57079632679489661923f); // π/2
91
+ __m256 const pi_reciprocal = _mm256_set1_ps(0.31830988618379067154f); // 1/π
92
+ // Degree-9 minimax coefficients
93
+ __m256 const coeff_9 = _mm256_set1_ps(+2.7557319224e-6f);
94
+ __m256 const coeff_7 = _mm256_set1_ps(-1.9841269841e-4f);
95
+ __m256 const coeff_5 = _mm256_set1_ps(+8.3333293855e-3f);
96
+ __m256 const coeff_3 = _mm256_set1_ps(-1.6666666641e-1f);
97
+
98
+ // Compute (multiples_of_pi) = round((angle / π) - 0.5)
99
+ __m256 quotients = _mm256_fmsub_ps(angles_radians, pi_reciprocal, _mm256_set1_ps(0.5f));
100
+ __m256 rounded_quotients = _mm256_round_ps(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
101
+ // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
102
+ __m256i multiples_of_pi = _mm256_cvttps_epi32(rounded_quotients);
103
+
104
+ // Cody-Waite range reduction: angle = angle_radians - (multiples * pi + pi/2)
105
+ __m256 const offset = _mm256_fmadd_ps(rounded_quotients, pi_hi_f32x8, pi_half);
106
+ __m256 angles = _mm256_sub_ps(angles_radians, offset);
107
+ angles = _mm256_fnmadd_ps(rounded_quotients, pi_lo_f32x8, angles);
108
+ __m256 const angles_squared = _mm256_mul_ps(angles, angles);
109
+ __m256 const angles_cubed = _mm256_mul_ps(angles, angles_squared);
110
+
111
+ // Degree-9 polynomial via Horner's method
112
+ __m256 polynomials = coeff_9;
113
+ polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_7);
114
+ polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_5);
115
+ polynomials = _mm256_fmadd_ps(polynomials, angles_squared, coeff_3);
116
+ __m256 results = _mm256_fmadd_ps(angles_cubed, polynomials, angles);
117
+
118
+ // If multiples_of_pi is even, flip the sign of the results
119
+ __m256i parity = _mm256_and_si256(multiples_of_pi, _mm256_set1_epi32(1));
120
+ __m256i even_mask = _mm256_cmpeq_epi32(parity, _mm256_setzero_si256());
121
+ __m256 float_mask = _mm256_castsi256_ps(even_mask);
122
+ __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), results);
123
+ results = _mm256_blendv_ps(results, negated, float_mask);
124
+ return results;
125
+ }
126
+
127
+ NK_INTERNAL __m256 nk_atan_f32x8_haswell_(__m256 const inputs) {
128
+ // Polynomial coefficients for atan approximation (8 terms)
129
+ // These coefficients approximate: atan(x) ≈ x + c8 × x³ + c7 × x⁵ + c6 × x⁷ + ... + c1 × x¹⁵
130
+ __m256 const coeff_8 = _mm256_set1_ps(-0.333331018686294555664062f);
131
+ __m256 const coeff_7 = _mm256_set1_ps(+0.199926957488059997558594f);
132
+ __m256 const coeff_6 = _mm256_set1_ps(-0.142027363181114196777344f);
133
+ __m256 const coeff_5 = _mm256_set1_ps(+0.106347933411598205566406f);
134
+ __m256 const coeff_4 = _mm256_set1_ps(-0.0748900920152664184570312f);
135
+ __m256 const coeff_3 = _mm256_set1_ps(+0.0425049886107444763183594f);
136
+ __m256 const coeff_2 = _mm256_set1_ps(-0.0159569028764963150024414f);
137
+ __m256 const coeff_1 = _mm256_set1_ps(+0.00282363896258175373077393f);
138
+ __m256 const sign_mask = _mm256_set1_ps(-0.0f);
139
+
140
+ // Adjust for quadrant - detect negative values
141
+ __m256 values = inputs;
142
+ __m256 negative_mask = _mm256_cmp_ps(values, _mm256_setzero_ps(), _CMP_LT_OS);
143
+ values = _mm256_andnot_ps(sign_mask, values); // abs(values)
144
+
145
+ // Check if values > 1 (need reciprocal)
146
+ __m256 reciprocal_mask = _mm256_cmp_ps(values, _mm256_set1_ps(1.0f), _CMP_GT_OS);
147
+ __m256 reciprocal_values = _mm256_div_ps(_mm256_set1_ps(1.0f), values);
148
+ values = _mm256_blendv_ps(values, reciprocal_values, reciprocal_mask);
149
+
150
+ // Argument reduction
151
+ __m256 const values_squared = _mm256_mul_ps(values, values);
152
+ __m256 const values_cubed = _mm256_mul_ps(values, values_squared);
153
+
154
+ // Polynomial evaluation using Horner's method.
155
+ // For large arrays, out-of-order execution across loop iterations already hides
156
+ // FMA latency. Estrin's scheme was tested but showed ~20% regression because
157
+ // the extra power computations (y², y⁴) hurt throughput more than the reduced
158
+ // dependency depth helps latency.
159
+ __m256 polynomials = coeff_1;
160
+ polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_2);
161
+ polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_3);
162
+ polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_4);
163
+ polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_5);
164
+ polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_6);
165
+ polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_7);
166
+ polynomials = _mm256_fmadd_ps(polynomials, values_squared, coeff_8);
167
+
168
+ // Compute result: atan(x) ≈ x + x³ * P(x²)
169
+ __m256 result = _mm256_fmadd_ps(values_cubed, polynomials, values);
170
+
171
+ // Adjust for reciprocal: result = π/2 - result
172
+ __m256 adjusted = _mm256_sub_ps(_mm256_set1_ps(1.5707963267948966f), result);
173
+ result = _mm256_blendv_ps(result, adjusted, reciprocal_mask);
174
+
175
+ // Adjust for negative: result = -result
176
+ __m256 negated = _mm256_sub_ps(_mm256_setzero_ps(), result);
177
+ result = _mm256_blendv_ps(result, negated, negative_mask);
178
+ return result;
179
+ }
180
+
181
+ NK_INTERNAL __m256 nk_atan2_f32x8_haswell_(__m256 const ys_inputs, __m256 const xs_inputs) {
182
+ // Polynomial coefficients (same as atan)
183
+ __m256 const coeff_8 = _mm256_set1_ps(-0.333331018686294555664062f);
184
+ __m256 const coeff_7 = _mm256_set1_ps(+0.199926957488059997558594f);
185
+ __m256 const coeff_6 = _mm256_set1_ps(-0.142027363181114196777344f);
186
+ __m256 const coeff_5 = _mm256_set1_ps(+0.106347933411598205566406f);
187
+ __m256 const coeff_4 = _mm256_set1_ps(-0.0748900920152664184570312f);
188
+ __m256 const coeff_3 = _mm256_set1_ps(+0.0425049886107444763183594f);
189
+ __m256 const coeff_2 = _mm256_set1_ps(-0.0159569028764963150024414f);
190
+ __m256 const coeff_1 = _mm256_set1_ps(+0.00282363896258175373077393f);
191
+ __m256 const sign_mask = _mm256_set1_ps(-0.0f);
192
+
193
+ // Quadrant adjustments normalizing to absolute values of x and y
194
+ __m256 xs_negative_mask = _mm256_cmp_ps(xs_inputs, _mm256_setzero_ps(), _CMP_LT_OS);
195
+ __m256 xs = _mm256_andnot_ps(sign_mask, xs_inputs); // abs(xs_inputs)
196
+ __m256 ys = _mm256_andnot_ps(sign_mask, ys_inputs); // abs(ys_inputs)
197
+
198
+ // Ensure proper fraction where the numerator is smaller than the denominator
199
+ __m256 swap_mask = _mm256_cmp_ps(ys, xs, _CMP_GT_OS);
200
+ __m256 temps = xs;
201
+ xs = _mm256_blendv_ps(xs, ys, swap_mask);
202
+ __m256 neg_temps = _mm256_sub_ps(_mm256_setzero_ps(), temps);
203
+ ys = _mm256_blendv_ps(ys, neg_temps, swap_mask);
204
+
205
+ // Compute ratio and powers
206
+ __m256 const ratio = _mm256_div_ps(ys, xs);
207
+ __m256 const ratio_squared = _mm256_mul_ps(ratio, ratio);
208
+ __m256 const ratio_cubed = _mm256_mul_ps(ratio, ratio_squared);
209
+
210
+ // Polynomial evaluation using Horner's method
211
+ __m256 polynomials = coeff_1;
212
+ polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_2);
213
+ polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_3);
214
+ polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_4);
215
+ polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_5);
216
+ polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_6);
217
+ polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_7);
218
+ polynomials = _mm256_fmadd_ps(polynomials, ratio_squared, coeff_8);
219
+
220
+ // Compute the result using masks for quadrant adjustments
221
+ __m256 results = _mm256_fmadd_ps(ratio_cubed, polynomials, ratio);
222
+
223
+ // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
224
+ // -2 for x<0 && !swap, -1 for x<0 && swap
225
+ __m256 quadrant = _mm256_setzero_ps();
226
+ __m256 neg_two = _mm256_set1_ps(-2.0f);
227
+ quadrant = _mm256_blendv_ps(quadrant, neg_two, xs_negative_mask);
228
+ __m256 one = _mm256_set1_ps(1.0f);
229
+ __m256 quadrant_incremented = _mm256_add_ps(quadrant, one);
230
+ quadrant = _mm256_blendv_ps(quadrant, quadrant_incremented, swap_mask);
231
+
232
+ // Adjust for quadrant: result += quadrant * π/2
233
+ __m256 pi_half = _mm256_set1_ps(1.5707963267948966f);
234
+ results = _mm256_fmadd_ps(quadrant, pi_half, results);
235
+
236
+ // Transfer sign from x (XOR with sign bit of x_input)
237
+ __m256 xs_sign_bits = _mm256_and_ps(xs_inputs, sign_mask);
238
+ results = _mm256_xor_ps(results, xs_sign_bits);
239
+
240
+ // Transfer sign from y (XOR with sign bit of y_input)
241
+ __m256 ys_sign_bits = _mm256_and_ps(ys_inputs, sign_mask);
242
+ results = _mm256_xor_ps(results, ys_sign_bits);
243
+
244
+ return results;
245
+ }
246
+
247
+ NK_INTERNAL __m256d nk_sin_f64x4_haswell_(__m256d const angles_radians) {
248
+ // Constants for argument reduction
249
+ __m256d const pi_high = _mm256_set1_pd(3.141592653589793116); // High-digits part of π
250
+ __m256d const pi_low = _mm256_set1_pd(1.2246467991473532072e-16); // Low-digits part of π
251
+ __m256d const pi_reciprocal = _mm256_set1_pd(0.31830988618379067154); // 1/π
252
+
253
+ // Polynomial coefficients for sine approximation (minimax polynomial)
254
+ __m256d const coeff_0 = _mm256_set1_pd(+0.00833333333333332974823815);
255
+ __m256d const coeff_1 = _mm256_set1_pd(-0.000198412698412696162806809);
256
+ __m256d const coeff_2 = _mm256_set1_pd(+2.75573192239198747630416e-06);
257
+ __m256d const coeff_3 = _mm256_set1_pd(-2.50521083763502045810755e-08);
258
+ __m256d const coeff_4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
259
+ __m256d const coeff_5 = _mm256_set1_pd(-7.64712219118158833288484e-13);
260
+ __m256d const coeff_6 = _mm256_set1_pd(+2.81009972710863200091251e-15);
261
+ __m256d const coeff_7 = _mm256_set1_pd(-7.97255955009037868891952e-18);
262
+ __m256d const coeff_8 = _mm256_set1_pd(-0.166666666666666657414808);
263
+
264
+ // Compute (rounded_quotients) = round(angle / π)
265
+ __m256d const quotients = _mm256_mul_pd(angles_radians, pi_reciprocal);
266
+ __m256d const rounded_quotients = _mm256_round_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
267
+
268
+ // Reduce the angle: angle - (rounded_quotients * π_high + rounded_quotients * π_low)
269
+ __m256d angles = angles_radians;
270
+ angles = _mm256_fnmadd_pd(rounded_quotients, pi_high, angles);
271
+ angles = _mm256_fnmadd_pd(rounded_quotients, pi_low, angles);
272
+
273
+ // If rounded_quotients is odd (bit 0 set), negate the angle
274
+ // Convert to 32-bit int (returns __m128i with 4 x 32-bit ints)
275
+ // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
276
+ __m128i quotients_i32 = _mm256_cvttpd_epi32(rounded_quotients);
277
+ __m128i parity = _mm_and_si128(quotients_i32, _mm_set1_epi32(1));
278
+ __m128i odd_mask_i32 = _mm_cmpeq_epi32(parity, _mm_set1_epi32(1));
279
+ // Expand 32-bit mask to 64-bit by shuffling
280
+ __m256i odd_mask_i64 = _mm256_cvtepi32_epi64(odd_mask_i32);
281
+ __m256d float_mask = _mm256_castsi256_pd(odd_mask_i64);
282
+ __m256d negated_angles = _mm256_sub_pd(_mm256_setzero_pd(), angles);
283
+ angles = _mm256_blendv_pd(angles, negated_angles, float_mask);
284
+
285
+ __m256d const angles_squared = _mm256_mul_pd(angles, angles);
286
+ __m256d const angles_cubed = _mm256_mul_pd(angles, angles_squared);
287
+ __m256d const angles_quadratic = _mm256_mul_pd(angles_squared, angles_squared);
288
+ __m256d const angles_octic = _mm256_mul_pd(angles_quadratic, angles_quadratic);
289
+
290
+ // Compute higher-degree polynomial terms
291
+ __m256d const poly_67 = _mm256_fmadd_pd(angles_squared, coeff_7, coeff_6);
292
+ __m256d const poly_45 = _mm256_fmadd_pd(angles_squared, coeff_5, coeff_4);
293
+ __m256d const poly_4567 = _mm256_fmadd_pd(angles_quadratic, poly_67, poly_45);
294
+
295
+ // Compute lower-degree polynomial terms
296
+ __m256d const poly_23 = _mm256_fmadd_pd(angles_squared, coeff_3, coeff_2);
297
+ __m256d const poly_01 = _mm256_fmadd_pd(angles_squared, coeff_1, coeff_0);
298
+ __m256d const poly_0123 = _mm256_fmadd_pd(angles_quadratic, poly_23, poly_01);
299
+
300
+ // Combine polynomial terms
301
+ __m256d results = _mm256_fmadd_pd(angles_octic, poly_4567, poly_0123);
302
+ results = _mm256_fmadd_pd(results, angles_squared, coeff_8);
303
+ results = _mm256_fmadd_pd(results, angles_cubed, angles);
304
+
305
+ // Handle the special case of negative zero input
306
+ __m256d const non_zero_mask = _mm256_cmp_pd(angles_radians, _mm256_setzero_pd(), _CMP_NEQ_UQ);
307
+ results = _mm256_and_pd(results, non_zero_mask);
308
+ return results;
309
+ }
310
+
311
+ NK_INTERNAL __m256d nk_cos_f64x4_haswell_(__m256d const angles_radians) {
312
+ // Constants for argument reduction
313
+ __m256d const pi_high_half = _mm256_set1_pd(3.141592653589793116 * 0.5); // High-digits part of π/2
314
+ __m256d const pi_low_half = _mm256_set1_pd(1.2246467991473532072e-16 * 0.5); // Low-digits part of π/2
315
+ __m256d const pi_reciprocal = _mm256_set1_pd(0.31830988618379067154); // 1/π
316
+
317
+ // Polynomial coefficients for cosine approximation
318
+ __m256d const coeff_0 = _mm256_set1_pd(+0.00833333333333332974823815);
319
+ __m256d const coeff_1 = _mm256_set1_pd(-0.000198412698412696162806809);
320
+ __m256d const coeff_2 = _mm256_set1_pd(+2.75573192239198747630416e-06);
321
+ __m256d const coeff_3 = _mm256_set1_pd(-2.50521083763502045810755e-08);
322
+ __m256d const coeff_4 = _mm256_set1_pd(+1.60590430605664501629054e-10);
323
+ __m256d const coeff_5 = _mm256_set1_pd(-7.64712219118158833288484e-13);
324
+ __m256d const coeff_6 = _mm256_set1_pd(+2.81009972710863200091251e-15);
325
+ __m256d const coeff_7 = _mm256_set1_pd(-7.97255955009037868891952e-18);
326
+ __m256d const coeff_8 = _mm256_set1_pd(-0.166666666666666657414808);
327
+
328
+ // Compute (rounded_quotients) = 2 * round(angle / π - 0.5) + 1
329
+ // Use fmsub: a*b - c = angles * (1/π) - 0.5
330
+ __m256d const quotients = _mm256_fmsub_pd(angles_radians, pi_reciprocal, _mm256_set1_pd(0.5));
331
+ __m256d const rounded_quotients = _mm256_fmadd_pd( //
332
+ _mm256_set1_pd(2.0), //
333
+ _mm256_round_pd(quotients, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), //
334
+ _mm256_set1_pd(1.0));
335
+
336
+ // Reduce the angle: angle - (rounded_quotients * π_high_half + rounded_quotients * π_low_half)
337
+ __m256d angles = angles_radians;
338
+ angles = _mm256_fnmadd_pd(rounded_quotients, pi_high_half, angles);
339
+ angles = _mm256_fnmadd_pd(rounded_quotients, pi_low_half, angles);
340
+
341
+ // If (rounded_quotients & 2) == 0, negate the angle
342
+ // Use truncation (MXCSR-independent) since rounded_quotients is already integer-valued
343
+ __m128i quotients_i32 = _mm256_cvttpd_epi32(rounded_quotients);
344
+ __m128i bit2 = _mm_and_si128(quotients_i32, _mm_set1_epi32(2));
345
+ __m128i flip_mask_i32 = _mm_cmpeq_epi32(bit2, _mm_setzero_si128());
346
+ __m256i flip_mask_i64 = _mm256_cvtepi32_epi64(flip_mask_i32);
347
+ __m256d float_mask = _mm256_castsi256_pd(flip_mask_i64);
348
+ __m256d negated_angles = _mm256_sub_pd(_mm256_setzero_pd(), angles);
349
+ angles = _mm256_blendv_pd(angles, negated_angles, float_mask);
350
+
351
+ __m256d const angles_squared = _mm256_mul_pd(angles, angles);
352
+ __m256d const angles_cubed = _mm256_mul_pd(angles, angles_squared);
353
+ __m256d const angles_quadratic = _mm256_mul_pd(angles_squared, angles_squared);
354
+ __m256d const angles_octic = _mm256_mul_pd(angles_quadratic, angles_quadratic);
355
+
356
+ // Compute higher-degree polynomial terms
357
+ __m256d const poly_67 = _mm256_fmadd_pd(angles_squared, coeff_7, coeff_6);
358
+ __m256d const poly_45 = _mm256_fmadd_pd(angles_squared, coeff_5, coeff_4);
359
+ __m256d const poly_4567 = _mm256_fmadd_pd(angles_quadratic, poly_67, poly_45);
360
+
361
+ // Compute lower-degree polynomial terms
362
+ __m256d const poly_23 = _mm256_fmadd_pd(angles_squared, coeff_3, coeff_2);
363
+ __m256d const poly_01 = _mm256_fmadd_pd(angles_squared, coeff_1, coeff_0);
364
+ __m256d const poly_0123 = _mm256_fmadd_pd(angles_quadratic, poly_23, poly_01);
365
+
366
+ // Combine polynomial terms
367
+ __m256d results = _mm256_fmadd_pd(angles_octic, poly_4567, poly_0123);
368
+ results = _mm256_fmadd_pd(results, angles_squared, coeff_8);
369
+ results = _mm256_fmadd_pd(results, angles_cubed, angles);
370
+ return results;
371
+ }
372
+
373
+ NK_INTERNAL __m256d nk_atan_f64x4_haswell_(__m256d const inputs) {
374
+ // Polynomial coefficients for atan approximation (19 coefficients)
375
+ // The polynomial approximates: atan(x) ≈ x + x³ * P(x²) where P has 19 terms
376
+ __m256d const coeff_19 = _mm256_set1_pd(-1.88796008463073496563746e-05);
377
+ __m256d const coeff_18 = _mm256_set1_pd(+0.000209850076645816976906797);
378
+ __m256d const coeff_17 = _mm256_set1_pd(-0.00110611831486672482563471);
379
+ __m256d const coeff_16 = _mm256_set1_pd(+0.00370026744188713119232403);
380
+ __m256d const coeff_15 = _mm256_set1_pd(-0.00889896195887655491740809);
381
+ __m256d const coeff_14 = _mm256_set1_pd(+0.016599329773529201970117);
382
+ __m256d const coeff_13 = _mm256_set1_pd(-0.0254517624932312641616861);
383
+ __m256d const coeff_12 = _mm256_set1_pd(+0.0337852580001353069993897);
384
+ __m256d const coeff_11 = _mm256_set1_pd(-0.0407629191276836500001934);
385
+ __m256d const coeff_10 = _mm256_set1_pd(+0.0466667150077840625632675);
386
+ __m256d const coeff_9 = _mm256_set1_pd(-0.0523674852303482457616113);
387
+ __m256d const coeff_8 = _mm256_set1_pd(+0.0587666392926673580854313);
388
+ __m256d const coeff_7 = _mm256_set1_pd(-0.0666573579361080525984562);
389
+ __m256d const coeff_6 = _mm256_set1_pd(+0.0769219538311769618355029);
390
+ __m256d const coeff_5 = _mm256_set1_pd(-0.090908995008245008229153);
391
+ __m256d const coeff_4 = _mm256_set1_pd(+0.111111105648261418443745);
392
+ __m256d const coeff_3 = _mm256_set1_pd(-0.14285714266771329383765);
393
+ __m256d const coeff_2 = _mm256_set1_pd(+0.199999999996591265594148);
394
+ __m256d const coeff_1 = _mm256_set1_pd(-0.333333333333311110369124);
395
+ __m256d const sign_mask = _mm256_set1_pd(-0.0);
396
+
397
+ // Adjust for quadrant - detect negative values
398
+ __m256d values = inputs;
399
+ __m256d negative_mask = _mm256_cmp_pd(values, _mm256_setzero_pd(), _CMP_LT_OS);
400
+ values = _mm256_andnot_pd(sign_mask, values); // abs(values)
401
+
402
+ // Check if values > 1 (need reciprocal)
403
+ // Note: For f64, we keep VDIVPD since RCPPD doesn't exist and Newton-Raphson
404
+ // would need 2 iterations for sufficient precision (~44 bits needed for f64)
405
+ __m256d reciprocal_mask = _mm256_cmp_pd(values, _mm256_set1_pd(1.0), _CMP_GT_OS);
406
+ __m256d reciprocal_values = _mm256_div_pd(_mm256_set1_pd(1.0), values);
407
+ values = _mm256_blendv_pd(values, reciprocal_values, reciprocal_mask);
408
+
409
+ // Argument reduction
410
+ __m256d const values_squared = _mm256_mul_pd(values, values);
411
+ __m256d const values_cubed = _mm256_mul_pd(values, values_squared);
412
+
413
+ // Polynomial evaluation using Horner's method.
414
+ // For large arrays, out-of-order execution across loop iterations already hides
415
+ // FMA latency. Estrin's scheme was tested but showed minimal improvement (~1%)
416
+ // while adding complexity. Keeping Horner for maintainability.
417
+ __m256d polynomials = coeff_19;
418
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_18);
419
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_17);
420
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_16);
421
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_15);
422
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_14);
423
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_13);
424
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_12);
425
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_11);
426
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_10);
427
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_9);
428
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_8);
429
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_7);
430
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_6);
431
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_5);
432
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_4);
433
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_3);
434
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_2);
435
+ polynomials = _mm256_fmadd_pd(polynomials, values_squared, coeff_1);
436
+
437
+ // Compute result
438
+ __m256d result = _mm256_fmadd_pd(values_cubed, polynomials, values);
439
+
440
+ // Adjust for reciprocal: result = π/2 - result
441
+ __m256d adjusted = _mm256_sub_pd(_mm256_set1_pd(1.5707963267948966), result);
442
+ result = _mm256_blendv_pd(result, adjusted, reciprocal_mask);
443
+
444
+ // Adjust for negative: result = -result
445
+ __m256d negated = _mm256_sub_pd(_mm256_setzero_pd(), result);
446
+ result = _mm256_blendv_pd(result, negated, negative_mask);
447
+ return result;
448
+ }
449
+
450
+ NK_INTERNAL __m256d nk_atan2_f64x4_haswell_(__m256d const ys_inputs, __m256d const xs_inputs) {
451
+ // Polynomial coefficients for atan approximation (19 coefficients, same as atan)
452
+ __m256d const coeff_19 = _mm256_set1_pd(-1.88796008463073496563746e-05);
453
+ __m256d const coeff_18 = _mm256_set1_pd(+0.000209850076645816976906797);
454
+ __m256d const coeff_17 = _mm256_set1_pd(-0.00110611831486672482563471);
455
+ __m256d const coeff_16 = _mm256_set1_pd(+0.00370026744188713119232403);
456
+ __m256d const coeff_15 = _mm256_set1_pd(-0.00889896195887655491740809);
457
+ __m256d const coeff_14 = _mm256_set1_pd(+0.016599329773529201970117);
458
+ __m256d const coeff_13 = _mm256_set1_pd(-0.0254517624932312641616861);
459
+ __m256d const coeff_12 = _mm256_set1_pd(+0.0337852580001353069993897);
460
+ __m256d const coeff_11 = _mm256_set1_pd(-0.0407629191276836500001934);
461
+ __m256d const coeff_10 = _mm256_set1_pd(+0.0466667150077840625632675);
462
+ __m256d const coeff_9 = _mm256_set1_pd(-0.0523674852303482457616113);
463
+ __m256d const coeff_8 = _mm256_set1_pd(+0.0587666392926673580854313);
464
+ __m256d const coeff_7 = _mm256_set1_pd(-0.0666573579361080525984562);
465
+ __m256d const coeff_6 = _mm256_set1_pd(+0.0769219538311769618355029);
466
+ __m256d const coeff_5 = _mm256_set1_pd(-0.090908995008245008229153);
467
+ __m256d const coeff_4 = _mm256_set1_pd(+0.111111105648261418443745);
468
+ __m256d const coeff_3 = _mm256_set1_pd(-0.14285714266771329383765);
469
+ __m256d const coeff_2 = _mm256_set1_pd(+0.199999999996591265594148);
470
+ __m256d const coeff_1 = _mm256_set1_pd(-0.333333333333311110369124);
471
+ __m256d const sign_mask = _mm256_set1_pd(-0.0);
472
+
473
+ // Quadrant adjustments normalizing to absolute values of x and y
474
+ __m256d xs_negative_mask = _mm256_cmp_pd(xs_inputs, _mm256_setzero_pd(), _CMP_LT_OS);
475
+ __m256d xs = _mm256_andnot_pd(sign_mask, xs_inputs); // abs(xs_inputs)
476
+ __m256d ys = _mm256_andnot_pd(sign_mask, ys_inputs); // abs(ys_inputs)
477
+
478
+ // Ensure proper fraction where the numerator is smaller than the denominator
479
+ __m256d swap_mask = _mm256_cmp_pd(ys, xs, _CMP_GT_OS);
480
+ __m256d temps = xs;
481
+ xs = _mm256_blendv_pd(xs, ys, swap_mask);
482
+ __m256d neg_temps = _mm256_sub_pd(_mm256_setzero_pd(), temps);
483
+ ys = _mm256_blendv_pd(ys, neg_temps, swap_mask);
484
+
485
+ // Compute ratio and powers
486
+ __m256d const ratio = _mm256_div_pd(ys, xs);
487
+ __m256d const ratio_squared = _mm256_mul_pd(ratio, ratio);
488
+ __m256d const ratio_cubed = _mm256_mul_pd(ratio, ratio_squared);
489
+
490
+ // Polynomial evaluation using Horner's method
491
+ __m256d polynomials = coeff_19;
492
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_18);
493
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_17);
494
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_16);
495
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_15);
496
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_14);
497
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_13);
498
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_12);
499
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_11);
500
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_10);
501
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_9);
502
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_8);
503
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_7);
504
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_6);
505
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_5);
506
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_4);
507
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_3);
508
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_2);
509
+ polynomials = _mm256_fmadd_pd(polynomials, ratio_squared, coeff_1);
510
+
511
+ // Compute the result using masks for quadrant adjustments
512
+ __m256d results = _mm256_fmadd_pd(ratio_cubed, polynomials, ratio);
513
+
514
+ // Compute quadrant value: 0 for x>=0 && !swap, 1 for x>=0 && swap,
515
+ // -2 for x<0 && !swap, -1 for x<0 && swap
516
+ __m256d quadrant = _mm256_setzero_pd();
517
+ __m256d neg_two = _mm256_set1_pd(-2.0);
518
+ quadrant = _mm256_blendv_pd(quadrant, neg_two, xs_negative_mask);
519
+ __m256d one = _mm256_set1_pd(1.0);
520
+ __m256d quadrant_incremented = _mm256_add_pd(quadrant, one);
521
+ quadrant = _mm256_blendv_pd(quadrant, quadrant_incremented, swap_mask);
522
+
523
+ // Adjust for quadrant: result += quadrant * π/2
524
+ __m256d pi_half = _mm256_set1_pd(1.5707963267948966);
525
+ results = _mm256_fmadd_pd(quadrant, pi_half, results);
526
+
527
+ // Transfer sign from x (XOR with sign bit of x_input)
528
+ __m256d xs_sign_bits = _mm256_and_pd(xs_inputs, sign_mask);
529
+ results = _mm256_xor_pd(results, xs_sign_bits);
530
+
531
+ // Transfer sign from y (XOR with sign bit of y_input)
532
+ __m256d ys_sign_bits = _mm256_and_pd(ys_inputs, sign_mask);
533
+ results = _mm256_xor_pd(results, ys_sign_bits);
534
+
535
+ return results;
536
+ }
537
+
538
+ NK_PUBLIC void nk_each_sin_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
539
+ nk_size_t i = 0;
540
+ for (; i + 8 <= n; i += 8) {
541
+ __m256 angles = _mm256_loadu_ps(ins + i);
542
+ __m256 results = nk_sin_f32x8_haswell_(angles);
543
+ _mm256_storeu_ps(outs + i, results);
544
+ }
545
+ if (i < n) {
546
+ nk_size_t remaining = n - i;
547
+ nk_b256_vec_t angles_vec;
548
+ nk_partial_load_b32x8_serial_(ins + i, &angles_vec, remaining);
549
+ nk_b256_vec_t results_vec;
550
+ results_vec.ymm_ps = nk_sin_f32x8_haswell_(angles_vec.ymm_ps);
551
+ nk_partial_store_b32x8_serial_(&results_vec, outs + i, remaining);
552
+ }
553
+ }
554
+
555
+ NK_PUBLIC void nk_each_cos_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
556
+ nk_size_t i = 0;
557
+ for (; i + 8 <= n; i += 8) {
558
+ __m256 angles = _mm256_loadu_ps(ins + i);
559
+ __m256 results = nk_cos_f32x8_haswell_(angles);
560
+ _mm256_storeu_ps(outs + i, results);
561
+ }
562
+ if (i < n) {
563
+ nk_size_t remaining = n - i;
564
+ nk_b256_vec_t angles_vec;
565
+ nk_partial_load_b32x8_serial_(ins + i, &angles_vec, remaining);
566
+ nk_b256_vec_t results_vec;
567
+ results_vec.ymm_ps = nk_cos_f32x8_haswell_(angles_vec.ymm_ps);
568
+ nk_partial_store_b32x8_serial_(&results_vec, outs + i, remaining);
569
+ }
570
+ }
571
+
572
+ NK_PUBLIC void nk_each_atan_f32_haswell(nk_f32_t const *ins, nk_size_t n, nk_f32_t *outs) {
573
+ nk_size_t i = 0;
574
+ for (; i + 8 <= n; i += 8) {
575
+ __m256 values = _mm256_loadu_ps(ins + i);
576
+ __m256 results = nk_atan_f32x8_haswell_(values);
577
+ _mm256_storeu_ps(outs + i, results);
578
+ }
579
+ if (i < n) {
580
+ nk_size_t remaining = n - i;
581
+ nk_b256_vec_t values_vec;
582
+ nk_partial_load_b32x8_serial_(ins + i, &values_vec, remaining);
583
+ nk_b256_vec_t results_vec;
584
+ results_vec.ymm_ps = nk_atan_f32x8_haswell_(values_vec.ymm_ps);
585
+ nk_partial_store_b32x8_serial_(&results_vec, outs + i, remaining);
586
+ }
587
+ }
588
+
589
+ NK_PUBLIC void nk_each_sin_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
590
+ nk_size_t i = 0;
591
+ for (; i + 4 <= n; i += 4) {
592
+ __m256d angles = _mm256_loadu_pd(ins + i);
593
+ __m256d results = nk_sin_f64x4_haswell_(angles);
594
+ _mm256_storeu_pd(outs + i, results);
595
+ }
596
+ if (i < n) {
597
+ nk_size_t remaining = n - i;
598
+ nk_b256_vec_t angles_vec;
599
+ nk_partial_load_b64x4_haswell_(ins + i, &angles_vec, remaining);
600
+ nk_b256_vec_t results_vec;
601
+ results_vec.ymm_pd = nk_sin_f64x4_haswell_(angles_vec.ymm_pd);
602
+ nk_partial_store_b64x4_haswell_(&results_vec, outs + i, remaining);
603
+ }
604
+ }
605
+
606
+ NK_PUBLIC void nk_each_cos_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
607
+ nk_size_t i = 0;
608
+ for (; i + 4 <= n; i += 4) {
609
+ __m256d angles = _mm256_loadu_pd(ins + i);
610
+ __m256d results = nk_cos_f64x4_haswell_(angles);
611
+ _mm256_storeu_pd(outs + i, results);
612
+ }
613
+ if (i < n) {
614
+ nk_size_t remaining = n - i;
615
+ nk_b256_vec_t angles_vec;
616
+ nk_partial_load_b64x4_haswell_(ins + i, &angles_vec, remaining);
617
+ nk_b256_vec_t results_vec;
618
+ results_vec.ymm_pd = nk_cos_f64x4_haswell_(angles_vec.ymm_pd);
619
+ nk_partial_store_b64x4_haswell_(&results_vec, outs + i, remaining);
620
+ }
621
+ }
622
+
623
+ NK_PUBLIC void nk_each_atan_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64_t *outs) {
624
+ nk_size_t i = 0;
625
+ for (; i + 4 <= n; i += 4) {
626
+ __m256d values = _mm256_loadu_pd(ins + i);
627
+ __m256d results = nk_atan_f64x4_haswell_(values);
628
+ _mm256_storeu_pd(outs + i, results);
629
+ }
630
+ if (i < n) {
631
+ nk_size_t remaining = n - i;
632
+ nk_b256_vec_t values_vec;
633
+ nk_partial_load_b64x4_haswell_(ins + i, &values_vec, remaining);
634
+ nk_b256_vec_t results_vec;
635
+ results_vec.ymm_pd = nk_atan_f64x4_haswell_(values_vec.ymm_pd);
636
+ nk_partial_store_b64x4_haswell_(&results_vec, outs + i, remaining);
637
+ }
638
+ }
639
+
640
+ #if defined(__clang__)
641
+ #pragma clang attribute pop
642
+ #elif defined(__GNUC__)
643
+ #pragma GCC pop_options
644
+ #endif
645
+
646
+ #if defined(__cplusplus)
647
+ } // extern "C"
648
+ #endif
649
+
650
+ #endif // NK_TARGET_HASWELL
651
+ #endif // NK_TARGET_X86_
652
+ #endif // NK_TRIGONOMETRY_HASWELL_H