numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -47,18 +47,18 @@ extern "C" {
47
47
 
48
48
  /** @brief Horizontal sum of 4 doubles in a YMM register. */
49
49
  NK_INTERNAL nk_f64_t nk_reduce_add_f64x4_haswell_(__m256d sum_f64x4) {
50
- __m128d lo_f64x2 = _mm256_castpd256_pd128(sum_f64x4);
51
- __m128d hi_f64x2 = _mm256_extractf128_pd(sum_f64x4, 1);
52
- __m128d sum_f64x2 = _mm_add_pd(lo_f64x2, hi_f64x2);
50
+ __m128d low_f64x2 = _mm256_castpd256_pd128(sum_f64x4);
51
+ __m128d high_f64x2 = _mm256_extractf128_pd(sum_f64x4, 1);
52
+ __m128d sum_f64x2 = _mm_add_pd(low_f64x2, high_f64x2);
53
53
  sum_f64x2 = _mm_hadd_pd(sum_f64x2, sum_f64x2);
54
54
  return _mm_cvtsd_f64(sum_f64x2);
55
55
  }
56
56
 
57
57
  /** @brief Horizontal sum of 8 floats in a YMM register (native f32 precision). */
58
58
  NK_INTERNAL nk_f32_t nk_reduce_add_f32x8_haswell_(__m256 sum_f32x8) {
59
- __m128 lo_f32x4 = _mm256_castps256_ps128(sum_f32x8);
60
- __m128 hi_f32x4 = _mm256_extractf128_ps(sum_f32x8, 1);
61
- __m128 sum_f32x4 = _mm_add_ps(lo_f32x4, hi_f32x4);
59
+ __m128 low_f32x4 = _mm256_castps256_ps128(sum_f32x8);
60
+ __m128 high_f32x4 = _mm256_extractf128_ps(sum_f32x8, 1);
61
+ __m128 sum_f32x4 = _mm_add_ps(low_f32x4, high_f32x4);
62
62
  sum_f32x4 = _mm_hadd_ps(sum_f32x4, sum_f32x4);
63
63
  sum_f32x4 = _mm_hadd_ps(sum_f32x4, sum_f32x4);
64
64
  return _mm_cvtss_f32(sum_f32x4);
@@ -66,9 +66,9 @@ NK_INTERNAL nk_f32_t nk_reduce_add_f32x8_haswell_(__m256 sum_f32x8) {
66
66
 
67
67
  /** @brief Horizontal sum of 8 i32s in a YMM register. */
68
68
  NK_INTERNAL nk_i32_t nk_reduce_add_i32x8_haswell_(__m256i sum_i32x8) {
69
- __m128i lo_i32x4 = _mm256_castsi256_si128(sum_i32x8);
70
- __m128i hi_i32x4 = _mm256_extracti128_si256(sum_i32x8, 1);
71
- __m128i sum_i32x4 = _mm_add_epi32(lo_i32x4, hi_i32x4);
69
+ __m128i low_i32x4 = _mm256_castsi256_si128(sum_i32x8);
70
+ __m128i high_i32x4 = _mm256_extracti128_si256(sum_i32x8, 1);
71
+ __m128i sum_i32x4 = _mm_add_epi32(low_i32x4, high_i32x4);
72
72
  sum_i32x4 = _mm_hadd_epi32(sum_i32x4, sum_i32x4);
73
73
  sum_i32x4 = _mm_hadd_epi32(sum_i32x4, sum_i32x4);
74
74
  return _mm_cvtsi128_si32(sum_i32x4);
@@ -76,19 +76,19 @@ NK_INTERNAL nk_i32_t nk_reduce_add_i32x8_haswell_(__m256i sum_i32x8) {
76
76
 
77
77
  /** @brief Horizontal sum of 4 i64s in a YMM register. */
78
78
  NK_INTERNAL nk_i64_t nk_reduce_add_i64x4_haswell_(__m256i sum_i64x4) {
79
- __m128i lo_i64x2 = _mm256_castsi256_si128(sum_i64x4);
80
- __m128i hi_i64x2 = _mm256_extracti128_si256(sum_i64x4, 1);
81
- __m128i sum_i64x2 = _mm_add_epi64(lo_i64x2, hi_i64x2);
82
- __m128i hi_lane_i64 = _mm_unpackhi_epi64(sum_i64x2, sum_i64x2);
83
- __m128i final_i64 = _mm_add_epi64(sum_i64x2, hi_lane_i64);
84
- return _mm_cvtsi128_si64(final_i64);
79
+ __m128i low_i64x2 = _mm256_castsi256_si128(sum_i64x4);
80
+ __m128i high_i64x2 = _mm256_extracti128_si256(sum_i64x4, 1);
81
+ __m128i sum_i64x2 = _mm_add_epi64(low_i64x2, high_i64x2);
82
+ __m128i high_lane_i64x2 = _mm_unpackhi_epi64(sum_i64x2, sum_i64x2);
83
+ __m128i final_i64x2 = _mm_add_epi64(sum_i64x2, high_lane_i64x2);
84
+ return _mm_cvtsi128_si64(final_i64x2);
85
85
  }
86
86
 
87
87
  /** @brief Horizontal min of 8 signed i8s in a YMM register. */
88
88
  NK_INTERNAL nk_i8_t nk_reduce_min_i8x32_haswell_(__m256i min_i8x32) {
89
- __m128i lo_i8x16 = _mm256_castsi256_si128(min_i8x32);
90
- __m128i hi_i8x16 = _mm256_extracti128_si256(min_i8x32, 1);
91
- __m128i min_i8x16 = _mm_min_epi8(lo_i8x16, hi_i8x16);
89
+ __m128i low_i8x16 = _mm256_castsi256_si128(min_i8x32);
90
+ __m128i high_i8x16 = _mm256_extracti128_si256(min_i8x32, 1);
91
+ __m128i min_i8x16 = _mm_min_epi8(low_i8x16, high_i8x16);
92
92
  min_i8x16 = _mm_min_epi8(min_i8x16, _mm_shuffle_epi32(min_i8x16, _MM_SHUFFLE(2, 3, 0, 1)));
93
93
  min_i8x16 = _mm_min_epi8(min_i8x16, _mm_shuffle_epi32(min_i8x16, _MM_SHUFFLE(1, 0, 3, 2)));
94
94
  min_i8x16 = _mm_min_epi8(min_i8x16, _mm_srli_si128(min_i8x16, 2));
@@ -98,9 +98,9 @@ NK_INTERNAL nk_i8_t nk_reduce_min_i8x32_haswell_(__m256i min_i8x32) {
98
98
 
99
99
  /** @brief Horizontal max of 8 signed i8s in a YMM register. */
100
100
  NK_INTERNAL nk_i8_t nk_reduce_max_i8x32_haswell_(__m256i max_i8x32) {
101
- __m128i lo_i8x16 = _mm256_castsi256_si128(max_i8x32);
102
- __m128i hi_i8x16 = _mm256_extracti128_si256(max_i8x32, 1);
103
- __m128i max_i8x16 = _mm_max_epi8(lo_i8x16, hi_i8x16);
101
+ __m128i low_i8x16 = _mm256_castsi256_si128(max_i8x32);
102
+ __m128i high_i8x16 = _mm256_extracti128_si256(max_i8x32, 1);
103
+ __m128i max_i8x16 = _mm_max_epi8(low_i8x16, high_i8x16);
104
104
  max_i8x16 = _mm_max_epi8(max_i8x16, _mm_shuffle_epi32(max_i8x16, _MM_SHUFFLE(2, 3, 0, 1)));
105
105
  max_i8x16 = _mm_max_epi8(max_i8x16, _mm_shuffle_epi32(max_i8x16, _MM_SHUFFLE(1, 0, 3, 2)));
106
106
  max_i8x16 = _mm_max_epi8(max_i8x16, _mm_srli_si128(max_i8x16, 2));
@@ -110,9 +110,9 @@ NK_INTERNAL nk_i8_t nk_reduce_max_i8x32_haswell_(__m256i max_i8x32) {
110
110
 
111
111
  /** @brief Horizontal min of 8 unsigned u8s in a YMM register. */
112
112
  NK_INTERNAL nk_u8_t nk_reduce_min_u8x32_haswell_(__m256i min_u8x32) {
113
- __m128i lo_u8x16 = _mm256_castsi256_si128(min_u8x32);
114
- __m128i hi_u8x16 = _mm256_extracti128_si256(min_u8x32, 1);
115
- __m128i min_u8x16 = _mm_min_epu8(lo_u8x16, hi_u8x16);
113
+ __m128i low_u8x16 = _mm256_castsi256_si128(min_u8x32);
114
+ __m128i high_u8x16 = _mm256_extracti128_si256(min_u8x32, 1);
115
+ __m128i min_u8x16 = _mm_min_epu8(low_u8x16, high_u8x16);
116
116
  min_u8x16 = _mm_min_epu8(min_u8x16, _mm_shuffle_epi32(min_u8x16, _MM_SHUFFLE(2, 3, 0, 1)));
117
117
  min_u8x16 = _mm_min_epu8(min_u8x16, _mm_shuffle_epi32(min_u8x16, _MM_SHUFFLE(1, 0, 3, 2)));
118
118
  min_u8x16 = _mm_min_epu8(min_u8x16, _mm_srli_si128(min_u8x16, 2));
@@ -122,9 +122,9 @@ NK_INTERNAL nk_u8_t nk_reduce_min_u8x32_haswell_(__m256i min_u8x32) {
122
122
 
123
123
  /** @brief Horizontal max of 8 unsigned u8s in a YMM register. */
124
124
  NK_INTERNAL nk_u8_t nk_reduce_max_u8x32_haswell_(__m256i max_u8x32) {
125
- __m128i lo_u8x16 = _mm256_castsi256_si128(max_u8x32);
126
- __m128i hi_u8x16 = _mm256_extracti128_si256(max_u8x32, 1);
127
- __m128i max_u8x16 = _mm_max_epu8(lo_u8x16, hi_u8x16);
125
+ __m128i low_u8x16 = _mm256_castsi256_si128(max_u8x32);
126
+ __m128i high_u8x16 = _mm256_extracti128_si256(max_u8x32, 1);
127
+ __m128i max_u8x16 = _mm_max_epu8(low_u8x16, high_u8x16);
128
128
  max_u8x16 = _mm_max_epu8(max_u8x16, _mm_shuffle_epi32(max_u8x16, _MM_SHUFFLE(2, 3, 0, 1)));
129
129
  max_u8x16 = _mm_max_epu8(max_u8x16, _mm_shuffle_epi32(max_u8x16, _MM_SHUFFLE(1, 0, 3, 2)));
130
130
  max_u8x16 = _mm_max_epu8(max_u8x16, _mm_srli_si128(max_u8x16, 2));
@@ -134,9 +134,9 @@ NK_INTERNAL nk_u8_t nk_reduce_max_u8x32_haswell_(__m256i max_u8x32) {
134
134
 
135
135
  /** @brief Horizontal min of 16 signed i16s in a YMM register. */
136
136
  NK_INTERNAL nk_i16_t nk_reduce_min_i16x16_haswell_(__m256i min_i16x16) {
137
- __m128i lo_i16x8 = _mm256_castsi256_si128(min_i16x16);
138
- __m128i hi_i16x8 = _mm256_extracti128_si256(min_i16x16, 1);
139
- __m128i min_i16x8 = _mm_min_epi16(lo_i16x8, hi_i16x8);
137
+ __m128i low_i16x8 = _mm256_castsi256_si128(min_i16x16);
138
+ __m128i high_i16x8 = _mm256_extracti128_si256(min_i16x16, 1);
139
+ __m128i min_i16x8 = _mm_min_epi16(low_i16x8, high_i16x8);
140
140
  min_i16x8 = _mm_min_epi16(min_i16x8, _mm_shuffle_epi32(min_i16x8, _MM_SHUFFLE(2, 3, 0, 1)));
141
141
  min_i16x8 = _mm_min_epi16(min_i16x8, _mm_shuffle_epi32(min_i16x8, _MM_SHUFFLE(1, 0, 3, 2)));
142
142
  min_i16x8 = _mm_min_epi16(min_i16x8, _mm_srli_si128(min_i16x8, 2));
@@ -145,9 +145,9 @@ NK_INTERNAL nk_i16_t nk_reduce_min_i16x16_haswell_(__m256i min_i16x16) {
145
145
 
146
146
  /** @brief Horizontal max of 16 signed i16s in a YMM register. */
147
147
  NK_INTERNAL nk_i16_t nk_reduce_max_i16x16_haswell_(__m256i max_i16x16) {
148
- __m128i lo_i16x8 = _mm256_castsi256_si128(max_i16x16);
149
- __m128i hi_i16x8 = _mm256_extracti128_si256(max_i16x16, 1);
150
- __m128i max_i16x8 = _mm_max_epi16(lo_i16x8, hi_i16x8);
148
+ __m128i low_i16x8 = _mm256_castsi256_si128(max_i16x16);
149
+ __m128i high_i16x8 = _mm256_extracti128_si256(max_i16x16, 1);
150
+ __m128i max_i16x8 = _mm_max_epi16(low_i16x8, high_i16x8);
151
151
  max_i16x8 = _mm_max_epi16(max_i16x8, _mm_shuffle_epi32(max_i16x8, _MM_SHUFFLE(2, 3, 0, 1)));
152
152
  max_i16x8 = _mm_max_epi16(max_i16x8, _mm_shuffle_epi32(max_i16x8, _MM_SHUFFLE(1, 0, 3, 2)));
153
153
  max_i16x8 = _mm_max_epi16(max_i16x8, _mm_srli_si128(max_i16x8, 2));
@@ -156,9 +156,9 @@ NK_INTERNAL nk_i16_t nk_reduce_max_i16x16_haswell_(__m256i max_i16x16) {
156
156
 
157
157
  /** @brief Horizontal min of 16 unsigned u16s in a YMM register. */
158
158
  NK_INTERNAL nk_u16_t nk_reduce_min_u16x16_haswell_(__m256i min_u16x16) {
159
- __m128i lo_u16x8 = _mm256_castsi256_si128(min_u16x16);
160
- __m128i hi_u16x8 = _mm256_extracti128_si256(min_u16x16, 1);
161
- __m128i min_u16x8 = _mm_min_epu16(lo_u16x8, hi_u16x8);
159
+ __m128i low_u16x8 = _mm256_castsi256_si128(min_u16x16);
160
+ __m128i high_u16x8 = _mm256_extracti128_si256(min_u16x16, 1);
161
+ __m128i min_u16x8 = _mm_min_epu16(low_u16x8, high_u16x8);
162
162
  min_u16x8 = _mm_min_epu16(min_u16x8, _mm_shuffle_epi32(min_u16x8, _MM_SHUFFLE(2, 3, 0, 1)));
163
163
  min_u16x8 = _mm_min_epu16(min_u16x8, _mm_shuffle_epi32(min_u16x8, _MM_SHUFFLE(1, 0, 3, 2)));
164
164
  min_u16x8 = _mm_min_epu16(min_u16x8, _mm_srli_si128(min_u16x8, 2));
@@ -167,9 +167,9 @@ NK_INTERNAL nk_u16_t nk_reduce_min_u16x16_haswell_(__m256i min_u16x16) {
167
167
 
168
168
  /** @brief Horizontal max of 16 unsigned u16s in a YMM register. */
169
169
  NK_INTERNAL nk_u16_t nk_reduce_max_u16x16_haswell_(__m256i max_u16x16) {
170
- __m128i lo_u16x8 = _mm256_castsi256_si128(max_u16x16);
171
- __m128i hi_u16x8 = _mm256_extracti128_si256(max_u16x16, 1);
172
- __m128i max_u16x8 = _mm_max_epu16(lo_u16x8, hi_u16x8);
170
+ __m128i low_u16x8 = _mm256_castsi256_si128(max_u16x16);
171
+ __m128i high_u16x8 = _mm256_extracti128_si256(max_u16x16, 1);
172
+ __m128i max_u16x8 = _mm_max_epu16(low_u16x8, high_u16x8);
173
173
  max_u16x8 = _mm_max_epu16(max_u16x8, _mm_shuffle_epi32(max_u16x8, _MM_SHUFFLE(2, 3, 0, 1)));
174
174
  max_u16x8 = _mm_max_epu16(max_u16x8, _mm_shuffle_epi32(max_u16x8, _MM_SHUFFLE(1, 0, 3, 2)));
175
175
  max_u16x8 = _mm_max_epu16(max_u16x8, _mm_srli_si128(max_u16x8, 2));
@@ -178,9 +178,9 @@ NK_INTERNAL nk_u16_t nk_reduce_max_u16x16_haswell_(__m256i max_u16x16) {
178
178
 
179
179
  /** @brief Horizontal min of 8 signed i32s in a YMM register. */
180
180
  NK_INTERNAL nk_i32_t nk_reduce_min_i32x8_haswell_(__m256i min_i32x8) {
181
- __m128i lo_i32x4 = _mm256_castsi256_si128(min_i32x8);
182
- __m128i hi_i32x4 = _mm256_extracti128_si256(min_i32x8, 1);
183
- __m128i min_i32x4 = _mm_min_epi32(lo_i32x4, hi_i32x4);
181
+ __m128i low_i32x4 = _mm256_castsi256_si128(min_i32x8);
182
+ __m128i high_i32x4 = _mm256_extracti128_si256(min_i32x8, 1);
183
+ __m128i min_i32x4 = _mm_min_epi32(low_i32x4, high_i32x4);
184
184
  min_i32x4 = _mm_min_epi32(min_i32x4, _mm_shuffle_epi32(min_i32x4, _MM_SHUFFLE(2, 3, 0, 1)));
185
185
  min_i32x4 = _mm_min_epi32(min_i32x4, _mm_shuffle_epi32(min_i32x4, _MM_SHUFFLE(1, 0, 3, 2)));
186
186
  return _mm_cvtsi128_si32(min_i32x4);
@@ -188,9 +188,9 @@ NK_INTERNAL nk_i32_t nk_reduce_min_i32x8_haswell_(__m256i min_i32x8) {
188
188
 
189
189
  /** @brief Horizontal max of 8 signed i32s in a YMM register. */
190
190
  NK_INTERNAL nk_i32_t nk_reduce_max_i32x8_haswell_(__m256i max_i32x8) {
191
- __m128i lo_i32x4 = _mm256_castsi256_si128(max_i32x8);
192
- __m128i hi_i32x4 = _mm256_extracti128_si256(max_i32x8, 1);
193
- __m128i max_i32x4 = _mm_max_epi32(lo_i32x4, hi_i32x4);
191
+ __m128i low_i32x4 = _mm256_castsi256_si128(max_i32x8);
192
+ __m128i high_i32x4 = _mm256_extracti128_si256(max_i32x8, 1);
193
+ __m128i max_i32x4 = _mm_max_epi32(low_i32x4, high_i32x4);
194
194
  max_i32x4 = _mm_max_epi32(max_i32x4, _mm_shuffle_epi32(max_i32x4, _MM_SHUFFLE(2, 3, 0, 1)));
195
195
  max_i32x4 = _mm_max_epi32(max_i32x4, _mm_shuffle_epi32(max_i32x4, _MM_SHUFFLE(1, 0, 3, 2)));
196
196
  return _mm_cvtsi128_si32(max_i32x4);
@@ -198,9 +198,9 @@ NK_INTERNAL nk_i32_t nk_reduce_max_i32x8_haswell_(__m256i max_i32x8) {
198
198
 
199
199
  /** @brief Horizontal min of 8 unsigned u32s in a YMM register. */
200
200
  NK_INTERNAL nk_u32_t nk_reduce_min_u32x8_haswell_(__m256i min_u32x8) {
201
- __m128i lo_u32x4 = _mm256_castsi256_si128(min_u32x8);
202
- __m128i hi_u32x4 = _mm256_extracti128_si256(min_u32x8, 1);
203
- __m128i min_u32x4 = _mm_min_epu32(lo_u32x4, hi_u32x4);
201
+ __m128i low_u32x4 = _mm256_castsi256_si128(min_u32x8);
202
+ __m128i high_u32x4 = _mm256_extracti128_si256(min_u32x8, 1);
203
+ __m128i min_u32x4 = _mm_min_epu32(low_u32x4, high_u32x4);
204
204
  min_u32x4 = _mm_min_epu32(min_u32x4, _mm_shuffle_epi32(min_u32x4, _MM_SHUFFLE(2, 3, 0, 1)));
205
205
  min_u32x4 = _mm_min_epu32(min_u32x4, _mm_shuffle_epi32(min_u32x4, _MM_SHUFFLE(1, 0, 3, 2)));
206
206
  return (nk_u32_t)_mm_cvtsi128_si32(min_u32x4);
@@ -208,9 +208,9 @@ NK_INTERNAL nk_u32_t nk_reduce_min_u32x8_haswell_(__m256i min_u32x8) {
208
208
 
209
209
  /** @brief Horizontal max of 8 unsigned u32s in a YMM register. */
210
210
  NK_INTERNAL nk_u32_t nk_reduce_max_u32x8_haswell_(__m256i max_u32x8) {
211
- __m128i lo_u32x4 = _mm256_castsi256_si128(max_u32x8);
212
- __m128i hi_u32x4 = _mm256_extracti128_si256(max_u32x8, 1);
213
- __m128i max_u32x4 = _mm_max_epu32(lo_u32x4, hi_u32x4);
211
+ __m128i low_u32x4 = _mm256_castsi256_si128(max_u32x8);
212
+ __m128i high_u32x4 = _mm256_extracti128_si256(max_u32x8, 1);
213
+ __m128i max_u32x4 = _mm_max_epu32(low_u32x4, high_u32x4);
214
214
  max_u32x4 = _mm_max_epu32(max_u32x4, _mm_shuffle_epi32(max_u32x4, _MM_SHUFFLE(2, 3, 0, 1)));
215
215
  max_u32x4 = _mm_max_epu32(max_u32x4, _mm_shuffle_epi32(max_u32x4, _MM_SHUFFLE(1, 0, 3, 2)));
216
216
  return (nk_u32_t)_mm_cvtsi128_si32(max_u32x4);
@@ -218,61 +218,63 @@ NK_INTERNAL nk_u32_t nk_reduce_max_u32x8_haswell_(__m256i max_u32x8) {
218
218
 
219
219
  /** @brief Horizontal min of 4 signed i64s in a YMM register using comparison+blend. */
220
220
  NK_INTERNAL nk_i64_t nk_reduce_min_i64x4_haswell_(__m256i min_i64x4) {
221
- __m128i lo_i64x2 = _mm256_castsi256_si128(min_i64x4);
222
- __m128i hi_i64x2 = _mm256_extracti128_si256(min_i64x4, 1);
223
- __m128i cmp_i64x2 = _mm_cmpgt_epi64(lo_i64x2, hi_i64x2);
224
- __m128i min_i64x2 = _mm_blendv_epi8(lo_i64x2, hi_i64x2, cmp_i64x2);
225
- __m128i hi_lane_i64 = _mm_unpackhi_epi64(min_i64x2, min_i64x2);
226
- __m128i cmp_final = _mm_cmpgt_epi64(min_i64x2, hi_lane_i64);
227
- __m128i result_i64 = _mm_blendv_epi8(min_i64x2, hi_lane_i64, cmp_final);
228
- return _mm_cvtsi128_si64(result_i64);
221
+ __m128i low_i64x2 = _mm256_castsi256_si128(min_i64x4);
222
+ __m128i high_i64x2 = _mm256_extracti128_si256(min_i64x4, 1);
223
+ __m128i cmp_i64x2 = _mm_cmpgt_epi64(low_i64x2, high_i64x2);
224
+ __m128i min_i64x2 = _mm_blendv_epi8(low_i64x2, high_i64x2, cmp_i64x2);
225
+ __m128i high_lane_i64x2 = _mm_unpackhi_epi64(min_i64x2, min_i64x2);
226
+ __m128i cmp_final_i64x2 = _mm_cmpgt_epi64(min_i64x2, high_lane_i64x2);
227
+ __m128i result_i64x2 = _mm_blendv_epi8(min_i64x2, high_lane_i64x2, cmp_final_i64x2);
228
+ return _mm_cvtsi128_si64(result_i64x2);
229
229
  }
230
230
 
231
231
  /** @brief Horizontal max of 4 signed i64s in a YMM register using comparison+blend. */
232
232
  NK_INTERNAL nk_i64_t nk_reduce_max_i64x4_haswell_(__m256i max_i64x4) {
233
- __m128i lo_i64x2 = _mm256_castsi256_si128(max_i64x4);
234
- __m128i hi_i64x2 = _mm256_extracti128_si256(max_i64x4, 1);
235
- __m128i cmp_i64x2 = _mm_cmpgt_epi64(lo_i64x2, hi_i64x2);
236
- __m128i max_i64x2 = _mm_blendv_epi8(hi_i64x2, lo_i64x2, cmp_i64x2);
237
- __m128i hi_lane_i64 = _mm_unpackhi_epi64(max_i64x2, max_i64x2);
238
- __m128i cmp_final = _mm_cmpgt_epi64(max_i64x2, hi_lane_i64);
239
- __m128i result_i64 = _mm_blendv_epi8(hi_lane_i64, max_i64x2, cmp_final);
240
- return _mm_cvtsi128_si64(result_i64);
233
+ __m128i low_i64x2 = _mm256_castsi256_si128(max_i64x4);
234
+ __m128i high_i64x2 = _mm256_extracti128_si256(max_i64x4, 1);
235
+ __m128i cmp_i64x2 = _mm_cmpgt_epi64(low_i64x2, high_i64x2);
236
+ __m128i max_i64x2 = _mm_blendv_epi8(high_i64x2, low_i64x2, cmp_i64x2);
237
+ __m128i high_lane_i64x2 = _mm_unpackhi_epi64(max_i64x2, max_i64x2);
238
+ __m128i cmp_final_i64x2 = _mm_cmpgt_epi64(max_i64x2, high_lane_i64x2);
239
+ __m128i result_i64x2 = _mm_blendv_epi8(high_lane_i64x2, max_i64x2, cmp_final_i64x2);
240
+ return _mm_cvtsi128_si64(result_i64x2);
241
241
  }
242
242
 
243
243
  /** @brief Horizontal min of 4 unsigned u64s in a YMM register using XOR trick for unsigned comparison. */
244
244
  NK_INTERNAL nk_u64_t nk_reduce_min_u64x4_haswell_(__m256i min_u64x4) {
245
- __m128i sign_bit_i64 = _mm_set1_epi64x((nk_i64_t)0x8000000000000000ull);
246
- __m128i lo_u64x2 = _mm256_castsi256_si128(min_u64x4);
247
- __m128i hi_u64x2 = _mm256_extracti128_si256(min_u64x4, 1);
248
- __m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(lo_u64x2, sign_bit_i64), _mm_xor_si128(hi_u64x2, sign_bit_i64));
249
- __m128i min_u64x2 = _mm_blendv_epi8(lo_u64x2, hi_u64x2, cmp_i64x2);
250
- __m128i hi_lane_u64 = _mm_unpackhi_epi64(min_u64x2, min_u64x2);
251
- __m128i cmp_final = _mm_cmpgt_epi64(_mm_xor_si128(min_u64x2, sign_bit_i64),
252
- _mm_xor_si128(hi_lane_u64, sign_bit_i64));
253
- __m128i result_u64 = _mm_blendv_epi8(min_u64x2, hi_lane_u64, cmp_final);
254
- return (nk_u64_t)_mm_cvtsi128_si64(result_u64);
245
+ __m128i sign_bit_i64x2 = _mm_set1_epi64x((nk_i64_t)0x8000000000000000ull);
246
+ __m128i low_u64x2 = _mm256_castsi256_si128(min_u64x4);
247
+ __m128i high_u64x2 = _mm256_extracti128_si256(min_u64x4, 1);
248
+ __m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(low_u64x2, sign_bit_i64x2),
249
+ _mm_xor_si128(high_u64x2, sign_bit_i64x2));
250
+ __m128i min_u64x2 = _mm_blendv_epi8(low_u64x2, high_u64x2, cmp_i64x2);
251
+ __m128i high_lane_u64x2 = _mm_unpackhi_epi64(min_u64x2, min_u64x2);
252
+ __m128i cmp_final_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(min_u64x2, sign_bit_i64x2),
253
+ _mm_xor_si128(high_lane_u64x2, sign_bit_i64x2));
254
+ __m128i result_u64x2 = _mm_blendv_epi8(min_u64x2, high_lane_u64x2, cmp_final_i64x2);
255
+ return (nk_u64_t)_mm_cvtsi128_si64(result_u64x2);
255
256
  }
256
257
 
257
258
  /** @brief Horizontal max of 4 unsigned u64s in a YMM register using XOR trick for unsigned comparison. */
258
259
  NK_INTERNAL nk_u64_t nk_reduce_max_u64x4_haswell_(__m256i max_u64x4) {
259
- __m128i sign_bit_i64 = _mm_set1_epi64x((nk_i64_t)0x8000000000000000ull);
260
- __m128i lo_u64x2 = _mm256_castsi256_si128(max_u64x4);
261
- __m128i hi_u64x2 = _mm256_extracti128_si256(max_u64x4, 1);
262
- __m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(lo_u64x2, sign_bit_i64), _mm_xor_si128(hi_u64x2, sign_bit_i64));
263
- __m128i max_u64x2 = _mm_blendv_epi8(hi_u64x2, lo_u64x2, cmp_i64x2);
264
- __m128i hi_lane_u64 = _mm_unpackhi_epi64(max_u64x2, max_u64x2);
265
- __m128i cmp_final = _mm_cmpgt_epi64(_mm_xor_si128(max_u64x2, sign_bit_i64),
266
- _mm_xor_si128(hi_lane_u64, sign_bit_i64));
267
- __m128i result_u64 = _mm_blendv_epi8(hi_lane_u64, max_u64x2, cmp_final);
268
- return (nk_u64_t)_mm_cvtsi128_si64(result_u64);
260
+ __m128i sign_bit_i64x2 = _mm_set1_epi64x((nk_i64_t)0x8000000000000000ull);
261
+ __m128i low_u64x2 = _mm256_castsi256_si128(max_u64x4);
262
+ __m128i high_u64x2 = _mm256_extracti128_si256(max_u64x4, 1);
263
+ __m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(low_u64x2, sign_bit_i64x2),
264
+ _mm_xor_si128(high_u64x2, sign_bit_i64x2));
265
+ __m128i max_u64x2 = _mm_blendv_epi8(high_u64x2, low_u64x2, cmp_i64x2);
266
+ __m128i high_lane_u64x2 = _mm_unpackhi_epi64(max_u64x2, max_u64x2);
267
+ __m128i cmp_final_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(max_u64x2, sign_bit_i64x2),
268
+ _mm_xor_si128(high_lane_u64x2, sign_bit_i64x2));
269
+ __m128i result_u64x2 = _mm_blendv_epi8(high_lane_u64x2, max_u64x2, cmp_final_i64x2);
270
+ return (nk_u64_t)_mm_cvtsi128_si64(result_u64x2);
269
271
  }
270
272
 
271
273
  /** @brief Horizontal min of 8 floats in a YMM register. */
272
274
  NK_INTERNAL nk_f32_t nk_reduce_min_f32x8_haswell_(__m256 min_f32x8) {
273
- __m128 lo_f32x4 = _mm256_castps256_ps128(min_f32x8);
274
- __m128 hi_f32x4 = _mm256_extractf128_ps(min_f32x8, 1);
275
- __m128 min_f32x4 = _mm_min_ps(lo_f32x4, hi_f32x4);
275
+ __m128 low_f32x4 = _mm256_castps256_ps128(min_f32x8);
276
+ __m128 high_f32x4 = _mm256_extractf128_ps(min_f32x8, 1);
277
+ __m128 min_f32x4 = _mm_min_ps(low_f32x4, high_f32x4);
276
278
  min_f32x4 = _mm_min_ps(min_f32x4, _mm_shuffle_ps(min_f32x4, min_f32x4, _MM_SHUFFLE(2, 3, 0, 1)));
277
279
  min_f32x4 = _mm_min_ps(min_f32x4, _mm_shuffle_ps(min_f32x4, min_f32x4, _MM_SHUFFLE(1, 0, 3, 2)));
278
280
  return _mm_cvtss_f32(min_f32x4);
@@ -280,9 +282,9 @@ NK_INTERNAL nk_f32_t nk_reduce_min_f32x8_haswell_(__m256 min_f32x8) {
280
282
 
281
283
  /** @brief Horizontal max of 8 floats in a YMM register. */
282
284
  NK_INTERNAL nk_f32_t nk_reduce_max_f32x8_haswell_(__m256 max_f32x8) {
283
- __m128 lo_f32x4 = _mm256_castps256_ps128(max_f32x8);
284
- __m128 hi_f32x4 = _mm256_extractf128_ps(max_f32x8, 1);
285
- __m128 max_f32x4 = _mm_max_ps(lo_f32x4, hi_f32x4);
285
+ __m128 low_f32x4 = _mm256_castps256_ps128(max_f32x8);
286
+ __m128 high_f32x4 = _mm256_extractf128_ps(max_f32x8, 1);
287
+ __m128 max_f32x4 = _mm_max_ps(low_f32x4, high_f32x4);
286
288
  max_f32x4 = _mm_max_ps(max_f32x4, _mm_shuffle_ps(max_f32x4, max_f32x4, _MM_SHUFFLE(2, 3, 0, 1)));
287
289
  max_f32x4 = _mm_max_ps(max_f32x4, _mm_shuffle_ps(max_f32x4, max_f32x4, _MM_SHUFFLE(1, 0, 3, 2)));
288
290
  return _mm_cvtss_f32(max_f32x4);
@@ -290,18 +292,18 @@ NK_INTERNAL nk_f32_t nk_reduce_max_f32x8_haswell_(__m256 max_f32x8) {
290
292
 
291
293
  /** @brief Horizontal min of 4 doubles in a YMM register. */
292
294
  NK_INTERNAL nk_f64_t nk_reduce_min_f64x4_haswell_(__m256d min_f64x4) {
293
- __m128d lo_f64x2 = _mm256_castpd256_pd128(min_f64x4);
294
- __m128d hi_f64x2 = _mm256_extractf128_pd(min_f64x4, 1);
295
- __m128d min_f64x2 = _mm_min_pd(lo_f64x2, hi_f64x2);
295
+ __m128d low_f64x2 = _mm256_castpd256_pd128(min_f64x4);
296
+ __m128d high_f64x2 = _mm256_extractf128_pd(min_f64x4, 1);
297
+ __m128d min_f64x2 = _mm_min_pd(low_f64x2, high_f64x2);
296
298
  min_f64x2 = _mm_min_pd(min_f64x2, _mm_shuffle_pd(min_f64x2, min_f64x2, 1));
297
299
  return _mm_cvtsd_f64(min_f64x2);
298
300
  }
299
301
 
300
302
  /** @brief Horizontal max of 4 doubles in a YMM register. */
301
303
  NK_INTERNAL nk_f64_t nk_reduce_max_f64x4_haswell_(__m256d max_f64x4) {
302
- __m128d lo_f64x2 = _mm256_castpd256_pd128(max_f64x4);
303
- __m128d hi_f64x2 = _mm256_extractf128_pd(max_f64x4, 1);
304
- __m128d max_f64x2 = _mm_max_pd(lo_f64x2, hi_f64x2);
304
+ __m128d low_f64x2 = _mm256_castpd256_pd128(max_f64x4);
305
+ __m128d high_f64x2 = _mm256_extractf128_pd(max_f64x4, 1);
306
+ __m128d max_f64x2 = _mm_max_pd(low_f64x2, high_f64x2);
305
307
  max_f64x2 = _mm_max_pd(max_f64x2, _mm_shuffle_pd(max_f64x2, max_f64x2, 1));
306
308
  return _mm_cvtsd_f64(max_f64x2);
307
309
  }
@@ -529,7 +531,7 @@ NK_INTERNAL void nk_reduce_moments_f32_haswell_strided_( //
529
531
  __m256d sumsq_low_f64x4 = _mm256_setzero_pd(), sumsq_high_f64x4 = _mm256_setzero_pd();
530
532
  nk_size_t idx = 0, total = count * stride_elements;
531
533
  nk_size_t step = nk_size_round_up_to_multiple_(8, stride_elements);
532
- for (; idx + step <= total; idx += step) {
534
+ for (; idx + stride_elements + 7 <= total; idx += step) {
533
535
  __m128 low_f32x4 = _mm_blendv_ps(zero_f32x4, _mm_loadu_ps(data_ptr + idx), blend_low_f32x4);
534
536
  __m128 high_f32x4 = _mm_blendv_ps(zero_f32x4, _mm_loadu_ps(data_ptr + idx + 4), blend_high_f32x4);
535
537
  __m256d low_f64x4 = _mm256_cvtps_pd(low_f32x4);
@@ -767,7 +769,7 @@ NK_INTERNAL void nk_reduce_moments_f64_haswell_strided_( //
767
769
  __m256d sumsq_comp_f64x4 = _mm256_setzero_pd();
768
770
  nk_size_t idx = 0, total = count * stride_elements;
769
771
  nk_size_t step = nk_size_round_up_to_multiple_(4, stride_elements);
770
- for (; idx + step <= total; idx += step) {
772
+ for (; idx + stride_elements + 3 <= total; idx += step) {
771
773
  __m256d val_f64x4 = _mm256_blendv_pd(zero_f64x4, _mm256_loadu_pd(data_ptr + idx), blend_f64x4);
772
774
  __m256d tentative_f64x4 = _mm256_add_pd(sum_f64x4, val_f64x4);
773
775
  __m256d round_f64x4 = _mm256_sub_pd(tentative_f64x4, sum_f64x4);
@@ -979,7 +981,7 @@ NK_INTERNAL void nk_reduce_moments_i8_haswell_strided_( //
979
981
  nk_size_t total_scalars = count * stride_elements;
980
982
  nk_size_t vector_element_count = 0;
981
983
  nk_size_t step = elements_per_vector * stride_elements;
982
- for (; idx_scalars + step <= total_scalars; idx_scalars += step) {
984
+ for (; idx_scalars + stride_elements + 31 <= total_scalars; idx_scalars += step) {
983
985
  __m256i data_i8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
984
986
  data_i8x32 = _mm256_and_si256(data_i8x32, stride_mask_i8x32);
985
987
  __m256i unsigned_u8x32 = _mm256_xor_si256(data_i8x32, masked_bias_i8x32);
@@ -1179,7 +1181,7 @@ NK_INTERNAL void nk_reduce_moments_u8_haswell_strided_( //
1179
1181
  nk_size_t idx_scalars = 0;
1180
1182
  nk_size_t total_scalars = count * stride_elements;
1181
1183
  nk_size_t step = nk_size_round_up_to_multiple_(32, stride_elements);
1182
- for (; idx_scalars + step <= total_scalars; idx_scalars += step) {
1184
+ for (; idx_scalars + stride_elements + 31 <= total_scalars; idx_scalars += step) {
1183
1185
  __m256i data_u8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
1184
1186
  data_u8x32 = _mm256_and_si256(data_u8x32, stride_mask_u8x32);
1185
1187
  sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(data_u8x32, zero_u8x32));
@@ -1375,7 +1377,7 @@ NK_INTERNAL void nk_reduce_moments_i16_haswell_strided_( //
1375
1377
  nk_size_t idx_scalars = 0;
1376
1378
  nk_size_t total_scalars = count * stride_elements;
1377
1379
  nk_size_t step = nk_size_round_up_to_multiple_(16, stride_elements);
1378
- for (; idx_scalars + step <= total_scalars; idx_scalars += step) {
1380
+ for (; idx_scalars + stride_elements + 15 <= total_scalars; idx_scalars += step) {
1379
1381
  __m256i data_i16x16 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
1380
1382
  data_i16x16 = _mm256_and_si256(data_i16x16, stride_mask_i16x16);
1381
1383
  sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(data_i16x16, ones_i16x16));
@@ -1566,19 +1568,19 @@ NK_INTERNAL void nk_reduce_moments_u16_haswell_strided_( //
1566
1568
  nk_size_t idx_scalars = 0;
1567
1569
  nk_size_t total_scalars = count * stride_elements;
1568
1570
  nk_size_t step = nk_size_round_up_to_multiple_(16, stride_elements);
1569
- for (; idx_scalars + step <= total_scalars; idx_scalars += step) {
1571
+ for (; idx_scalars + stride_elements + 15 <= total_scalars; idx_scalars += step) {
1570
1572
  __m256i data_u16x16 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
1571
1573
  data_u16x16 = _mm256_and_si256(data_u16x16, stride_mask_i16x16);
1572
- __m256i lo_u32x8 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(data_u16x16));
1573
- __m256i hi_u32x8 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(data_u16x16, 1));
1574
- sum_u32x8 = _mm256_add_epi32(sum_u32x8, lo_u32x8);
1575
- sum_u32x8 = _mm256_add_epi32(sum_u32x8, hi_u32x8);
1576
- __m256i lo_sq_u32x8 = _mm256_mullo_epi32(lo_u32x8, lo_u32x8);
1577
- __m256i hi_sq_u32x8 = _mm256_mullo_epi32(hi_u32x8, hi_u32x8);
1578
- sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(lo_sq_u32x8)));
1579
- sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(lo_sq_u32x8, 1)));
1580
- sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(hi_sq_u32x8)));
1581
- sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(hi_sq_u32x8, 1)));
1574
+ __m256i low_u32x8 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(data_u16x16));
1575
+ __m256i high_u32x8 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(data_u16x16, 1));
1576
+ sum_u32x8 = _mm256_add_epi32(sum_u32x8, low_u32x8);
1577
+ sum_u32x8 = _mm256_add_epi32(sum_u32x8, high_u32x8);
1578
+ __m256i low_sq_u32x8 = _mm256_mullo_epi32(low_u32x8, low_u32x8);
1579
+ __m256i high_sq_u32x8 = _mm256_mullo_epi32(high_u32x8, high_u32x8);
1580
+ sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(low_sq_u32x8)));
1581
+ sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(low_sq_u32x8, 1)));
1582
+ sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(high_sq_u32x8)));
1583
+ sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(high_sq_u32x8, 1)));
1582
1584
  }
1583
1585
  __m256i sum_u64x4 = _mm256_add_epi64( //
1584
1586
  _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum_u32x8)), //
@@ -1730,8 +1732,8 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
1730
1732
  nk_i32_t const *data_ptr, nk_size_t count, //
1731
1733
  nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
1732
1734
 
1733
- __m256i sum_lower_i64x4 = _mm256_setzero_si256();
1734
- __m256i sum_upper_i64x4 = _mm256_setzero_si256();
1735
+ __m256i sum_low_i64x4 = _mm256_setzero_si256();
1736
+ __m256i sum_high_i64x4 = _mm256_setzero_si256();
1735
1737
  __m256i sumsq_u64x4 = _mm256_setzero_si256();
1736
1738
  int sumsq_overflow_mask = 0;
1737
1739
  __m256i sign_bit_i64x4 = _mm256_set1_epi64x((nk_i64_t)0x8000000000000000ULL);
@@ -1739,25 +1741,25 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
1739
1741
  for (; idx + 8 <= count; idx += 8) {
1740
1742
  __m256i data_i32x8 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx));
1741
1743
  // 128-bit sum: lo half
1742
- __m256i widened_lo_i64x4 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(data_i32x8));
1743
- __m256i sum_before_i64x4 = sum_lower_i64x4;
1744
- sum_lower_i64x4 = _mm256_add_epi64(sum_lower_i64x4, widened_lo_i64x4);
1745
- __m256i result_biased_i64x4 = _mm256_xor_si256(sum_lower_i64x4, sign_bit_i64x4);
1744
+ __m256i widened_low_i64x4 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(data_i32x8));
1745
+ __m256i sum_before_i64x4 = sum_low_i64x4;
1746
+ sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_low_i64x4);
1747
+ __m256i result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
1746
1748
  __m256i before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
1747
1749
  __m256i carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
1748
- sum_upper_i64x4 = _mm256_sub_epi64(sum_upper_i64x4, carry_mask_i64x4);
1749
- __m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_lo_i64x4);
1750
- sum_upper_i64x4 = _mm256_add_epi64(sum_upper_i64x4, sign_ext_i64x4);
1750
+ sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
1751
+ __m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_low_i64x4);
1752
+ sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
1751
1753
  // 128-bit sum: hi half
1752
- __m256i widened_hi_i64x4 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(data_i32x8, 1));
1753
- sum_before_i64x4 = sum_lower_i64x4;
1754
- sum_lower_i64x4 = _mm256_add_epi64(sum_lower_i64x4, widened_hi_i64x4);
1755
- result_biased_i64x4 = _mm256_xor_si256(sum_lower_i64x4, sign_bit_i64x4);
1754
+ __m256i widened_high_i64x4 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(data_i32x8, 1));
1755
+ sum_before_i64x4 = sum_low_i64x4;
1756
+ sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_high_i64x4);
1757
+ result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
1756
1758
  before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
1757
1759
  carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
1758
- sum_upper_i64x4 = _mm256_sub_epi64(sum_upper_i64x4, carry_mask_i64x4);
1759
- sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_hi_i64x4);
1760
- sum_upper_i64x4 = _mm256_add_epi64(sum_upper_i64x4, sign_ext_i64x4);
1760
+ sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
1761
+ sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_high_i64x4);
1762
+ sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
1761
1763
  // Sumsq: running mask + wrapping add with unsigned carry detection
1762
1764
  __m256i even_sq_u64x4 = _mm256_mul_epi32(data_i32x8, data_i32x8);
1763
1765
  __m256i odd_i32x8 = _mm256_srli_epi64(data_i32x8, 32);
@@ -1780,24 +1782,24 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
1780
1782
  nk_b256_vec_t tail_vec;
1781
1783
  nk_partial_load_b32x8_serial_(data_ptr + idx, &tail_vec, remaining);
1782
1784
  __m256i data_i32x8 = tail_vec.ymm;
1783
- __m256i widened_lo_i64x4 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(data_i32x8));
1784
- __m256i sum_before_i64x4 = sum_lower_i64x4;
1785
- sum_lower_i64x4 = _mm256_add_epi64(sum_lower_i64x4, widened_lo_i64x4);
1786
- __m256i result_biased_i64x4 = _mm256_xor_si256(sum_lower_i64x4, sign_bit_i64x4);
1785
+ __m256i widened_low_i64x4 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(data_i32x8));
1786
+ __m256i sum_before_i64x4 = sum_low_i64x4;
1787
+ sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_low_i64x4);
1788
+ __m256i result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
1787
1789
  __m256i before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
1788
1790
  __m256i carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
1789
- sum_upper_i64x4 = _mm256_sub_epi64(sum_upper_i64x4, carry_mask_i64x4);
1790
- __m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_lo_i64x4);
1791
- sum_upper_i64x4 = _mm256_add_epi64(sum_upper_i64x4, sign_ext_i64x4);
1792
- __m256i widened_hi_i64x4 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(data_i32x8, 1));
1793
- sum_before_i64x4 = sum_lower_i64x4;
1794
- sum_lower_i64x4 = _mm256_add_epi64(sum_lower_i64x4, widened_hi_i64x4);
1795
- result_biased_i64x4 = _mm256_xor_si256(sum_lower_i64x4, sign_bit_i64x4);
1791
+ sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
1792
+ __m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_low_i64x4);
1793
+ sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
1794
+ __m256i widened_high_i64x4 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(data_i32x8, 1));
1795
+ sum_before_i64x4 = sum_low_i64x4;
1796
+ sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_high_i64x4);
1797
+ result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
1796
1798
  before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
1797
1799
  carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
1798
- sum_upper_i64x4 = _mm256_sub_epi64(sum_upper_i64x4, carry_mask_i64x4);
1799
- sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_hi_i64x4);
1800
- sum_upper_i64x4 = _mm256_add_epi64(sum_upper_i64x4, sign_ext_i64x4);
1800
+ sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
1801
+ sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_high_i64x4);
1802
+ sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
1801
1803
  __m256i even_sq_u64x4 = _mm256_mul_epi32(data_i32x8, data_i32x8);
1802
1804
  __m256i odd_i32x8 = _mm256_srli_epi64(data_i32x8, 32);
1803
1805
  __m256i odd_sq_u64x4 = _mm256_mul_epi32(odd_i32x8, odd_i32x8);
@@ -1820,20 +1822,20 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
1820
1822
  else sumsq = nk_reduce_sadd_u64x4_haswell_(sumsq_u64x4);
1821
1823
  // Sum: horizontal 128-bit reduction (4 lanes → scalar)
1822
1824
  nk_b256_vec_t lower_vec, upper_vec;
1823
- lower_vec.ymm = sum_lower_i64x4;
1824
- upper_vec.ymm = sum_upper_i64x4;
1825
- nk_u64_t sum_lower = 0;
1826
- nk_i64_t sum_upper = 0;
1825
+ lower_vec.ymm = sum_low_i64x4;
1826
+ upper_vec.ymm = sum_high_i64x4;
1827
+ nk_u64_t sum_low = 0;
1828
+ nk_i64_t sum_high = 0;
1827
1829
  for (int i = 0; i < 4; i++) {
1828
- nk_u64_t sum_before = sum_lower;
1829
- sum_lower += lower_vec.u64s[i];
1830
- if (sum_lower < sum_before) sum_upper++;
1831
- sum_upper += upper_vec.i64s[i];
1830
+ nk_u64_t sum_before = sum_low;
1831
+ sum_low += lower_vec.u64s[i];
1832
+ if (sum_low < sum_before) sum_high++;
1833
+ sum_high += upper_vec.i64s[i];
1832
1834
  }
1833
1835
  *sumsq_ptr = sumsq;
1834
- nk_i64_t sum_lower_signed = (nk_i64_t)sum_lower;
1835
- if (sum_upper == (sum_lower_signed >> 63)) *sum_ptr = sum_lower_signed;
1836
- else if (sum_upper >= 0) *sum_ptr = NK_I64_MAX;
1836
+ nk_i64_t sum_low_signed = (nk_i64_t)sum_low;
1837
+ if (sum_high == (sum_low_signed >> 63)) *sum_ptr = sum_low_signed;
1838
+ else if (sum_high >= 0) *sum_ptr = NK_I64_MAX;
1837
1839
  else *sum_ptr = NK_I64_MIN;
1838
1840
  }
1839
1841
 
@@ -2114,8 +2116,8 @@ NK_INTERNAL void nk_reduce_moments_i64_haswell_contiguous_( //
2114
2116
  nk_i64_t const *data_ptr, nk_size_t count, //
2115
2117
  nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
2116
2118
 
2117
- __m256i sum_lower_u64x4 = _mm256_setzero_si256();
2118
- __m256i sum_upper_i64x4 = _mm256_setzero_si256();
2119
+ __m256i sum_low_u64x4 = _mm256_setzero_si256();
2120
+ __m256i sum_high_i64x4 = _mm256_setzero_si256();
2119
2121
  __m256i sumsq_u64x4 = _mm256_setzero_si256();
2120
2122
  int sumsq_overflow_mask = 0;
2121
2123
  __m256i sign_bit_i64x4 = _mm256_set1_epi64x((nk_i64_t)0x8000000000000000ULL);
@@ -2130,26 +2132,26 @@ NK_INTERNAL void nk_reduce_moments_i64_haswell_contiguous_( //
2130
2132
  sumsq_overflow_mask |= _mm256_movemask_pd(
2131
2133
  _mm256_castsi256_pd(_mm256_cmpgt_epi64(sq_before_biased_u64x4, sq_result_biased_u64x4)));
2132
2134
  // Vectorized 128-bit carry-propagating sum
2133
- __m256i sum_before_u64x4 = sum_lower_u64x4;
2134
- sum_lower_u64x4 = _mm256_add_epi64(sum_lower_u64x4, data_i64x4);
2135
+ __m256i sum_before_u64x4 = sum_low_u64x4;
2136
+ sum_low_u64x4 = _mm256_add_epi64(sum_low_u64x4, data_i64x4);
2135
2137
  __m256i before_biased_u64x4 = _mm256_xor_si256(sum_before_u64x4, sign_bit_i64x4);
2136
- __m256i result_biased_u64x4 = _mm256_xor_si256(sum_lower_u64x4, sign_bit_i64x4);
2138
+ __m256i result_biased_u64x4 = _mm256_xor_si256(sum_low_u64x4, sign_bit_i64x4);
2137
2139
  __m256i carry_u64x4 = _mm256_cmpgt_epi64(before_biased_u64x4, result_biased_u64x4);
2138
- sum_upper_i64x4 = _mm256_sub_epi64(sum_upper_i64x4, carry_u64x4);
2140
+ sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_u64x4);
2139
2141
  __m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), data_i64x4);
2140
- sum_upper_i64x4 = _mm256_add_epi64(sum_upper_i64x4, sign_ext_i64x4);
2142
+ sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
2141
2143
  }
2142
- // Horizontal reduction of 4 lanes to scalar (sum_lower, sum_upper)
2144
+ // Horizontal reduction of 4 lanes to scalar (sum_low, sum_high)
2143
2145
  nk_b256_vec_t lower_vec, upper_vec;
2144
- lower_vec.ymm = sum_lower_u64x4;
2145
- upper_vec.ymm = sum_upper_i64x4;
2146
- nk_u64_t sum_lower = 0;
2147
- nk_i64_t sum_upper = 0;
2146
+ lower_vec.ymm = sum_low_u64x4;
2147
+ upper_vec.ymm = sum_high_i64x4;
2148
+ nk_u64_t sum_low = 0;
2149
+ nk_i64_t sum_high = 0;
2148
2150
  for (int i = 0; i < 4; i++) {
2149
- nk_u64_t before = sum_lower;
2150
- sum_lower += lower_vec.u64s[i];
2151
- if (sum_lower < before) sum_upper++;
2152
- sum_upper += upper_vec.i64s[i];
2151
+ nk_u64_t before = sum_low;
2152
+ sum_low += lower_vec.u64s[i];
2153
+ if (sum_low < before) sum_high++;
2154
+ sum_high += upper_vec.i64s[i];
2153
2155
  }
2154
2156
  nk_u64_t sumsq;
2155
2157
  if (sumsq_overflow_mask) sumsq = NK_U64_MAX;
@@ -2159,15 +2161,15 @@ NK_INTERNAL void nk_reduce_moments_i64_haswell_contiguous_( //
2159
2161
  nk_i64_t product = nk_i64_saturating_mul_serial(val, val);
2160
2162
  nk_u64_t unsigned_product = (nk_u64_t)product;
2161
2163
  sumsq = nk_u64_saturating_add_serial(sumsq, unsigned_product);
2162
- nk_u64_t before = sum_lower;
2163
- sum_lower += (nk_u64_t)val;
2164
- if (sum_lower < before) sum_upper++;
2165
- sum_upper += (val >> 63);
2164
+ nk_u64_t before = sum_low;
2165
+ sum_low += (nk_u64_t)val;
2166
+ if (sum_low < before) sum_high++;
2167
+ sum_high += (val >> 63);
2166
2168
  }
2167
2169
  *sumsq_ptr = sumsq;
2168
- nk_i64_t sum_lower_signed = (nk_i64_t)sum_lower;
2169
- if (sum_upper == (sum_lower_signed >> 63)) *sum_ptr = sum_lower_signed;
2170
- else if (sum_upper >= 0) *sum_ptr = NK_I64_MAX;
2170
+ nk_i64_t sum_low_signed = (nk_i64_t)sum_low;
2171
+ if (sum_high == (sum_low_signed >> 63)) *sum_ptr = sum_low_signed;
2172
+ else if (sum_high >= 0) *sum_ptr = NK_I64_MAX;
2171
2173
  else *sum_ptr = NK_I64_MIN;
2172
2174
  }
2173
2175
 
@@ -2925,9 +2927,9 @@ NK_PUBLIC void nk_reduce_moments_e2m3_haswell( //
2925
2927
 
2926
2928
  NK_INTERNAL __m256i nk_fp6x32_to_u8x32_comparable_haswell_(__m256i raw_i8x32) {
2927
2929
  raw_i8x32 = _mm256_and_si256(raw_i8x32, _mm256_set1_epi8(0x3F)); // mask to 6 valid bits
2928
- __m256i sign_mask = _mm256_set1_epi8(0x20);
2929
- __m256i neg_i8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(raw_i8x32, sign_mask), sign_mask);
2930
- __m256i pos_xor_i8x32 = sign_mask; // flip sign bit only
2930
+ __m256i sign_mask_i8x32 = _mm256_set1_epi8(0x20);
2931
+ __m256i neg_i8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(raw_i8x32, sign_mask_i8x32), sign_mask_i8x32);
2932
+ __m256i pos_xor_i8x32 = sign_mask_i8x32; // flip sign bit only
2931
2933
  __m256i neg_xor_i8x32 = _mm256_set1_epi8(0x3F); // flip all 6 bits
2932
2934
  __m256i xor_i8x32 = _mm256_blendv_epi8(pos_xor_i8x32, neg_xor_i8x32, neg_i8x32);
2933
2935
  return _mm256_xor_si256(raw_i8x32, xor_i8x32);
@@ -2960,15 +2962,15 @@ NK_INTERNAL void nk_reduce_minmax_e2m3_haswell_contiguous_( //
2960
2962
  for (; idx + 32 <= count; idx += 32) {
2961
2963
  __m256i data_i8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx));
2962
2964
  __m256i data_cmp_u8x32 = nk_fp6x32_to_u8x32_comparable_haswell_(data_i8x32);
2963
- __m256i new_min = _mm256_min_epu8(min_vec.ymm, data_cmp_u8x32);
2964
- __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min, min_vec.ymm),
2965
+ __m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_cmp_u8x32);
2966
+ __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
2965
2967
  _mm256_set1_epi8((char)0xFF));
2966
- min_vec.ymm = new_min;
2968
+ min_vec.ymm = new_min_u8x32;
2967
2969
  min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
2968
- __m256i new_max = _mm256_max_epu8(max_vec.ymm, data_cmp_u8x32);
2969
- __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max, max_vec.ymm),
2970
+ __m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_cmp_u8x32);
2971
+ __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
2970
2972
  _mm256_set1_epi8((char)0xFF));
2971
- max_vec.ymm = new_max;
2973
+ max_vec.ymm = new_max_u8x32;
2972
2974
  max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
2973
2975
  current_loop_cycle_u8x32 = _mm256_add_epi8(current_loop_cycle_u8x32, one_u8x32);
2974
2976
  }
@@ -2984,15 +2986,15 @@ NK_INTERNAL void nk_reduce_minmax_e2m3_haswell_contiguous_( //
2984
2986
  __m256i valid_b8x32 = _mm256_cmpgt_epi8(_mm256_set1_epi8((char)remaining), lane_indices_u8x32);
2985
2987
  __m256i data_min_u8x32 = _mm256_blendv_epi8(_mm256_set1_epi8(0x3F), data_cmp_u8x32, valid_b8x32);
2986
2988
  __m256i data_max_u8x32 = _mm256_blendv_epi8(_mm256_setzero_si256(), data_cmp_u8x32, valid_b8x32);
2987
- __m256i new_min = _mm256_min_epu8(min_vec.ymm, data_min_u8x32);
2988
- __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min, min_vec.ymm),
2989
+ __m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_min_u8x32);
2990
+ __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
2989
2991
  _mm256_set1_epi8((char)0xFF));
2990
- min_vec.ymm = new_min;
2992
+ min_vec.ymm = new_min_u8x32;
2991
2993
  min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
2992
- __m256i new_max = _mm256_max_epu8(max_vec.ymm, data_max_u8x32);
2993
- __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max, max_vec.ymm),
2994
+ __m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_max_u8x32);
2995
+ __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
2994
2996
  _mm256_set1_epi8((char)0xFF));
2995
- max_vec.ymm = new_max;
2997
+ max_vec.ymm = new_max_u8x32;
2996
2998
  max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
2997
2999
  }
2998
3000
 
@@ -3149,15 +3151,15 @@ NK_INTERNAL void nk_reduce_minmax_e3m2_haswell_contiguous_( //
3149
3151
  for (; idx + 32 <= count; idx += 32) {
3150
3152
  __m256i data_i8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx));
3151
3153
  __m256i data_cmp_u8x32 = nk_fp6x32_to_u8x32_comparable_haswell_(data_i8x32);
3152
- __m256i new_min = _mm256_min_epu8(min_vec.ymm, data_cmp_u8x32);
3153
- __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min, min_vec.ymm),
3154
+ __m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_cmp_u8x32);
3155
+ __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
3154
3156
  _mm256_set1_epi8((char)0xFF));
3155
- min_vec.ymm = new_min;
3157
+ min_vec.ymm = new_min_u8x32;
3156
3158
  min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
3157
- __m256i new_max = _mm256_max_epu8(max_vec.ymm, data_cmp_u8x32);
3158
- __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max, max_vec.ymm),
3159
+ __m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_cmp_u8x32);
3160
+ __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
3159
3161
  _mm256_set1_epi8((char)0xFF));
3160
- max_vec.ymm = new_max;
3162
+ max_vec.ymm = new_max_u8x32;
3161
3163
  max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
3162
3164
  current_loop_cycle_u8x32 = _mm256_add_epi8(current_loop_cycle_u8x32, one_u8x32);
3163
3165
  }
@@ -3172,15 +3174,15 @@ NK_INTERNAL void nk_reduce_minmax_e3m2_haswell_contiguous_( //
3172
3174
  __m256i valid_b8x32 = _mm256_cmpgt_epi8(_mm256_set1_epi8((char)remaining), lane_indices_u8x32);
3173
3175
  __m256i data_min_u8x32 = _mm256_blendv_epi8(_mm256_set1_epi8(0x3F), data_cmp_u8x32, valid_b8x32);
3174
3176
  __m256i data_max_u8x32 = _mm256_blendv_epi8(_mm256_setzero_si256(), data_cmp_u8x32, valid_b8x32);
3175
- __m256i new_min = _mm256_min_epu8(min_vec.ymm, data_min_u8x32);
3176
- __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min, min_vec.ymm),
3177
+ __m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_min_u8x32);
3178
+ __m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
3177
3179
  _mm256_set1_epi8((char)0xFF));
3178
- min_vec.ymm = new_min;
3180
+ min_vec.ymm = new_min_u8x32;
3179
3181
  min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
3180
- __m256i new_max = _mm256_max_epu8(max_vec.ymm, data_max_u8x32);
3181
- __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max, max_vec.ymm),
3182
+ __m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_max_u8x32);
3183
+ __m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
3182
3184
  _mm256_set1_epi8((char)0xFF));
3183
- max_vec.ymm = new_max;
3185
+ max_vec.ymm = new_max_u8x32;
3184
3186
  max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
3185
3187
  }
3186
3188
 
@@ -3645,14 +3647,14 @@ NK_INTERNAL void nk_reduce_moments_i4_haswell_contiguous_( //
3645
3647
  ptr += 32, count_bytes -= 32;
3646
3648
  }
3647
3649
  __m256i raw_i8x32 = raw_vec.ymm;
3648
- __m256i low_u4x32 = _mm256_and_si256(raw_i8x32, mask_0f_i8x32);
3649
- __m256i high_u4x32 = _mm256_and_si256(_mm256_srli_epi16(raw_i8x32, 4), mask_0f_i8x32);
3650
- __m256i low_biased_u4x32 = _mm256_xor_si256(low_u4x32, eight_i8x32);
3651
- __m256i high_biased_u4x32 = _mm256_xor_si256(high_u4x32, eight_i8x32);
3652
- __m256i pair_sum = _mm256_add_epi8(low_biased_u4x32, high_biased_u4x32);
3653
- sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(pair_sum, zero_i8x32));
3654
- __m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, low_u4x32);
3655
- __m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, high_u4x32);
3650
+ __m256i low_u4_u8x32 = _mm256_and_si256(raw_i8x32, mask_0f_i8x32);
3651
+ __m256i high_u4_u8x32 = _mm256_and_si256(_mm256_srli_epi16(raw_i8x32, 4), mask_0f_i8x32);
3652
+ __m256i low_biased_u4_u8x32 = _mm256_xor_si256(low_u4_u8x32, eight_i8x32);
3653
+ __m256i high_biased_u4_u8x32 = _mm256_xor_si256(high_u4_u8x32, eight_i8x32);
3654
+ __m256i pair_sum_u8x32 = _mm256_add_epi8(low_biased_u4_u8x32, high_biased_u4_u8x32);
3655
+ sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(pair_sum_u8x32, zero_i8x32));
3656
+ __m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, low_u4_u8x32);
3657
+ __m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, high_u4_u8x32);
3656
3658
  sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(low_sq_u8x32, zero_i8x32));
3657
3659
  sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(high_sq_u8x32, zero_i8x32));
3658
3660
  }
@@ -3702,12 +3704,12 @@ NK_INTERNAL void nk_reduce_moments_u4_haswell_contiguous_( //
3702
3704
  ptr += 32, count_bytes -= 32;
3703
3705
  }
3704
3706
  __m256i raw_i8x32 = raw_vec.ymm;
3705
- __m256i low_u4x32 = _mm256_and_si256(raw_i8x32, mask_0f_i8x32);
3706
- __m256i high_u4x32 = _mm256_and_si256(_mm256_srli_epi16(raw_i8x32, 4), mask_0f_i8x32);
3707
- __m256i pair_sum = _mm256_add_epi8(low_u4x32, high_u4x32);
3708
- sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(pair_sum, zero_i8x32));
3709
- __m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, low_u4x32);
3710
- __m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, high_u4x32);
3707
+ __m256i low_u4_u8x32 = _mm256_and_si256(raw_i8x32, mask_0f_i8x32);
3708
+ __m256i high_u4_u8x32 = _mm256_and_si256(_mm256_srli_epi16(raw_i8x32, 4), mask_0f_i8x32);
3709
+ __m256i pair_sum_u8x32 = _mm256_add_epi8(low_u4_u8x32, high_u4_u8x32);
3710
+ sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(pair_sum_u8x32, zero_i8x32));
3711
+ __m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, low_u4_u8x32);
3712
+ __m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, high_u4_u8x32);
3711
3713
  sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(low_sq_u8x32, zero_i8x32));
3712
3714
  sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(high_sq_u8x32, zero_i8x32));
3713
3715
  }