numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,1597 @@
1
+ /**
2
+ * @brief SIMD-accelerated Vector Reductions.
3
+ * @file include/numkong/reduce.h
4
+ * @author Ash Vardanian
5
+ * @date December 27, 2024
6
+ *
7
+ * Provides horizontal reduction operations over vectors with:
8
+ * - `nk_reduce_moments_*` — sum + sum-of-squares in one pass
9
+ * - `nk_reduce_minmax_*` — min + max with argmin/argmax in one pass
10
+ * - Dynamic dispatch for runtime ISA selection
11
+ *
12
+ * For dtypes:
13
+ *
14
+ * - f64: 64-bit IEEE floating point numbers
15
+ * - f32: 32-bit IEEE floating point numbers
16
+ * - f16: 16-bit IEEE floating point numbers
17
+ * - bf16: 16-bit brain floating point numbers
18
+ * - e4m3: 8-bit e4m3 floating point numbers
19
+ * - e5m2: 8-bit e5m2 floating point numbers
20
+ * - e2m3: 8-bit e2m3 floating point numbers (MX)
21
+ * - e3m2: 8-bit e3m2 floating point numbers (MX)
22
+ * - i8: 8-bit signed integers
23
+ * - u8: 8-bit unsigned integers
24
+ * - i16: 16-bit signed integers
25
+ * - u16: 16-bit unsigned integers
26
+ * - i32: 32-bit signed integers
27
+ * - u32: 32-bit unsigned integers
28
+ * - i64: 64-bit signed integers
29
+ * - u64: 64-bit unsigned integers
30
+ * - i4: 4-bit signed integers (packed pairs)
31
+ * - u4: 4-bit unsigned integers (packed pairs)
32
+ * - u1: 1-bit binary (packed octets)
33
+ *
34
+ * For hardware architectures:
35
+ *
36
+ * - Arm: NEON, NEON+F16, NEON+FHM, NEON+BF16, NEON+SDOT
37
+ * - x86: Haswell, Skylake, Ice Lake, Genoa, Sierra Forest
38
+ * - RISC-V: RVV
39
+ * - WASM: V128Relaxed
40
+ *
41
+ * @section numerical_stability Numerical stability
42
+ *
43
+ * All accumulations are performed with stable techniques and @b saturation in mind.
44
+ * Single-precision inputs are aggregated in double-precision. Double-precision
45
+ * inputs are handled with @b Neumaier-like compensated summation schemes. Mini-floats
46
+ * are propagated to more hardware-friendly types. And integer are handled with
47
+ * proper saturation logic, as opposed to simple pairwise saturation, meaning that
48
+ * if several extremely large values are followed by equal negative values, the
49
+ * sum will be zero.
50
+ *
51
+ * @code{.c}
52
+ *
53
+ * @endcode{.c}
54
+ *
55
+ *
56
+ * All MinMax scans are performed with respect to NaN values beyond simple total ordering.
57
+ * All positive and negative NaN values are masked out on the fly and can never be included
58
+ * in the output. For empty or NaN-only inputs, the returned argmin/argmax positions will
59
+ * be set to sentinel value @b `NK_SIZE_MAX`.
60
+ *
61
+ * @section reduction_strategy Reduction Strategy
62
+ *
63
+ * The key insight is that `_mm512_reduce_add_ps()` and similar intrinsics are
64
+ * actually serial operations - they don't parallelize the reduction across lanes.
65
+ * The correct approach is:
66
+ *
67
+ * 1. Accumulate vertically in SIMD registers throughout the entire loop
68
+ * 2. Perform a single horizontal reduction at the very end, reconstructing the lane positions
69
+ *
70
+ * @code{.c}
71
+ * __m512 sum_f32x16 = _mm512_setzero_ps();
72
+ * for (...) {
73
+ * __m512 data_f32x16 = _mm512_loadu_ps(ptr);
74
+ * sum_f32x16 = _mm512_add_ps(sum_f32x16, data_f32x16);
75
+ * }
76
+ * // Single horizontal reduce at the END only
77
+ * nk_f32_t result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
78
+ * @endcode
79
+ *
80
+ * @section stride_handling Stride Handling Strategies
81
+ *
82
+ * - stride == sizeof(scalar): Contiguous SIMD loads with masked tail
83
+ * - Large stride with gather support: Use gather instructions (32/64-bit types)
84
+ * - Otherwise: Serial fallback
85
+ *
86
+ * @section argminmax Argmin/Argmax Strategy
87
+ *
88
+ * Single-pass algorithm tracking both value and index in SIMD registers:
89
+ * @code{.c}
90
+ * __m512 min_f32x16 = _mm512_set1_ps(FLT_MAX);
91
+ * __m512i min_idx_i32x16 = _mm512_setzero_si512();
92
+ * __m512i current_idx_i32x16 = _mm512_setr_epi32(0,1,2,3,...,15);
93
+ * __m512i step_i32x16 = _mm512_set1_epi32(16);
94
+ * for (...) {
95
+ * __m512 data_f32x16 = _mm512_loadu_ps(ptr);
96
+ * __mmask16 lt_mask = _mm512_cmp_ps_mask(data_f32x16, min_f32x16, _CMP_LT_OQ);
97
+ * min_f32x16 = _mm512_mask_mov_ps(min_f32x16, lt_mask, data_f32x16);
98
+ * min_idx_i32x16 = _mm512_mask_mov_epi32(min_idx_i32x16, lt_mask, current_idx_i32x16);
99
+ * current_idx_i32x16 = _mm512_add_epi32(current_idx_i32x16, step_i32x16);
100
+ * }
101
+ * @endcode
102
+ */
103
+ #ifndef NK_REDUCE_H
104
+ #define NK_REDUCE_H
105
+
106
+ #include "numkong/types.h"
107
+
108
+ #ifdef __cplusplus
109
+ extern "C" {
110
+ #endif
111
+
112
+ /**
113
+ * @brief Horizontal moments reduction (sum + sum-of-squares) over a strided array.
114
+ * @param[in] data Pointer to the input data.
115
+ * @param[in] count Number of elements to reduce.
116
+ * @param[in] stride_bytes Stride between elements in bytes, equal to `sizeof(*data)` for contiguous arrays.
117
+ * @param[out] sum_ptr Output sum.
118
+ * @param[out] sumsq_ptr Output sum of squares.
119
+ */
120
+ NK_DYNAMIC void nk_reduce_moments_f64(nk_f64_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_f64_t *sum_ptr,
121
+ nk_f64_t *sumsq_ptr);
122
+
123
+ /**
124
+ * @brief Horizontal min+max reduction with argmin/argmax over a strided array.
125
+ * @param[in] data Pointer to the input data.
126
+ * @param[in] count Number of elements to reduce.
127
+ * @param[in] stride_bytes Stride between elements in bytes, equal to `sizeof(*data)` for contiguous arrays.
128
+ * @param[out] min_value_ptr Output minimum value.
129
+ * @param[out] min_index_ptr Output index of the minimum value.
130
+ * @param[out] max_value_ptr Output maximum value.
131
+ * @param[out] max_index_ptr Output index of the maximum value.
132
+ */
133
+ NK_DYNAMIC void nk_reduce_minmax_f64(nk_f64_t const *data, nk_size_t count, nk_size_t stride_bytes,
134
+ nk_f64_t *min_value_ptr, nk_size_t *min_index_ptr, nk_f64_t *max_value_ptr,
135
+ nk_size_t *max_index_ptr);
136
+
137
+ /** @copydoc nk_reduce_moments_f64 */
138
+ NK_DYNAMIC void nk_reduce_moments_f32(nk_f32_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_f64_t *sum_ptr,
139
+ nk_f64_t *sumsq_ptr);
140
+ /** @copydoc nk_reduce_minmax_f64 */
141
+ NK_DYNAMIC void nk_reduce_minmax_f32(nk_f32_t const *data, nk_size_t count, nk_size_t stride_bytes,
142
+ nk_f32_t *min_value_ptr, nk_size_t *min_index_ptr, nk_f32_t *max_value_ptr,
143
+ nk_size_t *max_index_ptr);
144
+
145
+ /** @copydoc nk_reduce_moments_f64 */
146
+ NK_DYNAMIC void nk_reduce_moments_i8(nk_i8_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
147
+ nk_u64_t *sumsq_ptr);
148
+ /** @copydoc nk_reduce_minmax_f64 */
149
+ NK_DYNAMIC void nk_reduce_minmax_i8(nk_i8_t const *data, nk_size_t count, nk_size_t stride_bytes,
150
+ nk_i8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i8_t *max_value_ptr,
151
+ nk_size_t *max_index_ptr);
152
+
153
+ /** @copydoc nk_reduce_moments_f64 */
154
+ NK_DYNAMIC void nk_reduce_moments_u8(nk_u8_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
155
+ nk_u64_t *sumsq_ptr);
156
+ /** @copydoc nk_reduce_minmax_f64 */
157
+ NK_DYNAMIC void nk_reduce_minmax_u8(nk_u8_t const *data, nk_size_t count, nk_size_t stride_bytes,
158
+ nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u8_t *max_value_ptr,
159
+ nk_size_t *max_index_ptr);
160
+
161
+ /** @copydoc nk_reduce_moments_f64 */
162
+ NK_DYNAMIC void nk_reduce_moments_i16(nk_i16_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
163
+ nk_u64_t *sumsq_ptr);
164
+ /** @copydoc nk_reduce_minmax_f64 */
165
+ NK_DYNAMIC void nk_reduce_minmax_i16(nk_i16_t const *data, nk_size_t count, nk_size_t stride_bytes,
166
+ nk_i16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i16_t *max_value_ptr,
167
+ nk_size_t *max_index_ptr);
168
+
169
+ /** @copydoc nk_reduce_moments_f64 */
170
+ NK_DYNAMIC void nk_reduce_moments_u16(nk_u16_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
171
+ nk_u64_t *sumsq_ptr);
172
+ /** @copydoc nk_reduce_minmax_f64 */
173
+ NK_DYNAMIC void nk_reduce_minmax_u16(nk_u16_t const *data, nk_size_t count, nk_size_t stride_bytes,
174
+ nk_u16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u16_t *max_value_ptr,
175
+ nk_size_t *max_index_ptr);
176
+
177
+ /** @copydoc nk_reduce_moments_f64 */
178
+ NK_DYNAMIC void nk_reduce_moments_i32(nk_i32_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
179
+ nk_u64_t *sumsq_ptr);
180
+ /** @copydoc nk_reduce_minmax_f64 */
181
+ NK_DYNAMIC void nk_reduce_minmax_i32(nk_i32_t const *data, nk_size_t count, nk_size_t stride_bytes,
182
+ nk_i32_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i32_t *max_value_ptr,
183
+ nk_size_t *max_index_ptr);
184
+
185
+ /** @copydoc nk_reduce_moments_f64 */
186
+ NK_DYNAMIC void nk_reduce_moments_u32(nk_u32_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
187
+ nk_u64_t *sumsq_ptr);
188
+ /** @copydoc nk_reduce_minmax_f64 */
189
+ NK_DYNAMIC void nk_reduce_minmax_u32(nk_u32_t const *data, nk_size_t count, nk_size_t stride_bytes,
190
+ nk_u32_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u32_t *max_value_ptr,
191
+ nk_size_t *max_index_ptr);
192
+
193
+ /** @copydoc nk_reduce_moments_f64 */
194
+ NK_DYNAMIC void nk_reduce_moments_i64(nk_i64_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
195
+ nk_u64_t *sumsq_ptr);
196
+ /** @copydoc nk_reduce_minmax_f64 */
197
+ NK_DYNAMIC void nk_reduce_minmax_i64(nk_i64_t const *data, nk_size_t count, nk_size_t stride_bytes,
198
+ nk_i64_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i64_t *max_value_ptr,
199
+ nk_size_t *max_index_ptr);
200
+
201
+ /** @copydoc nk_reduce_moments_f64 */
202
+ NK_DYNAMIC void nk_reduce_moments_u64(nk_u64_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
203
+ nk_u64_t *sumsq_ptr);
204
+ /** @copydoc nk_reduce_minmax_f64 */
205
+ NK_DYNAMIC void nk_reduce_minmax_u64(nk_u64_t const *data, nk_size_t count, nk_size_t stride_bytes,
206
+ nk_u64_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u64_t *max_value_ptr,
207
+ nk_size_t *max_index_ptr);
208
+
209
+ /** @copydoc nk_reduce_moments_f64 */
210
+ NK_DYNAMIC void nk_reduce_moments_f16(nk_f16_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_f32_t *sum_ptr,
211
+ nk_f32_t *sumsq_ptr);
212
+ /** @copydoc nk_reduce_minmax_f64 */
213
+ NK_DYNAMIC void nk_reduce_minmax_f16(nk_f16_t const *data, nk_size_t count, nk_size_t stride_bytes,
214
+ nk_f16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_f16_t *max_value_ptr,
215
+ nk_size_t *max_index_ptr);
216
+
217
+ /** @copydoc nk_reduce_moments_f64 */
218
+ NK_DYNAMIC void nk_reduce_moments_bf16(nk_bf16_t const *data, nk_size_t count, nk_size_t stride_bytes,
219
+ nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
220
+ /** @copydoc nk_reduce_minmax_f64 */
221
+ NK_DYNAMIC void nk_reduce_minmax_bf16(nk_bf16_t const *data, nk_size_t count, nk_size_t stride_bytes,
222
+ nk_bf16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_bf16_t *max_value_ptr,
223
+ nk_size_t *max_index_ptr);
224
+
225
+ /** @copydoc nk_reduce_moments_f64 */
226
+ NK_DYNAMIC void nk_reduce_moments_e4m3(nk_e4m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
227
+ nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
228
+ /** @copydoc nk_reduce_minmax_f64 */
229
+ NK_DYNAMIC void nk_reduce_minmax_e4m3(nk_e4m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
230
+ nk_e4m3_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e4m3_t *max_value_ptr,
231
+ nk_size_t *max_index_ptr);
232
+
233
+ /** @copydoc nk_reduce_moments_f64 */
234
+ NK_DYNAMIC void nk_reduce_moments_e5m2(nk_e5m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
235
+ nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
236
+ /** @copydoc nk_reduce_minmax_f64 */
237
+ NK_DYNAMIC void nk_reduce_minmax_e5m2(nk_e5m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
238
+ nk_e5m2_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e5m2_t *max_value_ptr,
239
+ nk_size_t *max_index_ptr);
240
+
241
+ /** @copydoc nk_reduce_moments_f64 */
242
+ NK_DYNAMIC void nk_reduce_moments_e2m3(nk_e2m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
243
+ nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
244
+ /** @copydoc nk_reduce_minmax_f64 */
245
+ NK_DYNAMIC void nk_reduce_minmax_e2m3(nk_e2m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
246
+ nk_e2m3_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e2m3_t *max_value_ptr,
247
+ nk_size_t *max_index_ptr);
248
+
249
+ /** @copydoc nk_reduce_moments_f64 */
250
+ NK_DYNAMIC void nk_reduce_moments_e3m2(nk_e3m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
251
+ nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
252
+ /** @copydoc nk_reduce_minmax_f64 */
253
+ NK_DYNAMIC void nk_reduce_minmax_e3m2(nk_e3m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
254
+ nk_e3m2_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e3m2_t *max_value_ptr,
255
+ nk_size_t *max_index_ptr);
256
+
257
+ /** @copydoc nk_reduce_moments_f64 */
258
+ NK_DYNAMIC void nk_reduce_moments_i4(nk_i4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
259
+ nk_u64_t *sumsq_ptr);
260
+ /** @copydoc nk_reduce_minmax_f64 */
261
+ NK_DYNAMIC void nk_reduce_minmax_i4(nk_i4x2_t const *data, nk_size_t count, nk_size_t stride_bytes,
262
+ nk_i8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i8_t *max_value_ptr,
263
+ nk_size_t *max_index_ptr);
264
+
265
+ /** @copydoc nk_reduce_moments_f64 */
266
+ NK_DYNAMIC void nk_reduce_moments_u4(nk_u4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
267
+ nk_u64_t *sumsq_ptr);
268
+ /** @copydoc nk_reduce_minmax_f64 */
269
+ NK_DYNAMIC void nk_reduce_minmax_u4(nk_u4x2_t const *data, nk_size_t count, nk_size_t stride_bytes,
270
+ nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u8_t *max_value_ptr,
271
+ nk_size_t *max_index_ptr);
272
+
273
+ /** @copydoc nk_reduce_moments_f64 */
274
+ NK_DYNAMIC void nk_reduce_moments_u1(nk_u1x8_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
275
+ nk_u64_t *sumsq_ptr);
276
+ /** @copydoc nk_reduce_minmax_f64 */
277
+ NK_DYNAMIC void nk_reduce_minmax_u1(nk_u1x8_t const *data, nk_size_t count, nk_size_t stride_bytes,
278
+ nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u8_t *max_value_ptr,
279
+ nk_size_t *max_index_ptr);
280
+
281
+ /** @copydoc nk_reduce_moments_f64 */
282
+ NK_PUBLIC void nk_reduce_moments_f32_serial(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
283
+ /** @copydoc nk_reduce_moments_f64 */
284
+ NK_PUBLIC void nk_reduce_moments_f64_serial(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
285
+ /** @copydoc nk_reduce_moments_f64 */
286
+ NK_PUBLIC void nk_reduce_moments_i8_serial(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
287
+ /** @copydoc nk_reduce_moments_f64 */
288
+ NK_PUBLIC void nk_reduce_moments_u8_serial(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
289
+ /** @copydoc nk_reduce_moments_f64 */
290
+ NK_PUBLIC void nk_reduce_moments_i16_serial(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
291
+ /** @copydoc nk_reduce_moments_f64 */
292
+ NK_PUBLIC void nk_reduce_moments_u16_serial(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
293
+ /** @copydoc nk_reduce_moments_f64 */
294
+ NK_PUBLIC void nk_reduce_moments_i32_serial(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
295
+ /** @copydoc nk_reduce_moments_f64 */
296
+ NK_PUBLIC void nk_reduce_moments_u32_serial(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
297
+ /** @copydoc nk_reduce_moments_f64 */
298
+ NK_PUBLIC void nk_reduce_moments_i64_serial(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
299
+ /** @copydoc nk_reduce_moments_f64 */
300
+ NK_PUBLIC void nk_reduce_moments_u64_serial(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
301
+ /** @copydoc nk_reduce_moments_f64 */
302
+ NK_PUBLIC void nk_reduce_moments_f16_serial(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
303
+ /** @copydoc nk_reduce_moments_f64 */
304
+ NK_PUBLIC void nk_reduce_moments_bf16_serial(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
305
+ /** @copydoc nk_reduce_moments_f64 */
306
+ NK_PUBLIC void nk_reduce_moments_e4m3_serial(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
307
+ /** @copydoc nk_reduce_moments_f64 */
308
+ NK_PUBLIC void nk_reduce_moments_e5m2_serial(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
309
+ /** @copydoc nk_reduce_moments_f64 */
310
+ NK_PUBLIC void nk_reduce_moments_e2m3_serial(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
311
+ /** @copydoc nk_reduce_moments_f64 */
312
+ NK_PUBLIC void nk_reduce_moments_e3m2_serial(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
313
+ /** @copydoc nk_reduce_moments_f64 */
314
+ NK_PUBLIC void nk_reduce_moments_i4_serial(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
315
+ /** @copydoc nk_reduce_moments_f64 */
316
+ NK_PUBLIC void nk_reduce_moments_u4_serial(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
317
+ /** @copydoc nk_reduce_moments_f64 */
318
+ NK_PUBLIC void nk_reduce_moments_u1_serial(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
319
+
320
+ /** @copydoc nk_reduce_minmax_f64 */
321
+ NK_PUBLIC void nk_reduce_minmax_f32_serial(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
322
+ nk_size_t *);
323
+ /** @copydoc nk_reduce_minmax_f64 */
324
+ NK_PUBLIC void nk_reduce_minmax_f64_serial(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
325
+ nk_size_t *);
326
+ /** @copydoc nk_reduce_minmax_f64 */
327
+ NK_PUBLIC void nk_reduce_minmax_i8_serial(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
328
+ nk_size_t *);
329
+ /** @copydoc nk_reduce_minmax_f64 */
330
+ NK_PUBLIC void nk_reduce_minmax_u8_serial(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
331
+ nk_size_t *);
332
+ /** @copydoc nk_reduce_minmax_f64 */
333
+ NK_PUBLIC void nk_reduce_minmax_i16_serial(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
334
+ nk_size_t *);
335
+ /** @copydoc nk_reduce_minmax_f64 */
336
+ NK_PUBLIC void nk_reduce_minmax_u16_serial(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
337
+ nk_size_t *);
338
+ /** @copydoc nk_reduce_minmax_f64 */
339
+ NK_PUBLIC void nk_reduce_minmax_i32_serial(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
340
+ nk_size_t *);
341
+ /** @copydoc nk_reduce_minmax_f64 */
342
+ NK_PUBLIC void nk_reduce_minmax_u32_serial(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
343
+ nk_size_t *);
344
+ /** @copydoc nk_reduce_minmax_f64 */
345
+ NK_PUBLIC void nk_reduce_minmax_i64_serial(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
346
+ nk_size_t *);
347
+ /** @copydoc nk_reduce_minmax_f64 */
348
+ NK_PUBLIC void nk_reduce_minmax_u64_serial(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
349
+ nk_size_t *);
350
+ /** @copydoc nk_reduce_minmax_f64 */
351
+ NK_PUBLIC void nk_reduce_minmax_f16_serial(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
352
+ nk_size_t *);
353
+ /** @copydoc nk_reduce_minmax_f64 */
354
+ NK_PUBLIC void nk_reduce_minmax_bf16_serial(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
355
+ nk_bf16_t *, nk_size_t *);
356
+ /** @copydoc nk_reduce_minmax_f64 */
357
+ NK_PUBLIC void nk_reduce_minmax_e4m3_serial(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
358
+ nk_e4m3_t *, nk_size_t *);
359
+ /** @copydoc nk_reduce_minmax_f64 */
360
+ NK_PUBLIC void nk_reduce_minmax_e5m2_serial(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
361
+ nk_e5m2_t *, nk_size_t *);
362
+ /** @copydoc nk_reduce_minmax_f64 */
363
+ NK_PUBLIC void nk_reduce_minmax_e2m3_serial(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
364
+ nk_e2m3_t *, nk_size_t *);
365
+ /** @copydoc nk_reduce_minmax_f64 */
366
+ NK_PUBLIC void nk_reduce_minmax_e3m2_serial(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
367
+ nk_e3m2_t *, nk_size_t *);
368
+ /** @copydoc nk_reduce_minmax_f64 */
369
+ NK_PUBLIC void nk_reduce_minmax_i4_serial(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
370
+ nk_size_t *);
371
+ /** @copydoc nk_reduce_minmax_f64 */
372
+ NK_PUBLIC void nk_reduce_minmax_u4_serial(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
373
+ nk_size_t *);
374
+ /** @copydoc nk_reduce_minmax_f64 */
375
+ NK_PUBLIC void nk_reduce_minmax_u1_serial(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
376
+ nk_size_t *);
377
+
378
+ #if NK_TARGET_NEON
379
+ /** @copydoc nk_reduce_moments_f64 */
380
+ NK_PUBLIC void nk_reduce_moments_f32_neon(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
381
+ /** @copydoc nk_reduce_moments_f64 */
382
+ NK_PUBLIC void nk_reduce_moments_f64_neon(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
383
+ /** @copydoc nk_reduce_moments_f64 */
384
+ NK_PUBLIC void nk_reduce_moments_i8_neon(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
385
+ /** @copydoc nk_reduce_moments_f64 */
386
+ NK_PUBLIC void nk_reduce_moments_u8_neon(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
387
+ /** @copydoc nk_reduce_moments_f64 */
388
+ NK_PUBLIC void nk_reduce_moments_i16_neon(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
389
+ /** @copydoc nk_reduce_moments_f64 */
390
+ NK_PUBLIC void nk_reduce_moments_u16_neon(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
391
+ /** @copydoc nk_reduce_moments_f64 */
392
+ NK_PUBLIC void nk_reduce_moments_i32_neon(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
393
+ /** @copydoc nk_reduce_moments_f64 */
394
+ NK_PUBLIC void nk_reduce_moments_u32_neon(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
395
+ /** @copydoc nk_reduce_moments_f64 */
396
+ NK_PUBLIC void nk_reduce_moments_i64_neon(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
397
+ /** @copydoc nk_reduce_moments_f64 */
398
+ NK_PUBLIC void nk_reduce_moments_u64_neon(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
399
+ /** @copydoc nk_reduce_moments_f64 */
400
+ NK_PUBLIC void nk_reduce_moments_e2m3_neon(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
401
+ /** @copydoc nk_reduce_moments_f64 */
402
+ NK_PUBLIC void nk_reduce_moments_e3m2_neon(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
403
+ /** @copydoc nk_reduce_moments_f64 */
404
+ NK_PUBLIC void nk_reduce_moments_e4m3_neon(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
405
+ /** @copydoc nk_reduce_moments_f64 */
406
+ NK_PUBLIC void nk_reduce_moments_e5m2_neon(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
407
+ /** @copydoc nk_reduce_minmax_f64 */
408
+ NK_PUBLIC void nk_reduce_minmax_f32_neon(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
409
+ nk_size_t *);
410
+ /** @copydoc nk_reduce_minmax_f64 */
411
+ NK_PUBLIC void nk_reduce_minmax_f64_neon(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
412
+ nk_size_t *);
413
+ /** @copydoc nk_reduce_minmax_f64 */
414
+ NK_PUBLIC void nk_reduce_minmax_i8_neon(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
415
+ nk_size_t *);
416
+ /** @copydoc nk_reduce_minmax_f64 */
417
+ NK_PUBLIC void nk_reduce_minmax_u8_neon(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
418
+ nk_size_t *);
419
+ /** @copydoc nk_reduce_minmax_f64 */
420
+ NK_PUBLIC void nk_reduce_minmax_i16_neon(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
421
+ nk_size_t *);
422
+ /** @copydoc nk_reduce_minmax_f64 */
423
+ NK_PUBLIC void nk_reduce_minmax_u16_neon(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
424
+ nk_size_t *);
425
+ /** @copydoc nk_reduce_minmax_f64 */
426
+ NK_PUBLIC void nk_reduce_minmax_i32_neon(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
427
+ nk_size_t *);
428
+ /** @copydoc nk_reduce_minmax_f64 */
429
+ NK_PUBLIC void nk_reduce_minmax_u32_neon(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
430
+ nk_size_t *);
431
+ /** @copydoc nk_reduce_minmax_f64 */
432
+ NK_PUBLIC void nk_reduce_minmax_i64_neon(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
433
+ nk_size_t *);
434
+ /** @copydoc nk_reduce_minmax_f64 */
435
+ NK_PUBLIC void nk_reduce_minmax_u64_neon(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
436
+ nk_size_t *);
437
+ /** @copydoc nk_reduce_minmax_f64 */
438
+ NK_PUBLIC void nk_reduce_minmax_e2m3_neon(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
439
+ nk_e2m3_t *, nk_size_t *);
440
+ /** @copydoc nk_reduce_minmax_f64 */
441
+ NK_PUBLIC void nk_reduce_minmax_e3m2_neon(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
442
+ nk_e3m2_t *, nk_size_t *);
443
+ /** @copydoc nk_reduce_minmax_f64 */
444
+ NK_PUBLIC void nk_reduce_minmax_e4m3_neon(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
445
+ nk_e4m3_t *, nk_size_t *);
446
+ /** @copydoc nk_reduce_minmax_f64 */
447
+ NK_PUBLIC void nk_reduce_minmax_e5m2_neon(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
448
+ nk_e5m2_t *, nk_size_t *);
449
+ #endif // NK_TARGET_NEON
450
+
451
+ #if NK_TARGET_NEONHALF
452
+ /** @copydoc nk_reduce_moments_f64 */
453
+ NK_PUBLIC void nk_reduce_moments_f16_neonhalf(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
454
+ #endif // NK_TARGET_NEONHALF
455
+
456
+ #if NK_TARGET_NEONBFDOT
457
+ /** @copydoc nk_reduce_moments_f64 */
458
+ NK_PUBLIC void nk_reduce_moments_bf16_neonbfdot(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
459
+ /** @copydoc nk_reduce_minmax_f64 */
460
+ NK_PUBLIC void nk_reduce_minmax_bf16_neonbfdot(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
461
+ nk_bf16_t *, nk_size_t *);
462
+ #endif // NK_TARGET_NEONBFDOT
463
+
464
+ #if NK_TARGET_NEONSDOT
465
+ /** @copydoc nk_reduce_moments_f64 */
466
+ NK_PUBLIC void nk_reduce_moments_i8_neonsdot(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
467
+ /** @copydoc nk_reduce_moments_f64 */
468
+ NK_PUBLIC void nk_reduce_moments_u8_neonsdot(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
469
+ /** @copydoc nk_reduce_moments_f64 */
470
+ NK_PUBLIC void nk_reduce_moments_e2m3_neonsdot(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
471
+ #endif // NK_TARGET_NEONSDOT
472
+
473
+ #if NK_TARGET_NEONFHM
474
+ /** @copydoc nk_reduce_moments_f64 */
475
+ NK_PUBLIC void nk_reduce_moments_e4m3_neonfhm(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
476
+ /** @copydoc nk_reduce_moments_f64 */
477
+ NK_PUBLIC void nk_reduce_moments_e5m2_neonfhm(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
478
+ /** @copydoc nk_reduce_minmax_f64 */
479
+ NK_PUBLIC void nk_reduce_minmax_e4m3_neonfhm(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
480
+ nk_e4m3_t *, nk_size_t *);
481
+ /** @copydoc nk_reduce_minmax_f64 */
482
+ NK_PUBLIC void nk_reduce_minmax_e5m2_neonfhm(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
483
+ nk_e5m2_t *, nk_size_t *);
484
+ #endif // NK_TARGET_NEONFHM
485
+
486
+ #if NK_TARGET_HASWELL
487
+ /** @copydoc nk_reduce_moments_f64 */
488
+ NK_PUBLIC void nk_reduce_moments_f32_haswell(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
489
+ /** @copydoc nk_reduce_moments_f64 */
490
+ NK_PUBLIC void nk_reduce_moments_f64_haswell(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
491
+ /** @copydoc nk_reduce_moments_f64 */
492
+ NK_PUBLIC void nk_reduce_moments_i8_haswell(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
493
+ /** @copydoc nk_reduce_moments_f64 */
494
+ NK_PUBLIC void nk_reduce_moments_u8_haswell(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
495
+ /** @copydoc nk_reduce_moments_f64 */
496
+ NK_PUBLIC void nk_reduce_moments_i16_haswell(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
497
+ /** @copydoc nk_reduce_moments_f64 */
498
+ NK_PUBLIC void nk_reduce_moments_u16_haswell(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
499
+ /** @copydoc nk_reduce_moments_f64 */
500
+ NK_PUBLIC void nk_reduce_moments_i32_haswell(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
501
+ /** @copydoc nk_reduce_moments_f64 */
502
+ NK_PUBLIC void nk_reduce_moments_u32_haswell(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
503
+ /** @copydoc nk_reduce_moments_f64 */
504
+ NK_PUBLIC void nk_reduce_moments_i64_haswell(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
505
+ /** @copydoc nk_reduce_moments_f64 */
506
+ NK_PUBLIC void nk_reduce_moments_u64_haswell(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
507
+ /** @copydoc nk_reduce_moments_f64 */
508
+ NK_PUBLIC void nk_reduce_moments_f16_haswell(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
509
+ /** @copydoc nk_reduce_moments_f64 */
510
+ NK_PUBLIC void nk_reduce_moments_bf16_haswell(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
511
+ /** @copydoc nk_reduce_moments_f64 */
512
+ NK_PUBLIC void nk_reduce_moments_e4m3_haswell(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
513
+ /** @copydoc nk_reduce_moments_f64 */
514
+ NK_PUBLIC void nk_reduce_moments_e5m2_haswell(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
515
+ /** @copydoc nk_reduce_moments_f64 */
516
+ NK_PUBLIC void nk_reduce_moments_e2m3_haswell(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
517
+ /** @copydoc nk_reduce_moments_f64 */
518
+ NK_PUBLIC void nk_reduce_moments_e3m2_haswell(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
519
+ /** @copydoc nk_reduce_moments_f64 */
520
+ NK_PUBLIC void nk_reduce_moments_i4_haswell(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
521
+ /** @copydoc nk_reduce_moments_f64 */
522
+ NK_PUBLIC void nk_reduce_moments_u4_haswell(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
523
+ /** @copydoc nk_reduce_moments_f64 */
524
+ NK_PUBLIC void nk_reduce_moments_u1_haswell(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
525
+ /** @copydoc nk_reduce_minmax_f64 */
526
+ NK_PUBLIC void nk_reduce_minmax_f32_haswell(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
527
+ nk_size_t *);
528
+ /** @copydoc nk_reduce_minmax_f64 */
529
+ NK_PUBLIC void nk_reduce_minmax_f64_haswell(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
530
+ nk_size_t *);
531
+ /** @copydoc nk_reduce_minmax_f64 */
532
+ NK_PUBLIC void nk_reduce_minmax_i8_haswell(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
533
+ nk_size_t *);
534
+ /** @copydoc nk_reduce_minmax_f64 */
535
+ NK_PUBLIC void nk_reduce_minmax_u8_haswell(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
536
+ nk_size_t *);
537
+ /** @copydoc nk_reduce_minmax_f64 */
538
+ NK_PUBLIC void nk_reduce_minmax_i16_haswell(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
539
+ nk_size_t *);
540
+ /** @copydoc nk_reduce_minmax_f64 */
541
+ NK_PUBLIC void nk_reduce_minmax_u16_haswell(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
542
+ nk_size_t *);
543
+ /** @copydoc nk_reduce_minmax_f64 */
544
+ NK_PUBLIC void nk_reduce_minmax_i32_haswell(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
545
+ nk_size_t *);
546
+ /** @copydoc nk_reduce_minmax_f64 */
547
+ NK_PUBLIC void nk_reduce_minmax_u32_haswell(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
548
+ nk_size_t *);
549
+ /** @copydoc nk_reduce_minmax_f64 */
550
+ NK_PUBLIC void nk_reduce_minmax_i64_haswell(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
551
+ nk_size_t *);
552
+ /** @copydoc nk_reduce_minmax_f64 */
553
+ NK_PUBLIC void nk_reduce_minmax_u64_haswell(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
554
+ nk_size_t *);
555
+ /** @copydoc nk_reduce_minmax_f64 */
556
+ NK_PUBLIC void nk_reduce_minmax_f16_haswell(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
557
+ nk_size_t *);
558
+ /** @copydoc nk_reduce_minmax_f64 */
559
+ NK_PUBLIC void nk_reduce_minmax_bf16_haswell(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
560
+ nk_bf16_t *, nk_size_t *);
561
+ /** @copydoc nk_reduce_minmax_f64 */
562
+ NK_PUBLIC void nk_reduce_minmax_e4m3_haswell(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
563
+ nk_e4m3_t *, nk_size_t *);
564
+ /** @copydoc nk_reduce_minmax_f64 */
565
+ NK_PUBLIC void nk_reduce_minmax_e5m2_haswell(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
566
+ nk_e5m2_t *, nk_size_t *);
567
+ /** @copydoc nk_reduce_minmax_f64 */
568
+ NK_PUBLIC void nk_reduce_minmax_e2m3_haswell(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
569
+ nk_e2m3_t *, nk_size_t *);
570
+ /** @copydoc nk_reduce_minmax_f64 */
571
+ NK_PUBLIC void nk_reduce_minmax_e3m2_haswell(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
572
+ nk_e3m2_t *, nk_size_t *);
573
+ #endif // NK_TARGET_HASWELL
574
+
575
+ #if NK_TARGET_SKYLAKE
576
+ /** @copydoc nk_reduce_moments_f64 */
577
+ NK_PUBLIC void nk_reduce_moments_f32_skylake(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
578
+ /** @copydoc nk_reduce_moments_f64 */
579
+ NK_PUBLIC void nk_reduce_moments_f64_skylake(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
580
+ /** @copydoc nk_reduce_moments_f64 */
581
+ NK_PUBLIC void nk_reduce_moments_i8_skylake(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
582
+ /** @copydoc nk_reduce_moments_f64 */
583
+ NK_PUBLIC void nk_reduce_moments_u8_skylake(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
584
+ /** @copydoc nk_reduce_moments_f64 */
585
+ NK_PUBLIC void nk_reduce_moments_i16_skylake(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
586
+ /** @copydoc nk_reduce_moments_f64 */
587
+ NK_PUBLIC void nk_reduce_moments_u16_skylake(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
588
+ /** @copydoc nk_reduce_moments_f64 */
589
+ NK_PUBLIC void nk_reduce_moments_i32_skylake(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
590
+ /** @copydoc nk_reduce_moments_f64 */
591
+ NK_PUBLIC void nk_reduce_moments_u32_skylake(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
592
+ /** @copydoc nk_reduce_moments_f64 */
593
+ NK_PUBLIC void nk_reduce_moments_i64_skylake(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
594
+ /** @copydoc nk_reduce_moments_f64 */
595
+ NK_PUBLIC void nk_reduce_moments_u64_skylake(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
596
+ /** @copydoc nk_reduce_moments_f64 */
597
+ NK_PUBLIC void nk_reduce_moments_f16_skylake(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
598
+ /** @copydoc nk_reduce_moments_f64 */
599
+ NK_PUBLIC void nk_reduce_moments_bf16_skylake(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
600
+ /** @copydoc nk_reduce_moments_f64 */
601
+ NK_PUBLIC void nk_reduce_moments_e4m3_skylake(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
602
+ /** @copydoc nk_reduce_moments_f64 */
603
+ NK_PUBLIC void nk_reduce_moments_e5m2_skylake(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
604
+ /** @copydoc nk_reduce_moments_f64 */
605
+ NK_PUBLIC void nk_reduce_moments_e2m3_skylake(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
606
+ /** @copydoc nk_reduce_moments_f64 */
607
+ NK_PUBLIC void nk_reduce_moments_e3m2_skylake(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
608
+ /** @copydoc nk_reduce_moments_f64 */
609
+ NK_PUBLIC void nk_reduce_moments_i4_skylake(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
610
+ /** @copydoc nk_reduce_moments_f64 */
611
+ NK_PUBLIC void nk_reduce_moments_u4_skylake(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
612
+ /** @copydoc nk_reduce_moments_f64 */
613
+ NK_PUBLIC void nk_reduce_moments_u1_skylake(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
614
+ /** @copydoc nk_reduce_minmax_f64 */
615
+ NK_PUBLIC void nk_reduce_minmax_f32_skylake(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
616
+ nk_size_t *);
617
+ /** @copydoc nk_reduce_minmax_f64 */
618
+ NK_PUBLIC void nk_reduce_minmax_f64_skylake(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
619
+ nk_size_t *);
620
+ /** @copydoc nk_reduce_minmax_f64 */
621
+ NK_PUBLIC void nk_reduce_minmax_i8_skylake(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
622
+ nk_size_t *);
623
+ /** @copydoc nk_reduce_minmax_f64 */
624
+ NK_PUBLIC void nk_reduce_minmax_u8_skylake(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
625
+ nk_size_t *);
626
+ /** @copydoc nk_reduce_minmax_f64 */
627
+ NK_PUBLIC void nk_reduce_minmax_i16_skylake(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
628
+ nk_size_t *);
629
+ /** @copydoc nk_reduce_minmax_f64 */
630
+ NK_PUBLIC void nk_reduce_minmax_u16_skylake(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
631
+ nk_size_t *);
632
+ /** @copydoc nk_reduce_minmax_f64 */
633
+ NK_PUBLIC void nk_reduce_minmax_i32_skylake(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
634
+ nk_size_t *);
635
+ /** @copydoc nk_reduce_minmax_f64 */
636
+ NK_PUBLIC void nk_reduce_minmax_u32_skylake(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
637
+ nk_size_t *);
638
+ /** @copydoc nk_reduce_minmax_f64 */
639
+ NK_PUBLIC void nk_reduce_minmax_i64_skylake(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
640
+ nk_size_t *);
641
+ /** @copydoc nk_reduce_minmax_f64 */
642
+ NK_PUBLIC void nk_reduce_minmax_u64_skylake(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
643
+ nk_size_t *);
644
+ /** @copydoc nk_reduce_minmax_f64 */
645
+ NK_PUBLIC void nk_reduce_minmax_f16_skylake(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
646
+ nk_size_t *);
647
+ /** @copydoc nk_reduce_minmax_f64 */
648
+ NK_PUBLIC void nk_reduce_minmax_bf16_skylake(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
649
+ nk_bf16_t *, nk_size_t *);
650
+ /** @copydoc nk_reduce_minmax_f64 */
651
+ NK_PUBLIC void nk_reduce_minmax_e4m3_skylake(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
652
+ nk_e4m3_t *, nk_size_t *);
653
+ /** @copydoc nk_reduce_minmax_f64 */
654
+ NK_PUBLIC void nk_reduce_minmax_e5m2_skylake(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
655
+ nk_e5m2_t *, nk_size_t *);
656
+ /** @copydoc nk_reduce_minmax_f64 */
657
+ NK_PUBLIC void nk_reduce_minmax_e2m3_skylake(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
658
+ nk_e2m3_t *, nk_size_t *);
659
+ /** @copydoc nk_reduce_minmax_f64 */
660
+ NK_PUBLIC void nk_reduce_minmax_e3m2_skylake(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
661
+ nk_e3m2_t *, nk_size_t *);
662
+ #endif // NK_TARGET_SKYLAKE
663
+
664
+ #if NK_TARGET_ICELAKE
665
+ /** @copydoc nk_reduce_moments_f64 */
666
+ NK_PUBLIC void nk_reduce_moments_i8_icelake(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
667
+ /** @copydoc nk_reduce_moments_f64 */
668
+ NK_PUBLIC void nk_reduce_moments_u8_icelake(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
669
+ /** @copydoc nk_reduce_moments_f64 */
670
+ NK_PUBLIC void nk_reduce_moments_i16_icelake(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
671
+ /** @copydoc nk_reduce_moments_f64 */
672
+ NK_PUBLIC void nk_reduce_moments_e2m3_icelake(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
673
+ /** @copydoc nk_reduce_moments_f64 */
674
+ NK_PUBLIC void nk_reduce_moments_e3m2_icelake(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
675
+ #endif // NK_TARGET_ICELAKE
676
+
677
+ #if NK_TARGET_GENOA
678
+ /** @copydoc nk_reduce_moments_f64 */
679
+ NK_PUBLIC void nk_reduce_moments_bf16_genoa(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
680
+ /** @copydoc nk_reduce_moments_f64 */
681
+ NK_PUBLIC void nk_reduce_moments_e4m3_genoa(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
682
+ /** @copydoc nk_reduce_moments_f64 */
683
+ NK_PUBLIC void nk_reduce_moments_e5m2_genoa(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
684
+ #endif // NK_TARGET_GENOA
685
+
686
+ #if NK_TARGET_ALDER
687
+ /** @copydoc nk_reduce_moments_f64 */
688
+ NK_PUBLIC void nk_reduce_moments_u8_alder(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
689
+ /** @copydoc nk_reduce_moments_f64 */
690
+ NK_PUBLIC void nk_reduce_moments_i16_alder(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
691
+ /** @copydoc nk_reduce_moments_f64 */
692
+ NK_PUBLIC void nk_reduce_moments_u16_alder(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
693
+ /** @copydoc nk_reduce_moments_f64 */
694
+ NK_PUBLIC void nk_reduce_moments_e3m2_alder(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
695
+ /** @copydoc nk_reduce_moments_f64 */
696
+ NK_PUBLIC void nk_reduce_moments_e2m3_alder(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
697
+ #endif // NK_TARGET_ALDER
698
+ #if NK_TARGET_SIERRA
699
+ /** @copydoc nk_reduce_moments_f64 */
700
+ NK_PUBLIC void nk_reduce_moments_i8_sierra(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
701
+ /** @copydoc nk_reduce_moments_f64 */
702
+ NK_PUBLIC void nk_reduce_moments_u8_sierra(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
703
+ /** @copydoc nk_reduce_moments_f64 */
704
+ NK_PUBLIC void nk_reduce_moments_e2m3_sierra(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
705
+ #endif // NK_TARGET_SIERRA
706
+
707
+ #if NK_TARGET_RVV
708
+ /** @copydoc nk_reduce_moments_f64 */
709
+ NK_PUBLIC void nk_reduce_moments_f32_rvv(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
710
+ /** @copydoc nk_reduce_moments_f64 */
711
+ NK_PUBLIC void nk_reduce_moments_f64_rvv(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
712
+ /** @copydoc nk_reduce_moments_f64 */
713
+ NK_PUBLIC void nk_reduce_moments_i8_rvv(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
714
+ /** @copydoc nk_reduce_moments_f64 */
715
+ NK_PUBLIC void nk_reduce_moments_u8_rvv(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
716
+ /** @copydoc nk_reduce_moments_f64 */
717
+ NK_PUBLIC void nk_reduce_moments_i16_rvv(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
718
+ /** @copydoc nk_reduce_moments_f64 */
719
+ NK_PUBLIC void nk_reduce_moments_u16_rvv(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
720
+ /** @copydoc nk_reduce_moments_f64 */
721
+ NK_PUBLIC void nk_reduce_moments_i32_rvv(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
722
+ /** @copydoc nk_reduce_moments_f64 */
723
+ NK_PUBLIC void nk_reduce_moments_u32_rvv(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
724
+ /** @copydoc nk_reduce_moments_f64 */
725
+ NK_PUBLIC void nk_reduce_moments_i64_rvv(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
726
+ /** @copydoc nk_reduce_moments_f64 */
727
+ NK_PUBLIC void nk_reduce_moments_u64_rvv(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
728
+ /** @copydoc nk_reduce_moments_f64 */
729
+ NK_PUBLIC void nk_reduce_moments_f16_rvv(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
730
+ /** @copydoc nk_reduce_moments_f64 */
731
+ NK_PUBLIC void nk_reduce_moments_bf16_rvv(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
732
+ /** @copydoc nk_reduce_moments_f64 */
733
+ NK_PUBLIC void nk_reduce_moments_e4m3_rvv(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
734
+ /** @copydoc nk_reduce_moments_f64 */
735
+ NK_PUBLIC void nk_reduce_moments_e5m2_rvv(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
736
+ /** @copydoc nk_reduce_moments_f64 */
737
+ NK_PUBLIC void nk_reduce_moments_e2m3_rvv(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
738
+ /** @copydoc nk_reduce_moments_f64 */
739
+ NK_PUBLIC void nk_reduce_moments_e3m2_rvv(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
740
+ /** @copydoc nk_reduce_minmax_f64 */
741
+ NK_PUBLIC void nk_reduce_minmax_f32_rvv(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
742
+ nk_size_t *);
743
+ /** @copydoc nk_reduce_minmax_f64 */
744
+ NK_PUBLIC void nk_reduce_minmax_f64_rvv(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
745
+ nk_size_t *);
746
+ /** @copydoc nk_reduce_minmax_f64 */
747
+ NK_PUBLIC void nk_reduce_minmax_i8_rvv(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
748
+ nk_size_t *);
749
+ /** @copydoc nk_reduce_minmax_f64 */
750
+ NK_PUBLIC void nk_reduce_minmax_u8_rvv(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
751
+ nk_size_t *);
752
+ /** @copydoc nk_reduce_minmax_f64 */
753
+ NK_PUBLIC void nk_reduce_minmax_i16_rvv(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
754
+ nk_size_t *);
755
+ /** @copydoc nk_reduce_minmax_f64 */
756
+ NK_PUBLIC void nk_reduce_minmax_u16_rvv(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
757
+ nk_size_t *);
758
+ /** @copydoc nk_reduce_minmax_f64 */
759
+ NK_PUBLIC void nk_reduce_minmax_i32_rvv(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
760
+ nk_size_t *);
761
+ /** @copydoc nk_reduce_minmax_f64 */
762
+ NK_PUBLIC void nk_reduce_minmax_u32_rvv(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
763
+ nk_size_t *);
764
+ /** @copydoc nk_reduce_minmax_f64 */
765
+ NK_PUBLIC void nk_reduce_minmax_i64_rvv(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
766
+ nk_size_t *);
767
+ /** @copydoc nk_reduce_minmax_f64 */
768
+ NK_PUBLIC void nk_reduce_minmax_u64_rvv(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
769
+ nk_size_t *);
770
+ /** @copydoc nk_reduce_minmax_f64 */
771
+ NK_PUBLIC void nk_reduce_minmax_f16_rvv(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
772
+ nk_size_t *);
773
+ /** @copydoc nk_reduce_minmax_f64 */
774
+ NK_PUBLIC void nk_reduce_minmax_bf16_rvv(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *, nk_bf16_t *,
775
+ nk_size_t *);
776
+ /** @copydoc nk_reduce_minmax_f64 */
777
+ NK_PUBLIC void nk_reduce_minmax_e4m3_rvv(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *, nk_e4m3_t *,
778
+ nk_size_t *);
779
+ /** @copydoc nk_reduce_minmax_f64 */
780
+ NK_PUBLIC void nk_reduce_minmax_e5m2_rvv(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *, nk_e5m2_t *,
781
+ nk_size_t *);
782
+ /** @copydoc nk_reduce_minmax_f64 */
783
+ NK_PUBLIC void nk_reduce_minmax_e2m3_rvv(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *, nk_e2m3_t *,
784
+ nk_size_t *);
785
+ /** @copydoc nk_reduce_minmax_f64 */
786
+ NK_PUBLIC void nk_reduce_minmax_e3m2_rvv(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *, nk_e3m2_t *,
787
+ nk_size_t *);
788
+ #endif // NK_TARGET_RVV
789
+
790
+ #if NK_TARGET_V128RELAXED
791
+ /** @copydoc nk_reduce_moments_f64 */
792
+ NK_PUBLIC void nk_reduce_moments_f32_v128relaxed(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
793
+ /** @copydoc nk_reduce_moments_f64 */
794
+ NK_PUBLIC void nk_reduce_moments_f64_v128relaxed(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
795
+ /** @copydoc nk_reduce_moments_f64 */
796
+ NK_PUBLIC void nk_reduce_moments_i8_v128relaxed(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
797
+ /** @copydoc nk_reduce_moments_f64 */
798
+ NK_PUBLIC void nk_reduce_moments_u8_v128relaxed(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
799
+ /** @copydoc nk_reduce_moments_f64 */
800
+ NK_PUBLIC void nk_reduce_moments_i16_v128relaxed(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
801
+ /** @copydoc nk_reduce_moments_f64 */
802
+ NK_PUBLIC void nk_reduce_moments_u16_v128relaxed(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
803
+ /** @copydoc nk_reduce_moments_f64 */
804
+ NK_PUBLIC void nk_reduce_moments_i32_v128relaxed(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
805
+ /** @copydoc nk_reduce_moments_f64 */
806
+ NK_PUBLIC void nk_reduce_moments_u32_v128relaxed(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
807
+ /** @copydoc nk_reduce_moments_f64 */
808
+ NK_PUBLIC void nk_reduce_moments_i64_v128relaxed(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
809
+ /** @copydoc nk_reduce_moments_f64 */
810
+ NK_PUBLIC void nk_reduce_moments_u64_v128relaxed(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
811
+ /** @copydoc nk_reduce_moments_f64 */
812
+ NK_PUBLIC void nk_reduce_moments_f16_v128relaxed(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
813
+ /** @copydoc nk_reduce_moments_f64 */
814
+ NK_PUBLIC void nk_reduce_moments_bf16_v128relaxed(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
815
+ /** @copydoc nk_reduce_moments_f64 */
816
+ NK_PUBLIC void nk_reduce_moments_e4m3_v128relaxed(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
817
+ /** @copydoc nk_reduce_moments_f64 */
818
+ NK_PUBLIC void nk_reduce_moments_e5m2_v128relaxed(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
819
+ /** @copydoc nk_reduce_moments_f64 */
820
+ NK_PUBLIC void nk_reduce_moments_e2m3_v128relaxed(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
821
+ /** @copydoc nk_reduce_moments_f64 */
822
+ NK_PUBLIC void nk_reduce_moments_e3m2_v128relaxed(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
823
+ /** @copydoc nk_reduce_minmax_f64 */
824
+ NK_PUBLIC void nk_reduce_minmax_f32_v128relaxed(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *,
825
+ nk_f32_t *, nk_size_t *);
826
+ /** @copydoc nk_reduce_minmax_f64 */
827
+ NK_PUBLIC void nk_reduce_minmax_f64_v128relaxed(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *,
828
+ nk_f64_t *, nk_size_t *);
829
+ /** @copydoc nk_reduce_minmax_f64 */
830
+ NK_PUBLIC void nk_reduce_minmax_i8_v128relaxed(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
831
+ nk_size_t *);
832
+ /** @copydoc nk_reduce_minmax_f64 */
833
+ NK_PUBLIC void nk_reduce_minmax_u8_v128relaxed(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
834
+ nk_size_t *);
835
+ /** @copydoc nk_reduce_minmax_f64 */
836
+ NK_PUBLIC void nk_reduce_minmax_i16_v128relaxed(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *,
837
+ nk_i16_t *, nk_size_t *);
838
+ /** @copydoc nk_reduce_minmax_f64 */
839
+ NK_PUBLIC void nk_reduce_minmax_u16_v128relaxed(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *,
840
+ nk_u16_t *, nk_size_t *);
841
+ /** @copydoc nk_reduce_minmax_f64 */
842
+ NK_PUBLIC void nk_reduce_minmax_i32_v128relaxed(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *,
843
+ nk_i32_t *, nk_size_t *);
844
+ /** @copydoc nk_reduce_minmax_f64 */
845
+ NK_PUBLIC void nk_reduce_minmax_u32_v128relaxed(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *,
846
+ nk_u32_t *, nk_size_t *);
847
+ /** @copydoc nk_reduce_minmax_f64 */
848
+ NK_PUBLIC void nk_reduce_minmax_i64_v128relaxed(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *,
849
+ nk_i64_t *, nk_size_t *);
850
+ /** @copydoc nk_reduce_minmax_f64 */
851
+ NK_PUBLIC void nk_reduce_minmax_u64_v128relaxed(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *,
852
+ nk_u64_t *, nk_size_t *);
853
+ /** @copydoc nk_reduce_minmax_f64 */
854
+ NK_PUBLIC void nk_reduce_minmax_f16_v128relaxed(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *,
855
+ nk_f16_t *, nk_size_t *);
856
+ /** @copydoc nk_reduce_minmax_f64 */
857
+ NK_PUBLIC void nk_reduce_minmax_bf16_v128relaxed(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
858
+ nk_bf16_t *, nk_size_t *);
859
+ /** @copydoc nk_reduce_minmax_f64 */
860
+ NK_PUBLIC void nk_reduce_minmax_e4m3_v128relaxed(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
861
+ nk_e4m3_t *, nk_size_t *);
862
+ /** @copydoc nk_reduce_minmax_f64 */
863
+ NK_PUBLIC void nk_reduce_minmax_e5m2_v128relaxed(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
864
+ nk_e5m2_t *, nk_size_t *);
865
+ /** @copydoc nk_reduce_minmax_f64 */
866
+ NK_PUBLIC void nk_reduce_minmax_e2m3_v128relaxed(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
867
+ nk_e2m3_t *, nk_size_t *);
868
+ /** @copydoc nk_reduce_minmax_f64 */
869
+ NK_PUBLIC void nk_reduce_minmax_e3m2_v128relaxed(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
870
+ nk_e3m2_t *, nk_size_t *);
871
+ #endif // NK_TARGET_V128RELAXED
872
+
873
+ /**
874
+ * @brief Returns the accumulator dtype for the `sum` output of reduce_moments.
875
+ *
876
+ * Float types accumulate into wider floats; signed ints into i64; unsigned ints into u64.
877
+ */
878
+ NK_INTERNAL nk_dtype_t nk_reduce_moments_sum_dtype(nk_dtype_t dtype) {
879
+ switch (dtype) {
880
+ case nk_f64_k: return nk_f64_k;
881
+ case nk_f32_k: return nk_f64_k;
882
+ case nk_f16_k: return nk_f32_k;
883
+ case nk_bf16_k: return nk_f32_k;
884
+ case nk_e4m3_k: return nk_f32_k;
885
+ case nk_e5m2_k: return nk_f32_k;
886
+ case nk_e2m3_k: return nk_f32_k;
887
+ case nk_e3m2_k: return nk_f32_k;
888
+ case nk_i8_k: return nk_i64_k;
889
+ case nk_i16_k: return nk_i64_k;
890
+ case nk_i32_k: return nk_i64_k;
891
+ case nk_i64_k: return nk_i64_k;
892
+ case nk_i4_k: return nk_i64_k;
893
+ case nk_u8_k: return nk_u64_k;
894
+ case nk_u16_k: return nk_u64_k;
895
+ case nk_u32_k: return nk_u64_k;
896
+ case nk_u64_k: return nk_u64_k;
897
+ case nk_u4_k: return nk_u64_k;
898
+ case nk_u1_k: return nk_u64_k;
899
+ default: return nk_dtype_unknown_k;
900
+ }
901
+ }
902
+
903
+ /**
904
+ * @brief Returns the accumulator dtype for the `sumsq` output of reduce_moments.
905
+ *
906
+ * Same as sum except all integers (signed and unsigned) accumulate into u64.
907
+ */
908
+ NK_INTERNAL nk_dtype_t nk_reduce_moments_sumsq_dtype(nk_dtype_t dtype) {
909
+ switch (dtype) {
910
+ case nk_f64_k: return nk_f64_k;
911
+ case nk_f32_k: return nk_f64_k;
912
+ case nk_f16_k: return nk_f32_k;
913
+ case nk_bf16_k: return nk_f32_k;
914
+ case nk_e4m3_k: return nk_f32_k;
915
+ case nk_e5m2_k: return nk_f32_k;
916
+ case nk_e2m3_k: return nk_f32_k;
917
+ case nk_e3m2_k: return nk_f32_k;
918
+ case nk_i8_k: return nk_u64_k;
919
+ case nk_i16_k: return nk_u64_k;
920
+ case nk_i32_k: return nk_u64_k;
921
+ case nk_i64_k: return nk_u64_k;
922
+ case nk_i4_k: return nk_u64_k;
923
+ case nk_u8_k: return nk_u64_k;
924
+ case nk_u16_k: return nk_u64_k;
925
+ case nk_u32_k: return nk_u64_k;
926
+ case nk_u64_k: return nk_u64_k;
927
+ case nk_u4_k: return nk_u64_k;
928
+ case nk_u1_k: return nk_u64_k;
929
+ default: return nk_dtype_unknown_k;
930
+ }
931
+ }
932
+
933
+ /**
934
+ * @brief Returns the value dtype for reduce_minmax outputs.
935
+ *
936
+ * Standard types return themselves. Sub-byte types widen: i4->i8, u4->u8, u1->u8.
937
+ */
938
+ NK_INTERNAL nk_dtype_t nk_reduce_minmax_value_dtype(nk_dtype_t dtype) {
939
+ switch (dtype) {
940
+ case nk_i4_k: return nk_i8_k;
941
+ case nk_u4_k: return nk_u8_k;
942
+ case nk_u1_k: return nk_u8_k;
943
+ default: return dtype;
944
+ }
945
+ }
946
+
947
+ #ifdef __cplusplus
948
+ } // extern "C"
949
+ #endif
950
+
951
+ #include "numkong/reduce/serial.h"
952
+ #include "numkong/reduce/neon.h"
953
+ #include "numkong/reduce/neonhalf.h"
954
+ #include "numkong/reduce/neonbfdot.h"
955
+ #include "numkong/reduce/neonsdot.h"
956
+ #include "numkong/reduce/neonfhm.h"
957
+ #include "numkong/reduce/haswell.h"
958
+ #include "numkong/reduce/skylake.h"
959
+ #include "numkong/reduce/icelake.h"
960
+ #include "numkong/reduce/genoa.h"
961
+ #include "numkong/reduce/alder.h"
962
+ #include "numkong/reduce/sierra.h"
963
+ #include "numkong/reduce/rvv.h"
964
+ #include "numkong/reduce/v128relaxed.h"
965
+
966
+ #ifdef __cplusplus
967
+ extern "C" {
968
+ #endif
969
+
970
+ #if !NK_DYNAMIC_DISPATCH
971
+
972
+ NK_PUBLIC void nk_reduce_moments_f32(nk_f32_t const *d, nk_size_t n, nk_size_t s, nk_f64_t *sum, nk_f64_t *sumsq) {
973
+ #if NK_TARGET_SKYLAKE
974
+ nk_reduce_moments_f32_skylake(d, n, s, sum, sumsq);
975
+ #elif NK_TARGET_HASWELL
976
+ nk_reduce_moments_f32_haswell(d, n, s, sum, sumsq);
977
+ #elif NK_TARGET_NEON
978
+ nk_reduce_moments_f32_neon(d, n, s, sum, sumsq);
979
+ #elif NK_TARGET_RVV
980
+ nk_reduce_moments_f32_rvv(d, n, s, sum, sumsq);
981
+ #elif NK_TARGET_V128RELAXED
982
+ nk_reduce_moments_f32_v128relaxed(d, n, s, sum, sumsq);
983
+ #else
984
+ nk_reduce_moments_f32_serial(d, n, s, sum, sumsq);
985
+ #endif
986
+ }
987
+
988
+ NK_PUBLIC void nk_reduce_minmax_f32(nk_f32_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *mn, nk_size_t *mi,
989
+ nk_f32_t *mx, nk_size_t *xi) {
990
+ #if NK_TARGET_SKYLAKE
991
+ nk_reduce_minmax_f32_skylake(d, n, s, mn, mi, mx, xi);
992
+ #elif NK_TARGET_HASWELL
993
+ nk_reduce_minmax_f32_haswell(d, n, s, mn, mi, mx, xi);
994
+ #elif NK_TARGET_NEON
995
+ nk_reduce_minmax_f32_neon(d, n, s, mn, mi, mx, xi);
996
+ #elif NK_TARGET_RVV
997
+ nk_reduce_minmax_f32_rvv(d, n, s, mn, mi, mx, xi);
998
+ #elif NK_TARGET_V128RELAXED
999
+ nk_reduce_minmax_f32_v128relaxed(d, n, s, mn, mi, mx, xi);
1000
+ #else
1001
+ nk_reduce_minmax_f32_serial(d, n, s, mn, mi, mx, xi);
1002
+ #endif
1003
+ }
1004
+
1005
+ NK_PUBLIC void nk_reduce_moments_f64(nk_f64_t const *d, nk_size_t n, nk_size_t s, nk_f64_t *sum, nk_f64_t *sumsq) {
1006
+ #if NK_TARGET_SKYLAKE
1007
+ nk_reduce_moments_f64_skylake(d, n, s, sum, sumsq);
1008
+ #elif NK_TARGET_HASWELL
1009
+ nk_reduce_moments_f64_haswell(d, n, s, sum, sumsq);
1010
+ #elif NK_TARGET_NEON
1011
+ nk_reduce_moments_f64_neon(d, n, s, sum, sumsq);
1012
+ #elif NK_TARGET_RVV
1013
+ nk_reduce_moments_f64_rvv(d, n, s, sum, sumsq);
1014
+ #elif NK_TARGET_V128RELAXED
1015
+ nk_reduce_moments_f64_v128relaxed(d, n, s, sum, sumsq);
1016
+ #else
1017
+ nk_reduce_moments_f64_serial(d, n, s, sum, sumsq);
1018
+ #endif
1019
+ }
1020
+
1021
+ NK_PUBLIC void nk_reduce_minmax_f64(nk_f64_t const *d, nk_size_t n, nk_size_t s, nk_f64_t *mn, nk_size_t *mi,
1022
+ nk_f64_t *mx, nk_size_t *xi) {
1023
+ #if NK_TARGET_SKYLAKE
1024
+ nk_reduce_minmax_f64_skylake(d, n, s, mn, mi, mx, xi);
1025
+ #elif NK_TARGET_HASWELL
1026
+ nk_reduce_minmax_f64_haswell(d, n, s, mn, mi, mx, xi);
1027
+ #elif NK_TARGET_NEON
1028
+ nk_reduce_minmax_f64_neon(d, n, s, mn, mi, mx, xi);
1029
+ #elif NK_TARGET_RVV
1030
+ nk_reduce_minmax_f64_rvv(d, n, s, mn, mi, mx, xi);
1031
+ #elif NK_TARGET_V128RELAXED
1032
+ nk_reduce_minmax_f64_v128relaxed(d, n, s, mn, mi, mx, xi);
1033
+ #else
1034
+ nk_reduce_minmax_f64_serial(d, n, s, mn, mi, mx, xi);
1035
+ #endif
1036
+ }
1037
+
1038
+ NK_PUBLIC void nk_reduce_moments_i8(nk_i8_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
1039
+ #if NK_TARGET_ICELAKE
1040
+ nk_reduce_moments_i8_icelake(d, n, s, sum, sumsq);
1041
+ #elif NK_TARGET_SKYLAKE
1042
+ nk_reduce_moments_i8_skylake(d, n, s, sum, sumsq);
1043
+ #elif NK_TARGET_SIERRA
1044
+ nk_reduce_moments_i8_sierra(d, n, s, sum, sumsq);
1045
+ #elif NK_TARGET_HASWELL
1046
+ nk_reduce_moments_i8_haswell(d, n, s, sum, sumsq);
1047
+ #elif NK_TARGET_NEONSDOT
1048
+ nk_reduce_moments_i8_neonsdot(d, n, s, sum, sumsq);
1049
+ #elif NK_TARGET_NEON
1050
+ nk_reduce_moments_i8_neon(d, n, s, sum, sumsq);
1051
+ #elif NK_TARGET_RVV
1052
+ nk_reduce_moments_i8_rvv(d, n, s, sum, sumsq);
1053
+ #elif NK_TARGET_V128RELAXED
1054
+ nk_reduce_moments_i8_v128relaxed(d, n, s, sum, sumsq);
1055
+ #else
1056
+ nk_reduce_moments_i8_serial(d, n, s, sum, sumsq);
1057
+ #endif
1058
+ }
1059
+
1060
+ NK_PUBLIC void nk_reduce_minmax_i8(nk_i8_t const *d, nk_size_t n, nk_size_t s, nk_i8_t *mn, nk_size_t *mi, nk_i8_t *mx,
1061
+ nk_size_t *xi) {
1062
+ #if NK_TARGET_SKYLAKE
1063
+ nk_reduce_minmax_i8_skylake(d, n, s, mn, mi, mx, xi);
1064
+ #elif NK_TARGET_HASWELL
1065
+ nk_reduce_minmax_i8_haswell(d, n, s, mn, mi, mx, xi);
1066
+ #elif NK_TARGET_NEON
1067
+ nk_reduce_minmax_i8_neon(d, n, s, mn, mi, mx, xi);
1068
+ #elif NK_TARGET_RVV
1069
+ nk_reduce_minmax_i8_rvv(d, n, s, mn, mi, mx, xi);
1070
+ #elif NK_TARGET_V128RELAXED
1071
+ nk_reduce_minmax_i8_v128relaxed(d, n, s, mn, mi, mx, xi);
1072
+ #else
1073
+ nk_reduce_minmax_i8_serial(d, n, s, mn, mi, mx, xi);
1074
+ #endif
1075
+ }
1076
+
1077
+ NK_PUBLIC void nk_reduce_moments_u8(nk_u8_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
1078
+ #if NK_TARGET_ICELAKE
1079
+ nk_reduce_moments_u8_icelake(d, n, s, sum, sumsq);
1080
+ #elif NK_TARGET_SKYLAKE
1081
+ nk_reduce_moments_u8_skylake(d, n, s, sum, sumsq);
1082
+ #elif NK_TARGET_SIERRA
1083
+ nk_reduce_moments_u8_sierra(d, n, s, sum, sumsq);
1084
+ #elif NK_TARGET_ALDER
1085
+ nk_reduce_moments_u8_alder(d, n, s, sum, sumsq);
1086
+ #elif NK_TARGET_HASWELL
1087
+ nk_reduce_moments_u8_haswell(d, n, s, sum, sumsq);
1088
+ #elif NK_TARGET_NEONSDOT
1089
+ nk_reduce_moments_u8_neonsdot(d, n, s, sum, sumsq);
1090
+ #elif NK_TARGET_NEON
1091
+ nk_reduce_moments_u8_neon(d, n, s, sum, sumsq);
1092
+ #elif NK_TARGET_RVV
1093
+ nk_reduce_moments_u8_rvv(d, n, s, sum, sumsq);
1094
+ #elif NK_TARGET_V128RELAXED
1095
+ nk_reduce_moments_u8_v128relaxed(d, n, s, sum, sumsq);
1096
+ #else
1097
+ nk_reduce_moments_u8_serial(d, n, s, sum, sumsq);
1098
+ #endif
1099
+ }
1100
+
1101
+ NK_PUBLIC void nk_reduce_minmax_u8(nk_u8_t const *d, nk_size_t n, nk_size_t s, nk_u8_t *mn, nk_size_t *mi, nk_u8_t *mx,
1102
+ nk_size_t *xi) {
1103
+ #if NK_TARGET_SKYLAKE
1104
+ nk_reduce_minmax_u8_skylake(d, n, s, mn, mi, mx, xi);
1105
+ #elif NK_TARGET_HASWELL
1106
+ nk_reduce_minmax_u8_haswell(d, n, s, mn, mi, mx, xi);
1107
+ #elif NK_TARGET_NEON
1108
+ nk_reduce_minmax_u8_neon(d, n, s, mn, mi, mx, xi);
1109
+ #elif NK_TARGET_RVV
1110
+ nk_reduce_minmax_u8_rvv(d, n, s, mn, mi, mx, xi);
1111
+ #elif NK_TARGET_V128RELAXED
1112
+ nk_reduce_minmax_u8_v128relaxed(d, n, s, mn, mi, mx, xi);
1113
+ #else
1114
+ nk_reduce_minmax_u8_serial(d, n, s, mn, mi, mx, xi);
1115
+ #endif
1116
+ }
1117
+
1118
+ NK_PUBLIC void nk_reduce_moments_i16(nk_i16_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
1119
+ #if NK_TARGET_ICELAKE
1120
+ nk_reduce_moments_i16_icelake(d, n, s, sum, sumsq);
1121
+ #elif NK_TARGET_SKYLAKE
1122
+ nk_reduce_moments_i16_skylake(d, n, s, sum, sumsq);
1123
+ #elif NK_TARGET_ALDER
1124
+ nk_reduce_moments_i16_alder(d, n, s, sum, sumsq);
1125
+ #elif NK_TARGET_HASWELL
1126
+ nk_reduce_moments_i16_haswell(d, n, s, sum, sumsq);
1127
+ #elif NK_TARGET_NEON
1128
+ nk_reduce_moments_i16_neon(d, n, s, sum, sumsq);
1129
+ #elif NK_TARGET_RVV
1130
+ nk_reduce_moments_i16_rvv(d, n, s, sum, sumsq);
1131
+ #elif NK_TARGET_V128RELAXED
1132
+ nk_reduce_moments_i16_v128relaxed(d, n, s, sum, sumsq);
1133
+ #else
1134
+ nk_reduce_moments_i16_serial(d, n, s, sum, sumsq);
1135
+ #endif
1136
+ }
1137
+
1138
+ NK_PUBLIC void nk_reduce_minmax_i16(nk_i16_t const *d, nk_size_t n, nk_size_t s, nk_i16_t *mn, nk_size_t *mi,
1139
+ nk_i16_t *mx, nk_size_t *xi) {
1140
+ #if NK_TARGET_SKYLAKE
1141
+ nk_reduce_minmax_i16_skylake(d, n, s, mn, mi, mx, xi);
1142
+ #elif NK_TARGET_HASWELL
1143
+ nk_reduce_minmax_i16_haswell(d, n, s, mn, mi, mx, xi);
1144
+ #elif NK_TARGET_NEON
1145
+ nk_reduce_minmax_i16_neon(d, n, s, mn, mi, mx, xi);
1146
+ #elif NK_TARGET_RVV
1147
+ nk_reduce_minmax_i16_rvv(d, n, s, mn, mi, mx, xi);
1148
+ #elif NK_TARGET_V128RELAXED
1149
+ nk_reduce_minmax_i16_v128relaxed(d, n, s, mn, mi, mx, xi);
1150
+ #else
1151
+ nk_reduce_minmax_i16_serial(d, n, s, mn, mi, mx, xi);
1152
+ #endif
1153
+ }
1154
+
1155
+ NK_PUBLIC void nk_reduce_moments_u16(nk_u16_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
1156
+ #if NK_TARGET_SKYLAKE
1157
+ nk_reduce_moments_u16_skylake(d, n, s, sum, sumsq);
1158
+ #elif NK_TARGET_ALDER
1159
+ nk_reduce_moments_u16_alder(d, n, s, sum, sumsq);
1160
+ #elif NK_TARGET_HASWELL
1161
+ nk_reduce_moments_u16_haswell(d, n, s, sum, sumsq);
1162
+ #elif NK_TARGET_NEON
1163
+ nk_reduce_moments_u16_neon(d, n, s, sum, sumsq);
1164
+ #elif NK_TARGET_RVV
1165
+ nk_reduce_moments_u16_rvv(d, n, s, sum, sumsq);
1166
+ #elif NK_TARGET_V128RELAXED
1167
+ nk_reduce_moments_u16_v128relaxed(d, n, s, sum, sumsq);
1168
+ #else
1169
+ nk_reduce_moments_u16_serial(d, n, s, sum, sumsq);
1170
+ #endif
1171
+ }
1172
+
1173
+ NK_PUBLIC void nk_reduce_minmax_u16(nk_u16_t const *d, nk_size_t n, nk_size_t s, nk_u16_t *mn, nk_size_t *mi,
1174
+ nk_u16_t *mx, nk_size_t *xi) {
1175
+ #if NK_TARGET_SKYLAKE
1176
+ nk_reduce_minmax_u16_skylake(d, n, s, mn, mi, mx, xi);
1177
+ #elif NK_TARGET_HASWELL
1178
+ nk_reduce_minmax_u16_haswell(d, n, s, mn, mi, mx, xi);
1179
+ #elif NK_TARGET_NEON
1180
+ nk_reduce_minmax_u16_neon(d, n, s, mn, mi, mx, xi);
1181
+ #elif NK_TARGET_RVV
1182
+ nk_reduce_minmax_u16_rvv(d, n, s, mn, mi, mx, xi);
1183
+ #elif NK_TARGET_V128RELAXED
1184
+ nk_reduce_minmax_u16_v128relaxed(d, n, s, mn, mi, mx, xi);
1185
+ #else
1186
+ nk_reduce_minmax_u16_serial(d, n, s, mn, mi, mx, xi);
1187
+ #endif
1188
+ }
1189
+
1190
+ NK_PUBLIC void nk_reduce_moments_i32(nk_i32_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
1191
+ #if NK_TARGET_SKYLAKE
1192
+ nk_reduce_moments_i32_skylake(d, n, s, sum, sumsq);
1193
+ #elif NK_TARGET_HASWELL
1194
+ nk_reduce_moments_i32_haswell(d, n, s, sum, sumsq);
1195
+ #elif NK_TARGET_NEON
1196
+ nk_reduce_moments_i32_neon(d, n, s, sum, sumsq);
1197
+ #elif NK_TARGET_RVV
1198
+ nk_reduce_moments_i32_rvv(d, n, s, sum, sumsq);
1199
+ #elif NK_TARGET_V128RELAXED
1200
+ nk_reduce_moments_i32_v128relaxed(d, n, s, sum, sumsq);
1201
+ #else
1202
+ nk_reduce_moments_i32_serial(d, n, s, sum, sumsq);
1203
+ #endif
1204
+ }
1205
+
1206
+ NK_PUBLIC void nk_reduce_minmax_i32(nk_i32_t const *d, nk_size_t n, nk_size_t s, nk_i32_t *mn, nk_size_t *mi,
1207
+ nk_i32_t *mx, nk_size_t *xi) {
1208
+ #if NK_TARGET_SKYLAKE
1209
+ nk_reduce_minmax_i32_skylake(d, n, s, mn, mi, mx, xi);
1210
+ #elif NK_TARGET_HASWELL
1211
+ nk_reduce_minmax_i32_haswell(d, n, s, mn, mi, mx, xi);
1212
+ #elif NK_TARGET_NEON
1213
+ nk_reduce_minmax_i32_neon(d, n, s, mn, mi, mx, xi);
1214
+ #elif NK_TARGET_RVV
1215
+ nk_reduce_minmax_i32_rvv(d, n, s, mn, mi, mx, xi);
1216
+ #elif NK_TARGET_V128RELAXED
1217
+ nk_reduce_minmax_i32_v128relaxed(d, n, s, mn, mi, mx, xi);
1218
+ #else
1219
+ nk_reduce_minmax_i32_serial(d, n, s, mn, mi, mx, xi);
1220
+ #endif
1221
+ }
1222
+
1223
+ NK_PUBLIC void nk_reduce_moments_u32(nk_u32_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
1224
+ #if NK_TARGET_SKYLAKE
1225
+ nk_reduce_moments_u32_skylake(d, n, s, sum, sumsq);
1226
+ #elif NK_TARGET_HASWELL
1227
+ nk_reduce_moments_u32_haswell(d, n, s, sum, sumsq);
1228
+ #elif NK_TARGET_NEON
1229
+ nk_reduce_moments_u32_neon(d, n, s, sum, sumsq);
1230
+ #elif NK_TARGET_RVV
1231
+ nk_reduce_moments_u32_rvv(d, n, s, sum, sumsq);
1232
+ #elif NK_TARGET_V128RELAXED
1233
+ nk_reduce_moments_u32_v128relaxed(d, n, s, sum, sumsq);
1234
+ #else
1235
+ nk_reduce_moments_u32_serial(d, n, s, sum, sumsq);
1236
+ #endif
1237
+ }
1238
+
1239
+ NK_PUBLIC void nk_reduce_minmax_u32(nk_u32_t const *d, nk_size_t n, nk_size_t s, nk_u32_t *mn, nk_size_t *mi,
1240
+ nk_u32_t *mx, nk_size_t *xi) {
1241
+ #if NK_TARGET_SKYLAKE
1242
+ nk_reduce_minmax_u32_skylake(d, n, s, mn, mi, mx, xi);
1243
+ #elif NK_TARGET_HASWELL
1244
+ nk_reduce_minmax_u32_haswell(d, n, s, mn, mi, mx, xi);
1245
+ #elif NK_TARGET_NEON
1246
+ nk_reduce_minmax_u32_neon(d, n, s, mn, mi, mx, xi);
1247
+ #elif NK_TARGET_RVV
1248
+ nk_reduce_minmax_u32_rvv(d, n, s, mn, mi, mx, xi);
1249
+ #elif NK_TARGET_V128RELAXED
1250
+ nk_reduce_minmax_u32_v128relaxed(d, n, s, mn, mi, mx, xi);
1251
+ #else
1252
+ nk_reduce_minmax_u32_serial(d, n, s, mn, mi, mx, xi);
1253
+ #endif
1254
+ }
1255
+
1256
+ NK_PUBLIC void nk_reduce_moments_i64(nk_i64_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
1257
+ #if NK_TARGET_SKYLAKE
1258
+ nk_reduce_moments_i64_skylake(d, n, s, sum, sumsq);
1259
+ #elif NK_TARGET_HASWELL
1260
+ nk_reduce_moments_i64_haswell(d, n, s, sum, sumsq);
1261
+ #elif NK_TARGET_NEON
1262
+ nk_reduce_moments_i64_neon(d, n, s, sum, sumsq);
1263
+ #elif NK_TARGET_RVV
1264
+ nk_reduce_moments_i64_rvv(d, n, s, sum, sumsq);
1265
+ #elif NK_TARGET_V128RELAXED
1266
+ nk_reduce_moments_i64_v128relaxed(d, n, s, sum, sumsq);
1267
+ #else
1268
+ nk_reduce_moments_i64_serial(d, n, s, sum, sumsq);
1269
+ #endif
1270
+ }
1271
+
1272
+ NK_PUBLIC void nk_reduce_minmax_i64(nk_i64_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *mn, nk_size_t *mi,
1273
+ nk_i64_t *mx, nk_size_t *xi) {
1274
+ #if NK_TARGET_SKYLAKE
1275
+ nk_reduce_minmax_i64_skylake(d, n, s, mn, mi, mx, xi);
1276
+ #elif NK_TARGET_HASWELL
1277
+ nk_reduce_minmax_i64_haswell(d, n, s, mn, mi, mx, xi);
1278
+ #elif NK_TARGET_NEON
1279
+ nk_reduce_minmax_i64_neon(d, n, s, mn, mi, mx, xi);
1280
+ #elif NK_TARGET_RVV
1281
+ nk_reduce_minmax_i64_rvv(d, n, s, mn, mi, mx, xi);
1282
+ #elif NK_TARGET_V128RELAXED
1283
+ nk_reduce_minmax_i64_v128relaxed(d, n, s, mn, mi, mx, xi);
1284
+ #else
1285
+ nk_reduce_minmax_i64_serial(d, n, s, mn, mi, mx, xi);
1286
+ #endif
1287
+ }
1288
+
1289
+ NK_PUBLIC void nk_reduce_moments_u64(nk_u64_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
1290
+ #if NK_TARGET_SKYLAKE
1291
+ nk_reduce_moments_u64_skylake(d, n, s, sum, sumsq);
1292
+ #elif NK_TARGET_HASWELL
1293
+ nk_reduce_moments_u64_haswell(d, n, s, sum, sumsq);
1294
+ #elif NK_TARGET_NEON
1295
+ nk_reduce_moments_u64_neon(d, n, s, sum, sumsq);
1296
+ #elif NK_TARGET_RVV
1297
+ nk_reduce_moments_u64_rvv(d, n, s, sum, sumsq);
1298
+ #elif NK_TARGET_V128RELAXED
1299
+ nk_reduce_moments_u64_v128relaxed(d, n, s, sum, sumsq);
1300
+ #else
1301
+ nk_reduce_moments_u64_serial(d, n, s, sum, sumsq);
1302
+ #endif
1303
+ }
1304
+
1305
+ NK_PUBLIC void nk_reduce_minmax_u64(nk_u64_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *mn, nk_size_t *mi,
1306
+ nk_u64_t *mx, nk_size_t *xi) {
1307
+ #if NK_TARGET_SKYLAKE
1308
+ nk_reduce_minmax_u64_skylake(d, n, s, mn, mi, mx, xi);
1309
+ #elif NK_TARGET_HASWELL
1310
+ nk_reduce_minmax_u64_haswell(d, n, s, mn, mi, mx, xi);
1311
+ #elif NK_TARGET_NEON
1312
+ nk_reduce_minmax_u64_neon(d, n, s, mn, mi, mx, xi);
1313
+ #elif NK_TARGET_RVV
1314
+ nk_reduce_minmax_u64_rvv(d, n, s, mn, mi, mx, xi);
1315
+ #elif NK_TARGET_V128RELAXED
1316
+ nk_reduce_minmax_u64_v128relaxed(d, n, s, mn, mi, mx, xi);
1317
+ #else
1318
+ nk_reduce_minmax_u64_serial(d, n, s, mn, mi, mx, xi);
1319
+ #endif
1320
+ }
1321
+
1322
+ NK_PUBLIC void nk_reduce_moments_f16(nk_f16_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
1323
+ #if NK_TARGET_SKYLAKE
1324
+ nk_reduce_moments_f16_skylake(d, n, s, sum, sumsq);
1325
+ #elif NK_TARGET_HASWELL
1326
+ nk_reduce_moments_f16_haswell(d, n, s, sum, sumsq);
1327
+ #elif NK_TARGET_NEONHALF
1328
+ nk_reduce_moments_f16_neonhalf(d, n, s, sum, sumsq);
1329
+ #elif NK_TARGET_RVV
1330
+ nk_reduce_moments_f16_rvv(d, n, s, sum, sumsq);
1331
+ #elif NK_TARGET_V128RELAXED
1332
+ nk_reduce_moments_f16_v128relaxed(d, n, s, sum, sumsq);
1333
+ #else
1334
+ nk_reduce_moments_f16_serial(d, n, s, sum, sumsq);
1335
+ #endif
1336
+ }
1337
+
1338
+ NK_PUBLIC void nk_reduce_minmax_f16(nk_f16_t const *d, nk_size_t n, nk_size_t s, nk_f16_t *mn, nk_size_t *mi,
1339
+ nk_f16_t *mx, nk_size_t *xi) {
1340
+ #if NK_TARGET_SKYLAKE
1341
+ nk_reduce_minmax_f16_skylake(d, n, s, mn, mi, mx, xi);
1342
+ #elif NK_TARGET_HASWELL
1343
+ nk_reduce_minmax_f16_haswell(d, n, s, mn, mi, mx, xi);
1344
+ #elif NK_TARGET_NEONHALF
1345
+ nk_reduce_minmax_f16_serial(d, n, s, mn, mi, mx, xi);
1346
+ #elif NK_TARGET_RVV
1347
+ nk_reduce_minmax_f16_rvv(d, n, s, mn, mi, mx, xi);
1348
+ #elif NK_TARGET_V128RELAXED
1349
+ nk_reduce_minmax_f16_v128relaxed(d, n, s, mn, mi, mx, xi);
1350
+ #else
1351
+ nk_reduce_minmax_f16_serial(d, n, s, mn, mi, mx, xi);
1352
+ #endif
1353
+ }
1354
+
1355
+ NK_PUBLIC void nk_reduce_moments_bf16(nk_bf16_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
1356
+ #if NK_TARGET_GENOA
1357
+ nk_reduce_moments_bf16_genoa(d, n, s, sum, sumsq);
1358
+ #elif NK_TARGET_SKYLAKE
1359
+ nk_reduce_moments_bf16_skylake(d, n, s, sum, sumsq);
1360
+ #elif NK_TARGET_HASWELL
1361
+ nk_reduce_moments_bf16_haswell(d, n, s, sum, sumsq);
1362
+ #elif NK_TARGET_NEONBFDOT
1363
+ nk_reduce_moments_bf16_neonbfdot(d, n, s, sum, sumsq);
1364
+ #elif NK_TARGET_RVV
1365
+ nk_reduce_moments_bf16_rvv(d, n, s, sum, sumsq);
1366
+ #elif NK_TARGET_V128RELAXED
1367
+ nk_reduce_moments_bf16_v128relaxed(d, n, s, sum, sumsq);
1368
+ #else
1369
+ nk_reduce_moments_bf16_serial(d, n, s, sum, sumsq);
1370
+ #endif
1371
+ }
1372
+
1373
+ NK_PUBLIC void nk_reduce_minmax_bf16(nk_bf16_t const *d, nk_size_t n, nk_size_t s, nk_bf16_t *mn, nk_size_t *mi,
1374
+ nk_bf16_t *mx, nk_size_t *xi) {
1375
+ #if NK_TARGET_SKYLAKE
1376
+ nk_reduce_minmax_bf16_skylake(d, n, s, mn, mi, mx, xi);
1377
+ #elif NK_TARGET_HASWELL
1378
+ nk_reduce_minmax_bf16_haswell(d, n, s, mn, mi, mx, xi);
1379
+ #elif NK_TARGET_NEONBFDOT
1380
+ nk_reduce_minmax_bf16_neonbfdot(d, n, s, mn, mi, mx, xi);
1381
+ #elif NK_TARGET_RVV
1382
+ nk_reduce_minmax_bf16_rvv(d, n, s, mn, mi, mx, xi);
1383
+ #elif NK_TARGET_V128RELAXED
1384
+ nk_reduce_minmax_bf16_v128relaxed(d, n, s, mn, mi, mx, xi);
1385
+ #else
1386
+ nk_reduce_minmax_bf16_serial(d, n, s, mn, mi, mx, xi);
1387
+ #endif
1388
+ }
1389
+
1390
+ NK_PUBLIC void nk_reduce_moments_e4m3(nk_e4m3_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
1391
+ #if NK_TARGET_GENOA
1392
+ nk_reduce_moments_e4m3_genoa(d, n, s, sum, sumsq);
1393
+ #elif NK_TARGET_SKYLAKE
1394
+ nk_reduce_moments_e4m3_skylake(d, n, s, sum, sumsq);
1395
+ #elif NK_TARGET_HASWELL
1396
+ nk_reduce_moments_e4m3_haswell(d, n, s, sum, sumsq);
1397
+ #elif NK_TARGET_NEONFHM
1398
+ nk_reduce_moments_e4m3_neonfhm(d, n, s, sum, sumsq);
1399
+ #elif NK_TARGET_NEON
1400
+ nk_reduce_moments_e4m3_neon(d, n, s, sum, sumsq);
1401
+ #elif NK_TARGET_RVV
1402
+ nk_reduce_moments_e4m3_rvv(d, n, s, sum, sumsq);
1403
+ #elif NK_TARGET_V128RELAXED
1404
+ nk_reduce_moments_e4m3_v128relaxed(d, n, s, sum, sumsq);
1405
+ #else
1406
+ nk_reduce_moments_e4m3_serial(d, n, s, sum, sumsq);
1407
+ #endif
1408
+ }
1409
+
1410
+ NK_PUBLIC void nk_reduce_minmax_e4m3(nk_e4m3_t const *d, nk_size_t n, nk_size_t s, nk_e4m3_t *mn, nk_size_t *mi,
1411
+ nk_e4m3_t *mx, nk_size_t *xi) {
1412
+ #if NK_TARGET_SKYLAKE
1413
+ nk_reduce_minmax_e4m3_skylake(d, n, s, mn, mi, mx, xi);
1414
+ #elif NK_TARGET_HASWELL
1415
+ nk_reduce_minmax_e4m3_haswell(d, n, s, mn, mi, mx, xi);
1416
+ #elif NK_TARGET_NEONFHM
1417
+ nk_reduce_minmax_e4m3_neonfhm(d, n, s, mn, mi, mx, xi);
1418
+ #elif NK_TARGET_NEON
1419
+ nk_reduce_minmax_e4m3_neon(d, n, s, mn, mi, mx, xi);
1420
+ #elif NK_TARGET_RVV
1421
+ nk_reduce_minmax_e4m3_rvv(d, n, s, mn, mi, mx, xi);
1422
+ #elif NK_TARGET_V128RELAXED
1423
+ nk_reduce_minmax_e4m3_v128relaxed(d, n, s, mn, mi, mx, xi);
1424
+ #else
1425
+ nk_reduce_minmax_e4m3_serial(d, n, s, mn, mi, mx, xi);
1426
+ #endif
1427
+ }
1428
+
1429
+ NK_PUBLIC void nk_reduce_moments_e5m2(nk_e5m2_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
1430
+ #if NK_TARGET_GENOA
1431
+ nk_reduce_moments_e5m2_genoa(d, n, s, sum, sumsq);
1432
+ #elif NK_TARGET_SKYLAKE
1433
+ nk_reduce_moments_e5m2_skylake(d, n, s, sum, sumsq);
1434
+ #elif NK_TARGET_HASWELL
1435
+ nk_reduce_moments_e5m2_haswell(d, n, s, sum, sumsq);
1436
+ #elif NK_TARGET_NEONFHM
1437
+ nk_reduce_moments_e5m2_neonfhm(d, n, s, sum, sumsq);
1438
+ #elif NK_TARGET_NEON
1439
+ nk_reduce_moments_e5m2_neon(d, n, s, sum, sumsq);
1440
+ #elif NK_TARGET_RVV
1441
+ nk_reduce_moments_e5m2_rvv(d, n, s, sum, sumsq);
1442
+ #elif NK_TARGET_V128RELAXED
1443
+ nk_reduce_moments_e5m2_v128relaxed(d, n, s, sum, sumsq);
1444
+ #else
1445
+ nk_reduce_moments_e5m2_serial(d, n, s, sum, sumsq);
1446
+ #endif
1447
+ }
1448
+
1449
+ NK_PUBLIC void nk_reduce_minmax_e5m2(nk_e5m2_t const *d, nk_size_t n, nk_size_t s, nk_e5m2_t *mn, nk_size_t *mi,
1450
+ nk_e5m2_t *mx, nk_size_t *xi) {
1451
+ #if NK_TARGET_SKYLAKE
1452
+ nk_reduce_minmax_e5m2_skylake(d, n, s, mn, mi, mx, xi);
1453
+ #elif NK_TARGET_HASWELL
1454
+ nk_reduce_minmax_e5m2_haswell(d, n, s, mn, mi, mx, xi);
1455
+ #elif NK_TARGET_NEONFHM
1456
+ nk_reduce_minmax_e5m2_neonfhm(d, n, s, mn, mi, mx, xi);
1457
+ #elif NK_TARGET_NEON
1458
+ nk_reduce_minmax_e5m2_neon(d, n, s, mn, mi, mx, xi);
1459
+ #elif NK_TARGET_RVV
1460
+ nk_reduce_minmax_e5m2_rvv(d, n, s, mn, mi, mx, xi);
1461
+ #elif NK_TARGET_V128RELAXED
1462
+ nk_reduce_minmax_e5m2_v128relaxed(d, n, s, mn, mi, mx, xi);
1463
+ #else
1464
+ nk_reduce_minmax_e5m2_serial(d, n, s, mn, mi, mx, xi);
1465
+ #endif
1466
+ }
1467
+
1468
+ NK_PUBLIC void nk_reduce_moments_e2m3(nk_e2m3_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
1469
+ #if NK_TARGET_ICELAKE
1470
+ nk_reduce_moments_e2m3_icelake(d, n, s, sum, sumsq);
1471
+ #elif NK_TARGET_SKYLAKE
1472
+ nk_reduce_moments_e2m3_skylake(d, n, s, sum, sumsq);
1473
+ #elif NK_TARGET_SIERRA
1474
+ nk_reduce_moments_e2m3_sierra(d, n, s, sum, sumsq);
1475
+ #elif NK_TARGET_ALDER
1476
+ nk_reduce_moments_e2m3_alder(d, n, s, sum, sumsq);
1477
+ #elif NK_TARGET_HASWELL
1478
+ nk_reduce_moments_e2m3_haswell(d, n, s, sum, sumsq);
1479
+ #elif NK_TARGET_NEONSDOT
1480
+ nk_reduce_moments_e2m3_neonsdot(d, n, s, sum, sumsq);
1481
+ #elif NK_TARGET_NEON
1482
+ nk_reduce_moments_e2m3_neon(d, n, s, sum, sumsq);
1483
+ #elif NK_TARGET_RVV
1484
+ nk_reduce_moments_e2m3_rvv(d, n, s, sum, sumsq);
1485
+ #elif NK_TARGET_V128RELAXED
1486
+ nk_reduce_moments_e2m3_v128relaxed(d, n, s, sum, sumsq);
1487
+ #else
1488
+ nk_reduce_moments_e2m3_serial(d, n, s, sum, sumsq);
1489
+ #endif
1490
+ }
1491
+
1492
+ NK_PUBLIC void nk_reduce_minmax_e2m3(nk_e2m3_t const *d, nk_size_t n, nk_size_t s, nk_e2m3_t *mn, nk_size_t *mi,
1493
+ nk_e2m3_t *mx, nk_size_t *xi) {
1494
+ #if NK_TARGET_SKYLAKE
1495
+ nk_reduce_minmax_e2m3_skylake(d, n, s, mn, mi, mx, xi);
1496
+ #elif NK_TARGET_HASWELL
1497
+ nk_reduce_minmax_e2m3_haswell(d, n, s, mn, mi, mx, xi);
1498
+ #elif NK_TARGET_NEON
1499
+ nk_reduce_minmax_e2m3_neon(d, n, s, mn, mi, mx, xi);
1500
+ #elif NK_TARGET_RVV
1501
+ nk_reduce_minmax_e2m3_rvv(d, n, s, mn, mi, mx, xi);
1502
+ #elif NK_TARGET_V128RELAXED
1503
+ nk_reduce_minmax_e2m3_v128relaxed(d, n, s, mn, mi, mx, xi);
1504
+ #else
1505
+ nk_reduce_minmax_e2m3_serial(d, n, s, mn, mi, mx, xi);
1506
+ #endif
1507
+ }
1508
+
1509
+ NK_PUBLIC void nk_reduce_moments_e3m2(nk_e3m2_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
1510
+ #if NK_TARGET_ICELAKE
1511
+ nk_reduce_moments_e3m2_icelake(d, n, s, sum, sumsq);
1512
+ #elif NK_TARGET_SKYLAKE
1513
+ nk_reduce_moments_e3m2_skylake(d, n, s, sum, sumsq);
1514
+ #elif NK_TARGET_ALDER
1515
+ nk_reduce_moments_e3m2_alder(d, n, s, sum, sumsq);
1516
+ #elif NK_TARGET_HASWELL
1517
+ nk_reduce_moments_e3m2_haswell(d, n, s, sum, sumsq);
1518
+ #elif NK_TARGET_NEON
1519
+ nk_reduce_moments_e3m2_neon(d, n, s, sum, sumsq);
1520
+ #elif NK_TARGET_RVV
1521
+ nk_reduce_moments_e3m2_rvv(d, n, s, sum, sumsq);
1522
+ #elif NK_TARGET_V128RELAXED
1523
+ nk_reduce_moments_e3m2_v128relaxed(d, n, s, sum, sumsq);
1524
+ #else
1525
+ nk_reduce_moments_e3m2_serial(d, n, s, sum, sumsq);
1526
+ #endif
1527
+ }
1528
+
1529
+ NK_PUBLIC void nk_reduce_minmax_e3m2(nk_e3m2_t const *d, nk_size_t n, nk_size_t s, nk_e3m2_t *mn, nk_size_t *mi,
1530
+ nk_e3m2_t *mx, nk_size_t *xi) {
1531
+ #if NK_TARGET_SKYLAKE
1532
+ nk_reduce_minmax_e3m2_skylake(d, n, s, mn, mi, mx, xi);
1533
+ #elif NK_TARGET_HASWELL
1534
+ nk_reduce_minmax_e3m2_haswell(d, n, s, mn, mi, mx, xi);
1535
+ #elif NK_TARGET_NEON
1536
+ nk_reduce_minmax_e3m2_neon(d, n, s, mn, mi, mx, xi);
1537
+ #elif NK_TARGET_RVV
1538
+ nk_reduce_minmax_e3m2_rvv(d, n, s, mn, mi, mx, xi);
1539
+ #elif NK_TARGET_V128RELAXED
1540
+ nk_reduce_minmax_e3m2_v128relaxed(d, n, s, mn, mi, mx, xi);
1541
+ #else
1542
+ nk_reduce_minmax_e3m2_serial(d, n, s, mn, mi, mx, xi);
1543
+ #endif
1544
+ }
1545
+
1546
+ NK_PUBLIC void nk_reduce_moments_i4(nk_i4x2_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
1547
+ #if NK_TARGET_SKYLAKE
1548
+ nk_reduce_moments_i4_skylake(d, n, s, sum, sumsq);
1549
+ #elif NK_TARGET_HASWELL
1550
+ nk_reduce_moments_i4_haswell(d, n, s, sum, sumsq);
1551
+ #else
1552
+ nk_reduce_moments_i4_serial(d, n, s, sum, sumsq);
1553
+ #endif
1554
+ }
1555
+
1556
+ NK_PUBLIC void nk_reduce_minmax_i4(nk_i4x2_t const *d, nk_size_t n, nk_size_t s, nk_i8_t *mn, nk_size_t *mi,
1557
+ nk_i8_t *mx, nk_size_t *xi) {
1558
+ nk_reduce_minmax_i4_serial(d, n, s, mn, mi, mx, xi);
1559
+ }
1560
+
1561
+ NK_PUBLIC void nk_reduce_moments_u4(nk_u4x2_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
1562
+ #if NK_TARGET_SKYLAKE
1563
+ nk_reduce_moments_u4_skylake(d, n, s, sum, sumsq);
1564
+ #elif NK_TARGET_HASWELL
1565
+ nk_reduce_moments_u4_haswell(d, n, s, sum, sumsq);
1566
+ #else
1567
+ nk_reduce_moments_u4_serial(d, n, s, sum, sumsq);
1568
+ #endif
1569
+ }
1570
+
1571
+ NK_PUBLIC void nk_reduce_minmax_u4(nk_u4x2_t const *d, nk_size_t n, nk_size_t s, nk_u8_t *mn, nk_size_t *mi,
1572
+ nk_u8_t *mx, nk_size_t *xi) {
1573
+ nk_reduce_minmax_u4_serial(d, n, s, mn, mi, mx, xi);
1574
+ }
1575
+
1576
+ NK_PUBLIC void nk_reduce_moments_u1(nk_u1x8_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
1577
+ #if NK_TARGET_SKYLAKE
1578
+ nk_reduce_moments_u1_skylake(d, n, s, sum, sumsq);
1579
+ #elif NK_TARGET_HASWELL
1580
+ nk_reduce_moments_u1_haswell(d, n, s, sum, sumsq);
1581
+ #else
1582
+ nk_reduce_moments_u1_serial(d, n, s, sum, sumsq);
1583
+ #endif
1584
+ }
1585
+
1586
+ NK_PUBLIC void nk_reduce_minmax_u1(nk_u1x8_t const *d, nk_size_t n, nk_size_t s, nk_u8_t *mn, nk_size_t *mi,
1587
+ nk_u8_t *mx, nk_size_t *xi) {
1588
+ nk_reduce_minmax_u1_serial(d, n, s, mn, mi, mx, xi);
1589
+ }
1590
+
1591
+ #endif // !NK_DYNAMIC_DISPATCH
1592
+
1593
+ #ifdef __cplusplus
1594
+ } // extern "C"
1595
+ #endif
1596
+
1597
+ #endif // NK_REDUCE_H