numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,1597 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Vector Reductions.
|
|
3
|
+
* @file include/numkong/reduce.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date December 27, 2024
|
|
6
|
+
*
|
|
7
|
+
* Provides horizontal reduction operations over vectors with:
|
|
8
|
+
* - `nk_reduce_moments_*` — sum + sum-of-squares in one pass
|
|
9
|
+
* - `nk_reduce_minmax_*` — min + max with argmin/argmax in one pass
|
|
10
|
+
* - Dynamic dispatch for runtime ISA selection
|
|
11
|
+
*
|
|
12
|
+
* For dtypes:
|
|
13
|
+
*
|
|
14
|
+
* - f64: 64-bit IEEE floating point numbers
|
|
15
|
+
* - f32: 32-bit IEEE floating point numbers
|
|
16
|
+
* - f16: 16-bit IEEE floating point numbers
|
|
17
|
+
* - bf16: 16-bit brain floating point numbers
|
|
18
|
+
* - e4m3: 8-bit e4m3 floating point numbers
|
|
19
|
+
* - e5m2: 8-bit e5m2 floating point numbers
|
|
20
|
+
* - e2m3: 8-bit e2m3 floating point numbers (MX)
|
|
21
|
+
* - e3m2: 8-bit e3m2 floating point numbers (MX)
|
|
22
|
+
* - i8: 8-bit signed integers
|
|
23
|
+
* - u8: 8-bit unsigned integers
|
|
24
|
+
* - i16: 16-bit signed integers
|
|
25
|
+
* - u16: 16-bit unsigned integers
|
|
26
|
+
* - i32: 32-bit signed integers
|
|
27
|
+
* - u32: 32-bit unsigned integers
|
|
28
|
+
* - i64: 64-bit signed integers
|
|
29
|
+
* - u64: 64-bit unsigned integers
|
|
30
|
+
* - i4: 4-bit signed integers (packed pairs)
|
|
31
|
+
* - u4: 4-bit unsigned integers (packed pairs)
|
|
32
|
+
* - u1: 1-bit binary (packed octets)
|
|
33
|
+
*
|
|
34
|
+
* For hardware architectures:
|
|
35
|
+
*
|
|
36
|
+
* - Arm: NEON, NEON+F16, NEON+FHM, NEON+BF16, NEON+SDOT
|
|
37
|
+
* - x86: Haswell, Skylake, Ice Lake, Genoa, Sierra Forest
|
|
38
|
+
* - RISC-V: RVV
|
|
39
|
+
* - WASM: V128Relaxed
|
|
40
|
+
*
|
|
41
|
+
* @section numerical_stability Numerical stability
|
|
42
|
+
*
|
|
43
|
+
* All accumulations are performed with stable techniques and @b saturation in mind.
|
|
44
|
+
* Single-precision inputs are aggregated in double-precision. Double-precision
|
|
45
|
+
* inputs are handled with @b Neumaier-like compensated summation schemes. Mini-floats
|
|
46
|
+
* are propagated to more hardware-friendly types. And integer are handled with
|
|
47
|
+
* proper saturation logic, as opposed to simple pairwise saturation, meaning that
|
|
48
|
+
* if several extremely large values are followed by equal negative values, the
|
|
49
|
+
* sum will be zero.
|
|
50
|
+
*
|
|
51
|
+
* @code{.c}
|
|
52
|
+
*
|
|
53
|
+
* @endcode{.c}
|
|
54
|
+
*
|
|
55
|
+
*
|
|
56
|
+
* All MinMax scans are performed with respect to NaN values beyond simple total ordering.
|
|
57
|
+
* All positive and negative NaN values are masked out on the fly and can never be included
|
|
58
|
+
* in the output. For empty or NaN-only inputs, the returned argmin/argmax positions will
|
|
59
|
+
* be set to sentinel value @b `NK_SIZE_MAX`.
|
|
60
|
+
*
|
|
61
|
+
* @section reduction_strategy Reduction Strategy
|
|
62
|
+
*
|
|
63
|
+
* The key insight is that `_mm512_reduce_add_ps()` and similar intrinsics are
|
|
64
|
+
* actually serial operations - they don't parallelize the reduction across lanes.
|
|
65
|
+
* The correct approach is:
|
|
66
|
+
*
|
|
67
|
+
* 1. Accumulate vertically in SIMD registers throughout the entire loop
|
|
68
|
+
* 2. Perform a single horizontal reduction at the very end, reconstructing the lane positions
|
|
69
|
+
*
|
|
70
|
+
* @code{.c}
|
|
71
|
+
* __m512 sum_f32x16 = _mm512_setzero_ps();
|
|
72
|
+
* for (...) {
|
|
73
|
+
* __m512 data_f32x16 = _mm512_loadu_ps(ptr);
|
|
74
|
+
* sum_f32x16 = _mm512_add_ps(sum_f32x16, data_f32x16);
|
|
75
|
+
* }
|
|
76
|
+
* // Single horizontal reduce at the END only
|
|
77
|
+
* nk_f32_t result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
|
|
78
|
+
* @endcode
|
|
79
|
+
*
|
|
80
|
+
* @section stride_handling Stride Handling Strategies
|
|
81
|
+
*
|
|
82
|
+
* - stride == sizeof(scalar): Contiguous SIMD loads with masked tail
|
|
83
|
+
* - Large stride with gather support: Use gather instructions (32/64-bit types)
|
|
84
|
+
* - Otherwise: Serial fallback
|
|
85
|
+
*
|
|
86
|
+
* @section argminmax Argmin/Argmax Strategy
|
|
87
|
+
*
|
|
88
|
+
* Single-pass algorithm tracking both value and index in SIMD registers:
|
|
89
|
+
* @code{.c}
|
|
90
|
+
* __m512 min_f32x16 = _mm512_set1_ps(FLT_MAX);
|
|
91
|
+
* __m512i min_idx_i32x16 = _mm512_setzero_si512();
|
|
92
|
+
* __m512i current_idx_i32x16 = _mm512_setr_epi32(0,1,2,3,...,15);
|
|
93
|
+
* __m512i step_i32x16 = _mm512_set1_epi32(16);
|
|
94
|
+
* for (...) {
|
|
95
|
+
* __m512 data_f32x16 = _mm512_loadu_ps(ptr);
|
|
96
|
+
* __mmask16 lt_mask = _mm512_cmp_ps_mask(data_f32x16, min_f32x16, _CMP_LT_OQ);
|
|
97
|
+
* min_f32x16 = _mm512_mask_mov_ps(min_f32x16, lt_mask, data_f32x16);
|
|
98
|
+
* min_idx_i32x16 = _mm512_mask_mov_epi32(min_idx_i32x16, lt_mask, current_idx_i32x16);
|
|
99
|
+
* current_idx_i32x16 = _mm512_add_epi32(current_idx_i32x16, step_i32x16);
|
|
100
|
+
* }
|
|
101
|
+
* @endcode
|
|
102
|
+
*/
|
|
103
|
+
#ifndef NK_REDUCE_H
|
|
104
|
+
#define NK_REDUCE_H
|
|
105
|
+
|
|
106
|
+
#include "numkong/types.h"
|
|
107
|
+
|
|
108
|
+
#ifdef __cplusplus
|
|
109
|
+
extern "C" {
|
|
110
|
+
#endif
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* @brief Horizontal moments reduction (sum + sum-of-squares) over a strided array.
|
|
114
|
+
* @param[in] data Pointer to the input data.
|
|
115
|
+
* @param[in] count Number of elements to reduce.
|
|
116
|
+
* @param[in] stride_bytes Stride between elements in bytes, equal to `sizeof(*data)` for contiguous arrays.
|
|
117
|
+
* @param[out] sum_ptr Output sum.
|
|
118
|
+
* @param[out] sumsq_ptr Output sum of squares.
|
|
119
|
+
*/
|
|
120
|
+
NK_DYNAMIC void nk_reduce_moments_f64(nk_f64_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_f64_t *sum_ptr,
|
|
121
|
+
nk_f64_t *sumsq_ptr);
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* @brief Horizontal min+max reduction with argmin/argmax over a strided array.
|
|
125
|
+
* @param[in] data Pointer to the input data.
|
|
126
|
+
* @param[in] count Number of elements to reduce.
|
|
127
|
+
* @param[in] stride_bytes Stride between elements in bytes, equal to `sizeof(*data)` for contiguous arrays.
|
|
128
|
+
* @param[out] min_value_ptr Output minimum value.
|
|
129
|
+
* @param[out] min_index_ptr Output index of the minimum value.
|
|
130
|
+
* @param[out] max_value_ptr Output maximum value.
|
|
131
|
+
* @param[out] max_index_ptr Output index of the maximum value.
|
|
132
|
+
*/
|
|
133
|
+
NK_DYNAMIC void nk_reduce_minmax_f64(nk_f64_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
134
|
+
nk_f64_t *min_value_ptr, nk_size_t *min_index_ptr, nk_f64_t *max_value_ptr,
|
|
135
|
+
nk_size_t *max_index_ptr);
|
|
136
|
+
|
|
137
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
138
|
+
NK_DYNAMIC void nk_reduce_moments_f32(nk_f32_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_f64_t *sum_ptr,
|
|
139
|
+
nk_f64_t *sumsq_ptr);
|
|
140
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
141
|
+
NK_DYNAMIC void nk_reduce_minmax_f32(nk_f32_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
142
|
+
nk_f32_t *min_value_ptr, nk_size_t *min_index_ptr, nk_f32_t *max_value_ptr,
|
|
143
|
+
nk_size_t *max_index_ptr);
|
|
144
|
+
|
|
145
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
146
|
+
NK_DYNAMIC void nk_reduce_moments_i8(nk_i8_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
|
|
147
|
+
nk_u64_t *sumsq_ptr);
|
|
148
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
149
|
+
NK_DYNAMIC void nk_reduce_minmax_i8(nk_i8_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
150
|
+
nk_i8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i8_t *max_value_ptr,
|
|
151
|
+
nk_size_t *max_index_ptr);
|
|
152
|
+
|
|
153
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
154
|
+
NK_DYNAMIC void nk_reduce_moments_u8(nk_u8_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
|
|
155
|
+
nk_u64_t *sumsq_ptr);
|
|
156
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
157
|
+
NK_DYNAMIC void nk_reduce_minmax_u8(nk_u8_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
158
|
+
nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u8_t *max_value_ptr,
|
|
159
|
+
nk_size_t *max_index_ptr);
|
|
160
|
+
|
|
161
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
162
|
+
NK_DYNAMIC void nk_reduce_moments_i16(nk_i16_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
|
|
163
|
+
nk_u64_t *sumsq_ptr);
|
|
164
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
165
|
+
NK_DYNAMIC void nk_reduce_minmax_i16(nk_i16_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
166
|
+
nk_i16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i16_t *max_value_ptr,
|
|
167
|
+
nk_size_t *max_index_ptr);
|
|
168
|
+
|
|
169
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
170
|
+
NK_DYNAMIC void nk_reduce_moments_u16(nk_u16_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
|
|
171
|
+
nk_u64_t *sumsq_ptr);
|
|
172
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
173
|
+
NK_DYNAMIC void nk_reduce_minmax_u16(nk_u16_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
174
|
+
nk_u16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u16_t *max_value_ptr,
|
|
175
|
+
nk_size_t *max_index_ptr);
|
|
176
|
+
|
|
177
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
178
|
+
NK_DYNAMIC void nk_reduce_moments_i32(nk_i32_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
|
|
179
|
+
nk_u64_t *sumsq_ptr);
|
|
180
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
181
|
+
NK_DYNAMIC void nk_reduce_minmax_i32(nk_i32_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
182
|
+
nk_i32_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i32_t *max_value_ptr,
|
|
183
|
+
nk_size_t *max_index_ptr);
|
|
184
|
+
|
|
185
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
186
|
+
NK_DYNAMIC void nk_reduce_moments_u32(nk_u32_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
|
|
187
|
+
nk_u64_t *sumsq_ptr);
|
|
188
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
189
|
+
NK_DYNAMIC void nk_reduce_minmax_u32(nk_u32_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
190
|
+
nk_u32_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u32_t *max_value_ptr,
|
|
191
|
+
nk_size_t *max_index_ptr);
|
|
192
|
+
|
|
193
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
194
|
+
NK_DYNAMIC void nk_reduce_moments_i64(nk_i64_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
|
|
195
|
+
nk_u64_t *sumsq_ptr);
|
|
196
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
197
|
+
NK_DYNAMIC void nk_reduce_minmax_i64(nk_i64_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
198
|
+
nk_i64_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i64_t *max_value_ptr,
|
|
199
|
+
nk_size_t *max_index_ptr);
|
|
200
|
+
|
|
201
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
202
|
+
NK_DYNAMIC void nk_reduce_moments_u64(nk_u64_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
|
|
203
|
+
nk_u64_t *sumsq_ptr);
|
|
204
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
205
|
+
NK_DYNAMIC void nk_reduce_minmax_u64(nk_u64_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
206
|
+
nk_u64_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u64_t *max_value_ptr,
|
|
207
|
+
nk_size_t *max_index_ptr);
|
|
208
|
+
|
|
209
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
210
|
+
NK_DYNAMIC void nk_reduce_moments_f16(nk_f16_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_f32_t *sum_ptr,
|
|
211
|
+
nk_f32_t *sumsq_ptr);
|
|
212
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
213
|
+
NK_DYNAMIC void nk_reduce_minmax_f16(nk_f16_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
214
|
+
nk_f16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_f16_t *max_value_ptr,
|
|
215
|
+
nk_size_t *max_index_ptr);
|
|
216
|
+
|
|
217
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
218
|
+
NK_DYNAMIC void nk_reduce_moments_bf16(nk_bf16_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
219
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
|
|
220
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
221
|
+
NK_DYNAMIC void nk_reduce_minmax_bf16(nk_bf16_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
222
|
+
nk_bf16_t *min_value_ptr, nk_size_t *min_index_ptr, nk_bf16_t *max_value_ptr,
|
|
223
|
+
nk_size_t *max_index_ptr);
|
|
224
|
+
|
|
225
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
226
|
+
NK_DYNAMIC void nk_reduce_moments_e4m3(nk_e4m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
227
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
|
|
228
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
229
|
+
NK_DYNAMIC void nk_reduce_minmax_e4m3(nk_e4m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
230
|
+
nk_e4m3_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e4m3_t *max_value_ptr,
|
|
231
|
+
nk_size_t *max_index_ptr);
|
|
232
|
+
|
|
233
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
234
|
+
NK_DYNAMIC void nk_reduce_moments_e5m2(nk_e5m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
235
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
|
|
236
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
237
|
+
NK_DYNAMIC void nk_reduce_minmax_e5m2(nk_e5m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
238
|
+
nk_e5m2_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e5m2_t *max_value_ptr,
|
|
239
|
+
nk_size_t *max_index_ptr);
|
|
240
|
+
|
|
241
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
242
|
+
NK_DYNAMIC void nk_reduce_moments_e2m3(nk_e2m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
243
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
|
|
244
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
245
|
+
NK_DYNAMIC void nk_reduce_minmax_e2m3(nk_e2m3_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
246
|
+
nk_e2m3_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e2m3_t *max_value_ptr,
|
|
247
|
+
nk_size_t *max_index_ptr);
|
|
248
|
+
|
|
249
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
250
|
+
NK_DYNAMIC void nk_reduce_moments_e3m2(nk_e3m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
251
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr);
|
|
252
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
253
|
+
NK_DYNAMIC void nk_reduce_minmax_e3m2(nk_e3m2_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
254
|
+
nk_e3m2_t *min_value_ptr, nk_size_t *min_index_ptr, nk_e3m2_t *max_value_ptr,
|
|
255
|
+
nk_size_t *max_index_ptr);
|
|
256
|
+
|
|
257
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
258
|
+
NK_DYNAMIC void nk_reduce_moments_i4(nk_i4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_i64_t *sum_ptr,
|
|
259
|
+
nk_u64_t *sumsq_ptr);
|
|
260
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
261
|
+
NK_DYNAMIC void nk_reduce_minmax_i4(nk_i4x2_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
262
|
+
nk_i8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_i8_t *max_value_ptr,
|
|
263
|
+
nk_size_t *max_index_ptr);
|
|
264
|
+
|
|
265
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
266
|
+
NK_DYNAMIC void nk_reduce_moments_u4(nk_u4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
|
|
267
|
+
nk_u64_t *sumsq_ptr);
|
|
268
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
269
|
+
NK_DYNAMIC void nk_reduce_minmax_u4(nk_u4x2_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
270
|
+
nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u8_t *max_value_ptr,
|
|
271
|
+
nk_size_t *max_index_ptr);
|
|
272
|
+
|
|
273
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
274
|
+
NK_DYNAMIC void nk_reduce_moments_u1(nk_u1x8_t const *data, nk_size_t count, nk_size_t stride_bytes, nk_u64_t *sum_ptr,
|
|
275
|
+
nk_u64_t *sumsq_ptr);
|
|
276
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
277
|
+
NK_DYNAMIC void nk_reduce_minmax_u1(nk_u1x8_t const *data, nk_size_t count, nk_size_t stride_bytes,
|
|
278
|
+
nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, nk_u8_t *max_value_ptr,
|
|
279
|
+
nk_size_t *max_index_ptr);
|
|
280
|
+
|
|
281
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
282
|
+
NK_PUBLIC void nk_reduce_moments_f32_serial(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
283
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
284
|
+
NK_PUBLIC void nk_reduce_moments_f64_serial(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
285
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
286
|
+
NK_PUBLIC void nk_reduce_moments_i8_serial(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
287
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
288
|
+
NK_PUBLIC void nk_reduce_moments_u8_serial(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
289
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
290
|
+
NK_PUBLIC void nk_reduce_moments_i16_serial(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
291
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
292
|
+
NK_PUBLIC void nk_reduce_moments_u16_serial(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
293
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
294
|
+
NK_PUBLIC void nk_reduce_moments_i32_serial(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
295
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
296
|
+
NK_PUBLIC void nk_reduce_moments_u32_serial(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
297
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
298
|
+
NK_PUBLIC void nk_reduce_moments_i64_serial(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
299
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
300
|
+
NK_PUBLIC void nk_reduce_moments_u64_serial(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
301
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
302
|
+
NK_PUBLIC void nk_reduce_moments_f16_serial(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
303
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
304
|
+
NK_PUBLIC void nk_reduce_moments_bf16_serial(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
305
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
306
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_serial(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
307
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
308
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_serial(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
309
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
310
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_serial(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
311
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
312
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_serial(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
313
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
314
|
+
NK_PUBLIC void nk_reduce_moments_i4_serial(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
315
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
316
|
+
NK_PUBLIC void nk_reduce_moments_u4_serial(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
317
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
318
|
+
NK_PUBLIC void nk_reduce_moments_u1_serial(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
319
|
+
|
|
320
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
321
|
+
NK_PUBLIC void nk_reduce_minmax_f32_serial(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
|
|
322
|
+
nk_size_t *);
|
|
323
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
324
|
+
NK_PUBLIC void nk_reduce_minmax_f64_serial(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
|
|
325
|
+
nk_size_t *);
|
|
326
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
327
|
+
NK_PUBLIC void nk_reduce_minmax_i8_serial(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
|
|
328
|
+
nk_size_t *);
|
|
329
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
330
|
+
NK_PUBLIC void nk_reduce_minmax_u8_serial(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
331
|
+
nk_size_t *);
|
|
332
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
333
|
+
NK_PUBLIC void nk_reduce_minmax_i16_serial(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
|
|
334
|
+
nk_size_t *);
|
|
335
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
336
|
+
NK_PUBLIC void nk_reduce_minmax_u16_serial(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
|
|
337
|
+
nk_size_t *);
|
|
338
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
339
|
+
NK_PUBLIC void nk_reduce_minmax_i32_serial(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
|
|
340
|
+
nk_size_t *);
|
|
341
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
342
|
+
NK_PUBLIC void nk_reduce_minmax_u32_serial(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
|
|
343
|
+
nk_size_t *);
|
|
344
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
345
|
+
NK_PUBLIC void nk_reduce_minmax_i64_serial(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
|
|
346
|
+
nk_size_t *);
|
|
347
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
348
|
+
NK_PUBLIC void nk_reduce_minmax_u64_serial(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
|
|
349
|
+
nk_size_t *);
|
|
350
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
351
|
+
NK_PUBLIC void nk_reduce_minmax_f16_serial(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
|
|
352
|
+
nk_size_t *);
|
|
353
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
354
|
+
NK_PUBLIC void nk_reduce_minmax_bf16_serial(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
|
|
355
|
+
nk_bf16_t *, nk_size_t *);
|
|
356
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
357
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_serial(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
|
|
358
|
+
nk_e4m3_t *, nk_size_t *);
|
|
359
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
360
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_serial(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
|
|
361
|
+
nk_e5m2_t *, nk_size_t *);
|
|
362
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
363
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3_serial(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
|
|
364
|
+
nk_e2m3_t *, nk_size_t *);
|
|
365
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
366
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2_serial(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
|
|
367
|
+
nk_e3m2_t *, nk_size_t *);
|
|
368
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
369
|
+
NK_PUBLIC void nk_reduce_minmax_i4_serial(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
|
|
370
|
+
nk_size_t *);
|
|
371
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
372
|
+
NK_PUBLIC void nk_reduce_minmax_u4_serial(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
373
|
+
nk_size_t *);
|
|
374
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
375
|
+
NK_PUBLIC void nk_reduce_minmax_u1_serial(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
376
|
+
nk_size_t *);
|
|
377
|
+
|
|
378
|
+
#if NK_TARGET_NEON
|
|
379
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
380
|
+
NK_PUBLIC void nk_reduce_moments_f32_neon(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
381
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
382
|
+
NK_PUBLIC void nk_reduce_moments_f64_neon(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
383
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
384
|
+
NK_PUBLIC void nk_reduce_moments_i8_neon(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
385
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
386
|
+
NK_PUBLIC void nk_reduce_moments_u8_neon(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
387
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
388
|
+
NK_PUBLIC void nk_reduce_moments_i16_neon(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
389
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
390
|
+
NK_PUBLIC void nk_reduce_moments_u16_neon(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
391
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
392
|
+
NK_PUBLIC void nk_reduce_moments_i32_neon(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
393
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
394
|
+
NK_PUBLIC void nk_reduce_moments_u32_neon(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
395
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
396
|
+
NK_PUBLIC void nk_reduce_moments_i64_neon(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
397
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
398
|
+
NK_PUBLIC void nk_reduce_moments_u64_neon(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
399
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
400
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_neon(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
401
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
402
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_neon(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
403
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
404
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_neon(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
405
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
406
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_neon(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
407
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
408
|
+
NK_PUBLIC void nk_reduce_minmax_f32_neon(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
|
|
409
|
+
nk_size_t *);
|
|
410
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
411
|
+
NK_PUBLIC void nk_reduce_minmax_f64_neon(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
|
|
412
|
+
nk_size_t *);
|
|
413
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
414
|
+
NK_PUBLIC void nk_reduce_minmax_i8_neon(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
|
|
415
|
+
nk_size_t *);
|
|
416
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
417
|
+
NK_PUBLIC void nk_reduce_minmax_u8_neon(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
418
|
+
nk_size_t *);
|
|
419
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
420
|
+
NK_PUBLIC void nk_reduce_minmax_i16_neon(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
|
|
421
|
+
nk_size_t *);
|
|
422
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
423
|
+
NK_PUBLIC void nk_reduce_minmax_u16_neon(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
|
|
424
|
+
nk_size_t *);
|
|
425
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
426
|
+
NK_PUBLIC void nk_reduce_minmax_i32_neon(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
|
|
427
|
+
nk_size_t *);
|
|
428
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
429
|
+
NK_PUBLIC void nk_reduce_minmax_u32_neon(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
|
|
430
|
+
nk_size_t *);
|
|
431
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
432
|
+
NK_PUBLIC void nk_reduce_minmax_i64_neon(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
|
|
433
|
+
nk_size_t *);
|
|
434
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
435
|
+
NK_PUBLIC void nk_reduce_minmax_u64_neon(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
|
|
436
|
+
nk_size_t *);
|
|
437
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
438
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3_neon(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
|
|
439
|
+
nk_e2m3_t *, nk_size_t *);
|
|
440
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
441
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2_neon(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
|
|
442
|
+
nk_e3m2_t *, nk_size_t *);
|
|
443
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
444
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_neon(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
|
|
445
|
+
nk_e4m3_t *, nk_size_t *);
|
|
446
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
447
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_neon(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
|
|
448
|
+
nk_e5m2_t *, nk_size_t *);
|
|
449
|
+
#endif // NK_TARGET_NEON
|
|
450
|
+
|
|
451
|
+
#if NK_TARGET_NEONHALF
|
|
452
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
453
|
+
NK_PUBLIC void nk_reduce_moments_f16_neonhalf(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
454
|
+
#endif // NK_TARGET_NEONHALF
|
|
455
|
+
|
|
456
|
+
#if NK_TARGET_NEONBFDOT
|
|
457
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
458
|
+
NK_PUBLIC void nk_reduce_moments_bf16_neonbfdot(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
459
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
460
|
+
NK_PUBLIC void nk_reduce_minmax_bf16_neonbfdot(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
|
|
461
|
+
nk_bf16_t *, nk_size_t *);
|
|
462
|
+
#endif // NK_TARGET_NEONBFDOT
|
|
463
|
+
|
|
464
|
+
#if NK_TARGET_NEONSDOT
|
|
465
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
466
|
+
NK_PUBLIC void nk_reduce_moments_i8_neonsdot(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
467
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
468
|
+
NK_PUBLIC void nk_reduce_moments_u8_neonsdot(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
469
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
470
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_neonsdot(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
471
|
+
#endif // NK_TARGET_NEONSDOT
|
|
472
|
+
|
|
473
|
+
#if NK_TARGET_NEONFHM
|
|
474
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
475
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_neonfhm(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
476
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
477
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_neonfhm(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
478
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
479
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_neonfhm(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
|
|
480
|
+
nk_e4m3_t *, nk_size_t *);
|
|
481
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
482
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_neonfhm(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
|
|
483
|
+
nk_e5m2_t *, nk_size_t *);
|
|
484
|
+
#endif // NK_TARGET_NEONFHM
|
|
485
|
+
|
|
486
|
+
#if NK_TARGET_HASWELL
|
|
487
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
488
|
+
NK_PUBLIC void nk_reduce_moments_f32_haswell(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
489
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
490
|
+
NK_PUBLIC void nk_reduce_moments_f64_haswell(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
491
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
492
|
+
NK_PUBLIC void nk_reduce_moments_i8_haswell(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
493
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
494
|
+
NK_PUBLIC void nk_reduce_moments_u8_haswell(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
495
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
496
|
+
NK_PUBLIC void nk_reduce_moments_i16_haswell(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
497
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
498
|
+
NK_PUBLIC void nk_reduce_moments_u16_haswell(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
499
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
500
|
+
NK_PUBLIC void nk_reduce_moments_i32_haswell(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
501
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
502
|
+
NK_PUBLIC void nk_reduce_moments_u32_haswell(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
503
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
504
|
+
NK_PUBLIC void nk_reduce_moments_i64_haswell(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
505
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
506
|
+
NK_PUBLIC void nk_reduce_moments_u64_haswell(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
507
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
508
|
+
NK_PUBLIC void nk_reduce_moments_f16_haswell(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
509
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
510
|
+
NK_PUBLIC void nk_reduce_moments_bf16_haswell(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
511
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
512
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_haswell(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
513
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
514
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_haswell(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
515
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
516
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_haswell(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
517
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
518
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_haswell(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
519
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
520
|
+
NK_PUBLIC void nk_reduce_moments_i4_haswell(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
521
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
522
|
+
NK_PUBLIC void nk_reduce_moments_u4_haswell(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
523
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
524
|
+
NK_PUBLIC void nk_reduce_moments_u1_haswell(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
525
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
526
|
+
NK_PUBLIC void nk_reduce_minmax_f32_haswell(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
|
|
527
|
+
nk_size_t *);
|
|
528
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
529
|
+
NK_PUBLIC void nk_reduce_minmax_f64_haswell(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
|
|
530
|
+
nk_size_t *);
|
|
531
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
532
|
+
NK_PUBLIC void nk_reduce_minmax_i8_haswell(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
|
|
533
|
+
nk_size_t *);
|
|
534
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
535
|
+
NK_PUBLIC void nk_reduce_minmax_u8_haswell(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
536
|
+
nk_size_t *);
|
|
537
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
538
|
+
NK_PUBLIC void nk_reduce_minmax_i16_haswell(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
|
|
539
|
+
nk_size_t *);
|
|
540
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
541
|
+
NK_PUBLIC void nk_reduce_minmax_u16_haswell(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
|
|
542
|
+
nk_size_t *);
|
|
543
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
544
|
+
NK_PUBLIC void nk_reduce_minmax_i32_haswell(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
|
|
545
|
+
nk_size_t *);
|
|
546
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
547
|
+
NK_PUBLIC void nk_reduce_minmax_u32_haswell(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
|
|
548
|
+
nk_size_t *);
|
|
549
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
550
|
+
NK_PUBLIC void nk_reduce_minmax_i64_haswell(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
|
|
551
|
+
nk_size_t *);
|
|
552
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
553
|
+
NK_PUBLIC void nk_reduce_minmax_u64_haswell(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
|
|
554
|
+
nk_size_t *);
|
|
555
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
556
|
+
NK_PUBLIC void nk_reduce_minmax_f16_haswell(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
|
|
557
|
+
nk_size_t *);
|
|
558
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
559
|
+
NK_PUBLIC void nk_reduce_minmax_bf16_haswell(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
|
|
560
|
+
nk_bf16_t *, nk_size_t *);
|
|
561
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
562
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_haswell(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
|
|
563
|
+
nk_e4m3_t *, nk_size_t *);
|
|
564
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
565
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_haswell(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
|
|
566
|
+
nk_e5m2_t *, nk_size_t *);
|
|
567
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
568
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3_haswell(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
|
|
569
|
+
nk_e2m3_t *, nk_size_t *);
|
|
570
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
571
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2_haswell(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
|
|
572
|
+
nk_e3m2_t *, nk_size_t *);
|
|
573
|
+
#endif // NK_TARGET_HASWELL
|
|
574
|
+
|
|
575
|
+
#if NK_TARGET_SKYLAKE
|
|
576
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
577
|
+
NK_PUBLIC void nk_reduce_moments_f32_skylake(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
578
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
579
|
+
NK_PUBLIC void nk_reduce_moments_f64_skylake(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
580
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
581
|
+
NK_PUBLIC void nk_reduce_moments_i8_skylake(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
582
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
583
|
+
NK_PUBLIC void nk_reduce_moments_u8_skylake(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
584
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
585
|
+
NK_PUBLIC void nk_reduce_moments_i16_skylake(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
586
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
587
|
+
NK_PUBLIC void nk_reduce_moments_u16_skylake(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
588
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
589
|
+
NK_PUBLIC void nk_reduce_moments_i32_skylake(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
590
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
591
|
+
NK_PUBLIC void nk_reduce_moments_u32_skylake(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
592
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
593
|
+
NK_PUBLIC void nk_reduce_moments_i64_skylake(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
594
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
595
|
+
NK_PUBLIC void nk_reduce_moments_u64_skylake(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
596
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
597
|
+
NK_PUBLIC void nk_reduce_moments_f16_skylake(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
598
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
599
|
+
NK_PUBLIC void nk_reduce_moments_bf16_skylake(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
600
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
601
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_skylake(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
602
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
603
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_skylake(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
604
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
605
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_skylake(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
606
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
607
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_skylake(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
608
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
609
|
+
NK_PUBLIC void nk_reduce_moments_i4_skylake(nk_i4x2_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
610
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
611
|
+
NK_PUBLIC void nk_reduce_moments_u4_skylake(nk_u4x2_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
612
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
613
|
+
NK_PUBLIC void nk_reduce_moments_u1_skylake(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
614
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
615
|
+
NK_PUBLIC void nk_reduce_minmax_f32_skylake(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
|
|
616
|
+
nk_size_t *);
|
|
617
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
618
|
+
NK_PUBLIC void nk_reduce_minmax_f64_skylake(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
|
|
619
|
+
nk_size_t *);
|
|
620
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
621
|
+
NK_PUBLIC void nk_reduce_minmax_i8_skylake(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
|
|
622
|
+
nk_size_t *);
|
|
623
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
624
|
+
NK_PUBLIC void nk_reduce_minmax_u8_skylake(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
625
|
+
nk_size_t *);
|
|
626
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
627
|
+
NK_PUBLIC void nk_reduce_minmax_i16_skylake(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
|
|
628
|
+
nk_size_t *);
|
|
629
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
630
|
+
NK_PUBLIC void nk_reduce_minmax_u16_skylake(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
|
|
631
|
+
nk_size_t *);
|
|
632
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
633
|
+
NK_PUBLIC void nk_reduce_minmax_i32_skylake(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
|
|
634
|
+
nk_size_t *);
|
|
635
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
636
|
+
NK_PUBLIC void nk_reduce_minmax_u32_skylake(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
|
|
637
|
+
nk_size_t *);
|
|
638
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
639
|
+
NK_PUBLIC void nk_reduce_minmax_i64_skylake(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
|
|
640
|
+
nk_size_t *);
|
|
641
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
642
|
+
NK_PUBLIC void nk_reduce_minmax_u64_skylake(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
|
|
643
|
+
nk_size_t *);
|
|
644
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
645
|
+
NK_PUBLIC void nk_reduce_minmax_f16_skylake(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
|
|
646
|
+
nk_size_t *);
|
|
647
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
648
|
+
NK_PUBLIC void nk_reduce_minmax_bf16_skylake(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
|
|
649
|
+
nk_bf16_t *, nk_size_t *);
|
|
650
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
651
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_skylake(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
|
|
652
|
+
nk_e4m3_t *, nk_size_t *);
|
|
653
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
654
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_skylake(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
|
|
655
|
+
nk_e5m2_t *, nk_size_t *);
|
|
656
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
657
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3_skylake(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
|
|
658
|
+
nk_e2m3_t *, nk_size_t *);
|
|
659
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
660
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2_skylake(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
|
|
661
|
+
nk_e3m2_t *, nk_size_t *);
|
|
662
|
+
#endif // NK_TARGET_SKYLAKE
|
|
663
|
+
|
|
664
|
+
#if NK_TARGET_ICELAKE
|
|
665
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
666
|
+
NK_PUBLIC void nk_reduce_moments_i8_icelake(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
667
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
668
|
+
NK_PUBLIC void nk_reduce_moments_u8_icelake(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
669
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
670
|
+
NK_PUBLIC void nk_reduce_moments_i16_icelake(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
671
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
672
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_icelake(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
673
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
674
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_icelake(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
675
|
+
#endif // NK_TARGET_ICELAKE
|
|
676
|
+
|
|
677
|
+
#if NK_TARGET_GENOA
|
|
678
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
679
|
+
NK_PUBLIC void nk_reduce_moments_bf16_genoa(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
680
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
681
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_genoa(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
682
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
683
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_genoa(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
684
|
+
#endif // NK_TARGET_GENOA
|
|
685
|
+
|
|
686
|
+
#if NK_TARGET_ALDER
|
|
687
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
688
|
+
NK_PUBLIC void nk_reduce_moments_u8_alder(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
689
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
690
|
+
NK_PUBLIC void nk_reduce_moments_i16_alder(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
691
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
692
|
+
NK_PUBLIC void nk_reduce_moments_u16_alder(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
693
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
694
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_alder(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
695
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
696
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_alder(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
697
|
+
#endif // NK_TARGET_ALDER
|
|
698
|
+
#if NK_TARGET_SIERRA
|
|
699
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
700
|
+
NK_PUBLIC void nk_reduce_moments_i8_sierra(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
701
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
702
|
+
NK_PUBLIC void nk_reduce_moments_u8_sierra(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
703
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
704
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_sierra(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
705
|
+
#endif // NK_TARGET_SIERRA
|
|
706
|
+
|
|
707
|
+
#if NK_TARGET_RVV
|
|
708
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
709
|
+
NK_PUBLIC void nk_reduce_moments_f32_rvv(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
710
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
711
|
+
NK_PUBLIC void nk_reduce_moments_f64_rvv(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
712
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
713
|
+
NK_PUBLIC void nk_reduce_moments_i8_rvv(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
714
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
715
|
+
NK_PUBLIC void nk_reduce_moments_u8_rvv(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
716
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
717
|
+
NK_PUBLIC void nk_reduce_moments_i16_rvv(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
718
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
719
|
+
NK_PUBLIC void nk_reduce_moments_u16_rvv(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
720
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
721
|
+
NK_PUBLIC void nk_reduce_moments_i32_rvv(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
722
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
723
|
+
NK_PUBLIC void nk_reduce_moments_u32_rvv(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
724
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
725
|
+
NK_PUBLIC void nk_reduce_moments_i64_rvv(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
726
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
727
|
+
NK_PUBLIC void nk_reduce_moments_u64_rvv(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
728
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
729
|
+
NK_PUBLIC void nk_reduce_moments_f16_rvv(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
730
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
731
|
+
NK_PUBLIC void nk_reduce_moments_bf16_rvv(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
732
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
733
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_rvv(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
734
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
735
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_rvv(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
736
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
737
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_rvv(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
738
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
739
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_rvv(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
740
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
741
|
+
NK_PUBLIC void nk_reduce_minmax_f32_rvv(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *, nk_f32_t *,
|
|
742
|
+
nk_size_t *);
|
|
743
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
744
|
+
NK_PUBLIC void nk_reduce_minmax_f64_rvv(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *, nk_f64_t *,
|
|
745
|
+
nk_size_t *);
|
|
746
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
747
|
+
NK_PUBLIC void nk_reduce_minmax_i8_rvv(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
|
|
748
|
+
nk_size_t *);
|
|
749
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
750
|
+
NK_PUBLIC void nk_reduce_minmax_u8_rvv(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
751
|
+
nk_size_t *);
|
|
752
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
753
|
+
NK_PUBLIC void nk_reduce_minmax_i16_rvv(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *, nk_i16_t *,
|
|
754
|
+
nk_size_t *);
|
|
755
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
756
|
+
NK_PUBLIC void nk_reduce_minmax_u16_rvv(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *, nk_u16_t *,
|
|
757
|
+
nk_size_t *);
|
|
758
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
759
|
+
NK_PUBLIC void nk_reduce_minmax_i32_rvv(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *, nk_i32_t *,
|
|
760
|
+
nk_size_t *);
|
|
761
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
762
|
+
NK_PUBLIC void nk_reduce_minmax_u32_rvv(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *, nk_u32_t *,
|
|
763
|
+
nk_size_t *);
|
|
764
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
765
|
+
NK_PUBLIC void nk_reduce_minmax_i64_rvv(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *, nk_i64_t *,
|
|
766
|
+
nk_size_t *);
|
|
767
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
768
|
+
NK_PUBLIC void nk_reduce_minmax_u64_rvv(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *, nk_u64_t *,
|
|
769
|
+
nk_size_t *);
|
|
770
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
771
|
+
NK_PUBLIC void nk_reduce_minmax_f16_rvv(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *, nk_f16_t *,
|
|
772
|
+
nk_size_t *);
|
|
773
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
774
|
+
NK_PUBLIC void nk_reduce_minmax_bf16_rvv(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *, nk_bf16_t *,
|
|
775
|
+
nk_size_t *);
|
|
776
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
777
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_rvv(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *, nk_e4m3_t *,
|
|
778
|
+
nk_size_t *);
|
|
779
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
780
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_rvv(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *, nk_e5m2_t *,
|
|
781
|
+
nk_size_t *);
|
|
782
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
783
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3_rvv(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *, nk_e2m3_t *,
|
|
784
|
+
nk_size_t *);
|
|
785
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
786
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2_rvv(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *, nk_e3m2_t *,
|
|
787
|
+
nk_size_t *);
|
|
788
|
+
#endif // NK_TARGET_RVV
|
|
789
|
+
|
|
790
|
+
#if NK_TARGET_V128RELAXED
|
|
791
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
792
|
+
NK_PUBLIC void nk_reduce_moments_f32_v128relaxed(nk_f32_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
793
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
794
|
+
NK_PUBLIC void nk_reduce_moments_f64_v128relaxed(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_f64_t *);
|
|
795
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
796
|
+
NK_PUBLIC void nk_reduce_moments_i8_v128relaxed(nk_i8_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
797
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
798
|
+
NK_PUBLIC void nk_reduce_moments_u8_v128relaxed(nk_u8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
799
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
800
|
+
NK_PUBLIC void nk_reduce_moments_i16_v128relaxed(nk_i16_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
801
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
802
|
+
NK_PUBLIC void nk_reduce_moments_u16_v128relaxed(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
803
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
804
|
+
NK_PUBLIC void nk_reduce_moments_i32_v128relaxed(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
805
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
806
|
+
NK_PUBLIC void nk_reduce_moments_u32_v128relaxed(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
807
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
808
|
+
NK_PUBLIC void nk_reduce_moments_i64_v128relaxed(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
|
|
809
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
810
|
+
NK_PUBLIC void nk_reduce_moments_u64_v128relaxed(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
|
|
811
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
812
|
+
NK_PUBLIC void nk_reduce_moments_f16_v128relaxed(nk_f16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
813
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
814
|
+
NK_PUBLIC void nk_reduce_moments_bf16_v128relaxed(nk_bf16_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
815
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
816
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_v128relaxed(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
817
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
818
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_v128relaxed(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
819
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
820
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_v128relaxed(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
821
|
+
/** @copydoc nk_reduce_moments_f64 */
|
|
822
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_v128relaxed(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_f32_t *);
|
|
823
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
824
|
+
NK_PUBLIC void nk_reduce_minmax_f32_v128relaxed(nk_f32_t const *, nk_size_t, nk_size_t, nk_f32_t *, nk_size_t *,
|
|
825
|
+
nk_f32_t *, nk_size_t *);
|
|
826
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
827
|
+
NK_PUBLIC void nk_reduce_minmax_f64_v128relaxed(nk_f64_t const *, nk_size_t, nk_size_t, nk_f64_t *, nk_size_t *,
|
|
828
|
+
nk_f64_t *, nk_size_t *);
|
|
829
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
830
|
+
NK_PUBLIC void nk_reduce_minmax_i8_v128relaxed(nk_i8_t const *, nk_size_t, nk_size_t, nk_i8_t *, nk_size_t *, nk_i8_t *,
|
|
831
|
+
nk_size_t *);
|
|
832
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
833
|
+
NK_PUBLIC void nk_reduce_minmax_u8_v128relaxed(nk_u8_t const *, nk_size_t, nk_size_t, nk_u8_t *, nk_size_t *, nk_u8_t *,
|
|
834
|
+
nk_size_t *);
|
|
835
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
836
|
+
NK_PUBLIC void nk_reduce_minmax_i16_v128relaxed(nk_i16_t const *, nk_size_t, nk_size_t, nk_i16_t *, nk_size_t *,
|
|
837
|
+
nk_i16_t *, nk_size_t *);
|
|
838
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
839
|
+
NK_PUBLIC void nk_reduce_minmax_u16_v128relaxed(nk_u16_t const *, nk_size_t, nk_size_t, nk_u16_t *, nk_size_t *,
|
|
840
|
+
nk_u16_t *, nk_size_t *);
|
|
841
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
842
|
+
NK_PUBLIC void nk_reduce_minmax_i32_v128relaxed(nk_i32_t const *, nk_size_t, nk_size_t, nk_i32_t *, nk_size_t *,
|
|
843
|
+
nk_i32_t *, nk_size_t *);
|
|
844
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
845
|
+
NK_PUBLIC void nk_reduce_minmax_u32_v128relaxed(nk_u32_t const *, nk_size_t, nk_size_t, nk_u32_t *, nk_size_t *,
|
|
846
|
+
nk_u32_t *, nk_size_t *);
|
|
847
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
848
|
+
NK_PUBLIC void nk_reduce_minmax_i64_v128relaxed(nk_i64_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_size_t *,
|
|
849
|
+
nk_i64_t *, nk_size_t *);
|
|
850
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
851
|
+
NK_PUBLIC void nk_reduce_minmax_u64_v128relaxed(nk_u64_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_size_t *,
|
|
852
|
+
nk_u64_t *, nk_size_t *);
|
|
853
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
854
|
+
NK_PUBLIC void nk_reduce_minmax_f16_v128relaxed(nk_f16_t const *, nk_size_t, nk_size_t, nk_f16_t *, nk_size_t *,
|
|
855
|
+
nk_f16_t *, nk_size_t *);
|
|
856
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
857
|
+
NK_PUBLIC void nk_reduce_minmax_bf16_v128relaxed(nk_bf16_t const *, nk_size_t, nk_size_t, nk_bf16_t *, nk_size_t *,
|
|
858
|
+
nk_bf16_t *, nk_size_t *);
|
|
859
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
860
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_v128relaxed(nk_e4m3_t const *, nk_size_t, nk_size_t, nk_e4m3_t *, nk_size_t *,
|
|
861
|
+
nk_e4m3_t *, nk_size_t *);
|
|
862
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
863
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_v128relaxed(nk_e5m2_t const *, nk_size_t, nk_size_t, nk_e5m2_t *, nk_size_t *,
|
|
864
|
+
nk_e5m2_t *, nk_size_t *);
|
|
865
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
866
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3_v128relaxed(nk_e2m3_t const *, nk_size_t, nk_size_t, nk_e2m3_t *, nk_size_t *,
|
|
867
|
+
nk_e2m3_t *, nk_size_t *);
|
|
868
|
+
/** @copydoc nk_reduce_minmax_f64 */
|
|
869
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2_v128relaxed(nk_e3m2_t const *, nk_size_t, nk_size_t, nk_e3m2_t *, nk_size_t *,
|
|
870
|
+
nk_e3m2_t *, nk_size_t *);
|
|
871
|
+
#endif // NK_TARGET_V128RELAXED
|
|
872
|
+
|
|
873
|
+
/**
|
|
874
|
+
* @brief Returns the accumulator dtype for the `sum` output of reduce_moments.
|
|
875
|
+
*
|
|
876
|
+
* Float types accumulate into wider floats; signed ints into i64; unsigned ints into u64.
|
|
877
|
+
*/
|
|
878
|
+
NK_INTERNAL nk_dtype_t nk_reduce_moments_sum_dtype(nk_dtype_t dtype) {
|
|
879
|
+
switch (dtype) {
|
|
880
|
+
case nk_f64_k: return nk_f64_k;
|
|
881
|
+
case nk_f32_k: return nk_f64_k;
|
|
882
|
+
case nk_f16_k: return nk_f32_k;
|
|
883
|
+
case nk_bf16_k: return nk_f32_k;
|
|
884
|
+
case nk_e4m3_k: return nk_f32_k;
|
|
885
|
+
case nk_e5m2_k: return nk_f32_k;
|
|
886
|
+
case nk_e2m3_k: return nk_f32_k;
|
|
887
|
+
case nk_e3m2_k: return nk_f32_k;
|
|
888
|
+
case nk_i8_k: return nk_i64_k;
|
|
889
|
+
case nk_i16_k: return nk_i64_k;
|
|
890
|
+
case nk_i32_k: return nk_i64_k;
|
|
891
|
+
case nk_i64_k: return nk_i64_k;
|
|
892
|
+
case nk_i4_k: return nk_i64_k;
|
|
893
|
+
case nk_u8_k: return nk_u64_k;
|
|
894
|
+
case nk_u16_k: return nk_u64_k;
|
|
895
|
+
case nk_u32_k: return nk_u64_k;
|
|
896
|
+
case nk_u64_k: return nk_u64_k;
|
|
897
|
+
case nk_u4_k: return nk_u64_k;
|
|
898
|
+
case nk_u1_k: return nk_u64_k;
|
|
899
|
+
default: return nk_dtype_unknown_k;
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
/**
|
|
904
|
+
* @brief Returns the accumulator dtype for the `sumsq` output of reduce_moments.
|
|
905
|
+
*
|
|
906
|
+
* Same as sum except all integers (signed and unsigned) accumulate into u64.
|
|
907
|
+
*/
|
|
908
|
+
NK_INTERNAL nk_dtype_t nk_reduce_moments_sumsq_dtype(nk_dtype_t dtype) {
|
|
909
|
+
switch (dtype) {
|
|
910
|
+
case nk_f64_k: return nk_f64_k;
|
|
911
|
+
case nk_f32_k: return nk_f64_k;
|
|
912
|
+
case nk_f16_k: return nk_f32_k;
|
|
913
|
+
case nk_bf16_k: return nk_f32_k;
|
|
914
|
+
case nk_e4m3_k: return nk_f32_k;
|
|
915
|
+
case nk_e5m2_k: return nk_f32_k;
|
|
916
|
+
case nk_e2m3_k: return nk_f32_k;
|
|
917
|
+
case nk_e3m2_k: return nk_f32_k;
|
|
918
|
+
case nk_i8_k: return nk_u64_k;
|
|
919
|
+
case nk_i16_k: return nk_u64_k;
|
|
920
|
+
case nk_i32_k: return nk_u64_k;
|
|
921
|
+
case nk_i64_k: return nk_u64_k;
|
|
922
|
+
case nk_i4_k: return nk_u64_k;
|
|
923
|
+
case nk_u8_k: return nk_u64_k;
|
|
924
|
+
case nk_u16_k: return nk_u64_k;
|
|
925
|
+
case nk_u32_k: return nk_u64_k;
|
|
926
|
+
case nk_u64_k: return nk_u64_k;
|
|
927
|
+
case nk_u4_k: return nk_u64_k;
|
|
928
|
+
case nk_u1_k: return nk_u64_k;
|
|
929
|
+
default: return nk_dtype_unknown_k;
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
/**
|
|
934
|
+
* @brief Returns the value dtype for reduce_minmax outputs.
|
|
935
|
+
*
|
|
936
|
+
* Standard types return themselves. Sub-byte types widen: i4->i8, u4->u8, u1->u8.
|
|
937
|
+
*/
|
|
938
|
+
NK_INTERNAL nk_dtype_t nk_reduce_minmax_value_dtype(nk_dtype_t dtype) {
|
|
939
|
+
switch (dtype) {
|
|
940
|
+
case nk_i4_k: return nk_i8_k;
|
|
941
|
+
case nk_u4_k: return nk_u8_k;
|
|
942
|
+
case nk_u1_k: return nk_u8_k;
|
|
943
|
+
default: return dtype;
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
#ifdef __cplusplus
|
|
948
|
+
} // extern "C"
|
|
949
|
+
#endif
|
|
950
|
+
|
|
951
|
+
#include "numkong/reduce/serial.h"
|
|
952
|
+
#include "numkong/reduce/neon.h"
|
|
953
|
+
#include "numkong/reduce/neonhalf.h"
|
|
954
|
+
#include "numkong/reduce/neonbfdot.h"
|
|
955
|
+
#include "numkong/reduce/neonsdot.h"
|
|
956
|
+
#include "numkong/reduce/neonfhm.h"
|
|
957
|
+
#include "numkong/reduce/haswell.h"
|
|
958
|
+
#include "numkong/reduce/skylake.h"
|
|
959
|
+
#include "numkong/reduce/icelake.h"
|
|
960
|
+
#include "numkong/reduce/genoa.h"
|
|
961
|
+
#include "numkong/reduce/alder.h"
|
|
962
|
+
#include "numkong/reduce/sierra.h"
|
|
963
|
+
#include "numkong/reduce/rvv.h"
|
|
964
|
+
#include "numkong/reduce/v128relaxed.h"
|
|
965
|
+
|
|
966
|
+
#ifdef __cplusplus
|
|
967
|
+
extern "C" {
|
|
968
|
+
#endif
|
|
969
|
+
|
|
970
|
+
#if !NK_DYNAMIC_DISPATCH
|
|
971
|
+
|
|
972
|
+
NK_PUBLIC void nk_reduce_moments_f32(nk_f32_t const *d, nk_size_t n, nk_size_t s, nk_f64_t *sum, nk_f64_t *sumsq) {
|
|
973
|
+
#if NK_TARGET_SKYLAKE
|
|
974
|
+
nk_reduce_moments_f32_skylake(d, n, s, sum, sumsq);
|
|
975
|
+
#elif NK_TARGET_HASWELL
|
|
976
|
+
nk_reduce_moments_f32_haswell(d, n, s, sum, sumsq);
|
|
977
|
+
#elif NK_TARGET_NEON
|
|
978
|
+
nk_reduce_moments_f32_neon(d, n, s, sum, sumsq);
|
|
979
|
+
#elif NK_TARGET_RVV
|
|
980
|
+
nk_reduce_moments_f32_rvv(d, n, s, sum, sumsq);
|
|
981
|
+
#elif NK_TARGET_V128RELAXED
|
|
982
|
+
nk_reduce_moments_f32_v128relaxed(d, n, s, sum, sumsq);
|
|
983
|
+
#else
|
|
984
|
+
nk_reduce_moments_f32_serial(d, n, s, sum, sumsq);
|
|
985
|
+
#endif
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
NK_PUBLIC void nk_reduce_minmax_f32(nk_f32_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *mn, nk_size_t *mi,
|
|
989
|
+
nk_f32_t *mx, nk_size_t *xi) {
|
|
990
|
+
#if NK_TARGET_SKYLAKE
|
|
991
|
+
nk_reduce_minmax_f32_skylake(d, n, s, mn, mi, mx, xi);
|
|
992
|
+
#elif NK_TARGET_HASWELL
|
|
993
|
+
nk_reduce_minmax_f32_haswell(d, n, s, mn, mi, mx, xi);
|
|
994
|
+
#elif NK_TARGET_NEON
|
|
995
|
+
nk_reduce_minmax_f32_neon(d, n, s, mn, mi, mx, xi);
|
|
996
|
+
#elif NK_TARGET_RVV
|
|
997
|
+
nk_reduce_minmax_f32_rvv(d, n, s, mn, mi, mx, xi);
|
|
998
|
+
#elif NK_TARGET_V128RELAXED
|
|
999
|
+
nk_reduce_minmax_f32_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1000
|
+
#else
|
|
1001
|
+
nk_reduce_minmax_f32_serial(d, n, s, mn, mi, mx, xi);
|
|
1002
|
+
#endif
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
NK_PUBLIC void nk_reduce_moments_f64(nk_f64_t const *d, nk_size_t n, nk_size_t s, nk_f64_t *sum, nk_f64_t *sumsq) {
|
|
1006
|
+
#if NK_TARGET_SKYLAKE
|
|
1007
|
+
nk_reduce_moments_f64_skylake(d, n, s, sum, sumsq);
|
|
1008
|
+
#elif NK_TARGET_HASWELL
|
|
1009
|
+
nk_reduce_moments_f64_haswell(d, n, s, sum, sumsq);
|
|
1010
|
+
#elif NK_TARGET_NEON
|
|
1011
|
+
nk_reduce_moments_f64_neon(d, n, s, sum, sumsq);
|
|
1012
|
+
#elif NK_TARGET_RVV
|
|
1013
|
+
nk_reduce_moments_f64_rvv(d, n, s, sum, sumsq);
|
|
1014
|
+
#elif NK_TARGET_V128RELAXED
|
|
1015
|
+
nk_reduce_moments_f64_v128relaxed(d, n, s, sum, sumsq);
|
|
1016
|
+
#else
|
|
1017
|
+
nk_reduce_moments_f64_serial(d, n, s, sum, sumsq);
|
|
1018
|
+
#endif
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
NK_PUBLIC void nk_reduce_minmax_f64(nk_f64_t const *d, nk_size_t n, nk_size_t s, nk_f64_t *mn, nk_size_t *mi,
|
|
1022
|
+
nk_f64_t *mx, nk_size_t *xi) {
|
|
1023
|
+
#if NK_TARGET_SKYLAKE
|
|
1024
|
+
nk_reduce_minmax_f64_skylake(d, n, s, mn, mi, mx, xi);
|
|
1025
|
+
#elif NK_TARGET_HASWELL
|
|
1026
|
+
nk_reduce_minmax_f64_haswell(d, n, s, mn, mi, mx, xi);
|
|
1027
|
+
#elif NK_TARGET_NEON
|
|
1028
|
+
nk_reduce_minmax_f64_neon(d, n, s, mn, mi, mx, xi);
|
|
1029
|
+
#elif NK_TARGET_RVV
|
|
1030
|
+
nk_reduce_minmax_f64_rvv(d, n, s, mn, mi, mx, xi);
|
|
1031
|
+
#elif NK_TARGET_V128RELAXED
|
|
1032
|
+
nk_reduce_minmax_f64_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1033
|
+
#else
|
|
1034
|
+
nk_reduce_minmax_f64_serial(d, n, s, mn, mi, mx, xi);
|
|
1035
|
+
#endif
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
NK_PUBLIC void nk_reduce_moments_i8(nk_i8_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
|
|
1039
|
+
#if NK_TARGET_ICELAKE
|
|
1040
|
+
nk_reduce_moments_i8_icelake(d, n, s, sum, sumsq);
|
|
1041
|
+
#elif NK_TARGET_SKYLAKE
|
|
1042
|
+
nk_reduce_moments_i8_skylake(d, n, s, sum, sumsq);
|
|
1043
|
+
#elif NK_TARGET_SIERRA
|
|
1044
|
+
nk_reduce_moments_i8_sierra(d, n, s, sum, sumsq);
|
|
1045
|
+
#elif NK_TARGET_HASWELL
|
|
1046
|
+
nk_reduce_moments_i8_haswell(d, n, s, sum, sumsq);
|
|
1047
|
+
#elif NK_TARGET_NEONSDOT
|
|
1048
|
+
nk_reduce_moments_i8_neonsdot(d, n, s, sum, sumsq);
|
|
1049
|
+
#elif NK_TARGET_NEON
|
|
1050
|
+
nk_reduce_moments_i8_neon(d, n, s, sum, sumsq);
|
|
1051
|
+
#elif NK_TARGET_RVV
|
|
1052
|
+
nk_reduce_moments_i8_rvv(d, n, s, sum, sumsq);
|
|
1053
|
+
#elif NK_TARGET_V128RELAXED
|
|
1054
|
+
nk_reduce_moments_i8_v128relaxed(d, n, s, sum, sumsq);
|
|
1055
|
+
#else
|
|
1056
|
+
nk_reduce_moments_i8_serial(d, n, s, sum, sumsq);
|
|
1057
|
+
#endif
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
NK_PUBLIC void nk_reduce_minmax_i8(nk_i8_t const *d, nk_size_t n, nk_size_t s, nk_i8_t *mn, nk_size_t *mi, nk_i8_t *mx,
|
|
1061
|
+
nk_size_t *xi) {
|
|
1062
|
+
#if NK_TARGET_SKYLAKE
|
|
1063
|
+
nk_reduce_minmax_i8_skylake(d, n, s, mn, mi, mx, xi);
|
|
1064
|
+
#elif NK_TARGET_HASWELL
|
|
1065
|
+
nk_reduce_minmax_i8_haswell(d, n, s, mn, mi, mx, xi);
|
|
1066
|
+
#elif NK_TARGET_NEON
|
|
1067
|
+
nk_reduce_minmax_i8_neon(d, n, s, mn, mi, mx, xi);
|
|
1068
|
+
#elif NK_TARGET_RVV
|
|
1069
|
+
nk_reduce_minmax_i8_rvv(d, n, s, mn, mi, mx, xi);
|
|
1070
|
+
#elif NK_TARGET_V128RELAXED
|
|
1071
|
+
nk_reduce_minmax_i8_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1072
|
+
#else
|
|
1073
|
+
nk_reduce_minmax_i8_serial(d, n, s, mn, mi, mx, xi);
|
|
1074
|
+
#endif
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
NK_PUBLIC void nk_reduce_moments_u8(nk_u8_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
|
|
1078
|
+
#if NK_TARGET_ICELAKE
|
|
1079
|
+
nk_reduce_moments_u8_icelake(d, n, s, sum, sumsq);
|
|
1080
|
+
#elif NK_TARGET_SKYLAKE
|
|
1081
|
+
nk_reduce_moments_u8_skylake(d, n, s, sum, sumsq);
|
|
1082
|
+
#elif NK_TARGET_SIERRA
|
|
1083
|
+
nk_reduce_moments_u8_sierra(d, n, s, sum, sumsq);
|
|
1084
|
+
#elif NK_TARGET_ALDER
|
|
1085
|
+
nk_reduce_moments_u8_alder(d, n, s, sum, sumsq);
|
|
1086
|
+
#elif NK_TARGET_HASWELL
|
|
1087
|
+
nk_reduce_moments_u8_haswell(d, n, s, sum, sumsq);
|
|
1088
|
+
#elif NK_TARGET_NEONSDOT
|
|
1089
|
+
nk_reduce_moments_u8_neonsdot(d, n, s, sum, sumsq);
|
|
1090
|
+
#elif NK_TARGET_NEON
|
|
1091
|
+
nk_reduce_moments_u8_neon(d, n, s, sum, sumsq);
|
|
1092
|
+
#elif NK_TARGET_RVV
|
|
1093
|
+
nk_reduce_moments_u8_rvv(d, n, s, sum, sumsq);
|
|
1094
|
+
#elif NK_TARGET_V128RELAXED
|
|
1095
|
+
nk_reduce_moments_u8_v128relaxed(d, n, s, sum, sumsq);
|
|
1096
|
+
#else
|
|
1097
|
+
nk_reduce_moments_u8_serial(d, n, s, sum, sumsq);
|
|
1098
|
+
#endif
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
NK_PUBLIC void nk_reduce_minmax_u8(nk_u8_t const *d, nk_size_t n, nk_size_t s, nk_u8_t *mn, nk_size_t *mi, nk_u8_t *mx,
|
|
1102
|
+
nk_size_t *xi) {
|
|
1103
|
+
#if NK_TARGET_SKYLAKE
|
|
1104
|
+
nk_reduce_minmax_u8_skylake(d, n, s, mn, mi, mx, xi);
|
|
1105
|
+
#elif NK_TARGET_HASWELL
|
|
1106
|
+
nk_reduce_minmax_u8_haswell(d, n, s, mn, mi, mx, xi);
|
|
1107
|
+
#elif NK_TARGET_NEON
|
|
1108
|
+
nk_reduce_minmax_u8_neon(d, n, s, mn, mi, mx, xi);
|
|
1109
|
+
#elif NK_TARGET_RVV
|
|
1110
|
+
nk_reduce_minmax_u8_rvv(d, n, s, mn, mi, mx, xi);
|
|
1111
|
+
#elif NK_TARGET_V128RELAXED
|
|
1112
|
+
nk_reduce_minmax_u8_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1113
|
+
#else
|
|
1114
|
+
nk_reduce_minmax_u8_serial(d, n, s, mn, mi, mx, xi);
|
|
1115
|
+
#endif
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
NK_PUBLIC void nk_reduce_moments_i16(nk_i16_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
|
|
1119
|
+
#if NK_TARGET_ICELAKE
|
|
1120
|
+
nk_reduce_moments_i16_icelake(d, n, s, sum, sumsq);
|
|
1121
|
+
#elif NK_TARGET_SKYLAKE
|
|
1122
|
+
nk_reduce_moments_i16_skylake(d, n, s, sum, sumsq);
|
|
1123
|
+
#elif NK_TARGET_ALDER
|
|
1124
|
+
nk_reduce_moments_i16_alder(d, n, s, sum, sumsq);
|
|
1125
|
+
#elif NK_TARGET_HASWELL
|
|
1126
|
+
nk_reduce_moments_i16_haswell(d, n, s, sum, sumsq);
|
|
1127
|
+
#elif NK_TARGET_NEON
|
|
1128
|
+
nk_reduce_moments_i16_neon(d, n, s, sum, sumsq);
|
|
1129
|
+
#elif NK_TARGET_RVV
|
|
1130
|
+
nk_reduce_moments_i16_rvv(d, n, s, sum, sumsq);
|
|
1131
|
+
#elif NK_TARGET_V128RELAXED
|
|
1132
|
+
nk_reduce_moments_i16_v128relaxed(d, n, s, sum, sumsq);
|
|
1133
|
+
#else
|
|
1134
|
+
nk_reduce_moments_i16_serial(d, n, s, sum, sumsq);
|
|
1135
|
+
#endif
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
NK_PUBLIC void nk_reduce_minmax_i16(nk_i16_t const *d, nk_size_t n, nk_size_t s, nk_i16_t *mn, nk_size_t *mi,
|
|
1139
|
+
nk_i16_t *mx, nk_size_t *xi) {
|
|
1140
|
+
#if NK_TARGET_SKYLAKE
|
|
1141
|
+
nk_reduce_minmax_i16_skylake(d, n, s, mn, mi, mx, xi);
|
|
1142
|
+
#elif NK_TARGET_HASWELL
|
|
1143
|
+
nk_reduce_minmax_i16_haswell(d, n, s, mn, mi, mx, xi);
|
|
1144
|
+
#elif NK_TARGET_NEON
|
|
1145
|
+
nk_reduce_minmax_i16_neon(d, n, s, mn, mi, mx, xi);
|
|
1146
|
+
#elif NK_TARGET_RVV
|
|
1147
|
+
nk_reduce_minmax_i16_rvv(d, n, s, mn, mi, mx, xi);
|
|
1148
|
+
#elif NK_TARGET_V128RELAXED
|
|
1149
|
+
nk_reduce_minmax_i16_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1150
|
+
#else
|
|
1151
|
+
nk_reduce_minmax_i16_serial(d, n, s, mn, mi, mx, xi);
|
|
1152
|
+
#endif
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
NK_PUBLIC void nk_reduce_moments_u16(nk_u16_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
|
|
1156
|
+
#if NK_TARGET_SKYLAKE
|
|
1157
|
+
nk_reduce_moments_u16_skylake(d, n, s, sum, sumsq);
|
|
1158
|
+
#elif NK_TARGET_ALDER
|
|
1159
|
+
nk_reduce_moments_u16_alder(d, n, s, sum, sumsq);
|
|
1160
|
+
#elif NK_TARGET_HASWELL
|
|
1161
|
+
nk_reduce_moments_u16_haswell(d, n, s, sum, sumsq);
|
|
1162
|
+
#elif NK_TARGET_NEON
|
|
1163
|
+
nk_reduce_moments_u16_neon(d, n, s, sum, sumsq);
|
|
1164
|
+
#elif NK_TARGET_RVV
|
|
1165
|
+
nk_reduce_moments_u16_rvv(d, n, s, sum, sumsq);
|
|
1166
|
+
#elif NK_TARGET_V128RELAXED
|
|
1167
|
+
nk_reduce_moments_u16_v128relaxed(d, n, s, sum, sumsq);
|
|
1168
|
+
#else
|
|
1169
|
+
nk_reduce_moments_u16_serial(d, n, s, sum, sumsq);
|
|
1170
|
+
#endif
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
NK_PUBLIC void nk_reduce_minmax_u16(nk_u16_t const *d, nk_size_t n, nk_size_t s, nk_u16_t *mn, nk_size_t *mi,
|
|
1174
|
+
nk_u16_t *mx, nk_size_t *xi) {
|
|
1175
|
+
#if NK_TARGET_SKYLAKE
|
|
1176
|
+
nk_reduce_minmax_u16_skylake(d, n, s, mn, mi, mx, xi);
|
|
1177
|
+
#elif NK_TARGET_HASWELL
|
|
1178
|
+
nk_reduce_minmax_u16_haswell(d, n, s, mn, mi, mx, xi);
|
|
1179
|
+
#elif NK_TARGET_NEON
|
|
1180
|
+
nk_reduce_minmax_u16_neon(d, n, s, mn, mi, mx, xi);
|
|
1181
|
+
#elif NK_TARGET_RVV
|
|
1182
|
+
nk_reduce_minmax_u16_rvv(d, n, s, mn, mi, mx, xi);
|
|
1183
|
+
#elif NK_TARGET_V128RELAXED
|
|
1184
|
+
nk_reduce_minmax_u16_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1185
|
+
#else
|
|
1186
|
+
nk_reduce_minmax_u16_serial(d, n, s, mn, mi, mx, xi);
|
|
1187
|
+
#endif
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
NK_PUBLIC void nk_reduce_moments_i32(nk_i32_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
|
|
1191
|
+
#if NK_TARGET_SKYLAKE
|
|
1192
|
+
nk_reduce_moments_i32_skylake(d, n, s, sum, sumsq);
|
|
1193
|
+
#elif NK_TARGET_HASWELL
|
|
1194
|
+
nk_reduce_moments_i32_haswell(d, n, s, sum, sumsq);
|
|
1195
|
+
#elif NK_TARGET_NEON
|
|
1196
|
+
nk_reduce_moments_i32_neon(d, n, s, sum, sumsq);
|
|
1197
|
+
#elif NK_TARGET_RVV
|
|
1198
|
+
nk_reduce_moments_i32_rvv(d, n, s, sum, sumsq);
|
|
1199
|
+
#elif NK_TARGET_V128RELAXED
|
|
1200
|
+
nk_reduce_moments_i32_v128relaxed(d, n, s, sum, sumsq);
|
|
1201
|
+
#else
|
|
1202
|
+
nk_reduce_moments_i32_serial(d, n, s, sum, sumsq);
|
|
1203
|
+
#endif
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
NK_PUBLIC void nk_reduce_minmax_i32(nk_i32_t const *d, nk_size_t n, nk_size_t s, nk_i32_t *mn, nk_size_t *mi,
|
|
1207
|
+
nk_i32_t *mx, nk_size_t *xi) {
|
|
1208
|
+
#if NK_TARGET_SKYLAKE
|
|
1209
|
+
nk_reduce_minmax_i32_skylake(d, n, s, mn, mi, mx, xi);
|
|
1210
|
+
#elif NK_TARGET_HASWELL
|
|
1211
|
+
nk_reduce_minmax_i32_haswell(d, n, s, mn, mi, mx, xi);
|
|
1212
|
+
#elif NK_TARGET_NEON
|
|
1213
|
+
nk_reduce_minmax_i32_neon(d, n, s, mn, mi, mx, xi);
|
|
1214
|
+
#elif NK_TARGET_RVV
|
|
1215
|
+
nk_reduce_minmax_i32_rvv(d, n, s, mn, mi, mx, xi);
|
|
1216
|
+
#elif NK_TARGET_V128RELAXED
|
|
1217
|
+
nk_reduce_minmax_i32_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1218
|
+
#else
|
|
1219
|
+
nk_reduce_minmax_i32_serial(d, n, s, mn, mi, mx, xi);
|
|
1220
|
+
#endif
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
NK_PUBLIC void nk_reduce_moments_u32(nk_u32_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
|
|
1224
|
+
#if NK_TARGET_SKYLAKE
|
|
1225
|
+
nk_reduce_moments_u32_skylake(d, n, s, sum, sumsq);
|
|
1226
|
+
#elif NK_TARGET_HASWELL
|
|
1227
|
+
nk_reduce_moments_u32_haswell(d, n, s, sum, sumsq);
|
|
1228
|
+
#elif NK_TARGET_NEON
|
|
1229
|
+
nk_reduce_moments_u32_neon(d, n, s, sum, sumsq);
|
|
1230
|
+
#elif NK_TARGET_RVV
|
|
1231
|
+
nk_reduce_moments_u32_rvv(d, n, s, sum, sumsq);
|
|
1232
|
+
#elif NK_TARGET_V128RELAXED
|
|
1233
|
+
nk_reduce_moments_u32_v128relaxed(d, n, s, sum, sumsq);
|
|
1234
|
+
#else
|
|
1235
|
+
nk_reduce_moments_u32_serial(d, n, s, sum, sumsq);
|
|
1236
|
+
#endif
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
NK_PUBLIC void nk_reduce_minmax_u32(nk_u32_t const *d, nk_size_t n, nk_size_t s, nk_u32_t *mn, nk_size_t *mi,
|
|
1240
|
+
nk_u32_t *mx, nk_size_t *xi) {
|
|
1241
|
+
#if NK_TARGET_SKYLAKE
|
|
1242
|
+
nk_reduce_minmax_u32_skylake(d, n, s, mn, mi, mx, xi);
|
|
1243
|
+
#elif NK_TARGET_HASWELL
|
|
1244
|
+
nk_reduce_minmax_u32_haswell(d, n, s, mn, mi, mx, xi);
|
|
1245
|
+
#elif NK_TARGET_NEON
|
|
1246
|
+
nk_reduce_minmax_u32_neon(d, n, s, mn, mi, mx, xi);
|
|
1247
|
+
#elif NK_TARGET_RVV
|
|
1248
|
+
nk_reduce_minmax_u32_rvv(d, n, s, mn, mi, mx, xi);
|
|
1249
|
+
#elif NK_TARGET_V128RELAXED
|
|
1250
|
+
nk_reduce_minmax_u32_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1251
|
+
#else
|
|
1252
|
+
nk_reduce_minmax_u32_serial(d, n, s, mn, mi, mx, xi);
|
|
1253
|
+
#endif
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
NK_PUBLIC void nk_reduce_moments_i64(nk_i64_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
|
|
1257
|
+
#if NK_TARGET_SKYLAKE
|
|
1258
|
+
nk_reduce_moments_i64_skylake(d, n, s, sum, sumsq);
|
|
1259
|
+
#elif NK_TARGET_HASWELL
|
|
1260
|
+
nk_reduce_moments_i64_haswell(d, n, s, sum, sumsq);
|
|
1261
|
+
#elif NK_TARGET_NEON
|
|
1262
|
+
nk_reduce_moments_i64_neon(d, n, s, sum, sumsq);
|
|
1263
|
+
#elif NK_TARGET_RVV
|
|
1264
|
+
nk_reduce_moments_i64_rvv(d, n, s, sum, sumsq);
|
|
1265
|
+
#elif NK_TARGET_V128RELAXED
|
|
1266
|
+
nk_reduce_moments_i64_v128relaxed(d, n, s, sum, sumsq);
|
|
1267
|
+
#else
|
|
1268
|
+
nk_reduce_moments_i64_serial(d, n, s, sum, sumsq);
|
|
1269
|
+
#endif
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
NK_PUBLIC void nk_reduce_minmax_i64(nk_i64_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *mn, nk_size_t *mi,
|
|
1273
|
+
nk_i64_t *mx, nk_size_t *xi) {
|
|
1274
|
+
#if NK_TARGET_SKYLAKE
|
|
1275
|
+
nk_reduce_minmax_i64_skylake(d, n, s, mn, mi, mx, xi);
|
|
1276
|
+
#elif NK_TARGET_HASWELL
|
|
1277
|
+
nk_reduce_minmax_i64_haswell(d, n, s, mn, mi, mx, xi);
|
|
1278
|
+
#elif NK_TARGET_NEON
|
|
1279
|
+
nk_reduce_minmax_i64_neon(d, n, s, mn, mi, mx, xi);
|
|
1280
|
+
#elif NK_TARGET_RVV
|
|
1281
|
+
nk_reduce_minmax_i64_rvv(d, n, s, mn, mi, mx, xi);
|
|
1282
|
+
#elif NK_TARGET_V128RELAXED
|
|
1283
|
+
nk_reduce_minmax_i64_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1284
|
+
#else
|
|
1285
|
+
nk_reduce_minmax_i64_serial(d, n, s, mn, mi, mx, xi);
|
|
1286
|
+
#endif
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
NK_PUBLIC void nk_reduce_moments_u64(nk_u64_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
|
|
1290
|
+
#if NK_TARGET_SKYLAKE
|
|
1291
|
+
nk_reduce_moments_u64_skylake(d, n, s, sum, sumsq);
|
|
1292
|
+
#elif NK_TARGET_HASWELL
|
|
1293
|
+
nk_reduce_moments_u64_haswell(d, n, s, sum, sumsq);
|
|
1294
|
+
#elif NK_TARGET_NEON
|
|
1295
|
+
nk_reduce_moments_u64_neon(d, n, s, sum, sumsq);
|
|
1296
|
+
#elif NK_TARGET_RVV
|
|
1297
|
+
nk_reduce_moments_u64_rvv(d, n, s, sum, sumsq);
|
|
1298
|
+
#elif NK_TARGET_V128RELAXED
|
|
1299
|
+
nk_reduce_moments_u64_v128relaxed(d, n, s, sum, sumsq);
|
|
1300
|
+
#else
|
|
1301
|
+
nk_reduce_moments_u64_serial(d, n, s, sum, sumsq);
|
|
1302
|
+
#endif
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
NK_PUBLIC void nk_reduce_minmax_u64(nk_u64_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *mn, nk_size_t *mi,
|
|
1306
|
+
nk_u64_t *mx, nk_size_t *xi) {
|
|
1307
|
+
#if NK_TARGET_SKYLAKE
|
|
1308
|
+
nk_reduce_minmax_u64_skylake(d, n, s, mn, mi, mx, xi);
|
|
1309
|
+
#elif NK_TARGET_HASWELL
|
|
1310
|
+
nk_reduce_minmax_u64_haswell(d, n, s, mn, mi, mx, xi);
|
|
1311
|
+
#elif NK_TARGET_NEON
|
|
1312
|
+
nk_reduce_minmax_u64_neon(d, n, s, mn, mi, mx, xi);
|
|
1313
|
+
#elif NK_TARGET_RVV
|
|
1314
|
+
nk_reduce_minmax_u64_rvv(d, n, s, mn, mi, mx, xi);
|
|
1315
|
+
#elif NK_TARGET_V128RELAXED
|
|
1316
|
+
nk_reduce_minmax_u64_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1317
|
+
#else
|
|
1318
|
+
nk_reduce_minmax_u64_serial(d, n, s, mn, mi, mx, xi);
|
|
1319
|
+
#endif
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
NK_PUBLIC void nk_reduce_moments_f16(nk_f16_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
|
|
1323
|
+
#if NK_TARGET_SKYLAKE
|
|
1324
|
+
nk_reduce_moments_f16_skylake(d, n, s, sum, sumsq);
|
|
1325
|
+
#elif NK_TARGET_HASWELL
|
|
1326
|
+
nk_reduce_moments_f16_haswell(d, n, s, sum, sumsq);
|
|
1327
|
+
#elif NK_TARGET_NEONHALF
|
|
1328
|
+
nk_reduce_moments_f16_neonhalf(d, n, s, sum, sumsq);
|
|
1329
|
+
#elif NK_TARGET_RVV
|
|
1330
|
+
nk_reduce_moments_f16_rvv(d, n, s, sum, sumsq);
|
|
1331
|
+
#elif NK_TARGET_V128RELAXED
|
|
1332
|
+
nk_reduce_moments_f16_v128relaxed(d, n, s, sum, sumsq);
|
|
1333
|
+
#else
|
|
1334
|
+
nk_reduce_moments_f16_serial(d, n, s, sum, sumsq);
|
|
1335
|
+
#endif
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
NK_PUBLIC void nk_reduce_minmax_f16(nk_f16_t const *d, nk_size_t n, nk_size_t s, nk_f16_t *mn, nk_size_t *mi,
|
|
1339
|
+
nk_f16_t *mx, nk_size_t *xi) {
|
|
1340
|
+
#if NK_TARGET_SKYLAKE
|
|
1341
|
+
nk_reduce_minmax_f16_skylake(d, n, s, mn, mi, mx, xi);
|
|
1342
|
+
#elif NK_TARGET_HASWELL
|
|
1343
|
+
nk_reduce_minmax_f16_haswell(d, n, s, mn, mi, mx, xi);
|
|
1344
|
+
#elif NK_TARGET_NEONHALF
|
|
1345
|
+
nk_reduce_minmax_f16_serial(d, n, s, mn, mi, mx, xi);
|
|
1346
|
+
#elif NK_TARGET_RVV
|
|
1347
|
+
nk_reduce_minmax_f16_rvv(d, n, s, mn, mi, mx, xi);
|
|
1348
|
+
#elif NK_TARGET_V128RELAXED
|
|
1349
|
+
nk_reduce_minmax_f16_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1350
|
+
#else
|
|
1351
|
+
nk_reduce_minmax_f16_serial(d, n, s, mn, mi, mx, xi);
|
|
1352
|
+
#endif
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
NK_PUBLIC void nk_reduce_moments_bf16(nk_bf16_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
|
|
1356
|
+
#if NK_TARGET_GENOA
|
|
1357
|
+
nk_reduce_moments_bf16_genoa(d, n, s, sum, sumsq);
|
|
1358
|
+
#elif NK_TARGET_SKYLAKE
|
|
1359
|
+
nk_reduce_moments_bf16_skylake(d, n, s, sum, sumsq);
|
|
1360
|
+
#elif NK_TARGET_HASWELL
|
|
1361
|
+
nk_reduce_moments_bf16_haswell(d, n, s, sum, sumsq);
|
|
1362
|
+
#elif NK_TARGET_NEONBFDOT
|
|
1363
|
+
nk_reduce_moments_bf16_neonbfdot(d, n, s, sum, sumsq);
|
|
1364
|
+
#elif NK_TARGET_RVV
|
|
1365
|
+
nk_reduce_moments_bf16_rvv(d, n, s, sum, sumsq);
|
|
1366
|
+
#elif NK_TARGET_V128RELAXED
|
|
1367
|
+
nk_reduce_moments_bf16_v128relaxed(d, n, s, sum, sumsq);
|
|
1368
|
+
#else
|
|
1369
|
+
nk_reduce_moments_bf16_serial(d, n, s, sum, sumsq);
|
|
1370
|
+
#endif
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
NK_PUBLIC void nk_reduce_minmax_bf16(nk_bf16_t const *d, nk_size_t n, nk_size_t s, nk_bf16_t *mn, nk_size_t *mi,
|
|
1374
|
+
nk_bf16_t *mx, nk_size_t *xi) {
|
|
1375
|
+
#if NK_TARGET_SKYLAKE
|
|
1376
|
+
nk_reduce_minmax_bf16_skylake(d, n, s, mn, mi, mx, xi);
|
|
1377
|
+
#elif NK_TARGET_HASWELL
|
|
1378
|
+
nk_reduce_minmax_bf16_haswell(d, n, s, mn, mi, mx, xi);
|
|
1379
|
+
#elif NK_TARGET_NEONBFDOT
|
|
1380
|
+
nk_reduce_minmax_bf16_neonbfdot(d, n, s, mn, mi, mx, xi);
|
|
1381
|
+
#elif NK_TARGET_RVV
|
|
1382
|
+
nk_reduce_minmax_bf16_rvv(d, n, s, mn, mi, mx, xi);
|
|
1383
|
+
#elif NK_TARGET_V128RELAXED
|
|
1384
|
+
nk_reduce_minmax_bf16_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1385
|
+
#else
|
|
1386
|
+
nk_reduce_minmax_bf16_serial(d, n, s, mn, mi, mx, xi);
|
|
1387
|
+
#endif
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
NK_PUBLIC void nk_reduce_moments_e4m3(nk_e4m3_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
|
|
1391
|
+
#if NK_TARGET_GENOA
|
|
1392
|
+
nk_reduce_moments_e4m3_genoa(d, n, s, sum, sumsq);
|
|
1393
|
+
#elif NK_TARGET_SKYLAKE
|
|
1394
|
+
nk_reduce_moments_e4m3_skylake(d, n, s, sum, sumsq);
|
|
1395
|
+
#elif NK_TARGET_HASWELL
|
|
1396
|
+
nk_reduce_moments_e4m3_haswell(d, n, s, sum, sumsq);
|
|
1397
|
+
#elif NK_TARGET_NEONFHM
|
|
1398
|
+
nk_reduce_moments_e4m3_neonfhm(d, n, s, sum, sumsq);
|
|
1399
|
+
#elif NK_TARGET_NEON
|
|
1400
|
+
nk_reduce_moments_e4m3_neon(d, n, s, sum, sumsq);
|
|
1401
|
+
#elif NK_TARGET_RVV
|
|
1402
|
+
nk_reduce_moments_e4m3_rvv(d, n, s, sum, sumsq);
|
|
1403
|
+
#elif NK_TARGET_V128RELAXED
|
|
1404
|
+
nk_reduce_moments_e4m3_v128relaxed(d, n, s, sum, sumsq);
|
|
1405
|
+
#else
|
|
1406
|
+
nk_reduce_moments_e4m3_serial(d, n, s, sum, sumsq);
|
|
1407
|
+
#endif
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3(nk_e4m3_t const *d, nk_size_t n, nk_size_t s, nk_e4m3_t *mn, nk_size_t *mi,
|
|
1411
|
+
nk_e4m3_t *mx, nk_size_t *xi) {
|
|
1412
|
+
#if NK_TARGET_SKYLAKE
|
|
1413
|
+
nk_reduce_minmax_e4m3_skylake(d, n, s, mn, mi, mx, xi);
|
|
1414
|
+
#elif NK_TARGET_HASWELL
|
|
1415
|
+
nk_reduce_minmax_e4m3_haswell(d, n, s, mn, mi, mx, xi);
|
|
1416
|
+
#elif NK_TARGET_NEONFHM
|
|
1417
|
+
nk_reduce_minmax_e4m3_neonfhm(d, n, s, mn, mi, mx, xi);
|
|
1418
|
+
#elif NK_TARGET_NEON
|
|
1419
|
+
nk_reduce_minmax_e4m3_neon(d, n, s, mn, mi, mx, xi);
|
|
1420
|
+
#elif NK_TARGET_RVV
|
|
1421
|
+
nk_reduce_minmax_e4m3_rvv(d, n, s, mn, mi, mx, xi);
|
|
1422
|
+
#elif NK_TARGET_V128RELAXED
|
|
1423
|
+
nk_reduce_minmax_e4m3_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1424
|
+
#else
|
|
1425
|
+
nk_reduce_minmax_e4m3_serial(d, n, s, mn, mi, mx, xi);
|
|
1426
|
+
#endif
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
NK_PUBLIC void nk_reduce_moments_e5m2(nk_e5m2_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
|
|
1430
|
+
#if NK_TARGET_GENOA
|
|
1431
|
+
nk_reduce_moments_e5m2_genoa(d, n, s, sum, sumsq);
|
|
1432
|
+
#elif NK_TARGET_SKYLAKE
|
|
1433
|
+
nk_reduce_moments_e5m2_skylake(d, n, s, sum, sumsq);
|
|
1434
|
+
#elif NK_TARGET_HASWELL
|
|
1435
|
+
nk_reduce_moments_e5m2_haswell(d, n, s, sum, sumsq);
|
|
1436
|
+
#elif NK_TARGET_NEONFHM
|
|
1437
|
+
nk_reduce_moments_e5m2_neonfhm(d, n, s, sum, sumsq);
|
|
1438
|
+
#elif NK_TARGET_NEON
|
|
1439
|
+
nk_reduce_moments_e5m2_neon(d, n, s, sum, sumsq);
|
|
1440
|
+
#elif NK_TARGET_RVV
|
|
1441
|
+
nk_reduce_moments_e5m2_rvv(d, n, s, sum, sumsq);
|
|
1442
|
+
#elif NK_TARGET_V128RELAXED
|
|
1443
|
+
nk_reduce_moments_e5m2_v128relaxed(d, n, s, sum, sumsq);
|
|
1444
|
+
#else
|
|
1445
|
+
nk_reduce_moments_e5m2_serial(d, n, s, sum, sumsq);
|
|
1446
|
+
#endif
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2(nk_e5m2_t const *d, nk_size_t n, nk_size_t s, nk_e5m2_t *mn, nk_size_t *mi,
|
|
1450
|
+
nk_e5m2_t *mx, nk_size_t *xi) {
|
|
1451
|
+
#if NK_TARGET_SKYLAKE
|
|
1452
|
+
nk_reduce_minmax_e5m2_skylake(d, n, s, mn, mi, mx, xi);
|
|
1453
|
+
#elif NK_TARGET_HASWELL
|
|
1454
|
+
nk_reduce_minmax_e5m2_haswell(d, n, s, mn, mi, mx, xi);
|
|
1455
|
+
#elif NK_TARGET_NEONFHM
|
|
1456
|
+
nk_reduce_minmax_e5m2_neonfhm(d, n, s, mn, mi, mx, xi);
|
|
1457
|
+
#elif NK_TARGET_NEON
|
|
1458
|
+
nk_reduce_minmax_e5m2_neon(d, n, s, mn, mi, mx, xi);
|
|
1459
|
+
#elif NK_TARGET_RVV
|
|
1460
|
+
nk_reduce_minmax_e5m2_rvv(d, n, s, mn, mi, mx, xi);
|
|
1461
|
+
#elif NK_TARGET_V128RELAXED
|
|
1462
|
+
nk_reduce_minmax_e5m2_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1463
|
+
#else
|
|
1464
|
+
nk_reduce_minmax_e5m2_serial(d, n, s, mn, mi, mx, xi);
|
|
1465
|
+
#endif
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
NK_PUBLIC void nk_reduce_moments_e2m3(nk_e2m3_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
|
|
1469
|
+
#if NK_TARGET_ICELAKE
|
|
1470
|
+
nk_reduce_moments_e2m3_icelake(d, n, s, sum, sumsq);
|
|
1471
|
+
#elif NK_TARGET_SKYLAKE
|
|
1472
|
+
nk_reduce_moments_e2m3_skylake(d, n, s, sum, sumsq);
|
|
1473
|
+
#elif NK_TARGET_SIERRA
|
|
1474
|
+
nk_reduce_moments_e2m3_sierra(d, n, s, sum, sumsq);
|
|
1475
|
+
#elif NK_TARGET_ALDER
|
|
1476
|
+
nk_reduce_moments_e2m3_alder(d, n, s, sum, sumsq);
|
|
1477
|
+
#elif NK_TARGET_HASWELL
|
|
1478
|
+
nk_reduce_moments_e2m3_haswell(d, n, s, sum, sumsq);
|
|
1479
|
+
#elif NK_TARGET_NEONSDOT
|
|
1480
|
+
nk_reduce_moments_e2m3_neonsdot(d, n, s, sum, sumsq);
|
|
1481
|
+
#elif NK_TARGET_NEON
|
|
1482
|
+
nk_reduce_moments_e2m3_neon(d, n, s, sum, sumsq);
|
|
1483
|
+
#elif NK_TARGET_RVV
|
|
1484
|
+
nk_reduce_moments_e2m3_rvv(d, n, s, sum, sumsq);
|
|
1485
|
+
#elif NK_TARGET_V128RELAXED
|
|
1486
|
+
nk_reduce_moments_e2m3_v128relaxed(d, n, s, sum, sumsq);
|
|
1487
|
+
#else
|
|
1488
|
+
nk_reduce_moments_e2m3_serial(d, n, s, sum, sumsq);
|
|
1489
|
+
#endif
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3(nk_e2m3_t const *d, nk_size_t n, nk_size_t s, nk_e2m3_t *mn, nk_size_t *mi,
|
|
1493
|
+
nk_e2m3_t *mx, nk_size_t *xi) {
|
|
1494
|
+
#if NK_TARGET_SKYLAKE
|
|
1495
|
+
nk_reduce_minmax_e2m3_skylake(d, n, s, mn, mi, mx, xi);
|
|
1496
|
+
#elif NK_TARGET_HASWELL
|
|
1497
|
+
nk_reduce_minmax_e2m3_haswell(d, n, s, mn, mi, mx, xi);
|
|
1498
|
+
#elif NK_TARGET_NEON
|
|
1499
|
+
nk_reduce_minmax_e2m3_neon(d, n, s, mn, mi, mx, xi);
|
|
1500
|
+
#elif NK_TARGET_RVV
|
|
1501
|
+
nk_reduce_minmax_e2m3_rvv(d, n, s, mn, mi, mx, xi);
|
|
1502
|
+
#elif NK_TARGET_V128RELAXED
|
|
1503
|
+
nk_reduce_minmax_e2m3_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1504
|
+
#else
|
|
1505
|
+
nk_reduce_minmax_e2m3_serial(d, n, s, mn, mi, mx, xi);
|
|
1506
|
+
#endif
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
NK_PUBLIC void nk_reduce_moments_e3m2(nk_e3m2_t const *d, nk_size_t n, nk_size_t s, nk_f32_t *sum, nk_f32_t *sumsq) {
|
|
1510
|
+
#if NK_TARGET_ICELAKE
|
|
1511
|
+
nk_reduce_moments_e3m2_icelake(d, n, s, sum, sumsq);
|
|
1512
|
+
#elif NK_TARGET_SKYLAKE
|
|
1513
|
+
nk_reduce_moments_e3m2_skylake(d, n, s, sum, sumsq);
|
|
1514
|
+
#elif NK_TARGET_ALDER
|
|
1515
|
+
nk_reduce_moments_e3m2_alder(d, n, s, sum, sumsq);
|
|
1516
|
+
#elif NK_TARGET_HASWELL
|
|
1517
|
+
nk_reduce_moments_e3m2_haswell(d, n, s, sum, sumsq);
|
|
1518
|
+
#elif NK_TARGET_NEON
|
|
1519
|
+
nk_reduce_moments_e3m2_neon(d, n, s, sum, sumsq);
|
|
1520
|
+
#elif NK_TARGET_RVV
|
|
1521
|
+
nk_reduce_moments_e3m2_rvv(d, n, s, sum, sumsq);
|
|
1522
|
+
#elif NK_TARGET_V128RELAXED
|
|
1523
|
+
nk_reduce_moments_e3m2_v128relaxed(d, n, s, sum, sumsq);
|
|
1524
|
+
#else
|
|
1525
|
+
nk_reduce_moments_e3m2_serial(d, n, s, sum, sumsq);
|
|
1526
|
+
#endif
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2(nk_e3m2_t const *d, nk_size_t n, nk_size_t s, nk_e3m2_t *mn, nk_size_t *mi,
|
|
1530
|
+
nk_e3m2_t *mx, nk_size_t *xi) {
|
|
1531
|
+
#if NK_TARGET_SKYLAKE
|
|
1532
|
+
nk_reduce_minmax_e3m2_skylake(d, n, s, mn, mi, mx, xi);
|
|
1533
|
+
#elif NK_TARGET_HASWELL
|
|
1534
|
+
nk_reduce_minmax_e3m2_haswell(d, n, s, mn, mi, mx, xi);
|
|
1535
|
+
#elif NK_TARGET_NEON
|
|
1536
|
+
nk_reduce_minmax_e3m2_neon(d, n, s, mn, mi, mx, xi);
|
|
1537
|
+
#elif NK_TARGET_RVV
|
|
1538
|
+
nk_reduce_minmax_e3m2_rvv(d, n, s, mn, mi, mx, xi);
|
|
1539
|
+
#elif NK_TARGET_V128RELAXED
|
|
1540
|
+
nk_reduce_minmax_e3m2_v128relaxed(d, n, s, mn, mi, mx, xi);
|
|
1541
|
+
#else
|
|
1542
|
+
nk_reduce_minmax_e3m2_serial(d, n, s, mn, mi, mx, xi);
|
|
1543
|
+
#endif
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
NK_PUBLIC void nk_reduce_moments_i4(nk_i4x2_t const *d, nk_size_t n, nk_size_t s, nk_i64_t *sum, nk_u64_t *sumsq) {
|
|
1547
|
+
#if NK_TARGET_SKYLAKE
|
|
1548
|
+
nk_reduce_moments_i4_skylake(d, n, s, sum, sumsq);
|
|
1549
|
+
#elif NK_TARGET_HASWELL
|
|
1550
|
+
nk_reduce_moments_i4_haswell(d, n, s, sum, sumsq);
|
|
1551
|
+
#else
|
|
1552
|
+
nk_reduce_moments_i4_serial(d, n, s, sum, sumsq);
|
|
1553
|
+
#endif
|
|
1554
|
+
}
|
|
1555
|
+
|
|
1556
|
+
NK_PUBLIC void nk_reduce_minmax_i4(nk_i4x2_t const *d, nk_size_t n, nk_size_t s, nk_i8_t *mn, nk_size_t *mi,
|
|
1557
|
+
nk_i8_t *mx, nk_size_t *xi) {
|
|
1558
|
+
nk_reduce_minmax_i4_serial(d, n, s, mn, mi, mx, xi);
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
NK_PUBLIC void nk_reduce_moments_u4(nk_u4x2_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
|
|
1562
|
+
#if NK_TARGET_SKYLAKE
|
|
1563
|
+
nk_reduce_moments_u4_skylake(d, n, s, sum, sumsq);
|
|
1564
|
+
#elif NK_TARGET_HASWELL
|
|
1565
|
+
nk_reduce_moments_u4_haswell(d, n, s, sum, sumsq);
|
|
1566
|
+
#else
|
|
1567
|
+
nk_reduce_moments_u4_serial(d, n, s, sum, sumsq);
|
|
1568
|
+
#endif
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
NK_PUBLIC void nk_reduce_minmax_u4(nk_u4x2_t const *d, nk_size_t n, nk_size_t s, nk_u8_t *mn, nk_size_t *mi,
|
|
1572
|
+
nk_u8_t *mx, nk_size_t *xi) {
|
|
1573
|
+
nk_reduce_minmax_u4_serial(d, n, s, mn, mi, mx, xi);
|
|
1574
|
+
}
|
|
1575
|
+
|
|
1576
|
+
NK_PUBLIC void nk_reduce_moments_u1(nk_u1x8_t const *d, nk_size_t n, nk_size_t s, nk_u64_t *sum, nk_u64_t *sumsq) {
|
|
1577
|
+
#if NK_TARGET_SKYLAKE
|
|
1578
|
+
nk_reduce_moments_u1_skylake(d, n, s, sum, sumsq);
|
|
1579
|
+
#elif NK_TARGET_HASWELL
|
|
1580
|
+
nk_reduce_moments_u1_haswell(d, n, s, sum, sumsq);
|
|
1581
|
+
#else
|
|
1582
|
+
nk_reduce_moments_u1_serial(d, n, s, sum, sumsq);
|
|
1583
|
+
#endif
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
NK_PUBLIC void nk_reduce_minmax_u1(nk_u1x8_t const *d, nk_size_t n, nk_size_t s, nk_u8_t *mn, nk_size_t *mi,
|
|
1587
|
+
nk_u8_t *mx, nk_size_t *xi) {
|
|
1588
|
+
nk_reduce_minmax_u1_serial(d, n, s, mn, mi, mx, xi);
|
|
1589
|
+
}
|
|
1590
|
+
|
|
1591
|
+
#endif // !NK_DYNAMIC_DISPATCH
|
|
1592
|
+
|
|
1593
|
+
#ifdef __cplusplus
|
|
1594
|
+
} // extern "C"
|
|
1595
|
+
#endif
|
|
1596
|
+
|
|
1597
|
+
#endif // NK_REDUCE_H
|