numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,757 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Serial fallbacks for the redesigned reduction API (moments + minmax).
|
|
3
|
+
* @file include/numkong/reduce/serial.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date February 11, 2026
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/reduce.h
|
|
8
|
+
*
|
|
9
|
+
* Provides serial (non-SIMD) implementations of:
|
|
10
|
+
* - `nk_reduce_moments_*_serial` — sum + sum-of-squares in one pass
|
|
11
|
+
* - `nk_reduce_minmax_*_serial` — min + max with indices in one pass
|
|
12
|
+
*/
|
|
13
|
+
#ifndef NK_REDUCE_SERIAL_H
|
|
14
|
+
#define NK_REDUCE_SERIAL_H
|
|
15
|
+
|
|
16
|
+
#include "numkong/types.h"
|
|
17
|
+
#include "numkong/scalar/serial.h"
|
|
18
|
+
#include "numkong/cast/serial.h"
|
|
19
|
+
#include "numkong/scalar/serial.h"
|
|
20
|
+
|
|
21
|
+
#if defined(__cplusplus)
|
|
22
|
+
extern "C" {
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
NK_INTERNAL nk_f64_t nk_reduce_sum_f64_serial_(nk_f64_t const *values, nk_f64_t const *compensations, int count) {
|
|
26
|
+
nk_f64_t running_sum = 0, accumulated_error = 0;
|
|
27
|
+
for (int i = 0; i < count; i++) {
|
|
28
|
+
// TwoSum: fold in values[i]
|
|
29
|
+
nk_f64_t tentative_sum = running_sum + values[i];
|
|
30
|
+
nk_f64_t virtual_addend = tentative_sum - running_sum;
|
|
31
|
+
accumulated_error += (running_sum - (tentative_sum - virtual_addend)) + (values[i] - virtual_addend);
|
|
32
|
+
running_sum = tentative_sum;
|
|
33
|
+
// TwoSum: fold in compensations[i]
|
|
34
|
+
tentative_sum = running_sum + compensations[i];
|
|
35
|
+
virtual_addend = tentative_sum - running_sum;
|
|
36
|
+
accumulated_error += (running_sum - (tentative_sum - virtual_addend)) + (compensations[i] - virtual_addend);
|
|
37
|
+
running_sum = tentative_sum;
|
|
38
|
+
}
|
|
39
|
+
return running_sum + accumulated_error;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
NK_PUBLIC void nk_reduce_moments_f32_serial( //
|
|
43
|
+
nk_f32_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
44
|
+
nk_f64_t *sum_ptr, nk_f64_t *sumsq_ptr) {
|
|
45
|
+
nk_f64_t running_sum = 0, sum_compensation = 0;
|
|
46
|
+
nk_f64_t running_sumsq = 0, sumsq_compensation = 0;
|
|
47
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
48
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
49
|
+
nk_f64_t val = (nk_f64_t)(*(nk_f32_t const *)ptr);
|
|
50
|
+
nk_f64_t tentative_sum = running_sum + val;
|
|
51
|
+
if (nk_f64_abs_(running_sum) >= nk_f64_abs_(val)) sum_compensation += (running_sum - tentative_sum) + val;
|
|
52
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
53
|
+
running_sum = tentative_sum;
|
|
54
|
+
|
|
55
|
+
nk_f64_t squared_value = val * val;
|
|
56
|
+
nk_f64_t tentative_sumsq = running_sumsq + squared_value;
|
|
57
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
58
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
59
|
+
running_sumsq = tentative_sumsq;
|
|
60
|
+
}
|
|
61
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
NK_PUBLIC void nk_reduce_moments_f64_serial( //
|
|
65
|
+
nk_f64_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
66
|
+
nk_f64_t *sum_ptr, nk_f64_t *sumsq_ptr) {
|
|
67
|
+
nk_f64_t running_sum = 0, sum_compensation = 0;
|
|
68
|
+
nk_f64_t running_sumsq = 0, sumsq_compensation = 0;
|
|
69
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
70
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
71
|
+
nk_f64_t val = *(nk_f64_t const *)ptr;
|
|
72
|
+
nk_f64_t tentative_sum = running_sum + val;
|
|
73
|
+
if (nk_f64_abs_(running_sum) >= nk_f64_abs_(val)) sum_compensation += (running_sum - tentative_sum) + val;
|
|
74
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
75
|
+
running_sum = tentative_sum;
|
|
76
|
+
|
|
77
|
+
nk_f64_t squared_value = val * val;
|
|
78
|
+
nk_f64_t tentative_sumsq = running_sumsq + squared_value;
|
|
79
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
80
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
81
|
+
running_sumsq = tentative_sumsq;
|
|
82
|
+
}
|
|
83
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
NK_PUBLIC void nk_reduce_moments_i8_serial( //
|
|
87
|
+
nk_i8_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
88
|
+
nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
89
|
+
nk_i64_t sum = 0;
|
|
90
|
+
nk_u64_t sumsq = 0;
|
|
91
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
92
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
93
|
+
nk_i64_t val = (nk_i64_t)(*(nk_i8_t const *)ptr);
|
|
94
|
+
sum += val;
|
|
95
|
+
sumsq += (nk_u64_t)(val * val);
|
|
96
|
+
}
|
|
97
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
NK_PUBLIC void nk_reduce_moments_u8_serial( //
|
|
101
|
+
nk_u8_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
102
|
+
nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
103
|
+
nk_u64_t sum = 0, sumsq = 0;
|
|
104
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
105
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
106
|
+
nk_u64_t val = (nk_u64_t)(*(nk_u8_t const *)ptr);
|
|
107
|
+
sum += val;
|
|
108
|
+
sumsq += val * val;
|
|
109
|
+
}
|
|
110
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
NK_PUBLIC void nk_reduce_moments_i16_serial( //
|
|
114
|
+
nk_i16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
115
|
+
nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
116
|
+
nk_i64_t sum = 0;
|
|
117
|
+
nk_u64_t sumsq = 0;
|
|
118
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
119
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
120
|
+
nk_i64_t val = (nk_i64_t)(*(nk_i16_t const *)ptr);
|
|
121
|
+
sum += val;
|
|
122
|
+
sumsq += (nk_u64_t)(val * val);
|
|
123
|
+
}
|
|
124
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
NK_PUBLIC void nk_reduce_moments_u16_serial( //
|
|
128
|
+
nk_u16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
129
|
+
nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
130
|
+
nk_u64_t sum = 0, sumsq = 0;
|
|
131
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
132
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
133
|
+
nk_u64_t val = (nk_u64_t)(*(nk_u16_t const *)ptr);
|
|
134
|
+
sum += val;
|
|
135
|
+
sumsq += val * val;
|
|
136
|
+
}
|
|
137
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
NK_PUBLIC void nk_reduce_moments_i32_serial( //
|
|
141
|
+
nk_i32_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
142
|
+
nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
143
|
+
nk_u64_t sum_lower = 0;
|
|
144
|
+
nk_i64_t sum_upper = 0;
|
|
145
|
+
nk_u64_t sumsq = 0;
|
|
146
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
147
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
148
|
+
nk_i64_t val = (nk_i64_t)(*(nk_i32_t const *)ptr);
|
|
149
|
+
nk_u64_t product = (nk_u64_t)(val * val);
|
|
150
|
+
nk_u64_t sum_before = sum_lower;
|
|
151
|
+
sum_lower += (nk_u64_t)val;
|
|
152
|
+
if (sum_lower < sum_before) sum_upper++;
|
|
153
|
+
sum_upper += (val >> 63);
|
|
154
|
+
sumsq = nk_u64_saturating_add_serial(sumsq, product);
|
|
155
|
+
}
|
|
156
|
+
nk_i64_t sum_lower_signed = (nk_i64_t)sum_lower;
|
|
157
|
+
if (sum_upper == (sum_lower_signed >> 63)) *sum_ptr = sum_lower_signed;
|
|
158
|
+
else if (sum_upper >= 0) *sum_ptr = NK_I64_MAX;
|
|
159
|
+
else *sum_ptr = NK_I64_MIN;
|
|
160
|
+
*sumsq_ptr = sumsq;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
NK_PUBLIC void nk_reduce_moments_u32_serial( //
|
|
164
|
+
nk_u32_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
165
|
+
nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
166
|
+
nk_u64_t sum = 0, sumsq = 0;
|
|
167
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
168
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
169
|
+
nk_u64_t val = (nk_u64_t)(*(nk_u32_t const *)ptr);
|
|
170
|
+
nk_u64_t product = val * val;
|
|
171
|
+
sum = nk_u64_saturating_add_serial(sum, val);
|
|
172
|
+
sumsq = nk_u64_saturating_add_serial(sumsq, product);
|
|
173
|
+
}
|
|
174
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
NK_PUBLIC void nk_reduce_moments_i64_serial( //
|
|
178
|
+
nk_i64_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
179
|
+
nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
180
|
+
nk_u64_t sum_lower = 0;
|
|
181
|
+
nk_i64_t sum_upper = 0;
|
|
182
|
+
nk_u64_t sumsq = 0;
|
|
183
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
184
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
185
|
+
nk_i64_t val = *(nk_i64_t const *)ptr;
|
|
186
|
+
nk_i64_t product = nk_i64_saturating_mul_serial(val, val);
|
|
187
|
+
nk_u64_t unsigned_product = (nk_u64_t)product;
|
|
188
|
+
sumsq = nk_u64_saturating_add_serial(sumsq, unsigned_product);
|
|
189
|
+
nk_u64_t sum_before = sum_lower;
|
|
190
|
+
sum_lower += (nk_u64_t)val;
|
|
191
|
+
if (sum_lower < sum_before) sum_upper++;
|
|
192
|
+
sum_upper += (val >> 63);
|
|
193
|
+
}
|
|
194
|
+
nk_i64_t sum_lower_signed = (nk_i64_t)sum_lower;
|
|
195
|
+
if (sum_upper == (sum_lower_signed >> 63)) *sum_ptr = sum_lower_signed;
|
|
196
|
+
else if (sum_upper >= 0) *sum_ptr = NK_I64_MAX;
|
|
197
|
+
else *sum_ptr = NK_I64_MIN;
|
|
198
|
+
*sumsq_ptr = sumsq;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
NK_PUBLIC void nk_reduce_moments_u64_serial( //
|
|
202
|
+
nk_u64_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
203
|
+
nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
204
|
+
nk_u64_t sum = 0, sumsq = 0;
|
|
205
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
206
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
207
|
+
nk_u64_t val = *(nk_u64_t const *)ptr;
|
|
208
|
+
nk_u64_t product = nk_u64_saturating_mul_serial(val, val);
|
|
209
|
+
sum = nk_u64_saturating_add_serial(sum, val);
|
|
210
|
+
sumsq = nk_u64_saturating_add_serial(sumsq, product);
|
|
211
|
+
}
|
|
212
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
NK_PUBLIC void nk_reduce_moments_f16_serial( //
|
|
216
|
+
nk_f16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
217
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
|
|
218
|
+
nk_f32_t running_sum = 0, sum_compensation = 0;
|
|
219
|
+
nk_f32_t running_sumsq = 0, sumsq_compensation = 0;
|
|
220
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
221
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
222
|
+
nk_f32_t val;
|
|
223
|
+
nk_f16_to_f32_serial((nk_f16_t const *)ptr, &val);
|
|
224
|
+
nk_f32_t tentative_sum = running_sum + val;
|
|
225
|
+
nk_f32_t abs_running_sum = nk_f32_abs_(running_sum);
|
|
226
|
+
nk_f32_t abs_val = nk_f32_abs_(val);
|
|
227
|
+
if (abs_running_sum >= abs_val) sum_compensation += (running_sum - tentative_sum) + val;
|
|
228
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
229
|
+
running_sum = tentative_sum;
|
|
230
|
+
|
|
231
|
+
nk_f32_t squared_value = val * val;
|
|
232
|
+
nk_f32_t tentative_sumsq = running_sumsq + squared_value;
|
|
233
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
234
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
235
|
+
running_sumsq = tentative_sumsq;
|
|
236
|
+
}
|
|
237
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
NK_PUBLIC void nk_reduce_moments_bf16_serial( //
|
|
241
|
+
nk_bf16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
242
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
|
|
243
|
+
nk_f32_t running_sum = 0, sum_compensation = 0;
|
|
244
|
+
nk_f32_t running_sumsq = 0, sumsq_compensation = 0;
|
|
245
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
246
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
247
|
+
nk_f32_t val;
|
|
248
|
+
nk_bf16_to_f32_serial((nk_bf16_t const *)ptr, &val);
|
|
249
|
+
nk_f32_t tentative_sum = running_sum + val;
|
|
250
|
+
nk_f32_t abs_running_sum = nk_f32_abs_(running_sum);
|
|
251
|
+
nk_f32_t abs_val = nk_f32_abs_(val);
|
|
252
|
+
if (abs_running_sum >= abs_val) sum_compensation += (running_sum - tentative_sum) + val;
|
|
253
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
254
|
+
running_sum = tentative_sum;
|
|
255
|
+
|
|
256
|
+
nk_f32_t squared_value = val * val;
|
|
257
|
+
nk_f32_t tentative_sumsq = running_sumsq + squared_value;
|
|
258
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
259
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
260
|
+
running_sumsq = tentative_sumsq;
|
|
261
|
+
}
|
|
262
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
NK_PUBLIC void nk_reduce_moments_e4m3_serial( //
|
|
266
|
+
nk_e4m3_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
267
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
|
|
268
|
+
nk_f32_t running_sum = 0, sum_compensation = 0;
|
|
269
|
+
nk_f32_t running_sumsq = 0, sumsq_compensation = 0;
|
|
270
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
271
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
272
|
+
nk_f32_t val;
|
|
273
|
+
nk_e4m3_to_f32_serial((nk_e4m3_t const *)ptr, &val);
|
|
274
|
+
nk_f32_t tentative_sum = running_sum + val;
|
|
275
|
+
nk_f32_t abs_running_sum = nk_f32_abs_(running_sum);
|
|
276
|
+
nk_f32_t abs_val = nk_f32_abs_(val);
|
|
277
|
+
if (abs_running_sum >= abs_val) sum_compensation += (running_sum - tentative_sum) + val;
|
|
278
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
279
|
+
running_sum = tentative_sum;
|
|
280
|
+
|
|
281
|
+
nk_f32_t squared_value = val * val;
|
|
282
|
+
nk_f32_t tentative_sumsq = running_sumsq + squared_value;
|
|
283
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
284
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
285
|
+
running_sumsq = tentative_sumsq;
|
|
286
|
+
}
|
|
287
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
NK_PUBLIC void nk_reduce_moments_e5m2_serial( //
|
|
291
|
+
nk_e5m2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
292
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
|
|
293
|
+
nk_f32_t running_sum = 0, sum_compensation = 0;
|
|
294
|
+
nk_f32_t running_sumsq = 0, sumsq_compensation = 0;
|
|
295
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
296
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
297
|
+
nk_f32_t val;
|
|
298
|
+
nk_e5m2_to_f32_serial((nk_e5m2_t const *)ptr, &val);
|
|
299
|
+
nk_f32_t tentative_sum = running_sum + val;
|
|
300
|
+
nk_f32_t abs_running_sum = nk_f32_abs_(running_sum);
|
|
301
|
+
nk_f32_t abs_val = nk_f32_abs_(val);
|
|
302
|
+
if (abs_running_sum >= abs_val) sum_compensation += (running_sum - tentative_sum) + val;
|
|
303
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
304
|
+
running_sum = tentative_sum;
|
|
305
|
+
|
|
306
|
+
nk_f32_t squared_value = val * val;
|
|
307
|
+
nk_f32_t tentative_sumsq = running_sumsq + squared_value;
|
|
308
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
309
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
310
|
+
running_sumsq = tentative_sumsq;
|
|
311
|
+
}
|
|
312
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
NK_PUBLIC void nk_reduce_moments_e2m3_serial( //
|
|
316
|
+
nk_e2m3_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
317
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
|
|
318
|
+
nk_f32_t running_sum = 0, sum_compensation = 0;
|
|
319
|
+
nk_f32_t running_sumsq = 0, sumsq_compensation = 0;
|
|
320
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
321
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
322
|
+
nk_f32_t val;
|
|
323
|
+
nk_e2m3_to_f32_serial((nk_e2m3_t const *)ptr, &val);
|
|
324
|
+
nk_f32_t tentative_sum = running_sum + val;
|
|
325
|
+
nk_f32_t abs_running_sum = nk_f32_abs_(running_sum);
|
|
326
|
+
nk_f32_t abs_val = nk_f32_abs_(val);
|
|
327
|
+
if (abs_running_sum >= abs_val) sum_compensation += (running_sum - tentative_sum) + val;
|
|
328
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
329
|
+
running_sum = tentative_sum;
|
|
330
|
+
|
|
331
|
+
nk_f32_t squared_value = val * val;
|
|
332
|
+
nk_f32_t tentative_sumsq = running_sumsq + squared_value;
|
|
333
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
334
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
335
|
+
running_sumsq = tentative_sumsq;
|
|
336
|
+
}
|
|
337
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
NK_PUBLIC void nk_reduce_moments_e3m2_serial( //
|
|
341
|
+
nk_e3m2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
342
|
+
nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
|
|
343
|
+
nk_f32_t running_sum = 0, sum_compensation = 0;
|
|
344
|
+
nk_f32_t running_sumsq = 0, sumsq_compensation = 0;
|
|
345
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
346
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
347
|
+
nk_f32_t val;
|
|
348
|
+
nk_e3m2_to_f32_serial((nk_e3m2_t const *)ptr, &val);
|
|
349
|
+
nk_f32_t tentative_sum = running_sum + val;
|
|
350
|
+
nk_f32_t abs_running_sum = nk_f32_abs_(running_sum);
|
|
351
|
+
nk_f32_t abs_val = nk_f32_abs_(val);
|
|
352
|
+
if (abs_running_sum >= abs_val) sum_compensation += (running_sum - tentative_sum) + val;
|
|
353
|
+
else sum_compensation += (val - tentative_sum) + running_sum;
|
|
354
|
+
running_sum = tentative_sum;
|
|
355
|
+
|
|
356
|
+
nk_f32_t squared_value = val * val;
|
|
357
|
+
nk_f32_t tentative_sumsq = running_sumsq + squared_value;
|
|
358
|
+
if (running_sumsq >= squared_value) sumsq_compensation += (running_sumsq - tentative_sumsq) + squared_value;
|
|
359
|
+
else sumsq_compensation += (squared_value - tentative_sumsq) + running_sumsq;
|
|
360
|
+
running_sumsq = tentative_sumsq;
|
|
361
|
+
}
|
|
362
|
+
*sum_ptr = running_sum + sum_compensation, *sumsq_ptr = running_sumsq + sumsq_compensation;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
NK_PUBLIC void nk_reduce_moments_i4_serial( //
|
|
366
|
+
nk_i4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
367
|
+
nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
368
|
+
count = nk_size_round_up_to_multiple_(count, 2);
|
|
369
|
+
nk_i64_t sum = 0;
|
|
370
|
+
nk_u64_t sumsq = 0;
|
|
371
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
372
|
+
for (nk_size_t i = 0; i < count; i += 2) {
|
|
373
|
+
unsigned char byte_val = ptr[(i / 2) * stride_bytes];
|
|
374
|
+
nk_i64_t low = (nk_i64_t)nk_i4x2_low_(byte_val);
|
|
375
|
+
nk_i64_t high = (nk_i64_t)nk_i4x2_high_(byte_val);
|
|
376
|
+
sum += low + high;
|
|
377
|
+
sumsq += (nk_u64_t)(low * low) + (nk_u64_t)(high * high);
|
|
378
|
+
}
|
|
379
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
NK_PUBLIC void nk_reduce_moments_u4_serial( //
|
|
383
|
+
nk_u4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
384
|
+
nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
385
|
+
count = nk_size_round_up_to_multiple_(count, 2);
|
|
386
|
+
nk_u64_t sum = 0, sumsq = 0;
|
|
387
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
388
|
+
for (nk_size_t i = 0; i < count; i += 2) {
|
|
389
|
+
unsigned char byte_val = ptr[(i / 2) * stride_bytes];
|
|
390
|
+
nk_u64_t low = nk_u4x2_low_(byte_val);
|
|
391
|
+
nk_u64_t high = nk_u4x2_high_(byte_val);
|
|
392
|
+
sum += low + high, sumsq += low * low + high * high;
|
|
393
|
+
}
|
|
394
|
+
*sum_ptr = sum, *sumsq_ptr = sumsq;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
NK_PUBLIC void nk_reduce_moments_u1_serial( //
|
|
398
|
+
nk_u1x8_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
399
|
+
nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
400
|
+
count = nk_size_round_up_to_multiple_(count, 8);
|
|
401
|
+
nk_u64_t sum = 0;
|
|
402
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
403
|
+
for (nk_size_t i = 0; i < count; i += 8) {
|
|
404
|
+
unsigned char byte_val = ptr[(i / 8) * stride_bytes];
|
|
405
|
+
sum += nk_u64_popcount_(byte_val);
|
|
406
|
+
}
|
|
407
|
+
*sum_ptr = sum, *sumsq_ptr = sum; // 0^2 = 0, 1^2 = 1, so sumsq == sum
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
NK_PUBLIC void nk_reduce_minmax_f32_serial( //
|
|
411
|
+
nk_f32_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
412
|
+
nk_f32_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
413
|
+
nk_f32_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
414
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
415
|
+
nk_f32_t min_value = NK_F32_MAX, max_value = NK_F32_MIN;
|
|
416
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
417
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
418
|
+
nk_f32_t val = *(nk_f32_t const *)ptr;
|
|
419
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
420
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
421
|
+
}
|
|
422
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
423
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
NK_PUBLIC void nk_reduce_minmax_f64_serial( //
|
|
427
|
+
nk_f64_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
428
|
+
nk_f64_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
429
|
+
nk_f64_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
430
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
431
|
+
nk_f64_t min_value = NK_F64_MAX, max_value = NK_F64_MIN;
|
|
432
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
433
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
434
|
+
nk_f64_t val = *(nk_f64_t const *)ptr;
|
|
435
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
436
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
437
|
+
}
|
|
438
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
439
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
NK_PUBLIC void nk_reduce_minmax_i8_serial( //
|
|
443
|
+
nk_i8_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
444
|
+
nk_i8_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
445
|
+
nk_i8_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
446
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
447
|
+
nk_i8_t min_value = NK_I8_MAX, max_value = NK_I8_MIN;
|
|
448
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
449
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
450
|
+
nk_i8_t val = *(nk_i8_t const *)ptr;
|
|
451
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
452
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
453
|
+
}
|
|
454
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
455
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
NK_PUBLIC void nk_reduce_minmax_u8_serial( //
|
|
459
|
+
nk_u8_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
460
|
+
nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
461
|
+
nk_u8_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
462
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
463
|
+
nk_u8_t min_value = NK_U8_MAX, max_value = NK_U8_MIN;
|
|
464
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
465
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
466
|
+
nk_u8_t val = *(nk_u8_t const *)ptr;
|
|
467
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
468
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
469
|
+
}
|
|
470
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
471
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
NK_PUBLIC void nk_reduce_minmax_i16_serial( //
|
|
475
|
+
nk_i16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
476
|
+
nk_i16_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
477
|
+
nk_i16_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
478
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
479
|
+
nk_i16_t min_value = NK_I16_MAX, max_value = NK_I16_MIN;
|
|
480
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
481
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
482
|
+
nk_i16_t val = *(nk_i16_t const *)ptr;
|
|
483
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
484
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
485
|
+
}
|
|
486
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
487
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
NK_PUBLIC void nk_reduce_minmax_u16_serial( //
|
|
491
|
+
nk_u16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
492
|
+
nk_u16_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
493
|
+
nk_u16_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
494
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
495
|
+
nk_u16_t min_value = NK_U16_MAX, max_value = NK_U16_MIN;
|
|
496
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
497
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
498
|
+
nk_u16_t val = *(nk_u16_t const *)ptr;
|
|
499
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
500
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
501
|
+
}
|
|
502
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
503
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
NK_PUBLIC void nk_reduce_minmax_i32_serial( //
|
|
507
|
+
nk_i32_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
508
|
+
nk_i32_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
509
|
+
nk_i32_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
510
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
511
|
+
nk_i32_t min_value = NK_I32_MAX, max_value = NK_I32_MIN;
|
|
512
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
513
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
514
|
+
nk_i32_t val = *(nk_i32_t const *)ptr;
|
|
515
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
516
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
517
|
+
}
|
|
518
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
519
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
NK_PUBLIC void nk_reduce_minmax_u32_serial( //
|
|
523
|
+
nk_u32_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
524
|
+
nk_u32_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
525
|
+
nk_u32_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
526
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
527
|
+
nk_u32_t min_value = NK_U32_MAX, max_value = NK_U32_MIN;
|
|
528
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
529
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
530
|
+
nk_u32_t val = *(nk_u32_t const *)ptr;
|
|
531
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
532
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
533
|
+
}
|
|
534
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
535
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
NK_PUBLIC void nk_reduce_minmax_i64_serial( //
|
|
539
|
+
nk_i64_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
540
|
+
nk_i64_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
541
|
+
nk_i64_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
542
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
543
|
+
nk_i64_t min_value = NK_I64_MAX, max_value = NK_I64_MIN;
|
|
544
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
545
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
546
|
+
nk_i64_t val = *(nk_i64_t const *)ptr;
|
|
547
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
548
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
549
|
+
}
|
|
550
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
551
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
NK_PUBLIC void nk_reduce_minmax_u64_serial( //
|
|
555
|
+
nk_u64_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
556
|
+
nk_u64_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
557
|
+
nk_u64_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
558
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
559
|
+
nk_u64_t min_value = NK_U64_MAX, max_value = NK_U64_MIN;
|
|
560
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
561
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
562
|
+
nk_u64_t val = *(nk_u64_t const *)ptr;
|
|
563
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
564
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
565
|
+
}
|
|
566
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
567
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
NK_PUBLIC void nk_reduce_minmax_f16_serial( //
|
|
571
|
+
nk_f16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
572
|
+
nk_f16_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
573
|
+
nk_f16_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
574
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
575
|
+
nk_f16_t min_value = nk_f16_from_u16_(NK_F16_MAX), max_value = nk_f16_from_u16_(NK_F16_MIN);
|
|
576
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
577
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
578
|
+
nk_f16_t raw_value = *(nk_f16_t const *)ptr;
|
|
579
|
+
nk_fui16_t raw_fui;
|
|
580
|
+
raw_fui.f = raw_value;
|
|
581
|
+
if (nk_f16_is_nan_(raw_fui.u)) continue;
|
|
582
|
+
if (min_idx == NK_SIZE_MAX || nk_f16_order_serial(raw_value, min_value) < 0) min_value = raw_value, min_idx = i;
|
|
583
|
+
if (max_idx == NK_SIZE_MAX || nk_f16_order_serial(raw_value, max_value) > 0) max_value = raw_value, max_idx = i;
|
|
584
|
+
}
|
|
585
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
586
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
NK_PUBLIC void nk_reduce_minmax_bf16_serial( //
|
|
590
|
+
nk_bf16_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
591
|
+
nk_bf16_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
592
|
+
nk_bf16_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
593
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
594
|
+
nk_bf16_t min_value = nk_bf16_from_u16_(NK_BF16_MAX), max_value = nk_bf16_from_u16_(NK_BF16_MIN);
|
|
595
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
596
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
597
|
+
nk_bf16_t raw_value = *(nk_bf16_t const *)ptr;
|
|
598
|
+
nk_fui16_t raw_fui;
|
|
599
|
+
raw_fui.bf = raw_value;
|
|
600
|
+
if (nk_bf16_is_nan_(raw_fui.u)) continue;
|
|
601
|
+
if (min_idx == NK_SIZE_MAX || nk_bf16_order_serial(raw_value, min_value) < 0)
|
|
602
|
+
min_value = raw_value, min_idx = i;
|
|
603
|
+
if (max_idx == NK_SIZE_MAX || nk_bf16_order_serial(raw_value, max_value) > 0)
|
|
604
|
+
max_value = raw_value, max_idx = i;
|
|
605
|
+
}
|
|
606
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
607
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
NK_PUBLIC void nk_reduce_minmax_e4m3_serial( //
|
|
611
|
+
nk_e4m3_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
612
|
+
nk_e4m3_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
613
|
+
nk_e4m3_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
614
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
615
|
+
nk_e4m3_t min_value = NK_E4M3_MAX, max_value = NK_E4M3_MIN;
|
|
616
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
617
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
618
|
+
nk_e4m3_t raw_value = *(nk_e4m3_t const *)ptr;
|
|
619
|
+
if (nk_e4m3_is_nan_(raw_value)) continue;
|
|
620
|
+
if (min_idx == NK_SIZE_MAX || nk_e4m3_order_serial(raw_value, min_value) < 0)
|
|
621
|
+
min_value = raw_value, min_idx = i;
|
|
622
|
+
if (max_idx == NK_SIZE_MAX || nk_e4m3_order_serial(raw_value, max_value) > 0)
|
|
623
|
+
max_value = raw_value, max_idx = i;
|
|
624
|
+
}
|
|
625
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
626
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
NK_PUBLIC void nk_reduce_minmax_e5m2_serial( //
|
|
630
|
+
nk_e5m2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
631
|
+
nk_e5m2_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
632
|
+
nk_e5m2_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
633
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
634
|
+
nk_e5m2_t min_value = NK_E5M2_MAX, max_value = NK_E5M2_MIN;
|
|
635
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
636
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
637
|
+
nk_e5m2_t raw_value = *(nk_e5m2_t const *)ptr;
|
|
638
|
+
if (nk_e5m2_is_nan_(raw_value)) continue;
|
|
639
|
+
if (min_idx == NK_SIZE_MAX || nk_e5m2_order_serial(raw_value, min_value) < 0)
|
|
640
|
+
min_value = raw_value, min_idx = i;
|
|
641
|
+
if (max_idx == NK_SIZE_MAX || nk_e5m2_order_serial(raw_value, max_value) > 0)
|
|
642
|
+
max_value = raw_value, max_idx = i;
|
|
643
|
+
}
|
|
644
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
645
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
NK_PUBLIC void nk_reduce_minmax_e2m3_serial( //
|
|
649
|
+
nk_e2m3_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
650
|
+
nk_e2m3_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
651
|
+
nk_e2m3_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
652
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
653
|
+
nk_e2m3_t min_value = NK_E2M3_MAX, max_value = NK_E2M3_MIN;
|
|
654
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
655
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
656
|
+
nk_e2m3_t raw_value = *(nk_e2m3_t const *)ptr;
|
|
657
|
+
if (nk_e2m3_order_serial(raw_value, min_value) < 0) min_value = raw_value, min_idx = i;
|
|
658
|
+
if (nk_e2m3_order_serial(raw_value, max_value) > 0) max_value = raw_value, max_idx = i;
|
|
659
|
+
}
|
|
660
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
661
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
NK_PUBLIC void nk_reduce_minmax_e3m2_serial( //
|
|
665
|
+
nk_e3m2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
666
|
+
nk_e3m2_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
667
|
+
nk_e3m2_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
668
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
669
|
+
nk_e3m2_t min_value = NK_E3M2_MAX, max_value = NK_E3M2_MIN;
|
|
670
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
671
|
+
for (nk_size_t i = 0; i < count; ++i, ptr += stride_bytes) {
|
|
672
|
+
nk_e3m2_t raw_value = *(nk_e3m2_t const *)ptr;
|
|
673
|
+
if (nk_e3m2_order_serial(raw_value, min_value) < 0) min_value = raw_value, min_idx = i;
|
|
674
|
+
if (nk_e3m2_order_serial(raw_value, max_value) > 0) max_value = raw_value, max_idx = i;
|
|
675
|
+
}
|
|
676
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
677
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
NK_PUBLIC void nk_reduce_minmax_i4_serial( //
|
|
681
|
+
nk_i4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
682
|
+
nk_i8_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
683
|
+
nk_i8_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
684
|
+
count = nk_size_round_up_to_multiple_(count, 2);
|
|
685
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
686
|
+
nk_i8_t min_value = 7, max_value = -8; // i4 range: -8 to 7
|
|
687
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
688
|
+
for (nk_size_t i = 0; i < count; ++i) {
|
|
689
|
+
nk_size_t byte_idx = i / 2;
|
|
690
|
+
unsigned char byte_val = ptr[byte_idx * stride_bytes];
|
|
691
|
+
nk_i8_t val = nk_i4x2_get_(byte_val, (int)(i & 1));
|
|
692
|
+
if (val < min_value) min_value = val, min_idx = i;
|
|
693
|
+
if (val > max_value) max_value = val, max_idx = i;
|
|
694
|
+
}
|
|
695
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
696
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
NK_PUBLIC void nk_reduce_minmax_u4_serial( //
|
|
700
|
+
nk_u4x2_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
701
|
+
nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
702
|
+
nk_u8_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
703
|
+
count = nk_size_round_up_to_multiple_(count, 2);
|
|
704
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
705
|
+
nk_u8_t min_value = 15, max_value = 0; // u4 range: 0 to 15
|
|
706
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
707
|
+
for (nk_size_t i = 0; i < count; ++i) {
|
|
708
|
+
nk_size_t byte_idx = i / 2;
|
|
709
|
+
unsigned char byte_val = ptr[byte_idx * stride_bytes];
|
|
710
|
+
nk_u8_t nibble = nk_u4x2_get_(byte_val, (int)(i & 1));
|
|
711
|
+
if (nibble < min_value) min_value = nibble, min_idx = i;
|
|
712
|
+
if (nibble > max_value) max_value = nibble, max_idx = i;
|
|
713
|
+
}
|
|
714
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
715
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
NK_PUBLIC void nk_reduce_minmax_u1_serial( //
|
|
719
|
+
nk_u1x8_t const *data, nk_size_t count, nk_size_t stride_bytes, //
|
|
720
|
+
nk_u8_t *min_value_ptr, nk_size_t *min_index_ptr, //
|
|
721
|
+
nk_u8_t *max_value_ptr, nk_size_t *max_index_ptr) {
|
|
722
|
+
count = nk_size_round_up_to_multiple_(count, 8);
|
|
723
|
+
unsigned char const *ptr = (unsigned char const *)data;
|
|
724
|
+
nk_u8_t min_value = 1, max_value = 0;
|
|
725
|
+
nk_size_t min_idx = NK_SIZE_MAX, max_idx = NK_SIZE_MAX;
|
|
726
|
+
for (nk_size_t i = 0; i < count; ++i) {
|
|
727
|
+
nk_size_t byte_idx = i / 8;
|
|
728
|
+
unsigned char byte_val = ptr[byte_idx * stride_bytes];
|
|
729
|
+
nk_u8_t bit = (byte_val >> (i % 8)) & 1;
|
|
730
|
+
if (bit < min_value) {
|
|
731
|
+
min_value = bit;
|
|
732
|
+
min_idx = i;
|
|
733
|
+
if (min_value == 0 && max_value == 1) {
|
|
734
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
735
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
736
|
+
return;
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
if (bit > max_value) {
|
|
740
|
+
max_value = bit;
|
|
741
|
+
max_idx = i;
|
|
742
|
+
if (min_value == 0 && max_value == 1) {
|
|
743
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
744
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
745
|
+
return;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
*min_value_ptr = min_value, *min_index_ptr = min_idx;
|
|
750
|
+
*max_value_ptr = max_value, *max_index_ptr = max_idx;
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
#if defined(__cplusplus)
|
|
754
|
+
} // extern "C"
|
|
755
|
+
#endif
|
|
756
|
+
|
|
757
|
+
#endif // NK_REDUCE_SERIAL_H
|