numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -47,18 +47,18 @@ extern "C" {
|
|
|
47
47
|
|
|
48
48
|
/** @brief Horizontal sum of 4 doubles in a YMM register. */
|
|
49
49
|
NK_INTERNAL nk_f64_t nk_reduce_add_f64x4_haswell_(__m256d sum_f64x4) {
|
|
50
|
-
__m128d
|
|
51
|
-
__m128d
|
|
52
|
-
__m128d sum_f64x2 = _mm_add_pd(
|
|
50
|
+
__m128d low_f64x2 = _mm256_castpd256_pd128(sum_f64x4);
|
|
51
|
+
__m128d high_f64x2 = _mm256_extractf128_pd(sum_f64x4, 1);
|
|
52
|
+
__m128d sum_f64x2 = _mm_add_pd(low_f64x2, high_f64x2);
|
|
53
53
|
sum_f64x2 = _mm_hadd_pd(sum_f64x2, sum_f64x2);
|
|
54
54
|
return _mm_cvtsd_f64(sum_f64x2);
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
/** @brief Horizontal sum of 8 floats in a YMM register (native f32 precision). */
|
|
58
58
|
NK_INTERNAL nk_f32_t nk_reduce_add_f32x8_haswell_(__m256 sum_f32x8) {
|
|
59
|
-
__m128
|
|
60
|
-
__m128
|
|
61
|
-
__m128 sum_f32x4 = _mm_add_ps(
|
|
59
|
+
__m128 low_f32x4 = _mm256_castps256_ps128(sum_f32x8);
|
|
60
|
+
__m128 high_f32x4 = _mm256_extractf128_ps(sum_f32x8, 1);
|
|
61
|
+
__m128 sum_f32x4 = _mm_add_ps(low_f32x4, high_f32x4);
|
|
62
62
|
sum_f32x4 = _mm_hadd_ps(sum_f32x4, sum_f32x4);
|
|
63
63
|
sum_f32x4 = _mm_hadd_ps(sum_f32x4, sum_f32x4);
|
|
64
64
|
return _mm_cvtss_f32(sum_f32x4);
|
|
@@ -66,9 +66,9 @@ NK_INTERNAL nk_f32_t nk_reduce_add_f32x8_haswell_(__m256 sum_f32x8) {
|
|
|
66
66
|
|
|
67
67
|
/** @brief Horizontal sum of 8 i32s in a YMM register. */
|
|
68
68
|
NK_INTERNAL nk_i32_t nk_reduce_add_i32x8_haswell_(__m256i sum_i32x8) {
|
|
69
|
-
__m128i
|
|
70
|
-
__m128i
|
|
71
|
-
__m128i sum_i32x4 = _mm_add_epi32(
|
|
69
|
+
__m128i low_i32x4 = _mm256_castsi256_si128(sum_i32x8);
|
|
70
|
+
__m128i high_i32x4 = _mm256_extracti128_si256(sum_i32x8, 1);
|
|
71
|
+
__m128i sum_i32x4 = _mm_add_epi32(low_i32x4, high_i32x4);
|
|
72
72
|
sum_i32x4 = _mm_hadd_epi32(sum_i32x4, sum_i32x4);
|
|
73
73
|
sum_i32x4 = _mm_hadd_epi32(sum_i32x4, sum_i32x4);
|
|
74
74
|
return _mm_cvtsi128_si32(sum_i32x4);
|
|
@@ -76,19 +76,19 @@ NK_INTERNAL nk_i32_t nk_reduce_add_i32x8_haswell_(__m256i sum_i32x8) {
|
|
|
76
76
|
|
|
77
77
|
/** @brief Horizontal sum of 4 i64s in a YMM register. */
|
|
78
78
|
NK_INTERNAL nk_i64_t nk_reduce_add_i64x4_haswell_(__m256i sum_i64x4) {
|
|
79
|
-
__m128i
|
|
80
|
-
__m128i
|
|
81
|
-
__m128i sum_i64x2 = _mm_add_epi64(
|
|
82
|
-
__m128i
|
|
83
|
-
__m128i
|
|
84
|
-
return _mm_cvtsi128_si64(
|
|
79
|
+
__m128i low_i64x2 = _mm256_castsi256_si128(sum_i64x4);
|
|
80
|
+
__m128i high_i64x2 = _mm256_extracti128_si256(sum_i64x4, 1);
|
|
81
|
+
__m128i sum_i64x2 = _mm_add_epi64(low_i64x2, high_i64x2);
|
|
82
|
+
__m128i high_lane_i64x2 = _mm_unpackhi_epi64(sum_i64x2, sum_i64x2);
|
|
83
|
+
__m128i final_i64x2 = _mm_add_epi64(sum_i64x2, high_lane_i64x2);
|
|
84
|
+
return _mm_cvtsi128_si64(final_i64x2);
|
|
85
85
|
}
|
|
86
86
|
|
|
87
87
|
/** @brief Horizontal min of 8 signed i8s in a YMM register. */
|
|
88
88
|
NK_INTERNAL nk_i8_t nk_reduce_min_i8x32_haswell_(__m256i min_i8x32) {
|
|
89
|
-
__m128i
|
|
90
|
-
__m128i
|
|
91
|
-
__m128i min_i8x16 = _mm_min_epi8(
|
|
89
|
+
__m128i low_i8x16 = _mm256_castsi256_si128(min_i8x32);
|
|
90
|
+
__m128i high_i8x16 = _mm256_extracti128_si256(min_i8x32, 1);
|
|
91
|
+
__m128i min_i8x16 = _mm_min_epi8(low_i8x16, high_i8x16);
|
|
92
92
|
min_i8x16 = _mm_min_epi8(min_i8x16, _mm_shuffle_epi32(min_i8x16, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
93
93
|
min_i8x16 = _mm_min_epi8(min_i8x16, _mm_shuffle_epi32(min_i8x16, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
94
94
|
min_i8x16 = _mm_min_epi8(min_i8x16, _mm_srli_si128(min_i8x16, 2));
|
|
@@ -98,9 +98,9 @@ NK_INTERNAL nk_i8_t nk_reduce_min_i8x32_haswell_(__m256i min_i8x32) {
|
|
|
98
98
|
|
|
99
99
|
/** @brief Horizontal max of 8 signed i8s in a YMM register. */
|
|
100
100
|
NK_INTERNAL nk_i8_t nk_reduce_max_i8x32_haswell_(__m256i max_i8x32) {
|
|
101
|
-
__m128i
|
|
102
|
-
__m128i
|
|
103
|
-
__m128i max_i8x16 = _mm_max_epi8(
|
|
101
|
+
__m128i low_i8x16 = _mm256_castsi256_si128(max_i8x32);
|
|
102
|
+
__m128i high_i8x16 = _mm256_extracti128_si256(max_i8x32, 1);
|
|
103
|
+
__m128i max_i8x16 = _mm_max_epi8(low_i8x16, high_i8x16);
|
|
104
104
|
max_i8x16 = _mm_max_epi8(max_i8x16, _mm_shuffle_epi32(max_i8x16, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
105
105
|
max_i8x16 = _mm_max_epi8(max_i8x16, _mm_shuffle_epi32(max_i8x16, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
106
106
|
max_i8x16 = _mm_max_epi8(max_i8x16, _mm_srli_si128(max_i8x16, 2));
|
|
@@ -110,9 +110,9 @@ NK_INTERNAL nk_i8_t nk_reduce_max_i8x32_haswell_(__m256i max_i8x32) {
|
|
|
110
110
|
|
|
111
111
|
/** @brief Horizontal min of 8 unsigned u8s in a YMM register. */
|
|
112
112
|
NK_INTERNAL nk_u8_t nk_reduce_min_u8x32_haswell_(__m256i min_u8x32) {
|
|
113
|
-
__m128i
|
|
114
|
-
__m128i
|
|
115
|
-
__m128i min_u8x16 = _mm_min_epu8(
|
|
113
|
+
__m128i low_u8x16 = _mm256_castsi256_si128(min_u8x32);
|
|
114
|
+
__m128i high_u8x16 = _mm256_extracti128_si256(min_u8x32, 1);
|
|
115
|
+
__m128i min_u8x16 = _mm_min_epu8(low_u8x16, high_u8x16);
|
|
116
116
|
min_u8x16 = _mm_min_epu8(min_u8x16, _mm_shuffle_epi32(min_u8x16, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
117
117
|
min_u8x16 = _mm_min_epu8(min_u8x16, _mm_shuffle_epi32(min_u8x16, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
118
118
|
min_u8x16 = _mm_min_epu8(min_u8x16, _mm_srli_si128(min_u8x16, 2));
|
|
@@ -122,9 +122,9 @@ NK_INTERNAL nk_u8_t nk_reduce_min_u8x32_haswell_(__m256i min_u8x32) {
|
|
|
122
122
|
|
|
123
123
|
/** @brief Horizontal max of 8 unsigned u8s in a YMM register. */
|
|
124
124
|
NK_INTERNAL nk_u8_t nk_reduce_max_u8x32_haswell_(__m256i max_u8x32) {
|
|
125
|
-
__m128i
|
|
126
|
-
__m128i
|
|
127
|
-
__m128i max_u8x16 = _mm_max_epu8(
|
|
125
|
+
__m128i low_u8x16 = _mm256_castsi256_si128(max_u8x32);
|
|
126
|
+
__m128i high_u8x16 = _mm256_extracti128_si256(max_u8x32, 1);
|
|
127
|
+
__m128i max_u8x16 = _mm_max_epu8(low_u8x16, high_u8x16);
|
|
128
128
|
max_u8x16 = _mm_max_epu8(max_u8x16, _mm_shuffle_epi32(max_u8x16, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
129
129
|
max_u8x16 = _mm_max_epu8(max_u8x16, _mm_shuffle_epi32(max_u8x16, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
130
130
|
max_u8x16 = _mm_max_epu8(max_u8x16, _mm_srli_si128(max_u8x16, 2));
|
|
@@ -134,9 +134,9 @@ NK_INTERNAL nk_u8_t nk_reduce_max_u8x32_haswell_(__m256i max_u8x32) {
|
|
|
134
134
|
|
|
135
135
|
/** @brief Horizontal min of 16 signed i16s in a YMM register. */
|
|
136
136
|
NK_INTERNAL nk_i16_t nk_reduce_min_i16x16_haswell_(__m256i min_i16x16) {
|
|
137
|
-
__m128i
|
|
138
|
-
__m128i
|
|
139
|
-
__m128i min_i16x8 = _mm_min_epi16(
|
|
137
|
+
__m128i low_i16x8 = _mm256_castsi256_si128(min_i16x16);
|
|
138
|
+
__m128i high_i16x8 = _mm256_extracti128_si256(min_i16x16, 1);
|
|
139
|
+
__m128i min_i16x8 = _mm_min_epi16(low_i16x8, high_i16x8);
|
|
140
140
|
min_i16x8 = _mm_min_epi16(min_i16x8, _mm_shuffle_epi32(min_i16x8, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
141
141
|
min_i16x8 = _mm_min_epi16(min_i16x8, _mm_shuffle_epi32(min_i16x8, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
142
142
|
min_i16x8 = _mm_min_epi16(min_i16x8, _mm_srli_si128(min_i16x8, 2));
|
|
@@ -145,9 +145,9 @@ NK_INTERNAL nk_i16_t nk_reduce_min_i16x16_haswell_(__m256i min_i16x16) {
|
|
|
145
145
|
|
|
146
146
|
/** @brief Horizontal max of 16 signed i16s in a YMM register. */
|
|
147
147
|
NK_INTERNAL nk_i16_t nk_reduce_max_i16x16_haswell_(__m256i max_i16x16) {
|
|
148
|
-
__m128i
|
|
149
|
-
__m128i
|
|
150
|
-
__m128i max_i16x8 = _mm_max_epi16(
|
|
148
|
+
__m128i low_i16x8 = _mm256_castsi256_si128(max_i16x16);
|
|
149
|
+
__m128i high_i16x8 = _mm256_extracti128_si256(max_i16x16, 1);
|
|
150
|
+
__m128i max_i16x8 = _mm_max_epi16(low_i16x8, high_i16x8);
|
|
151
151
|
max_i16x8 = _mm_max_epi16(max_i16x8, _mm_shuffle_epi32(max_i16x8, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
152
152
|
max_i16x8 = _mm_max_epi16(max_i16x8, _mm_shuffle_epi32(max_i16x8, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
153
153
|
max_i16x8 = _mm_max_epi16(max_i16x8, _mm_srli_si128(max_i16x8, 2));
|
|
@@ -156,9 +156,9 @@ NK_INTERNAL nk_i16_t nk_reduce_max_i16x16_haswell_(__m256i max_i16x16) {
|
|
|
156
156
|
|
|
157
157
|
/** @brief Horizontal min of 16 unsigned u16s in a YMM register. */
|
|
158
158
|
NK_INTERNAL nk_u16_t nk_reduce_min_u16x16_haswell_(__m256i min_u16x16) {
|
|
159
|
-
__m128i
|
|
160
|
-
__m128i
|
|
161
|
-
__m128i min_u16x8 = _mm_min_epu16(
|
|
159
|
+
__m128i low_u16x8 = _mm256_castsi256_si128(min_u16x16);
|
|
160
|
+
__m128i high_u16x8 = _mm256_extracti128_si256(min_u16x16, 1);
|
|
161
|
+
__m128i min_u16x8 = _mm_min_epu16(low_u16x8, high_u16x8);
|
|
162
162
|
min_u16x8 = _mm_min_epu16(min_u16x8, _mm_shuffle_epi32(min_u16x8, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
163
163
|
min_u16x8 = _mm_min_epu16(min_u16x8, _mm_shuffle_epi32(min_u16x8, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
164
164
|
min_u16x8 = _mm_min_epu16(min_u16x8, _mm_srli_si128(min_u16x8, 2));
|
|
@@ -167,9 +167,9 @@ NK_INTERNAL nk_u16_t nk_reduce_min_u16x16_haswell_(__m256i min_u16x16) {
|
|
|
167
167
|
|
|
168
168
|
/** @brief Horizontal max of 16 unsigned u16s in a YMM register. */
|
|
169
169
|
NK_INTERNAL nk_u16_t nk_reduce_max_u16x16_haswell_(__m256i max_u16x16) {
|
|
170
|
-
__m128i
|
|
171
|
-
__m128i
|
|
172
|
-
__m128i max_u16x8 = _mm_max_epu16(
|
|
170
|
+
__m128i low_u16x8 = _mm256_castsi256_si128(max_u16x16);
|
|
171
|
+
__m128i high_u16x8 = _mm256_extracti128_si256(max_u16x16, 1);
|
|
172
|
+
__m128i max_u16x8 = _mm_max_epu16(low_u16x8, high_u16x8);
|
|
173
173
|
max_u16x8 = _mm_max_epu16(max_u16x8, _mm_shuffle_epi32(max_u16x8, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
174
174
|
max_u16x8 = _mm_max_epu16(max_u16x8, _mm_shuffle_epi32(max_u16x8, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
175
175
|
max_u16x8 = _mm_max_epu16(max_u16x8, _mm_srli_si128(max_u16x8, 2));
|
|
@@ -178,9 +178,9 @@ NK_INTERNAL nk_u16_t nk_reduce_max_u16x16_haswell_(__m256i max_u16x16) {
|
|
|
178
178
|
|
|
179
179
|
/** @brief Horizontal min of 8 signed i32s in a YMM register. */
|
|
180
180
|
NK_INTERNAL nk_i32_t nk_reduce_min_i32x8_haswell_(__m256i min_i32x8) {
|
|
181
|
-
__m128i
|
|
182
|
-
__m128i
|
|
183
|
-
__m128i min_i32x4 = _mm_min_epi32(
|
|
181
|
+
__m128i low_i32x4 = _mm256_castsi256_si128(min_i32x8);
|
|
182
|
+
__m128i high_i32x4 = _mm256_extracti128_si256(min_i32x8, 1);
|
|
183
|
+
__m128i min_i32x4 = _mm_min_epi32(low_i32x4, high_i32x4);
|
|
184
184
|
min_i32x4 = _mm_min_epi32(min_i32x4, _mm_shuffle_epi32(min_i32x4, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
185
185
|
min_i32x4 = _mm_min_epi32(min_i32x4, _mm_shuffle_epi32(min_i32x4, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
186
186
|
return _mm_cvtsi128_si32(min_i32x4);
|
|
@@ -188,9 +188,9 @@ NK_INTERNAL nk_i32_t nk_reduce_min_i32x8_haswell_(__m256i min_i32x8) {
|
|
|
188
188
|
|
|
189
189
|
/** @brief Horizontal max of 8 signed i32s in a YMM register. */
|
|
190
190
|
NK_INTERNAL nk_i32_t nk_reduce_max_i32x8_haswell_(__m256i max_i32x8) {
|
|
191
|
-
__m128i
|
|
192
|
-
__m128i
|
|
193
|
-
__m128i max_i32x4 = _mm_max_epi32(
|
|
191
|
+
__m128i low_i32x4 = _mm256_castsi256_si128(max_i32x8);
|
|
192
|
+
__m128i high_i32x4 = _mm256_extracti128_si256(max_i32x8, 1);
|
|
193
|
+
__m128i max_i32x4 = _mm_max_epi32(low_i32x4, high_i32x4);
|
|
194
194
|
max_i32x4 = _mm_max_epi32(max_i32x4, _mm_shuffle_epi32(max_i32x4, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
195
195
|
max_i32x4 = _mm_max_epi32(max_i32x4, _mm_shuffle_epi32(max_i32x4, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
196
196
|
return _mm_cvtsi128_si32(max_i32x4);
|
|
@@ -198,9 +198,9 @@ NK_INTERNAL nk_i32_t nk_reduce_max_i32x8_haswell_(__m256i max_i32x8) {
|
|
|
198
198
|
|
|
199
199
|
/** @brief Horizontal min of 8 unsigned u32s in a YMM register. */
|
|
200
200
|
NK_INTERNAL nk_u32_t nk_reduce_min_u32x8_haswell_(__m256i min_u32x8) {
|
|
201
|
-
__m128i
|
|
202
|
-
__m128i
|
|
203
|
-
__m128i min_u32x4 = _mm_min_epu32(
|
|
201
|
+
__m128i low_u32x4 = _mm256_castsi256_si128(min_u32x8);
|
|
202
|
+
__m128i high_u32x4 = _mm256_extracti128_si256(min_u32x8, 1);
|
|
203
|
+
__m128i min_u32x4 = _mm_min_epu32(low_u32x4, high_u32x4);
|
|
204
204
|
min_u32x4 = _mm_min_epu32(min_u32x4, _mm_shuffle_epi32(min_u32x4, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
205
205
|
min_u32x4 = _mm_min_epu32(min_u32x4, _mm_shuffle_epi32(min_u32x4, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
206
206
|
return (nk_u32_t)_mm_cvtsi128_si32(min_u32x4);
|
|
@@ -208,9 +208,9 @@ NK_INTERNAL nk_u32_t nk_reduce_min_u32x8_haswell_(__m256i min_u32x8) {
|
|
|
208
208
|
|
|
209
209
|
/** @brief Horizontal max of 8 unsigned u32s in a YMM register. */
|
|
210
210
|
NK_INTERNAL nk_u32_t nk_reduce_max_u32x8_haswell_(__m256i max_u32x8) {
|
|
211
|
-
__m128i
|
|
212
|
-
__m128i
|
|
213
|
-
__m128i max_u32x4 = _mm_max_epu32(
|
|
211
|
+
__m128i low_u32x4 = _mm256_castsi256_si128(max_u32x8);
|
|
212
|
+
__m128i high_u32x4 = _mm256_extracti128_si256(max_u32x8, 1);
|
|
213
|
+
__m128i max_u32x4 = _mm_max_epu32(low_u32x4, high_u32x4);
|
|
214
214
|
max_u32x4 = _mm_max_epu32(max_u32x4, _mm_shuffle_epi32(max_u32x4, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
215
215
|
max_u32x4 = _mm_max_epu32(max_u32x4, _mm_shuffle_epi32(max_u32x4, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
216
216
|
return (nk_u32_t)_mm_cvtsi128_si32(max_u32x4);
|
|
@@ -218,61 +218,63 @@ NK_INTERNAL nk_u32_t nk_reduce_max_u32x8_haswell_(__m256i max_u32x8) {
|
|
|
218
218
|
|
|
219
219
|
/** @brief Horizontal min of 4 signed i64s in a YMM register using comparison+blend. */
|
|
220
220
|
NK_INTERNAL nk_i64_t nk_reduce_min_i64x4_haswell_(__m256i min_i64x4) {
|
|
221
|
-
__m128i
|
|
222
|
-
__m128i
|
|
223
|
-
__m128i cmp_i64x2 = _mm_cmpgt_epi64(
|
|
224
|
-
__m128i min_i64x2 = _mm_blendv_epi8(
|
|
225
|
-
__m128i
|
|
226
|
-
__m128i
|
|
227
|
-
__m128i
|
|
228
|
-
return _mm_cvtsi128_si64(
|
|
221
|
+
__m128i low_i64x2 = _mm256_castsi256_si128(min_i64x4);
|
|
222
|
+
__m128i high_i64x2 = _mm256_extracti128_si256(min_i64x4, 1);
|
|
223
|
+
__m128i cmp_i64x2 = _mm_cmpgt_epi64(low_i64x2, high_i64x2);
|
|
224
|
+
__m128i min_i64x2 = _mm_blendv_epi8(low_i64x2, high_i64x2, cmp_i64x2);
|
|
225
|
+
__m128i high_lane_i64x2 = _mm_unpackhi_epi64(min_i64x2, min_i64x2);
|
|
226
|
+
__m128i cmp_final_i64x2 = _mm_cmpgt_epi64(min_i64x2, high_lane_i64x2);
|
|
227
|
+
__m128i result_i64x2 = _mm_blendv_epi8(min_i64x2, high_lane_i64x2, cmp_final_i64x2);
|
|
228
|
+
return _mm_cvtsi128_si64(result_i64x2);
|
|
229
229
|
}
|
|
230
230
|
|
|
231
231
|
/** @brief Horizontal max of 4 signed i64s in a YMM register using comparison+blend. */
|
|
232
232
|
NK_INTERNAL nk_i64_t nk_reduce_max_i64x4_haswell_(__m256i max_i64x4) {
|
|
233
|
-
__m128i
|
|
234
|
-
__m128i
|
|
235
|
-
__m128i cmp_i64x2 = _mm_cmpgt_epi64(
|
|
236
|
-
__m128i max_i64x2 = _mm_blendv_epi8(
|
|
237
|
-
__m128i
|
|
238
|
-
__m128i
|
|
239
|
-
__m128i
|
|
240
|
-
return _mm_cvtsi128_si64(
|
|
233
|
+
__m128i low_i64x2 = _mm256_castsi256_si128(max_i64x4);
|
|
234
|
+
__m128i high_i64x2 = _mm256_extracti128_si256(max_i64x4, 1);
|
|
235
|
+
__m128i cmp_i64x2 = _mm_cmpgt_epi64(low_i64x2, high_i64x2);
|
|
236
|
+
__m128i max_i64x2 = _mm_blendv_epi8(high_i64x2, low_i64x2, cmp_i64x2);
|
|
237
|
+
__m128i high_lane_i64x2 = _mm_unpackhi_epi64(max_i64x2, max_i64x2);
|
|
238
|
+
__m128i cmp_final_i64x2 = _mm_cmpgt_epi64(max_i64x2, high_lane_i64x2);
|
|
239
|
+
__m128i result_i64x2 = _mm_blendv_epi8(high_lane_i64x2, max_i64x2, cmp_final_i64x2);
|
|
240
|
+
return _mm_cvtsi128_si64(result_i64x2);
|
|
241
241
|
}
|
|
242
242
|
|
|
243
243
|
/** @brief Horizontal min of 4 unsigned u64s in a YMM register using XOR trick for unsigned comparison. */
|
|
244
244
|
NK_INTERNAL nk_u64_t nk_reduce_min_u64x4_haswell_(__m256i min_u64x4) {
|
|
245
|
-
__m128i
|
|
246
|
-
__m128i
|
|
247
|
-
__m128i
|
|
248
|
-
__m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(
|
|
249
|
-
|
|
250
|
-
__m128i
|
|
251
|
-
__m128i
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
245
|
+
__m128i sign_bit_i64x2 = _mm_set1_epi64x((nk_i64_t)0x8000000000000000ull);
|
|
246
|
+
__m128i low_u64x2 = _mm256_castsi256_si128(min_u64x4);
|
|
247
|
+
__m128i high_u64x2 = _mm256_extracti128_si256(min_u64x4, 1);
|
|
248
|
+
__m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(low_u64x2, sign_bit_i64x2),
|
|
249
|
+
_mm_xor_si128(high_u64x2, sign_bit_i64x2));
|
|
250
|
+
__m128i min_u64x2 = _mm_blendv_epi8(low_u64x2, high_u64x2, cmp_i64x2);
|
|
251
|
+
__m128i high_lane_u64x2 = _mm_unpackhi_epi64(min_u64x2, min_u64x2);
|
|
252
|
+
__m128i cmp_final_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(min_u64x2, sign_bit_i64x2),
|
|
253
|
+
_mm_xor_si128(high_lane_u64x2, sign_bit_i64x2));
|
|
254
|
+
__m128i result_u64x2 = _mm_blendv_epi8(min_u64x2, high_lane_u64x2, cmp_final_i64x2);
|
|
255
|
+
return (nk_u64_t)_mm_cvtsi128_si64(result_u64x2);
|
|
255
256
|
}
|
|
256
257
|
|
|
257
258
|
/** @brief Horizontal max of 4 unsigned u64s in a YMM register using XOR trick for unsigned comparison. */
|
|
258
259
|
NK_INTERNAL nk_u64_t nk_reduce_max_u64x4_haswell_(__m256i max_u64x4) {
|
|
259
|
-
__m128i
|
|
260
|
-
__m128i
|
|
261
|
-
__m128i
|
|
262
|
-
__m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(
|
|
263
|
-
|
|
264
|
-
__m128i
|
|
265
|
-
__m128i
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
260
|
+
__m128i sign_bit_i64x2 = _mm_set1_epi64x((nk_i64_t)0x8000000000000000ull);
|
|
261
|
+
__m128i low_u64x2 = _mm256_castsi256_si128(max_u64x4);
|
|
262
|
+
__m128i high_u64x2 = _mm256_extracti128_si256(max_u64x4, 1);
|
|
263
|
+
__m128i cmp_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(low_u64x2, sign_bit_i64x2),
|
|
264
|
+
_mm_xor_si128(high_u64x2, sign_bit_i64x2));
|
|
265
|
+
__m128i max_u64x2 = _mm_blendv_epi8(high_u64x2, low_u64x2, cmp_i64x2);
|
|
266
|
+
__m128i high_lane_u64x2 = _mm_unpackhi_epi64(max_u64x2, max_u64x2);
|
|
267
|
+
__m128i cmp_final_i64x2 = _mm_cmpgt_epi64(_mm_xor_si128(max_u64x2, sign_bit_i64x2),
|
|
268
|
+
_mm_xor_si128(high_lane_u64x2, sign_bit_i64x2));
|
|
269
|
+
__m128i result_u64x2 = _mm_blendv_epi8(high_lane_u64x2, max_u64x2, cmp_final_i64x2);
|
|
270
|
+
return (nk_u64_t)_mm_cvtsi128_si64(result_u64x2);
|
|
269
271
|
}
|
|
270
272
|
|
|
271
273
|
/** @brief Horizontal min of 8 floats in a YMM register. */
|
|
272
274
|
NK_INTERNAL nk_f32_t nk_reduce_min_f32x8_haswell_(__m256 min_f32x8) {
|
|
273
|
-
__m128
|
|
274
|
-
__m128
|
|
275
|
-
__m128 min_f32x4 = _mm_min_ps(
|
|
275
|
+
__m128 low_f32x4 = _mm256_castps256_ps128(min_f32x8);
|
|
276
|
+
__m128 high_f32x4 = _mm256_extractf128_ps(min_f32x8, 1);
|
|
277
|
+
__m128 min_f32x4 = _mm_min_ps(low_f32x4, high_f32x4);
|
|
276
278
|
min_f32x4 = _mm_min_ps(min_f32x4, _mm_shuffle_ps(min_f32x4, min_f32x4, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
277
279
|
min_f32x4 = _mm_min_ps(min_f32x4, _mm_shuffle_ps(min_f32x4, min_f32x4, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
278
280
|
return _mm_cvtss_f32(min_f32x4);
|
|
@@ -280,9 +282,9 @@ NK_INTERNAL nk_f32_t nk_reduce_min_f32x8_haswell_(__m256 min_f32x8) {
|
|
|
280
282
|
|
|
281
283
|
/** @brief Horizontal max of 8 floats in a YMM register. */
|
|
282
284
|
NK_INTERNAL nk_f32_t nk_reduce_max_f32x8_haswell_(__m256 max_f32x8) {
|
|
283
|
-
__m128
|
|
284
|
-
__m128
|
|
285
|
-
__m128 max_f32x4 = _mm_max_ps(
|
|
285
|
+
__m128 low_f32x4 = _mm256_castps256_ps128(max_f32x8);
|
|
286
|
+
__m128 high_f32x4 = _mm256_extractf128_ps(max_f32x8, 1);
|
|
287
|
+
__m128 max_f32x4 = _mm_max_ps(low_f32x4, high_f32x4);
|
|
286
288
|
max_f32x4 = _mm_max_ps(max_f32x4, _mm_shuffle_ps(max_f32x4, max_f32x4, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
287
289
|
max_f32x4 = _mm_max_ps(max_f32x4, _mm_shuffle_ps(max_f32x4, max_f32x4, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
288
290
|
return _mm_cvtss_f32(max_f32x4);
|
|
@@ -290,18 +292,18 @@ NK_INTERNAL nk_f32_t nk_reduce_max_f32x8_haswell_(__m256 max_f32x8) {
|
|
|
290
292
|
|
|
291
293
|
/** @brief Horizontal min of 4 doubles in a YMM register. */
|
|
292
294
|
NK_INTERNAL nk_f64_t nk_reduce_min_f64x4_haswell_(__m256d min_f64x4) {
|
|
293
|
-
__m128d
|
|
294
|
-
__m128d
|
|
295
|
-
__m128d min_f64x2 = _mm_min_pd(
|
|
295
|
+
__m128d low_f64x2 = _mm256_castpd256_pd128(min_f64x4);
|
|
296
|
+
__m128d high_f64x2 = _mm256_extractf128_pd(min_f64x4, 1);
|
|
297
|
+
__m128d min_f64x2 = _mm_min_pd(low_f64x2, high_f64x2);
|
|
296
298
|
min_f64x2 = _mm_min_pd(min_f64x2, _mm_shuffle_pd(min_f64x2, min_f64x2, 1));
|
|
297
299
|
return _mm_cvtsd_f64(min_f64x2);
|
|
298
300
|
}
|
|
299
301
|
|
|
300
302
|
/** @brief Horizontal max of 4 doubles in a YMM register. */
|
|
301
303
|
NK_INTERNAL nk_f64_t nk_reduce_max_f64x4_haswell_(__m256d max_f64x4) {
|
|
302
|
-
__m128d
|
|
303
|
-
__m128d
|
|
304
|
-
__m128d max_f64x2 = _mm_max_pd(
|
|
304
|
+
__m128d low_f64x2 = _mm256_castpd256_pd128(max_f64x4);
|
|
305
|
+
__m128d high_f64x2 = _mm256_extractf128_pd(max_f64x4, 1);
|
|
306
|
+
__m128d max_f64x2 = _mm_max_pd(low_f64x2, high_f64x2);
|
|
305
307
|
max_f64x2 = _mm_max_pd(max_f64x2, _mm_shuffle_pd(max_f64x2, max_f64x2, 1));
|
|
306
308
|
return _mm_cvtsd_f64(max_f64x2);
|
|
307
309
|
}
|
|
@@ -529,7 +531,7 @@ NK_INTERNAL void nk_reduce_moments_f32_haswell_strided_( //
|
|
|
529
531
|
__m256d sumsq_low_f64x4 = _mm256_setzero_pd(), sumsq_high_f64x4 = _mm256_setzero_pd();
|
|
530
532
|
nk_size_t idx = 0, total = count * stride_elements;
|
|
531
533
|
nk_size_t step = nk_size_round_up_to_multiple_(8, stride_elements);
|
|
532
|
-
for (; idx +
|
|
534
|
+
for (; idx + stride_elements + 7 <= total; idx += step) {
|
|
533
535
|
__m128 low_f32x4 = _mm_blendv_ps(zero_f32x4, _mm_loadu_ps(data_ptr + idx), blend_low_f32x4);
|
|
534
536
|
__m128 high_f32x4 = _mm_blendv_ps(zero_f32x4, _mm_loadu_ps(data_ptr + idx + 4), blend_high_f32x4);
|
|
535
537
|
__m256d low_f64x4 = _mm256_cvtps_pd(low_f32x4);
|
|
@@ -767,7 +769,7 @@ NK_INTERNAL void nk_reduce_moments_f64_haswell_strided_( //
|
|
|
767
769
|
__m256d sumsq_comp_f64x4 = _mm256_setzero_pd();
|
|
768
770
|
nk_size_t idx = 0, total = count * stride_elements;
|
|
769
771
|
nk_size_t step = nk_size_round_up_to_multiple_(4, stride_elements);
|
|
770
|
-
for (; idx +
|
|
772
|
+
for (; idx + stride_elements + 3 <= total; idx += step) {
|
|
771
773
|
__m256d val_f64x4 = _mm256_blendv_pd(zero_f64x4, _mm256_loadu_pd(data_ptr + idx), blend_f64x4);
|
|
772
774
|
__m256d tentative_f64x4 = _mm256_add_pd(sum_f64x4, val_f64x4);
|
|
773
775
|
__m256d round_f64x4 = _mm256_sub_pd(tentative_f64x4, sum_f64x4);
|
|
@@ -979,7 +981,7 @@ NK_INTERNAL void nk_reduce_moments_i8_haswell_strided_( //
|
|
|
979
981
|
nk_size_t total_scalars = count * stride_elements;
|
|
980
982
|
nk_size_t vector_element_count = 0;
|
|
981
983
|
nk_size_t step = elements_per_vector * stride_elements;
|
|
982
|
-
for (; idx_scalars +
|
|
984
|
+
for (; idx_scalars + stride_elements + 31 <= total_scalars; idx_scalars += step) {
|
|
983
985
|
__m256i data_i8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
|
|
984
986
|
data_i8x32 = _mm256_and_si256(data_i8x32, stride_mask_i8x32);
|
|
985
987
|
__m256i unsigned_u8x32 = _mm256_xor_si256(data_i8x32, masked_bias_i8x32);
|
|
@@ -1179,7 +1181,7 @@ NK_INTERNAL void nk_reduce_moments_u8_haswell_strided_( //
|
|
|
1179
1181
|
nk_size_t idx_scalars = 0;
|
|
1180
1182
|
nk_size_t total_scalars = count * stride_elements;
|
|
1181
1183
|
nk_size_t step = nk_size_round_up_to_multiple_(32, stride_elements);
|
|
1182
|
-
for (; idx_scalars +
|
|
1184
|
+
for (; idx_scalars + stride_elements + 31 <= total_scalars; idx_scalars += step) {
|
|
1183
1185
|
__m256i data_u8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
|
|
1184
1186
|
data_u8x32 = _mm256_and_si256(data_u8x32, stride_mask_u8x32);
|
|
1185
1187
|
sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(data_u8x32, zero_u8x32));
|
|
@@ -1375,7 +1377,7 @@ NK_INTERNAL void nk_reduce_moments_i16_haswell_strided_( //
|
|
|
1375
1377
|
nk_size_t idx_scalars = 0;
|
|
1376
1378
|
nk_size_t total_scalars = count * stride_elements;
|
|
1377
1379
|
nk_size_t step = nk_size_round_up_to_multiple_(16, stride_elements);
|
|
1378
|
-
for (; idx_scalars +
|
|
1380
|
+
for (; idx_scalars + stride_elements + 15 <= total_scalars; idx_scalars += step) {
|
|
1379
1381
|
__m256i data_i16x16 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
|
|
1380
1382
|
data_i16x16 = _mm256_and_si256(data_i16x16, stride_mask_i16x16);
|
|
1381
1383
|
sum_i32x8 = _mm256_add_epi32(sum_i32x8, _mm256_madd_epi16(data_i16x16, ones_i16x16));
|
|
@@ -1566,19 +1568,19 @@ NK_INTERNAL void nk_reduce_moments_u16_haswell_strided_( //
|
|
|
1566
1568
|
nk_size_t idx_scalars = 0;
|
|
1567
1569
|
nk_size_t total_scalars = count * stride_elements;
|
|
1568
1570
|
nk_size_t step = nk_size_round_up_to_multiple_(16, stride_elements);
|
|
1569
|
-
for (; idx_scalars +
|
|
1571
|
+
for (; idx_scalars + stride_elements + 15 <= total_scalars; idx_scalars += step) {
|
|
1570
1572
|
__m256i data_u16x16 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx_scalars));
|
|
1571
1573
|
data_u16x16 = _mm256_and_si256(data_u16x16, stride_mask_i16x16);
|
|
1572
|
-
__m256i
|
|
1573
|
-
__m256i
|
|
1574
|
-
sum_u32x8 = _mm256_add_epi32(sum_u32x8,
|
|
1575
|
-
sum_u32x8 = _mm256_add_epi32(sum_u32x8,
|
|
1576
|
-
__m256i
|
|
1577
|
-
__m256i
|
|
1578
|
-
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(
|
|
1579
|
-
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(
|
|
1580
|
-
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(
|
|
1581
|
-
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(
|
|
1574
|
+
__m256i low_u32x8 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(data_u16x16));
|
|
1575
|
+
__m256i high_u32x8 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(data_u16x16, 1));
|
|
1576
|
+
sum_u32x8 = _mm256_add_epi32(sum_u32x8, low_u32x8);
|
|
1577
|
+
sum_u32x8 = _mm256_add_epi32(sum_u32x8, high_u32x8);
|
|
1578
|
+
__m256i low_sq_u32x8 = _mm256_mullo_epi32(low_u32x8, low_u32x8);
|
|
1579
|
+
__m256i high_sq_u32x8 = _mm256_mullo_epi32(high_u32x8, high_u32x8);
|
|
1580
|
+
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(low_sq_u32x8)));
|
|
1581
|
+
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(low_sq_u32x8, 1)));
|
|
1582
|
+
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_castsi256_si128(high_sq_u32x8)));
|
|
1583
|
+
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_cvtepu32_epi64(_mm256_extracti128_si256(high_sq_u32x8, 1)));
|
|
1582
1584
|
}
|
|
1583
1585
|
__m256i sum_u64x4 = _mm256_add_epi64( //
|
|
1584
1586
|
_mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum_u32x8)), //
|
|
@@ -1730,8 +1732,8 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
|
|
|
1730
1732
|
nk_i32_t const *data_ptr, nk_size_t count, //
|
|
1731
1733
|
nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
1732
1734
|
|
|
1733
|
-
__m256i
|
|
1734
|
-
__m256i
|
|
1735
|
+
__m256i sum_low_i64x4 = _mm256_setzero_si256();
|
|
1736
|
+
__m256i sum_high_i64x4 = _mm256_setzero_si256();
|
|
1735
1737
|
__m256i sumsq_u64x4 = _mm256_setzero_si256();
|
|
1736
1738
|
int sumsq_overflow_mask = 0;
|
|
1737
1739
|
__m256i sign_bit_i64x4 = _mm256_set1_epi64x((nk_i64_t)0x8000000000000000ULL);
|
|
@@ -1739,25 +1741,25 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
|
|
|
1739
1741
|
for (; idx + 8 <= count; idx += 8) {
|
|
1740
1742
|
__m256i data_i32x8 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx));
|
|
1741
1743
|
// 128-bit sum: lo half
|
|
1742
|
-
__m256i
|
|
1743
|
-
__m256i sum_before_i64x4 =
|
|
1744
|
-
|
|
1745
|
-
__m256i result_biased_i64x4 = _mm256_xor_si256(
|
|
1744
|
+
__m256i widened_low_i64x4 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(data_i32x8));
|
|
1745
|
+
__m256i sum_before_i64x4 = sum_low_i64x4;
|
|
1746
|
+
sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_low_i64x4);
|
|
1747
|
+
__m256i result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
|
|
1746
1748
|
__m256i before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
|
|
1747
1749
|
__m256i carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
|
|
1748
|
-
|
|
1749
|
-
__m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(),
|
|
1750
|
-
|
|
1750
|
+
sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
|
|
1751
|
+
__m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_low_i64x4);
|
|
1752
|
+
sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
|
|
1751
1753
|
// 128-bit sum: hi half
|
|
1752
|
-
__m256i
|
|
1753
|
-
sum_before_i64x4 =
|
|
1754
|
-
|
|
1755
|
-
result_biased_i64x4 = _mm256_xor_si256(
|
|
1754
|
+
__m256i widened_high_i64x4 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(data_i32x8, 1));
|
|
1755
|
+
sum_before_i64x4 = sum_low_i64x4;
|
|
1756
|
+
sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_high_i64x4);
|
|
1757
|
+
result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
|
|
1756
1758
|
before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
|
|
1757
1759
|
carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
|
|
1758
|
-
|
|
1759
|
-
sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(),
|
|
1760
|
-
|
|
1760
|
+
sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
|
|
1761
|
+
sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_high_i64x4);
|
|
1762
|
+
sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
|
|
1761
1763
|
// Sumsq: running mask + wrapping add with unsigned carry detection
|
|
1762
1764
|
__m256i even_sq_u64x4 = _mm256_mul_epi32(data_i32x8, data_i32x8);
|
|
1763
1765
|
__m256i odd_i32x8 = _mm256_srli_epi64(data_i32x8, 32);
|
|
@@ -1780,24 +1782,24 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
|
|
|
1780
1782
|
nk_b256_vec_t tail_vec;
|
|
1781
1783
|
nk_partial_load_b32x8_serial_(data_ptr + idx, &tail_vec, remaining);
|
|
1782
1784
|
__m256i data_i32x8 = tail_vec.ymm;
|
|
1783
|
-
__m256i
|
|
1784
|
-
__m256i sum_before_i64x4 =
|
|
1785
|
-
|
|
1786
|
-
__m256i result_biased_i64x4 = _mm256_xor_si256(
|
|
1785
|
+
__m256i widened_low_i64x4 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(data_i32x8));
|
|
1786
|
+
__m256i sum_before_i64x4 = sum_low_i64x4;
|
|
1787
|
+
sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_low_i64x4);
|
|
1788
|
+
__m256i result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
|
|
1787
1789
|
__m256i before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
|
|
1788
1790
|
__m256i carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
|
|
1789
|
-
|
|
1790
|
-
__m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(),
|
|
1791
|
-
|
|
1792
|
-
__m256i
|
|
1793
|
-
sum_before_i64x4 =
|
|
1794
|
-
|
|
1795
|
-
result_biased_i64x4 = _mm256_xor_si256(
|
|
1791
|
+
sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
|
|
1792
|
+
__m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_low_i64x4);
|
|
1793
|
+
sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
|
|
1794
|
+
__m256i widened_high_i64x4 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(data_i32x8, 1));
|
|
1795
|
+
sum_before_i64x4 = sum_low_i64x4;
|
|
1796
|
+
sum_low_i64x4 = _mm256_add_epi64(sum_low_i64x4, widened_high_i64x4);
|
|
1797
|
+
result_biased_i64x4 = _mm256_xor_si256(sum_low_i64x4, sign_bit_i64x4);
|
|
1796
1798
|
before_biased_i64x4 = _mm256_xor_si256(sum_before_i64x4, sign_bit_i64x4);
|
|
1797
1799
|
carry_mask_i64x4 = _mm256_cmpgt_epi64(before_biased_i64x4, result_biased_i64x4);
|
|
1798
|
-
|
|
1799
|
-
sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(),
|
|
1800
|
-
|
|
1800
|
+
sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_mask_i64x4);
|
|
1801
|
+
sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), widened_high_i64x4);
|
|
1802
|
+
sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
|
|
1801
1803
|
__m256i even_sq_u64x4 = _mm256_mul_epi32(data_i32x8, data_i32x8);
|
|
1802
1804
|
__m256i odd_i32x8 = _mm256_srli_epi64(data_i32x8, 32);
|
|
1803
1805
|
__m256i odd_sq_u64x4 = _mm256_mul_epi32(odd_i32x8, odd_i32x8);
|
|
@@ -1820,20 +1822,20 @@ NK_INTERNAL void nk_reduce_moments_i32_haswell_contiguous_( //
|
|
|
1820
1822
|
else sumsq = nk_reduce_sadd_u64x4_haswell_(sumsq_u64x4);
|
|
1821
1823
|
// Sum: horizontal 128-bit reduction (4 lanes → scalar)
|
|
1822
1824
|
nk_b256_vec_t lower_vec, upper_vec;
|
|
1823
|
-
lower_vec.ymm =
|
|
1824
|
-
upper_vec.ymm =
|
|
1825
|
-
nk_u64_t
|
|
1826
|
-
nk_i64_t
|
|
1825
|
+
lower_vec.ymm = sum_low_i64x4;
|
|
1826
|
+
upper_vec.ymm = sum_high_i64x4;
|
|
1827
|
+
nk_u64_t sum_low = 0;
|
|
1828
|
+
nk_i64_t sum_high = 0;
|
|
1827
1829
|
for (int i = 0; i < 4; i++) {
|
|
1828
|
-
nk_u64_t sum_before =
|
|
1829
|
-
|
|
1830
|
-
if (
|
|
1831
|
-
|
|
1830
|
+
nk_u64_t sum_before = sum_low;
|
|
1831
|
+
sum_low += lower_vec.u64s[i];
|
|
1832
|
+
if (sum_low < sum_before) sum_high++;
|
|
1833
|
+
sum_high += upper_vec.i64s[i];
|
|
1832
1834
|
}
|
|
1833
1835
|
*sumsq_ptr = sumsq;
|
|
1834
|
-
nk_i64_t
|
|
1835
|
-
if (
|
|
1836
|
-
else if (
|
|
1836
|
+
nk_i64_t sum_low_signed = (nk_i64_t)sum_low;
|
|
1837
|
+
if (sum_high == (sum_low_signed >> 63)) *sum_ptr = sum_low_signed;
|
|
1838
|
+
else if (sum_high >= 0) *sum_ptr = NK_I64_MAX;
|
|
1837
1839
|
else *sum_ptr = NK_I64_MIN;
|
|
1838
1840
|
}
|
|
1839
1841
|
|
|
@@ -2114,8 +2116,8 @@ NK_INTERNAL void nk_reduce_moments_i64_haswell_contiguous_( //
|
|
|
2114
2116
|
nk_i64_t const *data_ptr, nk_size_t count, //
|
|
2115
2117
|
nk_i64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
|
|
2116
2118
|
|
|
2117
|
-
__m256i
|
|
2118
|
-
__m256i
|
|
2119
|
+
__m256i sum_low_u64x4 = _mm256_setzero_si256();
|
|
2120
|
+
__m256i sum_high_i64x4 = _mm256_setzero_si256();
|
|
2119
2121
|
__m256i sumsq_u64x4 = _mm256_setzero_si256();
|
|
2120
2122
|
int sumsq_overflow_mask = 0;
|
|
2121
2123
|
__m256i sign_bit_i64x4 = _mm256_set1_epi64x((nk_i64_t)0x8000000000000000ULL);
|
|
@@ -2130,26 +2132,26 @@ NK_INTERNAL void nk_reduce_moments_i64_haswell_contiguous_( //
|
|
|
2130
2132
|
sumsq_overflow_mask |= _mm256_movemask_pd(
|
|
2131
2133
|
_mm256_castsi256_pd(_mm256_cmpgt_epi64(sq_before_biased_u64x4, sq_result_biased_u64x4)));
|
|
2132
2134
|
// Vectorized 128-bit carry-propagating sum
|
|
2133
|
-
__m256i sum_before_u64x4 =
|
|
2134
|
-
|
|
2135
|
+
__m256i sum_before_u64x4 = sum_low_u64x4;
|
|
2136
|
+
sum_low_u64x4 = _mm256_add_epi64(sum_low_u64x4, data_i64x4);
|
|
2135
2137
|
__m256i before_biased_u64x4 = _mm256_xor_si256(sum_before_u64x4, sign_bit_i64x4);
|
|
2136
|
-
__m256i result_biased_u64x4 = _mm256_xor_si256(
|
|
2138
|
+
__m256i result_biased_u64x4 = _mm256_xor_si256(sum_low_u64x4, sign_bit_i64x4);
|
|
2137
2139
|
__m256i carry_u64x4 = _mm256_cmpgt_epi64(before_biased_u64x4, result_biased_u64x4);
|
|
2138
|
-
|
|
2140
|
+
sum_high_i64x4 = _mm256_sub_epi64(sum_high_i64x4, carry_u64x4);
|
|
2139
2141
|
__m256i sign_ext_i64x4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), data_i64x4);
|
|
2140
|
-
|
|
2142
|
+
sum_high_i64x4 = _mm256_add_epi64(sum_high_i64x4, sign_ext_i64x4);
|
|
2141
2143
|
}
|
|
2142
|
-
// Horizontal reduction of 4 lanes to scalar (
|
|
2144
|
+
// Horizontal reduction of 4 lanes to scalar (sum_low, sum_high)
|
|
2143
2145
|
nk_b256_vec_t lower_vec, upper_vec;
|
|
2144
|
-
lower_vec.ymm =
|
|
2145
|
-
upper_vec.ymm =
|
|
2146
|
-
nk_u64_t
|
|
2147
|
-
nk_i64_t
|
|
2146
|
+
lower_vec.ymm = sum_low_u64x4;
|
|
2147
|
+
upper_vec.ymm = sum_high_i64x4;
|
|
2148
|
+
nk_u64_t sum_low = 0;
|
|
2149
|
+
nk_i64_t sum_high = 0;
|
|
2148
2150
|
for (int i = 0; i < 4; i++) {
|
|
2149
|
-
nk_u64_t before =
|
|
2150
|
-
|
|
2151
|
-
if (
|
|
2152
|
-
|
|
2151
|
+
nk_u64_t before = sum_low;
|
|
2152
|
+
sum_low += lower_vec.u64s[i];
|
|
2153
|
+
if (sum_low < before) sum_high++;
|
|
2154
|
+
sum_high += upper_vec.i64s[i];
|
|
2153
2155
|
}
|
|
2154
2156
|
nk_u64_t sumsq;
|
|
2155
2157
|
if (sumsq_overflow_mask) sumsq = NK_U64_MAX;
|
|
@@ -2159,15 +2161,15 @@ NK_INTERNAL void nk_reduce_moments_i64_haswell_contiguous_( //
|
|
|
2159
2161
|
nk_i64_t product = nk_i64_saturating_mul_serial(val, val);
|
|
2160
2162
|
nk_u64_t unsigned_product = (nk_u64_t)product;
|
|
2161
2163
|
sumsq = nk_u64_saturating_add_serial(sumsq, unsigned_product);
|
|
2162
|
-
nk_u64_t before =
|
|
2163
|
-
|
|
2164
|
-
if (
|
|
2165
|
-
|
|
2164
|
+
nk_u64_t before = sum_low;
|
|
2165
|
+
sum_low += (nk_u64_t)val;
|
|
2166
|
+
if (sum_low < before) sum_high++;
|
|
2167
|
+
sum_high += (val >> 63);
|
|
2166
2168
|
}
|
|
2167
2169
|
*sumsq_ptr = sumsq;
|
|
2168
|
-
nk_i64_t
|
|
2169
|
-
if (
|
|
2170
|
-
else if (
|
|
2170
|
+
nk_i64_t sum_low_signed = (nk_i64_t)sum_low;
|
|
2171
|
+
if (sum_high == (sum_low_signed >> 63)) *sum_ptr = sum_low_signed;
|
|
2172
|
+
else if (sum_high >= 0) *sum_ptr = NK_I64_MAX;
|
|
2171
2173
|
else *sum_ptr = NK_I64_MIN;
|
|
2172
2174
|
}
|
|
2173
2175
|
|
|
@@ -2925,9 +2927,9 @@ NK_PUBLIC void nk_reduce_moments_e2m3_haswell( //
|
|
|
2925
2927
|
|
|
2926
2928
|
NK_INTERNAL __m256i nk_fp6x32_to_u8x32_comparable_haswell_(__m256i raw_i8x32) {
|
|
2927
2929
|
raw_i8x32 = _mm256_and_si256(raw_i8x32, _mm256_set1_epi8(0x3F)); // mask to 6 valid bits
|
|
2928
|
-
__m256i
|
|
2929
|
-
__m256i neg_i8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(raw_i8x32,
|
|
2930
|
-
__m256i pos_xor_i8x32 =
|
|
2930
|
+
__m256i sign_mask_i8x32 = _mm256_set1_epi8(0x20);
|
|
2931
|
+
__m256i neg_i8x32 = _mm256_cmpeq_epi8(_mm256_and_si256(raw_i8x32, sign_mask_i8x32), sign_mask_i8x32);
|
|
2932
|
+
__m256i pos_xor_i8x32 = sign_mask_i8x32; // flip sign bit only
|
|
2931
2933
|
__m256i neg_xor_i8x32 = _mm256_set1_epi8(0x3F); // flip all 6 bits
|
|
2932
2934
|
__m256i xor_i8x32 = _mm256_blendv_epi8(pos_xor_i8x32, neg_xor_i8x32, neg_i8x32);
|
|
2933
2935
|
return _mm256_xor_si256(raw_i8x32, xor_i8x32);
|
|
@@ -2960,15 +2962,15 @@ NK_INTERNAL void nk_reduce_minmax_e2m3_haswell_contiguous_( //
|
|
|
2960
2962
|
for (; idx + 32 <= count; idx += 32) {
|
|
2961
2963
|
__m256i data_i8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx));
|
|
2962
2964
|
__m256i data_cmp_u8x32 = nk_fp6x32_to_u8x32_comparable_haswell_(data_i8x32);
|
|
2963
|
-
__m256i
|
|
2964
|
-
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
2965
|
+
__m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_cmp_u8x32);
|
|
2966
|
+
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
|
|
2965
2967
|
_mm256_set1_epi8((char)0xFF));
|
|
2966
|
-
min_vec.ymm =
|
|
2968
|
+
min_vec.ymm = new_min_u8x32;
|
|
2967
2969
|
min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
|
|
2968
|
-
__m256i
|
|
2969
|
-
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
2970
|
+
__m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_cmp_u8x32);
|
|
2971
|
+
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
|
|
2970
2972
|
_mm256_set1_epi8((char)0xFF));
|
|
2971
|
-
max_vec.ymm =
|
|
2973
|
+
max_vec.ymm = new_max_u8x32;
|
|
2972
2974
|
max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
|
|
2973
2975
|
current_loop_cycle_u8x32 = _mm256_add_epi8(current_loop_cycle_u8x32, one_u8x32);
|
|
2974
2976
|
}
|
|
@@ -2984,15 +2986,15 @@ NK_INTERNAL void nk_reduce_minmax_e2m3_haswell_contiguous_( //
|
|
|
2984
2986
|
__m256i valid_b8x32 = _mm256_cmpgt_epi8(_mm256_set1_epi8((char)remaining), lane_indices_u8x32);
|
|
2985
2987
|
__m256i data_min_u8x32 = _mm256_blendv_epi8(_mm256_set1_epi8(0x3F), data_cmp_u8x32, valid_b8x32);
|
|
2986
2988
|
__m256i data_max_u8x32 = _mm256_blendv_epi8(_mm256_setzero_si256(), data_cmp_u8x32, valid_b8x32);
|
|
2987
|
-
__m256i
|
|
2988
|
-
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
2989
|
+
__m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_min_u8x32);
|
|
2990
|
+
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
|
|
2989
2991
|
_mm256_set1_epi8((char)0xFF));
|
|
2990
|
-
min_vec.ymm =
|
|
2992
|
+
min_vec.ymm = new_min_u8x32;
|
|
2991
2993
|
min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
|
|
2992
|
-
__m256i
|
|
2993
|
-
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
2994
|
+
__m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_max_u8x32);
|
|
2995
|
+
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
|
|
2994
2996
|
_mm256_set1_epi8((char)0xFF));
|
|
2995
|
-
max_vec.ymm =
|
|
2997
|
+
max_vec.ymm = new_max_u8x32;
|
|
2996
2998
|
max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
|
|
2997
2999
|
}
|
|
2998
3000
|
|
|
@@ -3149,15 +3151,15 @@ NK_INTERNAL void nk_reduce_minmax_e3m2_haswell_contiguous_( //
|
|
|
3149
3151
|
for (; idx + 32 <= count; idx += 32) {
|
|
3150
3152
|
__m256i data_i8x32 = _mm256_loadu_si256((__m256i const *)(data_ptr + idx));
|
|
3151
3153
|
__m256i data_cmp_u8x32 = nk_fp6x32_to_u8x32_comparable_haswell_(data_i8x32);
|
|
3152
|
-
__m256i
|
|
3153
|
-
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
3154
|
+
__m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_cmp_u8x32);
|
|
3155
|
+
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
|
|
3154
3156
|
_mm256_set1_epi8((char)0xFF));
|
|
3155
|
-
min_vec.ymm =
|
|
3157
|
+
min_vec.ymm = new_min_u8x32;
|
|
3156
3158
|
min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
|
|
3157
|
-
__m256i
|
|
3158
|
-
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
3159
|
+
__m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_cmp_u8x32);
|
|
3160
|
+
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
|
|
3159
3161
|
_mm256_set1_epi8((char)0xFF));
|
|
3160
|
-
max_vec.ymm =
|
|
3162
|
+
max_vec.ymm = new_max_u8x32;
|
|
3161
3163
|
max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
|
|
3162
3164
|
current_loop_cycle_u8x32 = _mm256_add_epi8(current_loop_cycle_u8x32, one_u8x32);
|
|
3163
3165
|
}
|
|
@@ -3172,15 +3174,15 @@ NK_INTERNAL void nk_reduce_minmax_e3m2_haswell_contiguous_( //
|
|
|
3172
3174
|
__m256i valid_b8x32 = _mm256_cmpgt_epi8(_mm256_set1_epi8((char)remaining), lane_indices_u8x32);
|
|
3173
3175
|
__m256i data_min_u8x32 = _mm256_blendv_epi8(_mm256_set1_epi8(0x3F), data_cmp_u8x32, valid_b8x32);
|
|
3174
3176
|
__m256i data_max_u8x32 = _mm256_blendv_epi8(_mm256_setzero_si256(), data_cmp_u8x32, valid_b8x32);
|
|
3175
|
-
__m256i
|
|
3176
|
-
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
3177
|
+
__m256i new_min_u8x32 = _mm256_min_epu8(min_vec.ymm, data_min_u8x32);
|
|
3178
|
+
__m256i min_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_min_u8x32, min_vec.ymm),
|
|
3177
3179
|
_mm256_set1_epi8((char)0xFF));
|
|
3178
|
-
min_vec.ymm =
|
|
3180
|
+
min_vec.ymm = new_min_u8x32;
|
|
3179
3181
|
min_loop_cycle_u8x32 = _mm256_blendv_epi8(min_loop_cycle_u8x32, current_loop_cycle_u8x32, min_changed_i8x32);
|
|
3180
|
-
__m256i
|
|
3181
|
-
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(
|
|
3182
|
+
__m256i new_max_u8x32 = _mm256_max_epu8(max_vec.ymm, data_max_u8x32);
|
|
3183
|
+
__m256i max_changed_i8x32 = _mm256_xor_si256(_mm256_cmpeq_epi8(new_max_u8x32, max_vec.ymm),
|
|
3182
3184
|
_mm256_set1_epi8((char)0xFF));
|
|
3183
|
-
max_vec.ymm =
|
|
3185
|
+
max_vec.ymm = new_max_u8x32;
|
|
3184
3186
|
max_loop_cycle_u8x32 = _mm256_blendv_epi8(max_loop_cycle_u8x32, current_loop_cycle_u8x32, max_changed_i8x32);
|
|
3185
3187
|
}
|
|
3186
3188
|
|
|
@@ -3645,14 +3647,14 @@ NK_INTERNAL void nk_reduce_moments_i4_haswell_contiguous_( //
|
|
|
3645
3647
|
ptr += 32, count_bytes -= 32;
|
|
3646
3648
|
}
|
|
3647
3649
|
__m256i raw_i8x32 = raw_vec.ymm;
|
|
3648
|
-
__m256i
|
|
3649
|
-
__m256i
|
|
3650
|
-
__m256i
|
|
3651
|
-
__m256i
|
|
3652
|
-
__m256i
|
|
3653
|
-
sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(
|
|
3654
|
-
__m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32,
|
|
3655
|
-
__m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32,
|
|
3650
|
+
__m256i low_u4_u8x32 = _mm256_and_si256(raw_i8x32, mask_0f_i8x32);
|
|
3651
|
+
__m256i high_u4_u8x32 = _mm256_and_si256(_mm256_srli_epi16(raw_i8x32, 4), mask_0f_i8x32);
|
|
3652
|
+
__m256i low_biased_u4_u8x32 = _mm256_xor_si256(low_u4_u8x32, eight_i8x32);
|
|
3653
|
+
__m256i high_biased_u4_u8x32 = _mm256_xor_si256(high_u4_u8x32, eight_i8x32);
|
|
3654
|
+
__m256i pair_sum_u8x32 = _mm256_add_epi8(low_biased_u4_u8x32, high_biased_u4_u8x32);
|
|
3655
|
+
sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(pair_sum_u8x32, zero_i8x32));
|
|
3656
|
+
__m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, low_u4_u8x32);
|
|
3657
|
+
__m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, high_u4_u8x32);
|
|
3656
3658
|
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(low_sq_u8x32, zero_i8x32));
|
|
3657
3659
|
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(high_sq_u8x32, zero_i8x32));
|
|
3658
3660
|
}
|
|
@@ -3702,12 +3704,12 @@ NK_INTERNAL void nk_reduce_moments_u4_haswell_contiguous_( //
|
|
|
3702
3704
|
ptr += 32, count_bytes -= 32;
|
|
3703
3705
|
}
|
|
3704
3706
|
__m256i raw_i8x32 = raw_vec.ymm;
|
|
3705
|
-
__m256i
|
|
3706
|
-
__m256i
|
|
3707
|
-
__m256i
|
|
3708
|
-
sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(
|
|
3709
|
-
__m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32,
|
|
3710
|
-
__m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32,
|
|
3707
|
+
__m256i low_u4_u8x32 = _mm256_and_si256(raw_i8x32, mask_0f_i8x32);
|
|
3708
|
+
__m256i high_u4_u8x32 = _mm256_and_si256(_mm256_srli_epi16(raw_i8x32, 4), mask_0f_i8x32);
|
|
3709
|
+
__m256i pair_sum_u8x32 = _mm256_add_epi8(low_u4_u8x32, high_u4_u8x32);
|
|
3710
|
+
sum_u64x4 = _mm256_add_epi64(sum_u64x4, _mm256_sad_epu8(pair_sum_u8x32, zero_i8x32));
|
|
3711
|
+
__m256i low_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, low_u4_u8x32);
|
|
3712
|
+
__m256i high_sq_u8x32 = _mm256_shuffle_epi8(sq_lut_u8x32, high_u4_u8x32);
|
|
3711
3713
|
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(low_sq_u8x32, zero_i8x32));
|
|
3712
3714
|
sumsq_u64x4 = _mm256_add_epi64(sumsq_u64x4, _mm256_sad_epu8(high_sq_u8x32, zero_i8x32));
|
|
3713
3715
|
}
|