numkong 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +495 -0
- package/binding.gyp +540 -0
- package/c/dispatch.h +512 -0
- package/c/dispatch_bf16.c +389 -0
- package/c/dispatch_bf16c.c +52 -0
- package/c/dispatch_e2m3.c +263 -0
- package/c/dispatch_e3m2.c +243 -0
- package/c/dispatch_e4m3.c +276 -0
- package/c/dispatch_e5m2.c +272 -0
- package/c/dispatch_f16.c +376 -0
- package/c/dispatch_f16c.c +58 -0
- package/c/dispatch_f32.c +378 -0
- package/c/dispatch_f32c.c +99 -0
- package/c/dispatch_f64.c +296 -0
- package/c/dispatch_f64c.c +98 -0
- package/c/dispatch_i16.c +96 -0
- package/c/dispatch_i32.c +89 -0
- package/c/dispatch_i4.c +150 -0
- package/c/dispatch_i64.c +86 -0
- package/c/dispatch_i8.c +289 -0
- package/c/dispatch_other.c +330 -0
- package/c/dispatch_u1.c +148 -0
- package/c/dispatch_u16.c +124 -0
- package/c/dispatch_u32.c +118 -0
- package/c/dispatch_u4.c +150 -0
- package/c/dispatch_u64.c +102 -0
- package/c/dispatch_u8.c +303 -0
- package/c/numkong.c +950 -0
- package/include/README.md +573 -0
- package/include/module.modulemap +129 -0
- package/include/numkong/attention/sapphireamx.h +1361 -0
- package/include/numkong/attention/sme.h +2066 -0
- package/include/numkong/attention.h +49 -0
- package/include/numkong/capabilities.h +748 -0
- package/include/numkong/cast/README.md +262 -0
- package/include/numkong/cast/haswell.h +975 -0
- package/include/numkong/cast/icelake.h +470 -0
- package/include/numkong/cast/neon.h +1192 -0
- package/include/numkong/cast/rvv.h +1021 -0
- package/include/numkong/cast/sapphire.h +262 -0
- package/include/numkong/cast/serial.h +2262 -0
- package/include/numkong/cast/skylake.h +856 -0
- package/include/numkong/cast/v128relaxed.h +180 -0
- package/include/numkong/cast.h +230 -0
- package/include/numkong/curved/README.md +223 -0
- package/include/numkong/curved/genoa.h +182 -0
- package/include/numkong/curved/haswell.h +276 -0
- package/include/numkong/curved/neon.h +205 -0
- package/include/numkong/curved/neonbfdot.h +212 -0
- package/include/numkong/curved/neonhalf.h +212 -0
- package/include/numkong/curved/rvv.h +305 -0
- package/include/numkong/curved/serial.h +207 -0
- package/include/numkong/curved/skylake.h +457 -0
- package/include/numkong/curved/smef64.h +506 -0
- package/include/numkong/curved.h +517 -0
- package/include/numkong/curved.hpp +144 -0
- package/include/numkong/dot/README.md +425 -0
- package/include/numkong/dot/alder.h +563 -0
- package/include/numkong/dot/genoa.h +315 -0
- package/include/numkong/dot/haswell.h +1688 -0
- package/include/numkong/dot/icelake.h +883 -0
- package/include/numkong/dot/neon.h +818 -0
- package/include/numkong/dot/neonbfdot.h +244 -0
- package/include/numkong/dot/neonfhm.h +360 -0
- package/include/numkong/dot/neonhalf.h +198 -0
- package/include/numkong/dot/neonsdot.h +508 -0
- package/include/numkong/dot/rvv.h +714 -0
- package/include/numkong/dot/rvvbb.h +72 -0
- package/include/numkong/dot/rvvbf16.h +123 -0
- package/include/numkong/dot/rvvhalf.h +129 -0
- package/include/numkong/dot/sapphire.h +141 -0
- package/include/numkong/dot/serial.h +838 -0
- package/include/numkong/dot/sierra.h +405 -0
- package/include/numkong/dot/skylake.h +1084 -0
- package/include/numkong/dot/sve.h +379 -0
- package/include/numkong/dot/svebfdot.h +74 -0
- package/include/numkong/dot/svehalf.h +123 -0
- package/include/numkong/dot/v128relaxed.h +1258 -0
- package/include/numkong/dot.h +1070 -0
- package/include/numkong/dot.hpp +94 -0
- package/include/numkong/dots/README.md +496 -0
- package/include/numkong/dots/alder.h +114 -0
- package/include/numkong/dots/genoa.h +94 -0
- package/include/numkong/dots/haswell.h +295 -0
- package/include/numkong/dots/icelake.h +171 -0
- package/include/numkong/dots/neon.h +120 -0
- package/include/numkong/dots/neonbfdot.h +58 -0
- package/include/numkong/dots/neonfhm.h +94 -0
- package/include/numkong/dots/neonhalf.h +57 -0
- package/include/numkong/dots/neonsdot.h +108 -0
- package/include/numkong/dots/rvv.h +2486 -0
- package/include/numkong/dots/sapphireamx.h +3973 -0
- package/include/numkong/dots/serial.h +2844 -0
- package/include/numkong/dots/sierra.h +97 -0
- package/include/numkong/dots/skylake.h +196 -0
- package/include/numkong/dots/sme.h +5372 -0
- package/include/numkong/dots/smebi32.h +461 -0
- package/include/numkong/dots/smef64.h +1318 -0
- package/include/numkong/dots/smehalf.h +47 -0
- package/include/numkong/dots/v128relaxed.h +294 -0
- package/include/numkong/dots.h +2804 -0
- package/include/numkong/dots.hpp +639 -0
- package/include/numkong/each/README.md +469 -0
- package/include/numkong/each/haswell.h +1658 -0
- package/include/numkong/each/icelake.h +272 -0
- package/include/numkong/each/neon.h +1104 -0
- package/include/numkong/each/neonbfdot.h +212 -0
- package/include/numkong/each/neonhalf.h +410 -0
- package/include/numkong/each/rvv.h +1121 -0
- package/include/numkong/each/sapphire.h +477 -0
- package/include/numkong/each/serial.h +260 -0
- package/include/numkong/each/skylake.h +1562 -0
- package/include/numkong/each.h +2146 -0
- package/include/numkong/each.hpp +434 -0
- package/include/numkong/geospatial/README.md +147 -0
- package/include/numkong/geospatial/haswell.h +593 -0
- package/include/numkong/geospatial/neon.h +571 -0
- package/include/numkong/geospatial/rvv.h +701 -0
- package/include/numkong/geospatial/serial.h +309 -0
- package/include/numkong/geospatial/skylake.h +577 -0
- package/include/numkong/geospatial/v128relaxed.h +613 -0
- package/include/numkong/geospatial.h +453 -0
- package/include/numkong/geospatial.hpp +235 -0
- package/include/numkong/matrix.hpp +336 -0
- package/include/numkong/maxsim/README.md +187 -0
- package/include/numkong/maxsim/alder.h +511 -0
- package/include/numkong/maxsim/genoa.h +115 -0
- package/include/numkong/maxsim/haswell.h +553 -0
- package/include/numkong/maxsim/icelake.h +480 -0
- package/include/numkong/maxsim/neonsdot.h +394 -0
- package/include/numkong/maxsim/sapphireamx.h +877 -0
- package/include/numkong/maxsim/serial.h +490 -0
- package/include/numkong/maxsim/sme.h +929 -0
- package/include/numkong/maxsim/v128relaxed.h +280 -0
- package/include/numkong/maxsim.h +571 -0
- package/include/numkong/maxsim.hpp +133 -0
- package/include/numkong/mesh/README.md +227 -0
- package/include/numkong/mesh/haswell.h +2235 -0
- package/include/numkong/mesh/neon.h +1329 -0
- package/include/numkong/mesh/neonbfdot.h +842 -0
- package/include/numkong/mesh/neonhalf.h +616 -0
- package/include/numkong/mesh/rvv.h +916 -0
- package/include/numkong/mesh/serial.h +742 -0
- package/include/numkong/mesh/skylake.h +1135 -0
- package/include/numkong/mesh/v128relaxed.h +1052 -0
- package/include/numkong/mesh.h +652 -0
- package/include/numkong/mesh.hpp +762 -0
- package/include/numkong/numkong.h +78 -0
- package/include/numkong/numkong.hpp +57 -0
- package/include/numkong/probability/README.md +173 -0
- package/include/numkong/probability/haswell.h +267 -0
- package/include/numkong/probability/neon.h +225 -0
- package/include/numkong/probability/rvv.h +409 -0
- package/include/numkong/probability/serial.h +169 -0
- package/include/numkong/probability/skylake.h +324 -0
- package/include/numkong/probability.h +383 -0
- package/include/numkong/probability.hpp +120 -0
- package/include/numkong/random.h +50 -0
- package/include/numkong/random.hpp +285 -0
- package/include/numkong/reduce/README.md +547 -0
- package/include/numkong/reduce/alder.h +632 -0
- package/include/numkong/reduce/genoa.h +201 -0
- package/include/numkong/reduce/haswell.h +3783 -0
- package/include/numkong/reduce/icelake.h +549 -0
- package/include/numkong/reduce/neon.h +3841 -0
- package/include/numkong/reduce/neonbfdot.h +353 -0
- package/include/numkong/reduce/neonfhm.h +665 -0
- package/include/numkong/reduce/neonhalf.h +157 -0
- package/include/numkong/reduce/neonsdot.h +357 -0
- package/include/numkong/reduce/rvv.h +3407 -0
- package/include/numkong/reduce/serial.h +757 -0
- package/include/numkong/reduce/sierra.h +338 -0
- package/include/numkong/reduce/skylake.h +3792 -0
- package/include/numkong/reduce/v128relaxed.h +2302 -0
- package/include/numkong/reduce.h +1597 -0
- package/include/numkong/reduce.hpp +633 -0
- package/include/numkong/scalar/README.md +89 -0
- package/include/numkong/scalar/haswell.h +113 -0
- package/include/numkong/scalar/neon.h +122 -0
- package/include/numkong/scalar/neonhalf.h +70 -0
- package/include/numkong/scalar/rvv.h +211 -0
- package/include/numkong/scalar/sapphire.h +63 -0
- package/include/numkong/scalar/serial.h +332 -0
- package/include/numkong/scalar/v128relaxed.h +56 -0
- package/include/numkong/scalar.h +683 -0
- package/include/numkong/set/README.md +179 -0
- package/include/numkong/set/haswell.h +334 -0
- package/include/numkong/set/icelake.h +485 -0
- package/include/numkong/set/neon.h +364 -0
- package/include/numkong/set/rvv.h +226 -0
- package/include/numkong/set/rvvbb.h +117 -0
- package/include/numkong/set/serial.h +174 -0
- package/include/numkong/set/sve.h +185 -0
- package/include/numkong/set/v128relaxed.h +240 -0
- package/include/numkong/set.h +457 -0
- package/include/numkong/set.hpp +114 -0
- package/include/numkong/sets/README.md +149 -0
- package/include/numkong/sets/haswell.h +63 -0
- package/include/numkong/sets/icelake.h +66 -0
- package/include/numkong/sets/neon.h +61 -0
- package/include/numkong/sets/serial.h +43 -0
- package/include/numkong/sets/smebi32.h +1099 -0
- package/include/numkong/sets/v128relaxed.h +58 -0
- package/include/numkong/sets.h +339 -0
- package/include/numkong/sparse/README.md +156 -0
- package/include/numkong/sparse/icelake.h +463 -0
- package/include/numkong/sparse/neon.h +288 -0
- package/include/numkong/sparse/serial.h +117 -0
- package/include/numkong/sparse/sve2.h +507 -0
- package/include/numkong/sparse/turin.h +322 -0
- package/include/numkong/sparse.h +363 -0
- package/include/numkong/sparse.hpp +113 -0
- package/include/numkong/spatial/README.md +435 -0
- package/include/numkong/spatial/alder.h +607 -0
- package/include/numkong/spatial/genoa.h +290 -0
- package/include/numkong/spatial/haswell.h +960 -0
- package/include/numkong/spatial/icelake.h +586 -0
- package/include/numkong/spatial/neon.h +773 -0
- package/include/numkong/spatial/neonbfdot.h +165 -0
- package/include/numkong/spatial/neonhalf.h +118 -0
- package/include/numkong/spatial/neonsdot.h +261 -0
- package/include/numkong/spatial/rvv.h +984 -0
- package/include/numkong/spatial/rvvbf16.h +123 -0
- package/include/numkong/spatial/rvvhalf.h +117 -0
- package/include/numkong/spatial/sapphire.h +343 -0
- package/include/numkong/spatial/serial.h +346 -0
- package/include/numkong/spatial/sierra.h +323 -0
- package/include/numkong/spatial/skylake.h +606 -0
- package/include/numkong/spatial/sve.h +224 -0
- package/include/numkong/spatial/svebfdot.h +122 -0
- package/include/numkong/spatial/svehalf.h +109 -0
- package/include/numkong/spatial/v128relaxed.h +717 -0
- package/include/numkong/spatial.h +1425 -0
- package/include/numkong/spatial.hpp +183 -0
- package/include/numkong/spatials/README.md +580 -0
- package/include/numkong/spatials/alder.h +94 -0
- package/include/numkong/spatials/genoa.h +94 -0
- package/include/numkong/spatials/haswell.h +219 -0
- package/include/numkong/spatials/icelake.h +113 -0
- package/include/numkong/spatials/neon.h +109 -0
- package/include/numkong/spatials/neonbfdot.h +60 -0
- package/include/numkong/spatials/neonfhm.h +92 -0
- package/include/numkong/spatials/neonhalf.h +58 -0
- package/include/numkong/spatials/neonsdot.h +109 -0
- package/include/numkong/spatials/rvv.h +1960 -0
- package/include/numkong/spatials/sapphireamx.h +1149 -0
- package/include/numkong/spatials/serial.h +226 -0
- package/include/numkong/spatials/sierra.h +96 -0
- package/include/numkong/spatials/skylake.h +184 -0
- package/include/numkong/spatials/sme.h +1901 -0
- package/include/numkong/spatials/smef64.h +465 -0
- package/include/numkong/spatials/v128relaxed.h +240 -0
- package/include/numkong/spatials.h +3021 -0
- package/include/numkong/spatials.hpp +508 -0
- package/include/numkong/tensor.hpp +1592 -0
- package/include/numkong/trigonometry/README.md +184 -0
- package/include/numkong/trigonometry/haswell.h +652 -0
- package/include/numkong/trigonometry/neon.h +639 -0
- package/include/numkong/trigonometry/rvv.h +699 -0
- package/include/numkong/trigonometry/serial.h +703 -0
- package/include/numkong/trigonometry/skylake.h +721 -0
- package/include/numkong/trigonometry/v128relaxed.h +666 -0
- package/include/numkong/trigonometry.h +467 -0
- package/include/numkong/trigonometry.hpp +166 -0
- package/include/numkong/types.h +1384 -0
- package/include/numkong/types.hpp +5603 -0
- package/include/numkong/vector.hpp +698 -0
- package/javascript/README.md +246 -0
- package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
- package/javascript/dist/cjs/numkong-wasm.js +617 -0
- package/javascript/dist/cjs/numkong.d.ts +343 -0
- package/javascript/dist/cjs/numkong.js +523 -0
- package/javascript/dist/cjs/package.json +3 -0
- package/javascript/dist/cjs/types.d.ts +284 -0
- package/javascript/dist/cjs/types.js +653 -0
- package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
- package/javascript/dist/esm/numkong-wasm.js +595 -0
- package/javascript/dist/esm/numkong.d.ts +343 -0
- package/javascript/dist/esm/numkong.js +452 -0
- package/javascript/dist/esm/package.json +3 -0
- package/javascript/dist/esm/types.d.ts +284 -0
- package/javascript/dist/esm/types.js +630 -0
- package/javascript/dist-package-cjs.json +3 -0
- package/javascript/dist-package-esm.json +3 -0
- package/javascript/node-gyp-build.d.ts +1 -0
- package/javascript/numkong-wasm.ts +756 -0
- package/javascript/numkong.c +689 -0
- package/javascript/numkong.ts +575 -0
- package/javascript/tsconfig-base.json +39 -0
- package/javascript/tsconfig-cjs.json +8 -0
- package/javascript/tsconfig-esm.json +8 -0
- package/javascript/types.ts +674 -0
- package/package.json +87 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief SIMD-accelerated Elementwise Arithmetic for Ice Lake.
|
|
3
|
+
* @file include/numkong/each/icelake.h
|
|
4
|
+
* @author Ash Vardanian
|
|
5
|
+
* @date December 27, 2025
|
|
6
|
+
*
|
|
7
|
+
* @sa include/numkong/each.h
|
|
8
|
+
*
|
|
9
|
+
* @section ice_elementwise_instructions Relevant Instructions
|
|
10
|
+
*
|
|
11
|
+
* Intrinsic Instruction Ice Genoa
|
|
12
|
+
* _mm512_add_epi8 VPADDB (ZMM, ZMM, ZMM) 1cy @ p05 1cy @ p0123
|
|
13
|
+
* _mm512_adds_epi8 VPADDSB (ZMM, ZMM, ZMM) 1cy @ p05 1cy @ p0123
|
|
14
|
+
* _mm512_add_epi32 VPADDD (ZMM, ZMM, ZMM) 1cy @ p05 1cy @ p0123
|
|
15
|
+
* _mm512_cmpgt_epi32_mask VPCMPGTD (K, ZMM, ZMM) 3cy @ p5 3cy @ p0
|
|
16
|
+
* _mm512_mask_blend_epi32 VPBLENDMD (ZMM, K, ZMM, ZMM) 1cy @ p05 1cy @ p0123
|
|
17
|
+
* _mm512_maskz_loadu_epi8 VMOVDQU8 (ZMM {K}, M512) 7cy @ p23 7cy @ p23
|
|
18
|
+
*
|
|
19
|
+
* Ice Lake inherits Skylake's AVX-512 execution but reduces frequency throttling on client chips.
|
|
20
|
+
* Integer saturation arithmetic (VPADDSB, VPADDUSB) provides 1cy latency for overflow-safe addition.
|
|
21
|
+
* For i32/i64 saturation, manual overflow detection via compare-and-blend is required.
|
|
22
|
+
*/
|
|
23
|
+
#ifndef NK_EACH_ICELAKE_H
|
|
24
|
+
#define NK_EACH_ICELAKE_H
|
|
25
|
+
|
|
26
|
+
#if NK_TARGET_X86_
|
|
27
|
+
#if NK_TARGET_ICELAKE
|
|
28
|
+
|
|
29
|
+
#include "numkong/types.h"
|
|
30
|
+
|
|
31
|
+
#if defined(__cplusplus)
|
|
32
|
+
extern "C" {
|
|
33
|
+
#endif
|
|
34
|
+
|
|
35
|
+
#if defined(__clang__)
|
|
36
|
+
#pragma clang attribute push( \
|
|
37
|
+
__attribute__((target("avx2,avx512f,avx512vl,avx512bw,avx512dq,avx512vnni,f16c,fma,bmi,bmi2"))), \
|
|
38
|
+
apply_to = function)
|
|
39
|
+
#elif defined(__GNUC__)
|
|
40
|
+
#pragma GCC push_options
|
|
41
|
+
#pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vnni", "f16c", "fma", "bmi", "bmi2")
|
|
42
|
+
#endif
|
|
43
|
+
|
|
44
|
+
NK_PUBLIC void nk_each_sum_i8_icelake(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i8_t *result) {
|
|
45
|
+
__mmask64 mask = 0xFFFFFFFFFFFFFFFF;
|
|
46
|
+
__m512i a_i8_vec, b_i8_vec;
|
|
47
|
+
__m512i sum_i8_vec;
|
|
48
|
+
nk_each_sum_i8_icelake_cycle:
|
|
49
|
+
if (n < 64) {
|
|
50
|
+
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
|
|
51
|
+
a_i8_vec = _mm512_maskz_loadu_epi8(mask, a);
|
|
52
|
+
b_i8_vec = _mm512_maskz_loadu_epi8(mask, b);
|
|
53
|
+
n = 0;
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
a_i8_vec = _mm512_loadu_epi8(a);
|
|
57
|
+
b_i8_vec = _mm512_loadu_epi8(b);
|
|
58
|
+
a += 64, b += 64, n -= 64;
|
|
59
|
+
}
|
|
60
|
+
sum_i8_vec = _mm512_adds_epi8(a_i8_vec, b_i8_vec);
|
|
61
|
+
_mm512_mask_storeu_epi8(result, mask, sum_i8_vec);
|
|
62
|
+
result += 64;
|
|
63
|
+
if (n) goto nk_each_sum_i8_icelake_cycle;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
NK_PUBLIC void nk_each_sum_u8_icelake(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u8_t *result) {
|
|
67
|
+
__mmask64 mask = 0xFFFFFFFFFFFFFFFF;
|
|
68
|
+
__m512i a_u8_vec, b_u8_vec;
|
|
69
|
+
__m512i sum_u8_vec;
|
|
70
|
+
nk_each_sum_u8_icelake_cycle:
|
|
71
|
+
if (n < 64) {
|
|
72
|
+
mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
|
|
73
|
+
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
|
|
74
|
+
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
|
|
75
|
+
n = 0;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
a_u8_vec = _mm512_loadu_epi8(a);
|
|
79
|
+
b_u8_vec = _mm512_loadu_epi8(b);
|
|
80
|
+
a += 64, b += 64, n -= 64;
|
|
81
|
+
}
|
|
82
|
+
sum_u8_vec = _mm512_adds_epu8(a_u8_vec, b_u8_vec);
|
|
83
|
+
_mm512_mask_storeu_epi8(result, mask, sum_u8_vec);
|
|
84
|
+
result += 64;
|
|
85
|
+
if (n) goto nk_each_sum_u8_icelake_cycle;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
NK_PUBLIC void nk_each_sum_i16_icelake(nk_i16_t const *a, nk_i16_t const *b, nk_size_t n, nk_i16_t *result) {
|
|
89
|
+
__mmask32 mask = 0xFFFFFFFF;
|
|
90
|
+
__m512i a_i16_vec, b_i16_vec;
|
|
91
|
+
__m512i sum_i16_vec;
|
|
92
|
+
nk_each_sum_i16_icelake_cycle:
|
|
93
|
+
if (n < 32) {
|
|
94
|
+
mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
|
|
95
|
+
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
|
|
96
|
+
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
|
|
97
|
+
n = 0;
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
a_i16_vec = _mm512_loadu_epi16(a);
|
|
101
|
+
b_i16_vec = _mm512_loadu_epi16(b);
|
|
102
|
+
a += 32, b += 32, n -= 32;
|
|
103
|
+
}
|
|
104
|
+
sum_i16_vec = _mm512_adds_epi16(a_i16_vec, b_i16_vec);
|
|
105
|
+
_mm512_mask_storeu_epi16(result, mask, sum_i16_vec);
|
|
106
|
+
result += 32;
|
|
107
|
+
if (n) goto nk_each_sum_i16_icelake_cycle;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
NK_PUBLIC void nk_each_sum_u16_icelake(nk_u16_t const *a, nk_u16_t const *b, nk_size_t n, nk_u16_t *result) {
|
|
111
|
+
__mmask32 mask = 0xFFFFFFFF;
|
|
112
|
+
__m512i a_u16_vec, b_u16_vec;
|
|
113
|
+
__m512i sum_u16_vec;
|
|
114
|
+
nk_each_sum_u16_icelake_cycle:
|
|
115
|
+
if (n < 32) {
|
|
116
|
+
mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
|
|
117
|
+
a_u16_vec = _mm512_maskz_loadu_epi16(mask, a);
|
|
118
|
+
b_u16_vec = _mm512_maskz_loadu_epi16(mask, b);
|
|
119
|
+
n = 0;
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
a_u16_vec = _mm512_loadu_epi16(a);
|
|
123
|
+
b_u16_vec = _mm512_loadu_epi16(b);
|
|
124
|
+
a += 32, b += 32, n -= 32;
|
|
125
|
+
}
|
|
126
|
+
sum_u16_vec = _mm512_adds_epu16(a_u16_vec, b_u16_vec);
|
|
127
|
+
_mm512_mask_storeu_epi16(result, mask, sum_u16_vec);
|
|
128
|
+
result += 32;
|
|
129
|
+
if (n) goto nk_each_sum_u16_icelake_cycle;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
NK_INTERNAL __m512i _mm512_adds_epi32_icelake(__m512i a, __m512i b) {
|
|
133
|
+
__m512i sum_i32x16 = _mm512_add_epi32(a, b);
|
|
134
|
+
__m512i sign_i32x16 = _mm512_set1_epi32((int)0x80000000);
|
|
135
|
+
// ~(a^b) & (sum^a): overflow iff same-sign inputs produce different-sign result
|
|
136
|
+
__m512i overflow_i32x16 = _mm512_ternarylogic_epi64(a, b, sum_i32x16, 0x42);
|
|
137
|
+
__mmask16 overflow_b32x16 = _mm512_test_epi32_mask(overflow_i32x16, sign_i32x16);
|
|
138
|
+
// Positive overflow → INT32_MAX, negative overflow → INT32_MIN
|
|
139
|
+
__m512i max_i32x16 = _mm512_set1_epi32(0x7FFFFFFF);
|
|
140
|
+
__m512i min_i32x16 = _mm512_set1_epi32((int)0x80000000);
|
|
141
|
+
__m512i saturated_i32x16 = _mm512_mask_blend_epi32(_mm512_movepi32_mask(a), max_i32x16, min_i32x16);
|
|
142
|
+
return _mm512_mask_blend_epi32(overflow_b32x16, sum_i32x16, saturated_i32x16);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
NK_INTERNAL __m512i _mm512_adds_epu32_icelake(__m512i a, __m512i b) {
|
|
146
|
+
__m512i sum = _mm512_add_epi32(a, b);
|
|
147
|
+
__mmask16 overflow_mask = _mm512_cmp_epu32_mask(sum, a, _MM_CMPINT_LT); // sum < a means overflow
|
|
148
|
+
__m512i max_val = _mm512_set1_epi32(4294967295u);
|
|
149
|
+
return _mm512_mask_blend_epi32(overflow_mask, sum, max_val);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
NK_INTERNAL __m512i _mm512_adds_epi64_icelake(__m512i a, __m512i b) {
|
|
153
|
+
__m512i sum_i64x8 = _mm512_add_epi64(a, b);
|
|
154
|
+
__m512i sign_i64x8 = _mm512_set1_epi64((long long)0x8000000000000000);
|
|
155
|
+
// ~(a^b) & (sum^a): overflow iff same-sign inputs produce different-sign result
|
|
156
|
+
__m512i overflow_i64x8 = _mm512_ternarylogic_epi64(a, b, sum_i64x8, 0x42);
|
|
157
|
+
__mmask8 overflow_b64x8 = _mm512_test_epi64_mask(overflow_i64x8, sign_i64x8);
|
|
158
|
+
// Positive overflow → INT64_MAX, negative overflow → INT64_MIN
|
|
159
|
+
__m512i max_i64x8 = _mm512_set1_epi64(9223372036854775807ll);
|
|
160
|
+
__m512i min_i64x8 = _mm512_set1_epi64(-9223372036854775807ll - 1);
|
|
161
|
+
__m512i saturated_i64x8 = _mm512_mask_blend_epi64(_mm512_movepi64_mask(a), max_i64x8, min_i64x8);
|
|
162
|
+
return _mm512_mask_blend_epi64(overflow_b64x8, sum_i64x8, saturated_i64x8);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
NK_INTERNAL __m512i _mm512_adds_epu64_icelake(__m512i a, __m512i b) {
|
|
166
|
+
__m512i sum = _mm512_add_epi64(a, b);
|
|
167
|
+
__mmask8 overflow_mask = _mm512_cmp_epu64_mask(sum, a, _MM_CMPINT_LT); // sum < a means overflow
|
|
168
|
+
__m512i max_val = _mm512_set1_epi64(18446744073709551615ull);
|
|
169
|
+
return _mm512_mask_blend_epi64(overflow_mask, sum, max_val);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
NK_PUBLIC void nk_each_sum_i32_icelake(nk_i32_t const *a, nk_i32_t const *b, nk_size_t n, nk_i32_t *result) {
|
|
173
|
+
__mmask16 mask = 0xFFFF;
|
|
174
|
+
__m512i a_i32_vec, b_i32_vec;
|
|
175
|
+
__m512i sum_i32_vec;
|
|
176
|
+
nk_each_sum_i32_icelake_cycle:
|
|
177
|
+
if (n < 16) {
|
|
178
|
+
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
|
|
179
|
+
a_i32_vec = _mm512_maskz_loadu_epi32(mask, a);
|
|
180
|
+
b_i32_vec = _mm512_maskz_loadu_epi32(mask, b);
|
|
181
|
+
n = 0;
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
a_i32_vec = _mm512_loadu_epi32(a);
|
|
185
|
+
b_i32_vec = _mm512_loadu_epi32(b);
|
|
186
|
+
a += 16, b += 16, n -= 16;
|
|
187
|
+
}
|
|
188
|
+
sum_i32_vec = _mm512_adds_epi32_icelake(a_i32_vec, b_i32_vec);
|
|
189
|
+
_mm512_mask_storeu_epi32(result, mask, sum_i32_vec);
|
|
190
|
+
result += 16;
|
|
191
|
+
if (n) goto nk_each_sum_i32_icelake_cycle;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
NK_PUBLIC void nk_each_sum_u32_icelake(nk_u32_t const *a, nk_u32_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
195
|
+
__mmask16 mask = 0xFFFF;
|
|
196
|
+
__m512i a_u32_vec, b_u32_vec;
|
|
197
|
+
__m512i sum_u32_vec;
|
|
198
|
+
nk_each_sum_u32_icelake_cycle:
|
|
199
|
+
if (n < 16) {
|
|
200
|
+
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
|
|
201
|
+
a_u32_vec = _mm512_maskz_loadu_epi32(mask, a);
|
|
202
|
+
b_u32_vec = _mm512_maskz_loadu_epi32(mask, b);
|
|
203
|
+
n = 0;
|
|
204
|
+
}
|
|
205
|
+
else {
|
|
206
|
+
a_u32_vec = _mm512_loadu_epi32(a);
|
|
207
|
+
b_u32_vec = _mm512_loadu_epi32(b);
|
|
208
|
+
a += 16, b += 16, n -= 16;
|
|
209
|
+
}
|
|
210
|
+
sum_u32_vec = _mm512_adds_epu32_icelake(a_u32_vec, b_u32_vec);
|
|
211
|
+
_mm512_mask_storeu_epi32(result, mask, sum_u32_vec);
|
|
212
|
+
result += 16;
|
|
213
|
+
if (n) goto nk_each_sum_u32_icelake_cycle;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
NK_PUBLIC void nk_each_sum_i64_icelake(nk_i64_t const *a, nk_i64_t const *b, nk_size_t n, nk_i64_t *result) {
|
|
217
|
+
__mmask8 mask = 0xFF;
|
|
218
|
+
__m512i a_i64_vec, b_i64_vec;
|
|
219
|
+
__m512i sum_i64_vec;
|
|
220
|
+
nk_each_sum_i64_icelake_cycle:
|
|
221
|
+
if (n < 8) {
|
|
222
|
+
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
|
|
223
|
+
a_i64_vec = _mm512_maskz_loadu_epi64(mask, a);
|
|
224
|
+
b_i64_vec = _mm512_maskz_loadu_epi64(mask, b);
|
|
225
|
+
n = 0;
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
a_i64_vec = _mm512_loadu_epi64(a);
|
|
229
|
+
b_i64_vec = _mm512_loadu_epi64(b);
|
|
230
|
+
a += 8, b += 8, n -= 8;
|
|
231
|
+
}
|
|
232
|
+
sum_i64_vec = _mm512_adds_epi64_icelake(a_i64_vec, b_i64_vec);
|
|
233
|
+
_mm512_mask_storeu_epi64(result, mask, sum_i64_vec);
|
|
234
|
+
result += 8;
|
|
235
|
+
if (n) goto nk_each_sum_i64_icelake_cycle;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
NK_PUBLIC void nk_each_sum_u64_icelake(nk_u64_t const *a, nk_u64_t const *b, nk_size_t n, nk_u64_t *result) {
|
|
239
|
+
__mmask8 mask = 0xFF;
|
|
240
|
+
__m512i a_u64_vec, b_u64_vec;
|
|
241
|
+
__m512i sum_u64_vec;
|
|
242
|
+
nk_each_sum_u64_icelake_cycle:
|
|
243
|
+
if (n < 8) {
|
|
244
|
+
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
|
|
245
|
+
a_u64_vec = _mm512_maskz_loadu_epi64(mask, a);
|
|
246
|
+
b_u64_vec = _mm512_maskz_loadu_epi64(mask, b);
|
|
247
|
+
n = 0;
|
|
248
|
+
}
|
|
249
|
+
else {
|
|
250
|
+
a_u64_vec = _mm512_loadu_epi64(a);
|
|
251
|
+
b_u64_vec = _mm512_loadu_epi64(b);
|
|
252
|
+
a += 8, b += 8, n -= 8;
|
|
253
|
+
}
|
|
254
|
+
sum_u64_vec = _mm512_adds_epu64_icelake(a_u64_vec, b_u64_vec);
|
|
255
|
+
_mm512_mask_storeu_epi64(result, mask, sum_u64_vec);
|
|
256
|
+
result += 8;
|
|
257
|
+
if (n) goto nk_each_sum_u64_icelake_cycle;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
#if defined(__clang__)
|
|
261
|
+
#pragma clang attribute pop
|
|
262
|
+
#elif defined(__GNUC__)
|
|
263
|
+
#pragma GCC pop_options
|
|
264
|
+
#endif
|
|
265
|
+
|
|
266
|
+
#if defined(__cplusplus)
|
|
267
|
+
} // extern "C"
|
|
268
|
+
#endif
|
|
269
|
+
|
|
270
|
+
#endif // NK_TARGET_ICELAKE
|
|
271
|
+
#endif // NK_TARGET_X86_
|
|
272
|
+
#endif // NK_EACH_ICELAKE_H
|