numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/* NumKong ISA probe: SVE I8 signed-dot (FEAT_SVEDot) */
|
|
2
|
+
#if defined(_WIN32)
|
|
3
|
+
#error "SVE/SME not supported on Windows ARM"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if defined(__APPLE__) && defined(__aarch64__)
|
|
7
|
+
#error "SVE not available on Apple Silicon"
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#if !defined(__ARM_FEATURE_SVE)
|
|
11
|
+
#error "Feature not available"
|
|
12
|
+
#endif
|
|
13
|
+
#include <arm_sve.h>
|
|
14
|
+
int test_svesdot(void) {
|
|
15
|
+
svint32_t acc = svdup_s32(0);
|
|
16
|
+
svint8_t a = svdup_s8(1);
|
|
17
|
+
svint8_t b = svdup_s8(1);
|
|
18
|
+
acc = svdot_s32(acc, a, b);
|
|
19
|
+
return (int)svaddv_s32(svptrue_b32(), acc) >= 0 ? 0 : 1;
|
|
20
|
+
}
|
|
21
|
+
int main(void) { return test_svesdot(); }
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/* NumKong ISA probe: LoongArch LASX (256-bit SIMD) */
|
|
2
|
+
#if !defined(__loongarch_asx)
|
|
3
|
+
#error "Feature not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <lasxintrin.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
__m256i a = __lasx_xvreplgr2vr_w(1);
|
|
8
|
+
__m256i b = __lasx_xvreplgr2vr_w(2);
|
|
9
|
+
__m256i c = __lasx_xvadd_w(a, b);
|
|
10
|
+
int r = __lasx_xvpickve2gr_w(c, 0);
|
|
11
|
+
return r == 3 ? 0 : 1;
|
|
12
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/* NumKong ISA probe: Power VSX (POWER9+ 128-bit SIMD) */
|
|
2
|
+
#if !defined(__VSX__)
|
|
3
|
+
#error "Feature not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <altivec.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
__vector float a = vec_splats(1.0f);
|
|
8
|
+
__vector float b = vec_splats(2.0f);
|
|
9
|
+
__vector float c = vec_madd(a, b, a);
|
|
10
|
+
/* vec_extract requires POWER9+ */
|
|
11
|
+
return vec_extract(c, 0) == 3.0f ? 0 : 1;
|
|
12
|
+
}
|
package/probes/probe.js
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* NumKong ISA probe script for Node.js / node-gyp builds.
|
|
4
|
+
*
|
|
5
|
+
* Try-compiles each probe .c file from probes/ to determine which ISA
|
|
6
|
+
* extensions the current compiler supports. Writes results to
|
|
7
|
+
* build/nk_probes.h as #define NK_TARGET_FOO 1/0.
|
|
8
|
+
*
|
|
9
|
+
* Usage: node scripts/probe_isa.js
|
|
10
|
+
* Called automatically via package.json "preinstall" hook.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const { execSync } = require("child_process");
|
|
14
|
+
const fs = require("fs");
|
|
15
|
+
const path = require("path");
|
|
16
|
+
const os = require("os");
|
|
17
|
+
|
|
18
|
+
const cc = process.env.CC || (process.platform === "win32" ? "cl.exe" : "cc");
|
|
19
|
+
const isWin = process.platform === "win32";
|
|
20
|
+
|
|
21
|
+
/** Try to compile a probe file. Returns true if compilation succeeds. */
|
|
22
|
+
function probeIsa(probeFile, flags) {
|
|
23
|
+
const tmpObj = path.join(
|
|
24
|
+
os.tmpdir(),
|
|
25
|
+
`nk_probe_${path.basename(probeFile, ".c")}${isWin ? ".obj" : ".o"}`,
|
|
26
|
+
);
|
|
27
|
+
try {
|
|
28
|
+
const cmd = isWin
|
|
29
|
+
? `"${cc}" /c ${flags.join(" ")} "${probeFile}" /Fo"${tmpObj}" /nologo`
|
|
30
|
+
: `"${cc}" -c ${flags.join(" ")} "${probeFile}" -o "${tmpObj}" 2>/dev/null`;
|
|
31
|
+
execSync(cmd, { stdio: "pipe", timeout: 30000 });
|
|
32
|
+
return true;
|
|
33
|
+
} catch {
|
|
34
|
+
return false;
|
|
35
|
+
} finally {
|
|
36
|
+
try {
|
|
37
|
+
fs.unlinkSync(tmpObj);
|
|
38
|
+
} catch { }
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Probe table: [define, probeFile, gccFlags, msvcFlags]
|
|
43
|
+
// x86 probes: GCC flags are minimal — each implies its prerequisites.
|
|
44
|
+
// E.g., -mavx512vnni implies -mavx512f; -mavxvnni implies -mavx2.
|
|
45
|
+
const PROBES = [
|
|
46
|
+
// x86
|
|
47
|
+
["NK_TARGET_HASWELL", "probes/x86_haswell.c", ["-mavx2", "-mfma", "-mf16c"], ["/arch:AVX2"]],
|
|
48
|
+
["NK_TARGET_SKYLAKE", "probes/x86_skylake.c", ["-mavx512f", "-mavx512bw", "-mavx512dq", "-mavx512vl"], ["/arch:AVX512"]],
|
|
49
|
+
["NK_TARGET_ICELAKE", "probes/x86_icelake.c", ["-mavx512vnni", "-mavx512vl"], ["/arch:AVX512"]],
|
|
50
|
+
["NK_TARGET_GENOA", "probes/x86_genoa.c", ["-mavx512bf16", "-mavx512vl"], ["/arch:AVX512"]],
|
|
51
|
+
["NK_TARGET_SAPPHIRE", "probes/x86_sapphire.c", ["-mavx512fp16", "-mavx512vl"], ["/arch:AVX512"]],
|
|
52
|
+
["NK_TARGET_SAPPHIREAMX", "probes/x86_sapphireamx.c", ["-mamx-tile", "-mamx-int8"], ["/arch:AVX512"]],
|
|
53
|
+
["NK_TARGET_GRANITEAMX", "probes/x86_graniteamx.c", ["-mamx-tile", "-mamx-fp16"], ["/arch:AVX512"]],
|
|
54
|
+
["NK_TARGET_DIAMOND", "probes/x86_diamond.c", ["-mavx10.2-512"], ["/arch:AVX10.2"]],
|
|
55
|
+
["NK_TARGET_TURIN", "probes/x86_turin.c", ["-mavx512vp2intersect"], ["/arch:AVX512"]],
|
|
56
|
+
["NK_TARGET_ALDER", "probes/x86_alder.c", ["-mavxvnni"], ["/arch:AVX2"]],
|
|
57
|
+
["NK_TARGET_SIERRA", "probes/x86_sierra.c", ["-mavxvnniint8"], ["/arch:AVX2"]],
|
|
58
|
+
// ARM NEON base probes — msvc_flags are empty because MSVC does not define
|
|
59
|
+
// __ARM_FEATURE_* macros via /arch: flags. For MSVC header-only builds,
|
|
60
|
+
// types.h infers features from __ARM_ARCH level instead.
|
|
61
|
+
["NK_TARGET_NEON", "probes/arm_neon.c", ["-march=armv8-a+simd"], []], // FEAT_AdvSIMD
|
|
62
|
+
["NK_TARGET_NEONHALF", "probes/arm_neon_half.c", ["-march=armv8.2-a+simd+fp16"], ["/arch:armv8.2"]], // FEAT_FP16
|
|
63
|
+
["NK_TARGET_NEONSDOT", "probes/arm_neon_sdot.c", ["-march=armv8.2-a+dotprod"], ["/arch:armv8.4"]], // FEAT_DotProd
|
|
64
|
+
["NK_TARGET_NEONBFDOT", "probes/arm_neon_bfdot.c", ["-march=armv8.6-a+simd+bf16"], ["/arch:armv8.6"]], // FEAT_BF16
|
|
65
|
+
["NK_TARGET_NEONFHM", "probes/arm_neon_fhm.c", ["-march=armv8.2-a+simd+fp16+fp16fml"], ["/arch:armv8.4"]], // FEAT_FHM
|
|
66
|
+
// ARM SVE/SME
|
|
67
|
+
["NK_TARGET_SVE", "probes/arm_sve.c", ["-march=armv8.2-a+sve"], []],
|
|
68
|
+
["NK_TARGET_SVEHALF", "probes/arm_sve_half.c", ["-march=armv8.2-a+sve+fp16"], []],
|
|
69
|
+
["NK_TARGET_SVEBFDOT", "probes/arm_sve_bfdot.c", ["-march=armv8.2-a+sve+bf16"], []],
|
|
70
|
+
["NK_TARGET_SVESDOT", "probes/arm_sve_sdot.c", ["-march=armv8.2-a+sve+dotprod"], []],
|
|
71
|
+
["NK_TARGET_SVE2", "probes/arm_sve2.c", ["-march=armv8.2-a+sve2"], []],
|
|
72
|
+
["NK_TARGET_SVE2P1", "probes/arm_sve2p1.c", ["-march=armv8.2-a+sve2p1"], []],
|
|
73
|
+
["NK_TARGET_NEONFP8", "probes/arm_neonfp8.c", ["-march=armv8-a+simd+fp8dot4"], []],
|
|
74
|
+
["NK_TARGET_SME", "probes/arm_sme.c", ["-march=armv8-a+sme"], []],
|
|
75
|
+
["NK_TARGET_SME2", "probes/arm_sme2.c", ["-march=armv8-a+sme2"], []],
|
|
76
|
+
["NK_TARGET_SME2P1", "probes/arm_sme2p1.c", ["-march=armv8-a+sme2p1"], []],
|
|
77
|
+
["NK_TARGET_SMEF64", "probes/arm_sme_f64.c", ["-march=armv8-a+sme+sme-f64f64"], []],
|
|
78
|
+
["NK_TARGET_SMEHALF", "probes/arm_sme_half.c", ["-march=armv8-a+sme+sme-f16f16"], []],
|
|
79
|
+
["NK_TARGET_SMEBF16", "probes/arm_sme_bf16.c", ["-march=armv8-a+sme2+b16b16"], []],
|
|
80
|
+
["NK_TARGET_SMEBI32", "probes/arm_sme_bi32.c", ["-march=armv8-a+sme2+sme-i16i32"], []],
|
|
81
|
+
["NK_TARGET_SMELUT2", "probes/arm_sme_lut2.c", ["-march=armv8-a+sme2+lut"], []],
|
|
82
|
+
["NK_TARGET_SMEFA64", "probes/arm_sme_fa64.c", ["-march=armv8-a+sme+sme-fa64"], []],
|
|
83
|
+
// RISC-V
|
|
84
|
+
["NK_TARGET_RVV", "probes/riscv_rvv.c", ["-march=rv64gcv"], []],
|
|
85
|
+
["NK_TARGET_RVVHALF", "probes/riscv_rvv_half.c", ["-march=rv64gcv_zvfh"], []],
|
|
86
|
+
["NK_TARGET_RVVBF16", "probes/riscv_rvv_bf16.c", ["-march=rv64gcv_zvfbfwma"], []],
|
|
87
|
+
["NK_TARGET_RVVBB", "probes/riscv_rvv_bb.c", ["-march=rv64gcv_zvbb"], []],
|
|
88
|
+
// LoongArch
|
|
89
|
+
["NK_TARGET_LOONGSONASX", "probes/loongarch_lasx.c", ["-mlasx"], []],
|
|
90
|
+
// Power
|
|
91
|
+
["NK_TARGET_POWERVSX", "probes/power_vsx.c", ["-mcpu=power9", "-mvsx"], []],
|
|
92
|
+
// WASM
|
|
93
|
+
["NK_TARGET_V128RELAXED", "probes/wasm_v128relaxed.c", ["-mrelaxed-simd"], []],
|
|
94
|
+
];
|
|
95
|
+
|
|
96
|
+
function main() {
|
|
97
|
+
const pkgRoot = path.join(__dirname, "..");
|
|
98
|
+
fs.mkdirSync(pkgRoot, { recursive: true });
|
|
99
|
+
|
|
100
|
+
const arch = process.arch; // 'x64', 'arm64', etc.
|
|
101
|
+
const lines = [
|
|
102
|
+
"/* Auto-generated by scripts/probe_isa.js — do not edit */",
|
|
103
|
+
`/* Compiler: ${cc}, Platform: ${process.platform}, Arch: ${arch} */`,
|
|
104
|
+
"",
|
|
105
|
+
];
|
|
106
|
+
|
|
107
|
+
let enabled = 0;
|
|
108
|
+
for (const [define, probeFile, gccFlags, msvcFlags] of PROBES) {
|
|
109
|
+
const flags = isWin ? msvcFlags : gccFlags;
|
|
110
|
+
const supported = probeIsa(path.join(pkgRoot, probeFile), flags);
|
|
111
|
+
lines.push(`#define ${define} ${supported ? 1 : 0}`);
|
|
112
|
+
if (supported) {
|
|
113
|
+
enabled++;
|
|
114
|
+
console.log(`[NumKong] Probe ${define}: supported`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
lines.push("");
|
|
119
|
+
const header = lines.join("\n");
|
|
120
|
+
const outPath = path.join(pkgRoot, "nk_probes.h");
|
|
121
|
+
fs.writeFileSync(outPath, header);
|
|
122
|
+
console.log(
|
|
123
|
+
`[NumKong] Wrote ${outPath} (${enabled} ISAs enabled out of ${PROBES.length})`,
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
main();
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/* NumKong ISA probe: RVV 1.0 (RISC-V Vector Extension) */
|
|
2
|
+
#if !defined(__riscv_v)
|
|
3
|
+
#error "Feature not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <riscv_vector.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
size_t vl = __riscv_vsetvl_e32m1(4);
|
|
8
|
+
vfloat32m1_t a = __riscv_vfmv_v_f_f32m1(1.0f, vl);
|
|
9
|
+
vfloat32m1_t b = __riscv_vfmv_v_f_f32m1(2.0f, vl);
|
|
10
|
+
vfloat32m1_t c = __riscv_vfadd_vv_f32m1(a, b, vl);
|
|
11
|
+
vfloat32m1_t sum = __riscv_vfredusum_vs_f32m1_f32m1(c, __riscv_vfmv_v_f_f32m1(0.0f, vl), vl);
|
|
12
|
+
float result = __riscv_vfmv_f_s_f32m1_f32(sum);
|
|
13
|
+
return result > 0.0f ? 0 : 1;
|
|
14
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/* NumKong ISA probe: RVV Zvbb (basic bit-manipulation) */
|
|
2
|
+
#if !defined(__riscv_zvbb)
|
|
3
|
+
#error "Feature not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <riscv_vector.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
size_t vl = __riscv_vsetvl_e8m1(4);
|
|
8
|
+
vuint8m1_t a = __riscv_vmv_v_x_u8m1(0xFF, vl);
|
|
9
|
+
/* vcpop.v — per-element popcount, the key Zvbb instruction */
|
|
10
|
+
vuint8m1_t popcnt = __riscv_vcpop_v_u8m1(a, vl);
|
|
11
|
+
vuint8m1_t sum = __riscv_vredsum_vs_u8m1_u8m1(popcnt, __riscv_vmv_v_x_u8m1(0, 1), vl);
|
|
12
|
+
unsigned char result = __riscv_vmv_x_s_u8m1_u8(sum);
|
|
13
|
+
/* Each lane is 0xFF → popcount 8, sum of 4 lanes = 32 */
|
|
14
|
+
return result == 32 ? 0 : 1;
|
|
15
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/* NumKong ISA probe: RVV Zvfbfwma (BF16 widening FMA) */
|
|
2
|
+
#if !defined(__riscv_zvfbfwma)
|
|
3
|
+
#error "Feature not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <riscv_vector.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
size_t vl = __riscv_vsetvl_e16m1(4);
|
|
8
|
+
vuint16m1_t raw = __riscv_vmv_v_x_u16m1(0x3F80, vl); /* bf16 1.0 */
|
|
9
|
+
vbfloat16m1_t a = __riscv_vreinterpret_v_u16m1_bf16m1(raw);
|
|
10
|
+
size_t vl32 = __riscv_vsetvl_e32m2(4);
|
|
11
|
+
vfloat32m2_t acc = __riscv_vfmv_v_f_f32m2(0.0f, vl32);
|
|
12
|
+
acc = __riscv_vfwmaccbf16_vv_f32m2(acc, a, a, vl);
|
|
13
|
+
vfloat32m1_t sum = __riscv_vfredusum_vs_f32m2_f32m1(acc, __riscv_vfmv_v_f_f32m1(0.0f, __riscv_vsetvl_e32m1(1)),
|
|
14
|
+
vl32);
|
|
15
|
+
float result = __riscv_vfmv_f_s_f32m1_f32(sum);
|
|
16
|
+
return result > 0.0f ? 0 : 1;
|
|
17
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/* NumKong ISA probe: RVV Zvfh (half-precision vector) */
|
|
2
|
+
#if !defined(__riscv_zvfh)
|
|
3
|
+
#error "Feature not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <riscv_vector.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
size_t vl = __riscv_vsetvl_e16m1(4);
|
|
8
|
+
vfloat16m1_t a = __riscv_vfmv_v_f_f16m1((_Float16)1.0f, vl);
|
|
9
|
+
vfloat32m2_t wide = __riscv_vfwcvt_f_f_v_f32m2(a, vl);
|
|
10
|
+
vfloat32m1_t sum = __riscv_vfredusum_vs_f32m2_f32m1(wide, __riscv_vfmv_v_f_f32m1(0.0f, __riscv_vsetvl_e32m1(1)),
|
|
11
|
+
vl);
|
|
12
|
+
float result = __riscv_vfmv_f_s_f32m1_f32(sum);
|
|
13
|
+
return result > 0.0f ? 0 : 1;
|
|
14
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/* NumKong ISA probe: WASM Relaxed SIMD (v128) */
|
|
2
|
+
#if !defined(__wasm_relaxed_simd__)
|
|
3
|
+
#error "WASM Relaxed SIMD not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <wasm_simd128.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
v128_t a = wasm_f32x4_splat(1.0f);
|
|
8
|
+
v128_t b = wasm_f32x4_splat(2.0f);
|
|
9
|
+
v128_t c = wasm_f32x4_relaxed_madd(a, b, a);
|
|
10
|
+
return wasm_f32x4_extract_lane(c, 0) > 0.0f ? 0 : 1;
|
|
11
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/* NumKong ISA probe: Alder Lake (AVX-VNNI 256-bit) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVXVNNI__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile int two = 2;
|
|
12
|
+
__m256i acc = _mm256_setzero_si256();
|
|
13
|
+
__m256i a = _mm256_set1_epi8((char)two);
|
|
14
|
+
__m256i b = _mm256_set1_epi8((char)(two + 1));
|
|
15
|
+
acc = _mm256_dpbusd_avx_epi32(acc, a, b);
|
|
16
|
+
return _mm256_extract_epi32(acc, 0) == 24 ? 0 : 1;
|
|
17
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/* NumKong ISA probe: Diamond Rapids (AVX10.2) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVX512FP16__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile int one = 1;
|
|
12
|
+
__m256i acc = _mm256_setzero_si256();
|
|
13
|
+
__m256i a = _mm256_set1_epi8((char)one);
|
|
14
|
+
__m256i b = _mm256_set1_epi8((char)one);
|
|
15
|
+
acc = _mm256_dpbe4ss_epi32(acc, a, b);
|
|
16
|
+
return _mm256_extract_epi32(acc, 0) != 0 ? 0 : 1;
|
|
17
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/* NumKong ISA probe: Genoa (AVX-512F/BW/DQ/VL + BF16) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVX512BF16__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile float one = 1.0f;
|
|
12
|
+
__m512 f = _mm512_set1_ps(one);
|
|
13
|
+
__m256bh a = _mm512_cvtneps_pbh(f);
|
|
14
|
+
__m512bh wide = (__m512bh)_mm512_castsi512_ps(_mm512_inserti64x4(_mm512_setzero_si512(), (__m256i)a, 0));
|
|
15
|
+
__m512 r = _mm512_dpbf16_ps(_mm512_setzero_ps(), wide, wide);
|
|
16
|
+
return _mm512_reduce_add_ps(r) >= 0.0f ? 0 : 1;
|
|
17
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/* NumKong ISA probe: Granite Rapids AMX (AMX-TILE + AMX-FP16) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if defined(__FreeBSD__)
|
|
7
|
+
#error "AMX not supported on FreeBSD"
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#if !defined(__AMX_FP16__)
|
|
11
|
+
#error "Feature not available"
|
|
12
|
+
#endif
|
|
13
|
+
#include <immintrin.h>
|
|
14
|
+
#include <amxfp16intrin.h>
|
|
15
|
+
int main(void) {
|
|
16
|
+
volatile int zero = 0;
|
|
17
|
+
_tile_release();
|
|
18
|
+
return zero;
|
|
19
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/* NumKong ISA probe: Haswell (AVX2 + FMA + F16C) */
|
|
2
|
+
#if !defined(__AVX2__)
|
|
3
|
+
#error "Feature not available"
|
|
4
|
+
#endif
|
|
5
|
+
#include <immintrin.h>
|
|
6
|
+
int main(void) {
|
|
7
|
+
volatile int one = 1;
|
|
8
|
+
__m256i a = _mm256_set1_epi32(one);
|
|
9
|
+
__m256i b = _mm256_add_epi32(a, a);
|
|
10
|
+
return _mm256_extract_epi32(b, 0) == 2 ? 0 : 1;
|
|
11
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/* NumKong ISA probe: Ice Lake (AVX-512F/BW/DQ/VL + VNNI + VBMI + VPOPCNTDQ) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVX512VNNI__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile int one = 1;
|
|
12
|
+
__m512i acc = _mm512_setzero_si512();
|
|
13
|
+
__m512i a = _mm512_set1_epi8((char)one);
|
|
14
|
+
__m512i b = _mm512_set1_epi8((char)one);
|
|
15
|
+
acc = _mm512_dpbusd_epi32(acc, a, b);
|
|
16
|
+
return (int)_mm512_reduce_add_epi32(acc) == 64 ? 0 : 1;
|
|
17
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/* NumKong ISA probe: Sapphire Rapids (AVX-512F/BW/DQ/VL + FP16) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVX512FP16__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile float one = 1.0f;
|
|
12
|
+
__m512h a = _mm512_set1_ph((_Float16)one);
|
|
13
|
+
__m512h b = _mm512_set1_ph((_Float16)(one + one));
|
|
14
|
+
__m512h c = _mm512_fmadd_ph(a, b, a);
|
|
15
|
+
return (int)_mm_extract_epi16(_mm256_castsi256_si128(_mm512_castsi512_si256((__m512i)c)), 0) != 0 ? 0 : 1;
|
|
16
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/* NumKong ISA probe: Sapphire Rapids AMX (AMX-TILE + AMX-INT8) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if defined(__FreeBSD__)
|
|
7
|
+
#error "AMX not supported on FreeBSD"
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#if !defined(__AMX_INT8__)
|
|
11
|
+
#error "Feature not available"
|
|
12
|
+
#endif
|
|
13
|
+
#include <immintrin.h>
|
|
14
|
+
int main(void) {
|
|
15
|
+
volatile int zero = 0;
|
|
16
|
+
_tile_release();
|
|
17
|
+
return zero;
|
|
18
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/* NumKong ISA probe: Sierra Forest (AVXVNNIINT8) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVXVNNIINT8__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile int two = 2;
|
|
12
|
+
__m256i acc = _mm256_setzero_si256();
|
|
13
|
+
__m256i a = _mm256_set1_epi8((char)two);
|
|
14
|
+
__m256i b = _mm256_set1_epi8((char)(two + 1));
|
|
15
|
+
acc = _mm256_dpbssd_epi32(acc, a, b);
|
|
16
|
+
return _mm256_extract_epi32(acc, 0) == 24 ? 0 : 1;
|
|
17
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/* NumKong ISA probe: Skylake (AVX-512F/BW/DQ/VL) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVX512F__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile int one = 1;
|
|
12
|
+
__m512i a = _mm512_set1_epi32(one);
|
|
13
|
+
__m512i b = _mm512_add_epi32(a, a);
|
|
14
|
+
return (int)_mm512_reduce_add_epi32(b) == 32 ? 0 : 1;
|
|
15
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/* NumKong ISA probe: Turin (AVX-512F + VP2INTERSECT) */
|
|
2
|
+
#if defined(__APPLE__)
|
|
3
|
+
#error "AVX-512 not available on macOS"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#if !defined(__AVX512VP2INTERSECT__)
|
|
7
|
+
#error "Feature not available"
|
|
8
|
+
#endif
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
int main(void) {
|
|
11
|
+
volatile int val = 42;
|
|
12
|
+
__m512i a = _mm512_set1_epi32(val);
|
|
13
|
+
__m512i b = _mm512_set1_epi32(val);
|
|
14
|
+
__mmask16 k0, k1;
|
|
15
|
+
_mm512_2intersect_epi32(a, b, &k0, &k1);
|
|
16
|
+
return k0 != 0 ? 0 : 1;
|
|
17
|
+
}
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
async function NumKongModule(moduleArg={}){var moduleRtn;var Module=moduleArg;var ENVIRONMENT_IS_WEB=!!globalThis.window;var ENVIRONMENT_IS_WORKER=!!globalThis.WorkerGlobalScope;var ENVIRONMENT_IS_NODE=globalThis.process?.versions?.node&&globalThis.process?.type!="renderer";if(ENVIRONMENT_IS_NODE){const{createRequire}=await import("node:module");var require=createRequire(import.meta.url)}var arguments_=[];var thisProgram="./this.program";var quit_=(status,toThrow)=>{throw toThrow};var _scriptName=import.meta.url;var scriptDirectory="";function locateFile(path){if(Module["locateFile"]){return Module["locateFile"](path,scriptDirectory)}return scriptDirectory+path}var readAsync,readBinary;if(ENVIRONMENT_IS_NODE){var fs=require("node:fs");if(_scriptName.startsWith("file:")){scriptDirectory=require("node:path").dirname(require("node:url").fileURLToPath(_scriptName))+"/"}readBinary=filename=>{filename=isFileURI(filename)?new URL(filename):filename;var ret=fs.readFileSync(filename);return ret};readAsync=async(filename,binary=true)=>{filename=isFileURI(filename)?new URL(filename):filename;var ret=fs.readFileSync(filename,binary?undefined:"utf8");return ret};if(process.argv.length>1){thisProgram=process.argv[1].replace(/\\/g,"/")}arguments_=process.argv.slice(2);quit_=(status,toThrow)=>{process.exitCode=status;throw toThrow}}else if(ENVIRONMENT_IS_WEB||ENVIRONMENT_IS_WORKER){try{scriptDirectory=new URL(".",_scriptName).href}catch{}{readAsync=async url=>{var response=await fetch(url,{credentials:"same-origin"});if(response.ok){return response.arrayBuffer()}throw new Error(response.status+" : "+response.url)}}}else{}var out=console.log.bind(console);var err=console.error.bind(console);var wasmBinary;var ABORT=false;var isFileURI=filename=>filename.startsWith("file://");class EmscriptenEH{}class EmscriptenSjLj extends EmscriptenEH{}var readyPromiseResolve,readyPromiseReject;var runtimeInitialized=false;function updateMemoryViews(){var b=wasmMemory.buffer;HEAP8=new Int8Array(b);HEAP16=new Int16Array(b);HEAPU8=new Uint8Array(b);HEAPU16=new Uint16Array(b);HEAP32=new Int32Array(b);HEAPU32=new Uint32Array(b);HEAPF32=new Float32Array(b);HEAPF64=new Float64Array(b);HEAP64=new BigInt64Array(b);HEAPU64=new BigUint64Array(b)}function initMemory(){if(Module["wasmMemory"]){wasmMemory=Module["wasmMemory"]}else{var INITIAL_MEMORY=Module["INITIAL_MEMORY"]||16777216;wasmMemory=new WebAssembly.Memory({initial:INITIAL_MEMORY/65536,maximum:32768})}updateMemoryViews()}function preRun(){if(Module["preRun"]){if(typeof Module["preRun"]=="function")Module["preRun"]=[Module["preRun"]];while(Module["preRun"].length){addOnPreRun(Module["preRun"].shift())}}callRuntimeCallbacks(onPreRuns)}function initRuntime(){runtimeInitialized=true;wasmExports["d"]()}function postRun(){if(Module["postRun"]){if(typeof Module["postRun"]=="function")Module["postRun"]=[Module["postRun"]];while(Module["postRun"].length){addOnPostRun(Module["postRun"].shift())}}callRuntimeCallbacks(onPostRuns)}function abort(what){Module["onAbort"]?.(what);what=`Aborted(${what})`;err(what);ABORT=true;what+=". Build with -sASSERTIONS for more info.";var e=new WebAssembly.RuntimeError(what);readyPromiseReject?.(e);throw e}var wasmBinaryFile;function findWasmBinary(){if(Module["locateFile"]){return locateFile("numkong.wasm")}return new URL("numkong.wasm",import.meta.url).href}function getBinarySync(file){if(file==wasmBinaryFile&&wasmBinary){return new Uint8Array(wasmBinary)}if(readBinary){return readBinary(file)}throw"both async and sync fetching of the wasm failed"}async function getWasmBinary(binaryFile){if(!wasmBinary){try{var response=await readAsync(binaryFile);return new Uint8Array(response)}catch{}}return getBinarySync(binaryFile)}async function instantiateArrayBuffer(binaryFile,imports){try{var binary=await getWasmBinary(binaryFile);var instance=await WebAssembly.instantiate(binary,imports);return instance}catch(reason){err(`failed to asynchronously prepare wasm: ${reason}`);abort(reason)}}async function instantiateAsync(binary,binaryFile,imports){if(!binary&&!ENVIRONMENT_IS_NODE){try{var response=fetch(binaryFile,{credentials:"same-origin"});var instantiationResult=await WebAssembly.instantiateStreaming(response,imports);return instantiationResult}catch(reason){err(`wasm streaming compile failed: ${reason}`);err("falling back to ArrayBuffer instantiation")}}return instantiateArrayBuffer(binaryFile,imports)}function getWasmImports(){var imports={a:wasmImports};return imports}async function createWasm(){function receiveInstance(instance,module){wasmExports=instance.exports;assignWasmExports(wasmExports);return wasmExports}function receiveInstantiationResult(result){return receiveInstance(result["instance"])}var info=getWasmImports();if(Module["instantiateWasm"]){return new Promise((resolve,reject)=>{Module["instantiateWasm"](info,(inst,mod)=>{resolve(receiveInstance(inst,mod))})})}wasmBinaryFile??=findWasmBinary();var result=await instantiateAsync(wasmBinary,wasmBinaryFile,info);var exports=receiveInstantiationResult(result);return exports}class ExitStatus{name="ExitStatus";constructor(status){this.message=`Program terminated with exit(${status})`;this.status=status}}var HEAP16;var HEAP32;var HEAP64;var HEAP8;var HEAPF32;var HEAPF64;var HEAPU16;var HEAPU32;var HEAPU64;var HEAPU8;var callRuntimeCallbacks=callbacks=>{while(callbacks.length>0){callbacks.shift()(Module)}};var onPostRuns=[];var addOnPostRun=cb=>onPostRuns.push(cb);var onPreRuns=[];var addOnPreRun=cb=>onPreRuns.push(cb);var noExitRuntime=true;var wasmMemory;var getHeapMax=()=>2147483648;var alignMemory=(size,alignment)=>Math.ceil(size/alignment)*alignment;var growMemory=size=>{var oldHeapSize=wasmMemory.buffer.byteLength;var pages=(size-oldHeapSize+65535)/65536|0;try{wasmMemory.grow(pages);updateMemoryViews();return 1}catch(e){}};var _emscripten_resize_heap=requestedSize=>{var oldSize=HEAPU8.length;requestedSize>>>=0;var maxHeapSize=getHeapMax();if(requestedSize>maxHeapSize){return false}for(var cutDown=1;cutDown<=4;cutDown*=2){var overGrownHeapSize=oldSize*(1+.2/cutDown);overGrownHeapSize=Math.min(overGrownHeapSize,requestedSize+100663296);var newSize=Math.min(maxHeapSize,alignMemory(Math.max(requestedSize,overGrownHeapSize),65536));var replacement=growMemory(newSize);if(replacement){return true}}return false};{initMemory();if(Module["noExitRuntime"])noExitRuntime=Module["noExitRuntime"];if(Module["print"])out=Module["print"];if(Module["printErr"])err=Module["printErr"];if(Module["wasmBinary"])wasmBinary=Module["wasmBinary"];if(Module["arguments"])arguments_=Module["arguments"];if(Module["thisProgram"])thisProgram=Module["thisProgram"];if(Module["preInit"]){if(typeof Module["preInit"]=="function")Module["preInit"]=[Module["preInit"]];while(Module["preInit"].length>0){Module["preInit"].shift()()}}}Module["wasmMemory"]=wasmMemory;function nk_has_relaxed(){var test=new Uint8Array([0,97,115,109,1,0,0,0,1,8,1,96,3,123,123,123,1,123,3,2,1,0,10,9,1,7,0,32,0,32,1,32,2,253,175,1,11]);try{return WebAssembly.validate(test)?1:0}catch(e){return 0}}var _nk_dot_f64,_nk_dot_f32,_nk_dot_bf16,_nk_dot_f16,_nk_dot_i8,_nk_dot_u8,_nk_angular_f64,_nk_angular_f32,_nk_angular_bf16,_nk_angular_f16,_nk_angular_i8,_nk_angular_u8,_nk_euclidean_f64,_nk_euclidean_f32,_nk_euclidean_bf16,_nk_euclidean_f16,_nk_euclidean_i8,_nk_euclidean_u8,_nk_sqeuclidean_f64,_nk_sqeuclidean_f32,_nk_sqeuclidean_bf16,_nk_sqeuclidean_f16,_nk_sqeuclidean_i8,_nk_sqeuclidean_u8,_nk_hamming_u8,_nk_hamming_u1,_nk_jaccard_u16,_nk_jaccard_u1,_nk_kld_f64,_nk_kld_f32,_nk_jsd_f64,_nk_jsd_f32,_nk_dots_packed_size_f64,_nk_dots_packed_size_f32,_nk_dots_packed_size_bf16,_nk_dots_packed_size_f16,_nk_dots_packed_size_i8,_nk_dots_packed_size_u8,_nk_dots_pack_f64,_nk_dots_pack_f32,_nk_dots_pack_bf16,_nk_dots_pack_f16,_nk_dots_pack_i8,_nk_dots_pack_u8,_nk_dots_packed_f64,_nk_dots_packed_f32,_nk_dots_packed_bf16,_nk_dots_packed_f16,_nk_dots_packed_i8,_nk_dots_packed_u8,_nk_dots_symmetric_f64,_nk_dots_symmetric_f32,_nk_dots_symmetric_bf16,_nk_dots_symmetric_f16,_nk_dots_symmetric_i8,_nk_dots_symmetric_u8,_nk_angulars_packed_f64,_nk_angulars_packed_f32,_nk_angulars_packed_bf16,_nk_angulars_packed_f16,_nk_angulars_symmetric_f64,_nk_angulars_symmetric_f32,_nk_angulars_symmetric_bf16,_nk_angulars_symmetric_f16,_nk_euclideans_packed_f64,_nk_euclideans_packed_f32,_nk_euclideans_packed_bf16,_nk_euclideans_packed_f16,_nk_euclideans_symmetric_f64,_nk_euclideans_symmetric_f32,_nk_euclideans_symmetric_bf16,_nk_euclideans_symmetric_f16,_nk_capabilities,_malloc,_free,__indirect_function_table;function assignWasmExports(wasmExports){_nk_dot_f64=Module["_nk_dot_f64"]=wasmExports["e"];_nk_dot_f32=Module["_nk_dot_f32"]=wasmExports["f"];_nk_dot_bf16=Module["_nk_dot_bf16"]=wasmExports["g"];_nk_dot_f16=Module["_nk_dot_f16"]=wasmExports["h"];_nk_dot_i8=Module["_nk_dot_i8"]=wasmExports["i"];_nk_dot_u8=Module["_nk_dot_u8"]=wasmExports["j"];_nk_angular_f64=Module["_nk_angular_f64"]=wasmExports["k"];_nk_angular_f32=Module["_nk_angular_f32"]=wasmExports["l"];_nk_angular_bf16=Module["_nk_angular_bf16"]=wasmExports["m"];_nk_angular_f16=Module["_nk_angular_f16"]=wasmExports["n"];_nk_angular_i8=Module["_nk_angular_i8"]=wasmExports["o"];_nk_angular_u8=Module["_nk_angular_u8"]=wasmExports["p"];_nk_euclidean_f64=Module["_nk_euclidean_f64"]=wasmExports["q"];_nk_euclidean_f32=Module["_nk_euclidean_f32"]=wasmExports["r"];_nk_euclidean_bf16=Module["_nk_euclidean_bf16"]=wasmExports["s"];_nk_euclidean_f16=Module["_nk_euclidean_f16"]=wasmExports["t"];_nk_euclidean_i8=Module["_nk_euclidean_i8"]=wasmExports["u"];_nk_euclidean_u8=Module["_nk_euclidean_u8"]=wasmExports["v"];_nk_sqeuclidean_f64=Module["_nk_sqeuclidean_f64"]=wasmExports["w"];_nk_sqeuclidean_f32=Module["_nk_sqeuclidean_f32"]=wasmExports["x"];_nk_sqeuclidean_bf16=Module["_nk_sqeuclidean_bf16"]=wasmExports["y"];_nk_sqeuclidean_f16=Module["_nk_sqeuclidean_f16"]=wasmExports["z"];_nk_sqeuclidean_i8=Module["_nk_sqeuclidean_i8"]=wasmExports["A"];_nk_sqeuclidean_u8=Module["_nk_sqeuclidean_u8"]=wasmExports["B"];_nk_hamming_u8=Module["_nk_hamming_u8"]=wasmExports["C"];_nk_hamming_u1=Module["_nk_hamming_u1"]=wasmExports["D"];_nk_jaccard_u16=Module["_nk_jaccard_u16"]=wasmExports["E"];_nk_jaccard_u1=Module["_nk_jaccard_u1"]=wasmExports["F"];_nk_kld_f64=Module["_nk_kld_f64"]=wasmExports["G"];_nk_kld_f32=Module["_nk_kld_f32"]=wasmExports["H"];_nk_jsd_f64=Module["_nk_jsd_f64"]=wasmExports["I"];_nk_jsd_f32=Module["_nk_jsd_f32"]=wasmExports["J"];_nk_dots_packed_size_f64=Module["_nk_dots_packed_size_f64"]=wasmExports["K"];_nk_dots_packed_size_f32=Module["_nk_dots_packed_size_f32"]=wasmExports["L"];_nk_dots_packed_size_bf16=Module["_nk_dots_packed_size_bf16"]=wasmExports["M"];_nk_dots_packed_size_f16=Module["_nk_dots_packed_size_f16"]=wasmExports["N"];_nk_dots_packed_size_i8=Module["_nk_dots_packed_size_i8"]=wasmExports["O"];_nk_dots_packed_size_u8=Module["_nk_dots_packed_size_u8"]=wasmExports["P"];_nk_dots_pack_f64=Module["_nk_dots_pack_f64"]=wasmExports["Q"];_nk_dots_pack_f32=Module["_nk_dots_pack_f32"]=wasmExports["R"];_nk_dots_pack_bf16=Module["_nk_dots_pack_bf16"]=wasmExports["S"];_nk_dots_pack_f16=Module["_nk_dots_pack_f16"]=wasmExports["T"];_nk_dots_pack_i8=Module["_nk_dots_pack_i8"]=wasmExports["U"];_nk_dots_pack_u8=Module["_nk_dots_pack_u8"]=wasmExports["V"];_nk_dots_packed_f64=Module["_nk_dots_packed_f64"]=wasmExports["W"];_nk_dots_packed_f32=Module["_nk_dots_packed_f32"]=wasmExports["X"];_nk_dots_packed_bf16=Module["_nk_dots_packed_bf16"]=wasmExports["Y"];_nk_dots_packed_f16=Module["_nk_dots_packed_f16"]=wasmExports["Z"];_nk_dots_packed_i8=Module["_nk_dots_packed_i8"]=wasmExports["_"];_nk_dots_packed_u8=Module["_nk_dots_packed_u8"]=wasmExports["$"];_nk_dots_symmetric_f64=Module["_nk_dots_symmetric_f64"]=wasmExports["aa"];_nk_dots_symmetric_f32=Module["_nk_dots_symmetric_f32"]=wasmExports["ba"];_nk_dots_symmetric_bf16=Module["_nk_dots_symmetric_bf16"]=wasmExports["ca"];_nk_dots_symmetric_f16=Module["_nk_dots_symmetric_f16"]=wasmExports["da"];_nk_dots_symmetric_i8=Module["_nk_dots_symmetric_i8"]=wasmExports["ea"];_nk_dots_symmetric_u8=Module["_nk_dots_symmetric_u8"]=wasmExports["fa"];_nk_angulars_packed_f64=Module["_nk_angulars_packed_f64"]=wasmExports["ga"];_nk_angulars_packed_f32=Module["_nk_angulars_packed_f32"]=wasmExports["ha"];_nk_angulars_packed_bf16=Module["_nk_angulars_packed_bf16"]=wasmExports["ia"];_nk_angulars_packed_f16=Module["_nk_angulars_packed_f16"]=wasmExports["ja"];_nk_angulars_symmetric_f64=Module["_nk_angulars_symmetric_f64"]=wasmExports["ka"];_nk_angulars_symmetric_f32=Module["_nk_angulars_symmetric_f32"]=wasmExports["la"];_nk_angulars_symmetric_bf16=Module["_nk_angulars_symmetric_bf16"]=wasmExports["ma"];_nk_angulars_symmetric_f16=Module["_nk_angulars_symmetric_f16"]=wasmExports["na"];_nk_euclideans_packed_f64=Module["_nk_euclideans_packed_f64"]=wasmExports["oa"];_nk_euclideans_packed_f32=Module["_nk_euclideans_packed_f32"]=wasmExports["pa"];_nk_euclideans_packed_bf16=Module["_nk_euclideans_packed_bf16"]=wasmExports["qa"];_nk_euclideans_packed_f16=Module["_nk_euclideans_packed_f16"]=wasmExports["ra"];_nk_euclideans_symmetric_f64=Module["_nk_euclideans_symmetric_f64"]=wasmExports["sa"];_nk_euclideans_symmetric_f32=Module["_nk_euclideans_symmetric_f32"]=wasmExports["ta"];_nk_euclideans_symmetric_bf16=Module["_nk_euclideans_symmetric_bf16"]=wasmExports["ua"];_nk_euclideans_symmetric_f16=Module["_nk_euclideans_symmetric_f16"]=wasmExports["va"];_nk_capabilities=Module["_nk_capabilities"]=wasmExports["wa"];_malloc=Module["_malloc"]=wasmExports["xa"];_free=Module["_free"]=wasmExports["ya"];__indirect_function_table=wasmExports["__indirect_function_table"]}var wasmImports={b:_emscripten_resize_heap,a:wasmMemory,c:nk_has_relaxed};function run(){preRun();function doRun(){Module["calledRun"]=true;if(ABORT)return;initRuntime();readyPromiseResolve?.(Module);Module["onRuntimeInitialized"]?.();postRun()}if(Module["setStatus"]){Module["setStatus"]("Running...");setTimeout(()=>{setTimeout(()=>Module["setStatus"](""),1);doRun()},1)}else{doRun()}}var wasmExports;wasmExports=await (createWasm());run();if(runtimeInitialized){moduleRtn=Module}else{moduleRtn=new Promise((resolve,reject)=>{readyPromiseResolve=resolve;readyPromiseReject=reject})}
|
|
2
|
+
;return moduleRtn}export default NumKongModule;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @brief Self-contained browser ESM entry point for NumKong WASM.
|
|
3
|
+
* @file javascript/numkong-browser.ts
|
|
4
|
+
*
|
|
5
|
+
* Auto-initializes the Emscripten module on import via top-level await.
|
|
6
|
+
* The Emscripten glue (`numkong-emscripten.js`) and binary (`numkong.wasm`)
|
|
7
|
+
* must be co-located with this file (same directory or CDN prefix).
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* import { dot, euclidean } from './numkong.js';
|
|
11
|
+
* console.log(dot(new Float32Array([1,2,3]), new Float32Array([4,5,6])));
|
|
12
|
+
*/
|
|
13
|
+
export { TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, TypedArray, KernelFamily, dtypeToString, outputDtype, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, isFloat16Array, isBFloat16Array, isE4M3Array, isE5M2Array, isBinaryArray, } from './types.js';
|
|
14
|
+
export { dot, inner, euclidean, sqeuclidean, angular, hamming, jaccard, kullbackleibler, jensenshannon, getCapabilities, hasCapability, dotsPack, dotsPackedSize, dotsPacked, angularsPacked, euclideansPacked, dotsSymmetric, angularsSymmetric, euclideansSymmetric, } from './numkong-wasm.js';
|