numkong 7.4.5 → 7.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/binding.gyp +81 -5
- package/c/dispatch_f16.c +23 -0
- package/c/numkong.c +0 -13
- package/include/numkong/attention/sme.h +34 -31
- package/include/numkong/capabilities.h +2 -15
- package/include/numkong/cast/neon.h +15 -0
- package/include/numkong/curved/smef64.h +82 -62
- package/include/numkong/dot/rvvbf16.h +1 -1
- package/include/numkong/dot/rvvhalf.h +1 -1
- package/include/numkong/dot/sve.h +6 -5
- package/include/numkong/dot/svebfdot.h +2 -1
- package/include/numkong/dot/svehalf.h +6 -5
- package/include/numkong/dot/svesdot.h +3 -2
- package/include/numkong/dots/graniteamx.h +733 -0
- package/include/numkong/dots/serial.h +11 -4
- package/include/numkong/dots/sme.h +172 -140
- package/include/numkong/dots/smebi32.h +14 -11
- package/include/numkong/dots/smef64.h +31 -26
- package/include/numkong/dots.h +29 -3
- package/include/numkong/each/serial.h +22 -0
- package/include/numkong/geospatial/haswell.h +1 -1
- package/include/numkong/geospatial/neon.h +1 -1
- package/include/numkong/geospatial/serial.h +1 -1
- package/include/numkong/geospatial/skylake.h +1 -1
- package/include/numkong/maxsim/sme.h +34 -33
- package/include/numkong/mesh/serial.h +22 -0
- package/include/numkong/reduce/neon.h +29 -0
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +4 -4
- package/include/numkong/reduce/sve.h +52 -0
- package/include/numkong/reduce.h +4 -0
- package/include/numkong/set/sve.h +6 -5
- package/include/numkong/sets/smebi32.h +35 -30
- package/include/numkong/sparse/sve2.h +3 -2
- package/include/numkong/spatial/sve.h +7 -6
- package/include/numkong/spatial/svebfdot.h +7 -4
- package/include/numkong/spatial/svehalf.h +5 -4
- package/include/numkong/spatial/svesdot.h +9 -8
- package/include/numkong/spatials/graniteamx.h +173 -0
- package/include/numkong/spatials/serial.h +22 -0
- package/include/numkong/spatials/sme.h +391 -350
- package/include/numkong/spatials/smef64.h +79 -70
- package/include/numkong/spatials.h +37 -4
- package/include/numkong/types.h +59 -0
- package/javascript/dist/cjs/numkong.js +13 -0
- package/javascript/dist/esm/numkong.js +13 -0
- package/javascript/numkong.c +56 -12
- package/javascript/numkong.ts +13 -0
- package/package.json +7 -7
- package/probes/probe.js +2 -2
- package/wasm/numkong.wasm +0 -0
|
@@ -2433,11 +2433,14 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
|
|
|
2433
2433
|
|
|
2434
2434
|
/* Optimize serial GEMM instantiations for size rather than speed.
|
|
2435
2435
|
* These fallback kernels are only used when no SIMD backend is available, so aggressive inlining/unrolling from -O3
|
|
2436
|
-
* wastes
|
|
2437
|
-
* of `__attribute__((optimize("Os"))` isn't supported on Clang, so this flag only applies to GCC builds.
|
|
2436
|
+
* wastes over 1 MB of binary space with negligible performance benefit on the serial path.
|
|
2438
2437
|
*/
|
|
2439
2438
|
#if defined(NDEBUG)
|
|
2440
|
-
#if defined(
|
|
2439
|
+
#if defined(_MSC_VER)
|
|
2440
|
+
#pragma optimize("s", on)
|
|
2441
|
+
#elif defined(__clang__)
|
|
2442
|
+
#pragma clang attribute push(__attribute__((minsize)), apply_to = function)
|
|
2443
|
+
#elif defined(__GNUC__)
|
|
2441
2444
|
#pragma GCC push_options
|
|
2442
2445
|
#pragma GCC optimize("Os")
|
|
2443
2446
|
#endif
|
|
@@ -2677,7 +2680,11 @@ nk_define_cross_packed_(dots, u1, serial, u1x8, u1x8, u32, nk_b128_vec_t, nk_dot
|
|
|
2677
2680
|
/*depth_simd_dimensions=*/128, /*dimensions_per_value=*/8)
|
|
2678
2681
|
|
|
2679
2682
|
#if defined(NDEBUG)
|
|
2680
|
-
#if defined(
|
|
2683
|
+
#if defined(_MSC_VER)
|
|
2684
|
+
#pragma optimize("", on)
|
|
2685
|
+
#elif defined(__clang__)
|
|
2686
|
+
#pragma clang attribute pop
|
|
2687
|
+
#elif defined(__GNUC__)
|
|
2681
2688
|
#pragma GCC pop_options
|
|
2682
2689
|
#endif
|
|
2683
2690
|
#endif
|