numkong 7.5.0 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +18 -0
- package/c/dispatch_e5m2.c +23 -3
- package/include/numkong/capabilities.h +1 -1
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +434 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +23 -8
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots.h +12 -0
- package/include/numkong/each/serial.h +18 -1
- package/include/numkong/geospatial/serial.h +14 -3
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +204 -162
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +128 -0
- package/include/numkong/spatials/serial.h +18 -1
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials.h +17 -0
- package/include/numkong/tensor.hpp +107 -23
- package/javascript/numkong.c +3 -2
- package/package.json +7 -7
- package/wasm/numkong.wasm +0 -0
|
@@ -522,7 +522,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
|
|
|
522
522
|
load_a_vec_fn, partial_load_a_vec_fn, load_b_vec_fn, partial_load_b_vec_fn, \
|
|
523
523
|
inner_product_fn, reduce_accumulators_fn, store_fn, partial_store_fn, \
|
|
524
524
|
depth_simd_dimensions, dimensions_per_value) \
|
|
525
|
-
|
|
525
|
+
NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_( \
|
|
526
526
|
nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
|
|
527
527
|
nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
|
|
528
528
|
nk_size_t c_stride_in_bytes) { \
|
|
@@ -698,7 +698,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
|
|
|
698
698
|
} \
|
|
699
699
|
} \
|
|
700
700
|
} \
|
|
701
|
-
|
|
701
|
+
NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_( \
|
|
702
702
|
nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
|
|
703
703
|
nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
|
|
704
704
|
nk_size_t c_stride_in_bytes) { \
|
|
@@ -1090,7 +1090,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
|
|
|
1090
1090
|
norm_value_type, vec_type, state_type, result_vec_type, init_accumulator_fn, load_a_vec_fn, partial_load_a_vec_fn, \
|
|
1091
1091
|
load_b_vec_fn, partial_load_b_vec_fn, inner_product_fn, compensated_finalize_fn, store_fn, partial_store_fn, \
|
|
1092
1092
|
load_sum_fn, partial_load_sum_fn, compute_a_sum_fn, depth_simd_dimensions, dimensions_per_value) \
|
|
1093
|
-
|
|
1093
|
+
NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_( \
|
|
1094
1094
|
nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
|
|
1095
1095
|
nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
|
|
1096
1096
|
nk_size_t c_stride_in_bytes) { \
|
|
@@ -1200,7 +1200,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
|
|
|
1200
1200
|
} \
|
|
1201
1201
|
} \
|
|
1202
1202
|
} \
|
|
1203
|
-
|
|
1203
|
+
NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_( \
|
|
1204
1204
|
nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
|
|
1205
1205
|
nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
|
|
1206
1206
|
nk_size_t c_stride_in_bytes) { \
|
|
@@ -2431,10 +2431,19 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
|
|
|
2431
2431
|
} \
|
|
2432
2432
|
}
|
|
2433
2433
|
|
|
2434
|
-
/*
|
|
2435
|
-
*
|
|
2436
|
-
* wastes
|
|
2437
|
-
|
|
2434
|
+
/* Keep the serial instantiations below actually scalar, regardless of build type.
|
|
2435
|
+
* Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
|
|
2436
|
+
* callers in dispatch_*.c, which wastes ~1 MB of binary and — more importantly —
|
|
2437
|
+
* breaks the nk_*_serial-as-scalar-oracle contract that tests and the numerical-
|
|
2438
|
+
* stability docs in this header rely on. */
|
|
2439
|
+
#if defined(__clang__)
|
|
2440
|
+
#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
|
|
2441
|
+
#elif defined(__GNUC__)
|
|
2442
|
+
#pragma GCC push_options
|
|
2443
|
+
#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
|
|
2444
|
+
#endif
|
|
2445
|
+
|
|
2446
|
+
/* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
|
|
2438
2447
|
#if defined(NDEBUG)
|
|
2439
2448
|
#if defined(_MSC_VER)
|
|
2440
2449
|
#pragma optimize("s", on)
|
|
@@ -2689,6 +2698,12 @@ nk_define_cross_packed_(dots, u1, serial, u1x8, u1x8, u32, nk_b128_vec_t, nk_dot
|
|
|
2689
2698
|
#endif
|
|
2690
2699
|
#endif
|
|
2691
2700
|
|
|
2701
|
+
#if defined(__clang__)
|
|
2702
|
+
#pragma clang attribute pop
|
|
2703
|
+
#elif defined(__GNUC__)
|
|
2704
|
+
#pragma GCC pop_options
|
|
2705
|
+
#endif
|
|
2706
|
+
|
|
2692
2707
|
/* BF16 compact: truncate F32 → BF16 in-place.
|
|
2693
2708
|
* Reads F32 matrix with c_stride_in_bytes, writes BF16 tightly packed (stride_in_bytes = column_count × sizeof(bf16)).
|
|
2694
2709
|
*/
|
|
@@ -114,45 +114,50 @@ nk_define_cross_packed_(dots, f16, skylake, f16, f32, f32, nk_b512_vec_t, nk_dot
|
|
|
114
114
|
nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
|
|
115
115
|
/*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
|
|
116
116
|
|
|
117
|
-
/* E4M3 GEMM:
|
|
118
|
-
|
|
117
|
+
/* E4M3 GEMM: F16-pack with asymmetric A/B representations at compute time. Pack converts
|
|
118
|
+
* E4M3 → F16 once (~10 ops/16 elements, 2 bytes/elt stored). A-stream uses the Giesen E4M3→F32
|
|
119
|
+
* cast (identical cost to F32-pack path). B-loader widens F16 → F32 inline (1 vcvtph2ps per 16
|
|
120
|
+
* lanes). Update takes both as F32 → plain fmadd. Saves 2 bytes/elt vs F32-pack; inner loop
|
|
121
|
+
* adds one cvtph2ps per B-read. Symmetric uses E4M3→F32 for both sides (no pack involved). */
|
|
122
|
+
nk_define_cross_pack_size_(dots, e4m3, skylake, e4m3, f16, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/16,
|
|
119
123
|
/*dimensions_per_value=*/1)
|
|
120
|
-
nk_define_cross_pack_(dots, e4m3, skylake, e4m3,
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
+
nk_define_cross_pack_(dots, e4m3, skylake, e4m3, f16, nk_b256_vec_t, nk_load_e4m3x16_to_f16x16_skylake_,
|
|
125
|
+
nk_partial_load_e4m3x16_to_f16x16_skylake_, nk_store_b256_haswell_,
|
|
126
|
+
nk_partial_store_b16x16_serial_,
|
|
127
|
+
/*simd_width=*/16, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e4m3_,
|
|
128
|
+
/*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
|
|
124
129
|
nk_define_cross_symmetric_(dots, e4m3, skylake, e4m3, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
|
|
125
130
|
nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e4m3x16_to_f32x16_skylake_,
|
|
126
131
|
nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
|
|
127
132
|
nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
|
|
128
133
|
nk_partial_store_b32x4_skylake_,
|
|
129
134
|
/*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
|
|
130
|
-
nk_define_cross_packed_(dots, e4m3, skylake, e4m3,
|
|
135
|
+
nk_define_cross_packed_(dots, e4m3, skylake, e4m3, f16, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
|
|
131
136
|
nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e4m3x16_to_f32x16_skylake_,
|
|
132
|
-
nk_partial_load_e4m3x16_to_f32x16_skylake_,
|
|
133
|
-
|
|
137
|
+
nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_load_f16x16_to_f32x16_skylake_,
|
|
138
|
+
nk_partial_load_f16x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
|
|
134
139
|
nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
|
|
135
140
|
/*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
|
|
136
141
|
|
|
137
|
-
/* E5M2 GEMM: depth_simd_dimensions=
|
|
138
|
-
nk_define_cross_pack_size_(dots, e5m2, skylake, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/
|
|
142
|
+
/* E5M2 GEMM: depth_simd_dimensions=64 (byte-level batch; widen inside the update helper) */
|
|
143
|
+
nk_define_cross_pack_size_(dots, e5m2, skylake, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/64,
|
|
139
144
|
/*dimensions_per_value=*/1)
|
|
140
|
-
nk_define_cross_pack_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t,
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
145
|
+
nk_define_cross_pack_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_load_b512_skylake_,
|
|
146
|
+
nk_partial_load_b8x64_skylake_, nk_store_b512_skylake_, nk_partial_store_b8x64_skylake_,
|
|
147
|
+
/*simd_width=*/64, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e5m2_,
|
|
148
|
+
/*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
|
|
144
149
|
nk_define_cross_symmetric_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
|
|
145
|
-
nk_b128_vec_t, nk_dot_through_f32_init_skylake_,
|
|
146
|
-
|
|
150
|
+
nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_b512_skylake_,
|
|
151
|
+
nk_partial_load_b8x64_skylake_, nk_dot_e5m2x64_update_skylake_,
|
|
147
152
|
nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
|
|
148
153
|
nk_partial_store_b32x4_skylake_,
|
|
149
|
-
/*depth_simd_dimensions=*/
|
|
154
|
+
/*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
|
|
150
155
|
nk_define_cross_packed_(dots, e5m2, skylake, e5m2, f32, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
|
|
151
|
-
nk_b128_vec_t, nk_dot_through_f32_init_skylake_,
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
/*depth_simd_dimensions=*/
|
|
156
|
+
nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_b512_skylake_,
|
|
157
|
+
nk_partial_load_b8x64_skylake_, nk_load_b512_skylake_, nk_partial_load_b8x64_skylake_,
|
|
158
|
+
nk_dot_e5m2x64_update_skylake_, nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
|
|
159
|
+
nk_partial_store_b32x4_skylake_,
|
|
160
|
+
/*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
|
|
156
161
|
|
|
157
162
|
/* E2M3 GEMM: integer LUT path, depth_simd_dimensions=64 (64 e2m3s = 64 bytes = AVX-512 register width) */
|
|
158
163
|
nk_define_cross_pack_size_(dots, e2m3, skylake, e2m3, e2m3, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/64,
|
package/include/numkong/dots.h
CHANGED
|
@@ -698,6 +698,18 @@ NK_PUBLIC void nk_dots_packed_f16_graniteamx(nk_f16_t const *a, void const *b_pa
|
|
|
698
698
|
NK_PUBLIC void nk_dots_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
699
699
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
700
700
|
nk_size_t row_start, nk_size_t row_count);
|
|
701
|
+
/** @copydoc nk_dots_packed_size_f16 */
|
|
702
|
+
NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2_graniteamx(nk_size_t width, nk_size_t depth);
|
|
703
|
+
/** @copydoc nk_dots_pack_f16 */
|
|
704
|
+
NK_PUBLIC void nk_dots_pack_e5m2_graniteamx(nk_e5m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
|
|
705
|
+
void *b_packed);
|
|
706
|
+
/** @copydoc nk_dots_packed_f16 */
|
|
707
|
+
NK_PUBLIC void nk_dots_packed_e5m2_graniteamx(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
|
|
708
|
+
nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
|
|
709
|
+
/** @copydoc nk_dots_symmetric_f16 */
|
|
710
|
+
NK_PUBLIC void nk_dots_symmetric_e5m2_graniteamx(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
711
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
712
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
701
713
|
#endif // NK_TARGET_GRANITEAMX
|
|
702
714
|
|
|
703
715
|
/* ARM SME backends using Scalable Matrix Extension.
|
|
@@ -76,7 +76,18 @@ extern "C" {
|
|
|
76
76
|
} \
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
-
/*
|
|
79
|
+
/* Keep the serial instantiations below actually scalar, regardless of build type.
|
|
80
|
+
* Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
|
|
81
|
+
* callers in dispatch_*.c, which wastes binary and breaks the nk_*_serial-as-scalar-oracle
|
|
82
|
+
* contract. See dots/serial.h. */
|
|
83
|
+
#if defined(__clang__)
|
|
84
|
+
#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
|
|
85
|
+
#elif defined(__GNUC__)
|
|
86
|
+
#pragma GCC push_options
|
|
87
|
+
#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
|
|
88
|
+
#endif
|
|
89
|
+
|
|
90
|
+
/* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
|
|
80
91
|
#if defined(NDEBUG)
|
|
81
92
|
#if defined(_MSC_VER)
|
|
82
93
|
#pragma optimize("s", on)
|
|
@@ -275,6 +286,12 @@ NK_PUBLIC void nk_each_fma_f64c_serial(nk_f64c_t const *a, nk_f64c_t const *b, n
|
|
|
275
286
|
#endif
|
|
276
287
|
#endif
|
|
277
288
|
|
|
289
|
+
#if defined(__clang__)
|
|
290
|
+
#pragma clang attribute pop
|
|
291
|
+
#elif defined(__GNUC__)
|
|
292
|
+
#pragma GCC pop_options
|
|
293
|
+
#endif
|
|
294
|
+
|
|
278
295
|
#if defined(__cplusplus)
|
|
279
296
|
} // extern "C"
|
|
280
297
|
#endif
|
|
@@ -17,9 +17,14 @@
|
|
|
17
17
|
extern "C" {
|
|
18
18
|
#endif
|
|
19
19
|
|
|
20
|
-
/*
|
|
21
|
-
*
|
|
22
|
-
|
|
20
|
+
/* Keep the serial instantiations below actually scalar, regardless of build type.
|
|
21
|
+
* See dots/serial.h for rationale. */
|
|
22
|
+
#if defined(__clang__)
|
|
23
|
+
#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
|
|
24
|
+
#elif defined(__GNUC__)
|
|
25
|
+
#pragma GCC push_options
|
|
26
|
+
#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
|
|
27
|
+
#endif
|
|
23
28
|
|
|
24
29
|
NK_PUBLIC void nk_haversine_f64_serial( //
|
|
25
30
|
nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
|
|
@@ -302,6 +307,12 @@ NK_PUBLIC void nk_vincenty_f32_serial( //
|
|
|
302
307
|
}
|
|
303
308
|
}
|
|
304
309
|
|
|
310
|
+
#if defined(__clang__)
|
|
311
|
+
#pragma clang attribute pop
|
|
312
|
+
#elif defined(__GNUC__)
|
|
313
|
+
#pragma GCC pop_options
|
|
314
|
+
#endif
|
|
315
|
+
|
|
305
316
|
#if defined(__cplusplus)
|
|
306
317
|
} // extern "C"
|
|
307
318
|
#endif
|
|
@@ -71,6 +71,15 @@ NK_STATIC_ASSERT(sizeof(nk_maxsim_vector_metadata_t) == 12, nk_maxsim_vector_met
|
|
|
71
71
|
*/
|
|
72
72
|
typedef void (*nk_maxsim_to_f32_t)(void const *source, nk_f32_t *destination);
|
|
73
73
|
|
|
74
|
+
/* Keep the serial instantiations below actually scalar, regardless of build type.
|
|
75
|
+
* See dots/serial.h for rationale. */
|
|
76
|
+
#if defined(__clang__)
|
|
77
|
+
#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
|
|
78
|
+
#elif defined(__GNUC__)
|
|
79
|
+
#pragma GCC push_options
|
|
80
|
+
#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
|
|
81
|
+
#endif
|
|
82
|
+
|
|
74
83
|
/** @brief Identity conversion for f32 sources — just a typed memcpy. */
|
|
75
84
|
NK_INTERNAL void nk_f32_to_f32_(void const *source, nk_f32_t *destination) { *destination = *(nk_f32_t const *)source; }
|
|
76
85
|
|
|
@@ -483,6 +492,12 @@ NK_PUBLIC void nk_maxsim_packed_f16_serial( //
|
|
|
483
492
|
*result = (nk_f32_t)total_angular_distance;
|
|
484
493
|
}
|
|
485
494
|
|
|
495
|
+
#if defined(__clang__)
|
|
496
|
+
#pragma clang attribute pop
|
|
497
|
+
#elif defined(__GNUC__)
|
|
498
|
+
#pragma GCC pop_options
|
|
499
|
+
#endif
|
|
500
|
+
|
|
486
501
|
#if defined(__cplusplus)
|
|
487
502
|
} // extern "C"
|
|
488
503
|
#endif
|
|
@@ -105,67 +105,73 @@ Each kernel runs for at least 20 seconds per configuration.
|
|
|
105
105
|
Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
|
|
106
106
|
Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
|
|
107
107
|
|
|
108
|
-
### Intel
|
|
108
|
+
### Intel Granite Rapids
|
|
109
|
+
|
|
110
|
+
Xeon 6776P, 2.3 GHz base, `cpu_scaling_enabled=false`.
|
|
111
|
+
Serial kernels compiled with `-fno-tree-vectorize`.
|
|
109
112
|
|
|
110
113
|
#### Native
|
|
111
114
|
|
|
112
115
|
| Kernel | 256 | 1024 | 4096 |
|
|
113
116
|
| :------------------------ | -----------------------: | -----------------------: | -----------------------: |
|
|
114
117
|
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
115
|
-
| `nk_rmsd_f64_serial` |
|
|
116
|
-
| `nk_kabsch_f64_serial` |
|
|
117
|
-
| `nk_umeyama_f64_serial` |
|
|
118
|
-
| `nk_rmsd_f64_haswell` |
|
|
119
|
-
| `nk_kabsch_f64_haswell` |
|
|
120
|
-
| `nk_umeyama_f64_haswell` |
|
|
121
|
-
| `nk_rmsd_f64_skylake` |
|
|
122
|
-
| `nk_kabsch_f64_skylake` |
|
|
123
|
-
| `nk_umeyama_f64_skylake` |
|
|
118
|
+
| `nk_rmsd_f64_serial` | 93.7 mp/s, 0.5 ulp | 87.4 mp/s, 0.5 ulp | 69.8 mp/s, 0.5 ulp |
|
|
119
|
+
| `nk_kabsch_f64_serial` | 11.8 mp/s, 0.8 ulp | 13.6 mp/s, 0.8 ulp | 12.8 mp/s, 0.8 ulp |
|
|
120
|
+
| `nk_umeyama_f64_serial` | 10.4 mp/s, 0.3 ulp | 11.7 mp/s, 0.3 ulp | 11.5 mp/s, 0.3 ulp |
|
|
121
|
+
| `nk_rmsd_f64_haswell` | 523 mp/s, 0.3 ulp | 564 mp/s, 0.4 ulp | 449 mp/s, 0.8 ulp |
|
|
122
|
+
| `nk_kabsch_f64_haswell` | 65.3 mp/s, 0.5 ulp | 203 mp/s, 0.9 ulp | 326 mp/s, 1.5 ulp |
|
|
123
|
+
| `nk_umeyama_f64_haswell` | 68.0 mp/s, 0.5 ulp | 200 mp/s, 0.8 ulp | 324 mp/s, 1.5 ulp |
|
|
124
|
+
| `nk_rmsd_f64_skylake` | 546 mp/s, 0.2 ulp | 587 mp/s, 0.3 ulp | 583 mp/s, 0.4 ulp |
|
|
125
|
+
| `nk_kabsch_f64_skylake` | 34.5 mp/s, 0.4 ulp | 107 mp/s, 0.5 ulp | 261 mp/s, 0.8 ulp |
|
|
126
|
+
| `nk_umeyama_f64_skylake` | 24.3 mp/s, 0.3 ulp | 82.7 mp/s, 0.5 ulp | 201 mp/s, 0.8 ulp |
|
|
124
127
|
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
125
|
-
| `nk_rmsd_f32_serial` |
|
|
126
|
-
| `nk_kabsch_f32_serial` |
|
|
127
|
-
| `nk_umeyama_f32_serial` |
|
|
128
|
-
| `nk_rmsd_f32_haswell` |
|
|
129
|
-
| `nk_kabsch_f32_haswell` |
|
|
130
|
-
| `nk_umeyama_f32_haswell` |
|
|
131
|
-
| `nk_rmsd_f32_skylake` | 1,
|
|
132
|
-
| `nk_kabsch_f32_skylake` |
|
|
133
|
-
| `nk_umeyama_f32_skylake` |
|
|
128
|
+
| `nk_rmsd_f32_serial` | 68.9 mp/s, 0.5 ulp | 70.7 mp/s, 0.5 ulp | 72.1 mp/s, 0.5 ulp |
|
|
129
|
+
| `nk_kabsch_f32_serial` | 11.2 mp/s, 0.8 ulp | 12.8 mp/s, 0.8 ulp | 14.0 mp/s, 0.9 ulp |
|
|
130
|
+
| `nk_umeyama_f32_serial` | 10.1 mp/s, 0.3 ulp | 11.2 mp/s, 0.3 ulp | 12.1 mp/s, 0.4 ulp |
|
|
131
|
+
| `nk_rmsd_f32_haswell` | 686 mp/s, 0.3 ulp | 848 mp/s, 0.5 ulp | 841 mp/s, 0.9 ulp |
|
|
132
|
+
| `nk_kabsch_f32_haswell` | 90.4 mp/s, 0.9 ulp | 250 mp/s, 1.3 ulp | 455 mp/s, 7.6 ulp |
|
|
133
|
+
| `nk_umeyama_f32_haswell` | 87.7 mp/s, 0.3 ulp | 250 mp/s, 0.4 ulp | 374 mp/s, 0.7 ulp |
|
|
134
|
+
| `nk_rmsd_f32_skylake` | 1,016 mp/s, 1.2 ulp | 1,112 mp/s, 1.2 ulp | 1,042 mp/s, 4.3 ulp |
|
|
135
|
+
| `nk_kabsch_f32_skylake` | 81.8 mp/s, 0.9 ulp | 241 mp/s, 4.1 ulp | 549 mp/s, 3.1 ulp |
|
|
136
|
+
| `nk_umeyama_f32_skylake` | 58.0 mp/s, 0.6 ulp | 168 mp/s, 2.9 ulp | 459 mp/s, 2.1 ulp |
|
|
134
137
|
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
135
|
-
| `nk_rmsd_bf16_haswell` |
|
|
136
|
-
| `nk_kabsch_bf16_haswell` |
|
|
137
|
-
| `nk_umeyama_bf16_haswell` |
|
|
138
|
-
| `nk_rmsd_bf16_skylake` | 1,
|
|
139
|
-
| `nk_kabsch_bf16_skylake` |
|
|
140
|
-
| `nk_umeyama_bf16_skylake` |
|
|
138
|
+
| `nk_rmsd_bf16_haswell` | 284 mp/s, 0.3 ulp | 281 mp/s, 3.5 ulp | 273 mp/s, 12.8 ulp |
|
|
139
|
+
| `nk_kabsch_bf16_haswell` | 36.2 mp/s, 0.4 ulp | 106 mp/s, 7.6 ulp | 186 mp/s, 33.0 ulp |
|
|
140
|
+
| `nk_umeyama_bf16_haswell` | 34.5 mp/s, 0.3 ulp | 102 mp/s, 5.3 ulp | 186 mp/s, 23.1 ulp |
|
|
141
|
+
| `nk_rmsd_bf16_skylake` | 1,837 mp/s, 0.4 ulp | 2,357 mp/s, 5.4 ulp | 2,422 mp/s, 11.8 ulp |
|
|
142
|
+
| `nk_kabsch_bf16_skylake` | 34.1 mp/s, 0.3 ulp | 131 mp/s, 3.2 ulp | 487 mp/s, 20.4 ulp |
|
|
143
|
+
| `nk_umeyama_bf16_skylake` | 34.6 mp/s, 0.3 ulp | 130 mp/s, 2.2 ulp | 394 mp/s, 14.3 ulp |
|
|
144
|
+
| `nk_rmsd_bf16_genoa` | 1,743 mp/s, 0.3 ulp | 2,323 mp/s, 3.1 ulp | 2,066 mp/s, 20.2 ulp |
|
|
145
|
+
| `nk_kabsch_bf16_genoa` | 33.4 mp/s, 0.3 ulp | 133 mp/s, 3.2 ulp | 405 mp/s, 20.3 ulp |
|
|
146
|
+
| `nk_umeyama_bf16_genoa` | 33.2 mp/s, 0.3 ulp | 129 mp/s, 2.2 ulp | 439 mp/s, 14.3 ulp |
|
|
141
147
|
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
142
|
-
| `nk_rmsd_f16_haswell` |
|
|
143
|
-
| `nk_kabsch_f16_haswell` |
|
|
144
|
-
| `nk_umeyama_f16_haswell` |
|
|
145
|
-
| `nk_rmsd_f16_skylake` | 1,
|
|
146
|
-
| `nk_kabsch_f16_skylake` |
|
|
147
|
-
| `nk_umeyama_f16_skylake` |
|
|
148
|
+
| `nk_rmsd_f16_haswell` | 273 mp/s, 0.2 ulp | 274 mp/s, 0.7 ulp | 291 mp/s, 2.5 ulp |
|
|
149
|
+
| `nk_kabsch_f16_haswell` | 34.4 mp/s, 0.5 ulp | 98.0 mp/s, 1.8 ulp | 197 mp/s, 8.2 ulp |
|
|
150
|
+
| `nk_umeyama_f16_haswell` | 35.5 mp/s, 0.4 ulp | 97.9 mp/s, 1.2 ulp | 196 mp/s, 5.7 ulp |
|
|
151
|
+
| `nk_rmsd_f16_skylake` | 1,834 mp/s, 0.3 ulp | 2,341 mp/s, 1.3 ulp | 2,418 mp/s, 3.9 ulp |
|
|
152
|
+
| `nk_kabsch_f16_skylake` | 34.0 mp/s, 0.7 ulp | 132 mp/s, 0.5 ulp | 480 mp/s, 4.7 ulp |
|
|
153
|
+
| `nk_umeyama_f16_skylake` | 33.8 mp/s, 0.5 ulp | 127 mp/s, 0.4 ulp | 481 mp/s, 3.3 ulp |
|
|
148
154
|
|
|
149
155
|
#### WASM
|
|
150
156
|
|
|
151
|
-
Measured with Wasmtime
|
|
157
|
+
Measured with Wasmtime v43 (Cranelift backend), WASI-SDK 24, `-msimd128 -mrelaxed-simd`.
|
|
152
158
|
|
|
153
159
|
| Kernel | 256 | 1024 | 4096 |
|
|
154
160
|
| :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
155
161
|
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
156
|
-
| `nk_rmsd_f64_serial` |
|
|
157
|
-
| `nk_rmsd_f64_v128relaxed` |
|
|
158
|
-
| `nk_kabsch_f64_serial` |
|
|
159
|
-
| `nk_kabsch_f64_v128relaxed` |
|
|
160
|
-
| `nk_umeyama_f64_serial` |
|
|
161
|
-
| `nk_umeyama_f64_v128relaxed` |
|
|
162
|
+
| `nk_rmsd_f64_serial` | 89.9 mp/s, 0.5 ulp | 86.1 mp/s, 0.5 ulp | 73.4 mp/s, 0.5 ulp |
|
|
163
|
+
| `nk_rmsd_f64_v128relaxed` | 485 mp/s, 0.4 ulp | 552 mp/s, 0.7 ulp | 412 mp/s, 1.3 ulp |
|
|
164
|
+
| `nk_kabsch_f64_serial` | 12.1 mp/s, 0.8 ulp | 13.9 mp/s, 0.8 ulp | 14.0 mp/s, 0.9 ulp |
|
|
165
|
+
| `nk_kabsch_f64_v128relaxed` | 66.0 mp/s, 0.9 ulp | 188 mp/s, 1.7 ulp | 177 mp/s, 3.1 ulp |
|
|
166
|
+
| `nk_umeyama_f64_serial` | 10.8 mp/s, 0.3 ulp | 12.3 mp/s, 0.3 ulp | 12.2 mp/s, 0.4 ulp |
|
|
167
|
+
| `nk_umeyama_f64_v128relaxed` | 64.0 mp/s, 0.8 ulp | 187 mp/s, 1.6 ulp | 178 mp/s, 3.2 ulp |
|
|
162
168
|
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
163
|
-
| `nk_rmsd_f32_serial` |
|
|
164
|
-
| `nk_rmsd_f32_v128relaxed` |
|
|
165
|
-
| `nk_kabsch_f32_serial` |
|
|
166
|
-
| `nk_kabsch_f32_v128relaxed` |
|
|
167
|
-
| `nk_umeyama_f32_serial` |
|
|
168
|
-
| `nk_umeyama_f32_v128relaxed` |
|
|
169
|
+
| `nk_rmsd_f32_serial` | 80.6 mp/s, 0.5 ulp | 82.7 mp/s, 0.5 ulp | 70.3 mp/s, 0.5 ulp |
|
|
170
|
+
| `nk_rmsd_f32_v128relaxed` | 452 mp/s, 1.5 ulp | 416 mp/s, 1.3 ulp | 399 mp/s, 4.8 ulp |
|
|
171
|
+
| `nk_kabsch_f32_serial` | 11.4 mp/s, 0.8 ulp | 12.8 mp/s, 0.9 ulp | 12.7 mp/s, 0.8 ulp |
|
|
172
|
+
| `nk_kabsch_f32_v128relaxed` | 79.5 mp/s, 4.2 ulp | 132 mp/s, 3.9 ulp | 177 mp/s, 14.3 ulp |
|
|
173
|
+
| `nk_umeyama_f32_serial` | 10.1 mp/s, 0.3 ulp | 11.2 mp/s, 0.3 ulp | 11.2 mp/s, 0.3 ulp |
|
|
174
|
+
| `nk_umeyama_f32_v128relaxed` | 79.4 mp/s, 2.8 ulp | 138 mp/s, 2.8 ulp | 194 mp/s, 10.1 ulp |
|
|
169
175
|
|
|
170
176
|
|
|
171
177
|
### Apple M5
|