numkong 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/binding.gyp +18 -0
  2. package/c/dispatch_e5m2.c +23 -3
  3. package/include/numkong/capabilities.h +1 -1
  4. package/include/numkong/cast/README.md +3 -0
  5. package/include/numkong/cast/haswell.h +28 -64
  6. package/include/numkong/cast/serial.h +17 -0
  7. package/include/numkong/cast/skylake.h +67 -52
  8. package/include/numkong/cast.h +1 -0
  9. package/include/numkong/dot/README.md +1 -0
  10. package/include/numkong/dot/haswell.h +92 -13
  11. package/include/numkong/dot/serial.h +15 -0
  12. package/include/numkong/dot/skylake.h +61 -14
  13. package/include/numkong/dots/README.md +2 -0
  14. package/include/numkong/dots/graniteamx.h +434 -0
  15. package/include/numkong/dots/haswell.h +28 -28
  16. package/include/numkong/dots/sapphireamx.h +1 -1
  17. package/include/numkong/dots/serial.h +23 -8
  18. package/include/numkong/dots/skylake.h +28 -23
  19. package/include/numkong/dots.h +12 -0
  20. package/include/numkong/each/serial.h +18 -1
  21. package/include/numkong/geospatial/serial.h +14 -3
  22. package/include/numkong/maxsim/serial.h +15 -0
  23. package/include/numkong/mesh/README.md +50 -44
  24. package/include/numkong/mesh/genoa.h +462 -0
  25. package/include/numkong/mesh/haswell.h +806 -933
  26. package/include/numkong/mesh/neon.h +871 -943
  27. package/include/numkong/mesh/neonbfdot.h +382 -522
  28. package/include/numkong/mesh/neonfhm.h +676 -0
  29. package/include/numkong/mesh/rvv.h +404 -319
  30. package/include/numkong/mesh/serial.h +204 -162
  31. package/include/numkong/mesh/skylake.h +1029 -1585
  32. package/include/numkong/mesh/v128relaxed.h +403 -377
  33. package/include/numkong/mesh.h +38 -0
  34. package/include/numkong/reduce/serial.h +15 -1
  35. package/include/numkong/sparse/serial.h +17 -2
  36. package/include/numkong/spatial/genoa.h +0 -68
  37. package/include/numkong/spatial/haswell.h +98 -56
  38. package/include/numkong/spatial/serial.h +15 -0
  39. package/include/numkong/spatial/skylake.h +114 -54
  40. package/include/numkong/spatial.h +0 -12
  41. package/include/numkong/spatials/graniteamx.h +128 -0
  42. package/include/numkong/spatials/serial.h +18 -1
  43. package/include/numkong/spatials/skylake.h +2 -2
  44. package/include/numkong/spatials.h +17 -0
  45. package/include/numkong/tensor.hpp +107 -23
  46. package/javascript/numkong.c +3 -2
  47. package/package.json +7 -7
  48. package/wasm/numkong.wasm +0 -0
@@ -522,7 +522,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
522
522
  load_a_vec_fn, partial_load_a_vec_fn, load_b_vec_fn, partial_load_b_vec_fn, \
523
523
  inner_product_fn, reduce_accumulators_fn, store_fn, partial_store_fn, \
524
524
  depth_simd_dimensions, dimensions_per_value) \
525
- NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_( \
525
+ NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_( \
526
526
  nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
527
527
  nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
528
528
  nk_size_t c_stride_in_bytes) { \
@@ -698,7 +698,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
698
698
  } \
699
699
  } \
700
700
  } \
701
- NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_( \
701
+ NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_( \
702
702
  nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
703
703
  nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
704
704
  nk_size_t c_stride_in_bytes) { \
@@ -1090,7 +1090,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
1090
1090
  norm_value_type, vec_type, state_type, result_vec_type, init_accumulator_fn, load_a_vec_fn, partial_load_a_vec_fn, \
1091
1091
  load_b_vec_fn, partial_load_b_vec_fn, inner_product_fn, compensated_finalize_fn, store_fn, partial_store_fn, \
1092
1092
  load_sum_fn, partial_load_sum_fn, compute_a_sum_fn, depth_simd_dimensions, dimensions_per_value) \
1093
- NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_( \
1093
+ NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_aligned_( \
1094
1094
  nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
1095
1095
  nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
1096
1096
  nk_size_t c_stride_in_bytes) { \
@@ -1200,7 +1200,7 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
1200
1200
  } \
1201
1201
  } \
1202
1202
  } \
1203
- NK_PUBLIC void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_( \
1203
+ NK_INTERNAL void nk_##api_name##_packed_##input_type_name##_##isa_suffix##_1x8_aligned_( \
1204
1204
  nk_##input_value_type##_t const *a_matrix, void const *b_packed_buffer, nk_##result_value_type##_t *c_matrix, \
1205
1205
  nk_size_t row_count, nk_size_t column_count, nk_size_t depth, nk_size_t a_stride_in_bytes, \
1206
1206
  nk_size_t c_stride_in_bytes) { \
@@ -2431,10 +2431,19 @@ NK_INTERNAL nk_i32_t nk_dots_reduce_sum_i4_(nk_i4x2_t const *data, nk_size_t cou
2431
2431
  } \
2432
2432
  }
2433
2433
 
2434
- /* Optimize serial GEMM instantiations for size rather than speed.
2435
- * These fallback kernels are only used when no SIMD backend is available, so aggressive inlining/unrolling from -O3
2436
- * wastes over 1 MB of binary space with negligible performance benefit on the serial path.
2437
- */
2434
+ /* Keep the serial instantiations below actually scalar, regardless of build type.
2435
+ * Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
2436
+ * callers in dispatch_*.c, which wastes ~1 MB of binary and more importantly
2437
+ * breaks the nk_*_serial-as-scalar-oracle contract that tests and the numerical-
2438
+ * stability docs in this header rely on. */
2439
+ #if defined(__clang__)
2440
+ #pragma clang attribute push(__attribute__((noinline)), apply_to = function)
2441
+ #elif defined(__GNUC__)
2442
+ #pragma GCC push_options
2443
+ #pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
2444
+ #endif
2445
+
2446
+ /* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
2438
2447
  #if defined(NDEBUG)
2439
2448
  #if defined(_MSC_VER)
2440
2449
  #pragma optimize("s", on)
@@ -2689,6 +2698,12 @@ nk_define_cross_packed_(dots, u1, serial, u1x8, u1x8, u32, nk_b128_vec_t, nk_dot
2689
2698
  #endif
2690
2699
  #endif
2691
2700
 
2701
+ #if defined(__clang__)
2702
+ #pragma clang attribute pop
2703
+ #elif defined(__GNUC__)
2704
+ #pragma GCC pop_options
2705
+ #endif
2706
+
2692
2707
  /* BF16 compact: truncate F32 → BF16 in-place.
2693
2708
  * Reads F32 matrix with c_stride_in_bytes, writes BF16 tightly packed (stride_in_bytes = column_count × sizeof(bf16)).
2694
2709
  */
@@ -114,45 +114,50 @@ nk_define_cross_packed_(dots, f16, skylake, f16, f32, f32, nk_b512_vec_t, nk_dot
114
114
  nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
115
115
  /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
116
116
 
117
- /* E4M3 GEMM: depth_simd_dimensions=16 (16 e4m3s = 16 bytes = quarter cache line), F32 accumulator */
118
- nk_define_cross_pack_size_(dots, e4m3, skylake, e4m3, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/16,
117
+ /* E4M3 GEMM: F16-pack with asymmetric A/B representations at compute time. Pack converts
118
+ * E4M3 → F16 once (~10 ops/16 elements, 2 bytes/elt stored). A-stream uses the Giesen E4M3→F32
119
+ * cast (identical cost to F32-pack path). B-loader widens F16 → F32 inline (1 vcvtph2ps per 16
120
+ * lanes). Update takes both as F32 → plain fmadd. Saves 2 bytes/elt vs F32-pack; inner loop
121
+ * adds one cvtph2ps per B-read. Symmetric uses E4M3→F32 for both sides (no pack involved). */
122
+ nk_define_cross_pack_size_(dots, e4m3, skylake, e4m3, f16, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/16,
119
123
  /*dimensions_per_value=*/1)
120
- nk_define_cross_pack_(dots, e4m3, skylake, e4m3, f32, nk_b512_vec_t, nk_load_e4m3x16_to_f32x16_skylake_,
121
- nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_store_b512_skylake_,
122
- nk_partial_store_b32x16_skylake_, /*simd_width=*/16, /*norm_value_type=*/f32,
123
- nk_dots_reduce_sumsq_e4m3_, /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
124
+ nk_define_cross_pack_(dots, e4m3, skylake, e4m3, f16, nk_b256_vec_t, nk_load_e4m3x16_to_f16x16_skylake_,
125
+ nk_partial_load_e4m3x16_to_f16x16_skylake_, nk_store_b256_haswell_,
126
+ nk_partial_store_b16x16_serial_,
127
+ /*simd_width=*/16, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e4m3_,
128
+ /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
124
129
  nk_define_cross_symmetric_(dots, e4m3, skylake, e4m3, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
125
130
  nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e4m3x16_to_f32x16_skylake_,
126
131
  nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
127
132
  nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
128
133
  nk_partial_store_b32x4_skylake_,
129
134
  /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
130
- nk_define_cross_packed_(dots, e4m3, skylake, e4m3, f32, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
135
+ nk_define_cross_packed_(dots, e4m3, skylake, e4m3, f16, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
131
136
  nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e4m3x16_to_f32x16_skylake_,
132
- nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_load_b512_skylake_,
133
- nk_partial_load_b32x16_skylake_, nk_dot_through_f32_update_skylake_,
137
+ nk_partial_load_e4m3x16_to_f32x16_skylake_, nk_load_f16x16_to_f32x16_skylake_,
138
+ nk_partial_load_f16x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
134
139
  nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
135
140
  /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
136
141
 
137
- /* E5M2 GEMM: depth_simd_dimensions=16 (16 e5m2s = 16 bytes = quarter cache line), F32 accumulator */
138
- nk_define_cross_pack_size_(dots, e5m2, skylake, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/16,
142
+ /* E5M2 GEMM: depth_simd_dimensions=64 (byte-level batch; widen inside the update helper) */
143
+ nk_define_cross_pack_size_(dots, e5m2, skylake, e5m2, f32, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/64,
139
144
  /*dimensions_per_value=*/1)
140
- nk_define_cross_pack_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_load_e5m2x16_to_f32x16_skylake_,
141
- nk_partial_load_e5m2x16_to_f32x16_skylake_, nk_store_b512_skylake_,
142
- nk_partial_store_b32x16_skylake_, /*simd_width=*/16, /*norm_value_type=*/f32,
143
- nk_dots_reduce_sumsq_e5m2_, /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
145
+ nk_define_cross_pack_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_load_b512_skylake_,
146
+ nk_partial_load_b8x64_skylake_, nk_store_b512_skylake_, nk_partial_store_b8x64_skylake_,
147
+ /*simd_width=*/64, /*norm_value_type=*/f32, nk_dots_reduce_sumsq_e5m2_,
148
+ /*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
144
149
  nk_define_cross_symmetric_(dots, e5m2, skylake, e5m2, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
145
- nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e5m2x16_to_f32x16_skylake_,
146
- nk_partial_load_e5m2x16_to_f32x16_skylake_, nk_dot_through_f32_update_skylake_,
150
+ nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_b512_skylake_,
151
+ nk_partial_load_b8x64_skylake_, nk_dot_e5m2x64_update_skylake_,
147
152
  nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
148
153
  nk_partial_store_b32x4_skylake_,
149
- /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
154
+ /*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
150
155
  nk_define_cross_packed_(dots, e5m2, skylake, e5m2, f32, f32, nk_b512_vec_t, nk_dot_through_f32_state_skylake_t_,
151
- nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_e5m2x16_to_f32x16_skylake_,
152
- nk_partial_load_e5m2x16_to_f32x16_skylake_, nk_load_b512_skylake_,
153
- nk_partial_load_b32x16_skylake_, nk_dot_through_f32_update_skylake_,
154
- nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_, nk_partial_store_b32x4_skylake_,
155
- /*depth_simd_dimensions=*/16, /*dimensions_per_value=*/1)
156
+ nk_b128_vec_t, nk_dot_through_f32_init_skylake_, nk_load_b512_skylake_,
157
+ nk_partial_load_b8x64_skylake_, nk_load_b512_skylake_, nk_partial_load_b8x64_skylake_,
158
+ nk_dot_e5m2x64_update_skylake_, nk_dot_through_f32_finalize_skylake_, nk_store_b128_haswell_,
159
+ nk_partial_store_b32x4_skylake_,
160
+ /*depth_simd_dimensions=*/64, /*dimensions_per_value=*/1)
156
161
 
157
162
  /* E2M3 GEMM: integer LUT path, depth_simd_dimensions=64 (64 e2m3s = 64 bytes = AVX-512 register width) */
158
163
  nk_define_cross_pack_size_(dots, e2m3, skylake, e2m3, e2m3, /*norm_value_type=*/f32, /*depth_simd_dimensions=*/64,
@@ -698,6 +698,18 @@ NK_PUBLIC void nk_dots_packed_f16_graniteamx(nk_f16_t const *a, void const *b_pa
698
698
  NK_PUBLIC void nk_dots_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
699
699
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
700
700
  nk_size_t row_start, nk_size_t row_count);
701
+ /** @copydoc nk_dots_packed_size_f16 */
702
+ NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2_graniteamx(nk_size_t width, nk_size_t depth);
703
+ /** @copydoc nk_dots_pack_f16 */
704
+ NK_PUBLIC void nk_dots_pack_e5m2_graniteamx(nk_e5m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
705
+ void *b_packed);
706
+ /** @copydoc nk_dots_packed_f16 */
707
+ NK_PUBLIC void nk_dots_packed_e5m2_graniteamx(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
708
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
709
+ /** @copydoc nk_dots_symmetric_f16 */
710
+ NK_PUBLIC void nk_dots_symmetric_e5m2_graniteamx(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
711
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
712
+ nk_size_t row_start, nk_size_t row_count);
701
713
  #endif // NK_TARGET_GRANITEAMX
702
714
 
703
715
  /* ARM SME backends using Scalable Matrix Extension.
@@ -76,7 +76,18 @@ extern "C" {
76
76
  } \
77
77
  }
78
78
 
79
- /* Optimize serial fallbacks for size see dots/serial.h for rationale. */
79
+ /* Keep the serial instantiations below actually scalar, regardless of build type.
80
+ * Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
81
+ * callers in dispatch_*.c, which wastes binary and breaks the nk_*_serial-as-scalar-oracle
82
+ * contract. See dots/serial.h. */
83
+ #if defined(__clang__)
84
+ #pragma clang attribute push(__attribute__((noinline)), apply_to = function)
85
+ #elif defined(__GNUC__)
86
+ #pragma GCC push_options
87
+ #pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
88
+ #endif
89
+
90
+ /* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
80
91
  #if defined(NDEBUG)
81
92
  #if defined(_MSC_VER)
82
93
  #pragma optimize("s", on)
@@ -275,6 +286,12 @@ NK_PUBLIC void nk_each_fma_f64c_serial(nk_f64c_t const *a, nk_f64c_t const *b, n
275
286
  #endif
276
287
  #endif
277
288
 
289
+ #if defined(__clang__)
290
+ #pragma clang attribute pop
291
+ #elif defined(__GNUC__)
292
+ #pragma GCC pop_options
293
+ #endif
294
+
278
295
  #if defined(__cplusplus)
279
296
  } // extern "C"
280
297
  #endif
@@ -17,9 +17,14 @@
17
17
  extern "C" {
18
18
  #endif
19
19
 
20
- /* Serial implementations of geospatial distance functions.
21
- * These use the trigonometric functions from trigonometry.h for sin, cos, and atan2.
22
- */
20
+ /* Keep the serial instantiations below actually scalar, regardless of build type.
21
+ * See dots/serial.h for rationale. */
22
+ #if defined(__clang__)
23
+ #pragma clang attribute push(__attribute__((noinline)), apply_to = function)
24
+ #elif defined(__GNUC__)
25
+ #pragma GCC push_options
26
+ #pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
27
+ #endif
23
28
 
24
29
  NK_PUBLIC void nk_haversine_f64_serial( //
25
30
  nk_f64_t const *a_lats, nk_f64_t const *a_lons, //
@@ -302,6 +307,12 @@ NK_PUBLIC void nk_vincenty_f32_serial( //
302
307
  }
303
308
  }
304
309
 
310
+ #if defined(__clang__)
311
+ #pragma clang attribute pop
312
+ #elif defined(__GNUC__)
313
+ #pragma GCC pop_options
314
+ #endif
315
+
305
316
  #if defined(__cplusplus)
306
317
  } // extern "C"
307
318
  #endif
@@ -71,6 +71,15 @@ NK_STATIC_ASSERT(sizeof(nk_maxsim_vector_metadata_t) == 12, nk_maxsim_vector_met
71
71
  */
72
72
  typedef void (*nk_maxsim_to_f32_t)(void const *source, nk_f32_t *destination);
73
73
 
74
+ /* Keep the serial instantiations below actually scalar, regardless of build type.
75
+ * See dots/serial.h for rationale. */
76
+ #if defined(__clang__)
77
+ #pragma clang attribute push(__attribute__((noinline)), apply_to = function)
78
+ #elif defined(__GNUC__)
79
+ #pragma GCC push_options
80
+ #pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
81
+ #endif
82
+
74
83
  /** @brief Identity conversion for f32 sources — just a typed memcpy. */
75
84
  NK_INTERNAL void nk_f32_to_f32_(void const *source, nk_f32_t *destination) { *destination = *(nk_f32_t const *)source; }
76
85
 
@@ -483,6 +492,12 @@ NK_PUBLIC void nk_maxsim_packed_f16_serial( //
483
492
  *result = (nk_f32_t)total_angular_distance;
484
493
  }
485
494
 
495
+ #if defined(__clang__)
496
+ #pragma clang attribute pop
497
+ #elif defined(__GNUC__)
498
+ #pragma GCC pop_options
499
+ #endif
500
+
486
501
  #if defined(__cplusplus)
487
502
  } // extern "C"
488
503
  #endif
@@ -105,67 +105,73 @@ Each kernel runs for at least 20 seconds per configuration.
105
105
  Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
106
106
  Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
107
107
 
108
- ### Intel Sapphire Rapids
108
+ ### Intel Granite Rapids
109
+
110
+ Xeon 6776P, 2.3 GHz base, `cpu_scaling_enabled=false`.
111
+ Serial kernels compiled with `-fno-tree-vectorize`.
109
112
 
110
113
  #### Native
111
114
 
112
115
  | Kernel | 256 | 1024 | 4096 |
113
116
  | :------------------------ | -----------------------: | -----------------------: | -----------------------: |
114
117
  | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
115
- | `nk_rmsd_f64_serial` | 354 mp/s, 1.4 ulp | 176 mp/s, 2.7 ulp | 159 mp/s, 5.0 ulp |
116
- | `nk_kabsch_f64_serial` | 71.1 mp/s, 1.4 ulp | 70.8 mp/s, 2.7 ulp | 80.3 mp/s, 5.2 ulp |
117
- | `nk_umeyama_f64_serial` | 70.1 mp/s, 1.0 ulp | 75.1 mp/s, 1.8 ulp | 79.1 mp/s, 3.9 ulp |
118
- | `nk_rmsd_f64_haswell` | 405 mp/s, 0.3 ulp | 260 mp/s, 0.4 ulp | 192 mp/s, 0.8 ulp |
119
- | `nk_kabsch_f64_haswell` | 82.1 mp/s, 0.9 ulp | 105 mp/s, 1.3 ulp | 133 mp/s, 2.3 ulp |
120
- | `nk_umeyama_f64_haswell` | 82.6 mp/s, 0.4 ulp | 119 mp/s, 0.8 ulp | 134 mp/s, 1.5 ulp |
121
- | `nk_rmsd_f64_skylake` | 540 mp/s, 0.3 ulp | 219 mp/s, 0.3 ulp | 213 mp/s, 0.5 ulp |
122
- | `nk_kabsch_f64_skylake` | 96.8 mp/s, 0.7 ulp | 115 mp/s, 0.9 ulp | 159 mp/s, 1.1 ulp |
123
- | `nk_umeyama_f64_skylake` | 101 mp/s, 0.2 ulp | 119 mp/s, 0.4 ulp | 157 mp/s, 0.8 ulp |
118
+ | `nk_rmsd_f64_serial` | 93.7 mp/s, 0.5 ulp | 87.4 mp/s, 0.5 ulp | 69.8 mp/s, 0.5 ulp |
119
+ | `nk_kabsch_f64_serial` | 11.8 mp/s, 0.8 ulp | 13.6 mp/s, 0.8 ulp | 12.8 mp/s, 0.8 ulp |
120
+ | `nk_umeyama_f64_serial` | 10.4 mp/s, 0.3 ulp | 11.7 mp/s, 0.3 ulp | 11.5 mp/s, 0.3 ulp |
121
+ | `nk_rmsd_f64_haswell` | 523 mp/s, 0.3 ulp | 564 mp/s, 0.4 ulp | 449 mp/s, 0.8 ulp |
122
+ | `nk_kabsch_f64_haswell` | 65.3 mp/s, 0.5 ulp | 203 mp/s, 0.9 ulp | 326 mp/s, 1.5 ulp |
123
+ | `nk_umeyama_f64_haswell` | 68.0 mp/s, 0.5 ulp | 200 mp/s, 0.8 ulp | 324 mp/s, 1.5 ulp |
124
+ | `nk_rmsd_f64_skylake` | 546 mp/s, 0.2 ulp | 587 mp/s, 0.3 ulp | 583 mp/s, 0.4 ulp |
125
+ | `nk_kabsch_f64_skylake` | 34.5 mp/s, 0.4 ulp | 107 mp/s, 0.5 ulp | 261 mp/s, 0.8 ulp |
126
+ | `nk_umeyama_f64_skylake` | 24.3 mp/s, 0.3 ulp | 82.7 mp/s, 0.5 ulp | 201 mp/s, 0.8 ulp |
124
127
  | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
125
- | `nk_rmsd_f32_serial` | 480 mp/s, 1.4 ulp | 314 mp/s, 2.7 ulp | 270 mp/s, 5.4 ulp |
126
- | `nk_kabsch_f32_serial` | 83.2 mp/s, 1.5 ulp | 91.6 mp/s, 2.6 ulp | 110 mp/s, 5.3 ulp |
127
- | `nk_umeyama_f32_serial` | 80.4 mp/s, 1.0 ulp | 104 mp/s, 1.9 ulp | 106 mp/s, 3.7 ulp |
128
- | `nk_rmsd_f32_haswell` | 447 mp/s, 0.3 ulp | 484 mp/s, 0.3 ulp | 350 mp/s, 0.4 ulp |
129
- | `nk_kabsch_f32_haswell` | 101 mp/s, 0.7 ulp | 192 mp/s, 0.9 ulp | 213 mp/s, 1.3 ulp |
130
- | `nk_umeyama_f32_haswell` | 97.4 mp/s, 0.3 ulp | 155 mp/s, 0.4 ulp | 207 mp/s, 0.8 ulp |
131
- | `nk_rmsd_f32_skylake` | 1,000 mp/s, 0.7 ulp | 974 mp/s, 1.2 ulp | 786 mp/s, 2.4 ulp |
132
- | `nk_kabsch_f32_skylake` | 97.5 mp/s, 0.7 ulp | 232 mp/s, 0.7 ulp | 332 mp/s, 0.9 ulp |
133
- | `nk_umeyama_f32_skylake` | 92.5 mp/s, 0.2 ulp | 227 mp/s, 0.2 ulp | 325 mp/s, 0.3 ulp |
128
+ | `nk_rmsd_f32_serial` | 68.9 mp/s, 0.5 ulp | 70.7 mp/s, 0.5 ulp | 72.1 mp/s, 0.5 ulp |
129
+ | `nk_kabsch_f32_serial` | 11.2 mp/s, 0.8 ulp | 12.8 mp/s, 0.8 ulp | 14.0 mp/s, 0.9 ulp |
130
+ | `nk_umeyama_f32_serial` | 10.1 mp/s, 0.3 ulp | 11.2 mp/s, 0.3 ulp | 12.1 mp/s, 0.4 ulp |
131
+ | `nk_rmsd_f32_haswell` | 686 mp/s, 0.3 ulp | 848 mp/s, 0.5 ulp | 841 mp/s, 0.9 ulp |
132
+ | `nk_kabsch_f32_haswell` | 90.4 mp/s, 0.9 ulp | 250 mp/s, 1.3 ulp | 455 mp/s, 7.6 ulp |
133
+ | `nk_umeyama_f32_haswell` | 87.7 mp/s, 0.3 ulp | 250 mp/s, 0.4 ulp | 374 mp/s, 0.7 ulp |
134
+ | `nk_rmsd_f32_skylake` | 1,016 mp/s, 1.2 ulp | 1,112 mp/s, 1.2 ulp | 1,042 mp/s, 4.3 ulp |
135
+ | `nk_kabsch_f32_skylake` | 81.8 mp/s, 0.9 ulp | 241 mp/s, 4.1 ulp | 549 mp/s, 3.1 ulp |
136
+ | `nk_umeyama_f32_skylake` | 58.0 mp/s, 0.6 ulp | 168 mp/s, 2.9 ulp | 459 mp/s, 2.1 ulp |
134
137
  | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
135
- | `nk_rmsd_bf16_haswell` | 511 mp/s, 0.3 ulp | 481 mp/s, 3.5 ulp | 497 mp/s, 12.8 ulp |
136
- | `nk_kabsch_bf16_haswell` | 52.4 mp/s, 0.7 ulp | 65.3 mp/s, 0.9 ulp | 74.8 mp/s, 1.3 ulp |
137
- | `nk_umeyama_bf16_haswell` | 51.5 mp/s, 0.2 ulp | 69.2 mp/s, 0.4 ulp | 74.6 mp/s, 0.8 ulp |
138
- | `nk_rmsd_bf16_skylake` | 1,765 mp/s, 0.3 ulp | 1,945 mp/s, 0.5 ulp | 2,056 mp/s, 6.0 ulp |
139
- | `nk_kabsch_bf16_skylake` | 132 mp/s, 0.7 ulp | 370 mp/s, 0.8 ulp | 689 mp/s, 0.9 ulp |
140
- | `nk_umeyama_bf16_skylake` | 130 mp/s, 0.2 ulp | 366 mp/s, 0.3 ulp | 689 mp/s, 0.5 ulp |
138
+ | `nk_rmsd_bf16_haswell` | 284 mp/s, 0.3 ulp | 281 mp/s, 3.5 ulp | 273 mp/s, 12.8 ulp |
139
+ | `nk_kabsch_bf16_haswell` | 36.2 mp/s, 0.4 ulp | 106 mp/s, 7.6 ulp | 186 mp/s, 33.0 ulp |
140
+ | `nk_umeyama_bf16_haswell` | 34.5 mp/s, 0.3 ulp | 102 mp/s, 5.3 ulp | 186 mp/s, 23.1 ulp |
141
+ | `nk_rmsd_bf16_skylake` | 1,837 mp/s, 0.4 ulp | 2,357 mp/s, 5.4 ulp | 2,422 mp/s, 11.8 ulp |
142
+ | `nk_kabsch_bf16_skylake` | 34.1 mp/s, 0.3 ulp | 131 mp/s, 3.2 ulp | 487 mp/s, 20.4 ulp |
143
+ | `nk_umeyama_bf16_skylake` | 34.6 mp/s, 0.3 ulp | 130 mp/s, 2.2 ulp | 394 mp/s, 14.3 ulp |
144
+ | `nk_rmsd_bf16_genoa` | 1,743 mp/s, 0.3 ulp | 2,323 mp/s, 3.1 ulp | 2,066 mp/s, 20.2 ulp |
145
+ | `nk_kabsch_bf16_genoa` | 33.4 mp/s, 0.3 ulp | 133 mp/s, 3.2 ulp | 405 mp/s, 20.3 ulp |
146
+ | `nk_umeyama_bf16_genoa` | 33.2 mp/s, 0.3 ulp | 129 mp/s, 2.2 ulp | 439 mp/s, 14.3 ulp |
141
147
  | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
142
- | `nk_rmsd_f16_haswell` | 415 mp/s, 0.3 ulp | 497 mp/s, 0.7 ulp | 458 mp/s, 2.5 ulp |
143
- | `nk_kabsch_f16_haswell` | 151 mp/s, 0.7 ulp | 222 mp/s, 0.9 ulp | 221 mp/s, 1.4 ulp |
144
- | `nk_umeyama_f16_haswell` | 186 mp/s, 0.2 ulp | 232 mp/s, 0.5 ulp | 222 mp/s, 0.9 ulp |
145
- | `nk_rmsd_f16_skylake` | 1,813 mp/s, 0.3 ulp | 1,982 mp/s, 0.4 ulp | 2,049 mp/s, 1.8 ulp |
146
- | `nk_kabsch_f16_skylake` | 367 mp/s, 0.7 ulp | 695 mp/s, 0.7 ulp | 903 mp/s, 0.9 ulp |
147
- | `nk_umeyama_f16_skylake` | 341 mp/s, 0.2 ulp | 686 mp/s, 0.2 ulp | 882 mp/s, 0.4 ulp |
148
+ | `nk_rmsd_f16_haswell` | 273 mp/s, 0.2 ulp | 274 mp/s, 0.7 ulp | 291 mp/s, 2.5 ulp |
149
+ | `nk_kabsch_f16_haswell` | 34.4 mp/s, 0.5 ulp | 98.0 mp/s, 1.8 ulp | 197 mp/s, 8.2 ulp |
150
+ | `nk_umeyama_f16_haswell` | 35.5 mp/s, 0.4 ulp | 97.9 mp/s, 1.2 ulp | 196 mp/s, 5.7 ulp |
151
+ | `nk_rmsd_f16_skylake` | 1,834 mp/s, 0.3 ulp | 2,341 mp/s, 1.3 ulp | 2,418 mp/s, 3.9 ulp |
152
+ | `nk_kabsch_f16_skylake` | 34.0 mp/s, 0.7 ulp | 132 mp/s, 0.5 ulp | 480 mp/s, 4.7 ulp |
153
+ | `nk_umeyama_f16_skylake` | 33.8 mp/s, 0.5 ulp | 127 mp/s, 0.4 ulp | 481 mp/s, 3.3 ulp |
148
154
 
149
155
  #### WASM
150
156
 
151
- Measured with Wasmtime v42 (Cranelift backend).
157
+ Measured with Wasmtime v43 (Cranelift backend), WASI-SDK 24, `-msimd128 -mrelaxed-simd`.
152
158
 
153
159
  | Kernel | 256 | 1024 | 4096 |
154
160
  | :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
155
161
  | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
156
- | `nk_rmsd_f64_serial` | 178 mp/s, 1.4 ulp | 158 mp/s, 2.6 ulp | ? mp/s, 5.3 ulp |
157
- | `nk_rmsd_f64_v128relaxed` | 273 mp/s, 0.4 ulp | 307 mp/s, 0.7 ulp | ? mp/s, 1.3 ulp |
158
- | `nk_kabsch_f64_serial` | 37.7 mp/s, 1.4 ulp | 51.7 mp/s, 2.5 ulp | ? mp/s, 5.2 ulp |
159
- | `nk_kabsch_f64_v128relaxed` | 31.7 mp/s, 1.2 ulp | 56.9 mp/s, 2.3 ulp | ? mp/s, 4.5 ulp |
160
- | `nk_umeyama_f64_serial` | 36.5 mp/s, 0.9 ulp | 49.6 mp/s, 1.9 ulp | ? mp/s, 3.6 ulp |
161
- | `nk_umeyama_f64_v128relaxed` | 32.6 mp/s, 0.8 ulp | 55.5 mp/s, 1.5 ulp | ? mp/s, 3.2 ulp |
162
+ | `nk_rmsd_f64_serial` | 89.9 mp/s, 0.5 ulp | 86.1 mp/s, 0.5 ulp | 73.4 mp/s, 0.5 ulp |
163
+ | `nk_rmsd_f64_v128relaxed` | 485 mp/s, 0.4 ulp | 552 mp/s, 0.7 ulp | 412 mp/s, 1.3 ulp |
164
+ | `nk_kabsch_f64_serial` | 12.1 mp/s, 0.8 ulp | 13.9 mp/s, 0.8 ulp | 14.0 mp/s, 0.9 ulp |
165
+ | `nk_kabsch_f64_v128relaxed` | 66.0 mp/s, 0.9 ulp | 188 mp/s, 1.7 ulp | 177 mp/s, 3.1 ulp |
166
+ | `nk_umeyama_f64_serial` | 10.8 mp/s, 0.3 ulp | 12.3 mp/s, 0.3 ulp | 12.2 mp/s, 0.4 ulp |
167
+ | `nk_umeyama_f64_v128relaxed` | 64.0 mp/s, 0.8 ulp | 187 mp/s, 1.6 ulp | 178 mp/s, 3.2 ulp |
162
168
  | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
163
- | `nk_rmsd_f32_serial` | 105 mp/s, 1.4 ulp | 122 mp/s, 2.7 ulp | ? mp/s, 5.2 ulp |
164
- | `nk_rmsd_f32_v128relaxed` | 213 mp/s, 0.3 ulp | 258 mp/s, 0.4 ulp | ? mp/s, 0.8 ulp |
165
- | `nk_kabsch_f32_serial` | 15.5 mp/s, 1.4 ulp | 32.8 mp/s, 2.6 ulp | ? mp/s, 5.1 ulp |
166
- | `nk_kabsch_f32_v128relaxed` | 13.5 mp/s, 0.9 ulp | 46.2 mp/s, 1.3 ulp | ? mp/s, 2.5 ulp |
167
- | `nk_umeyama_f32_serial` | 15.2 mp/s, 1.0 ulp | 37.4 mp/s, 1.8 ulp | ? mp/s, 3.7 ulp |
168
- | `nk_umeyama_f32_v128relaxed` | 18.3 mp/s, 0.4 ulp | 38.9 mp/s, 0.8 ulp | ? mp/s, 1.5 ulp |
169
+ | `nk_rmsd_f32_serial` | 80.6 mp/s, 0.5 ulp | 82.7 mp/s, 0.5 ulp | 70.3 mp/s, 0.5 ulp |
170
+ | `nk_rmsd_f32_v128relaxed` | 452 mp/s, 1.5 ulp | 416 mp/s, 1.3 ulp | 399 mp/s, 4.8 ulp |
171
+ | `nk_kabsch_f32_serial` | 11.4 mp/s, 0.8 ulp | 12.8 mp/s, 0.9 ulp | 12.7 mp/s, 0.8 ulp |
172
+ | `nk_kabsch_f32_v128relaxed` | 79.5 mp/s, 4.2 ulp | 132 mp/s, 3.9 ulp | 177 mp/s, 14.3 ulp |
173
+ | `nk_umeyama_f32_serial` | 10.1 mp/s, 0.3 ulp | 11.2 mp/s, 0.3 ulp | 11.2 mp/s, 0.3 ulp |
174
+ | `nk_umeyama_f32_v128relaxed` | 79.4 mp/s, 2.8 ulp | 138 mp/s, 2.8 ulp | 194 mp/s, 10.1 ulp |
169
175
 
170
176
 
171
177
  ### Apple M5