numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -46,7 +46,7 @@ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_v128relaxed(nk_size_t vector_count
|
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed( //
|
|
49
|
-
nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t
|
|
49
|
+
nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride_in_bytes, void *packed) {
|
|
50
50
|
|
|
51
51
|
nk_size_t const element_bytes = sizeof(nk_bf16_t);
|
|
52
52
|
nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
|
|
@@ -58,7 +58,7 @@ NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed( //
|
|
|
58
58
|
nk_size_t const original_stride = header->original_stride_bytes;
|
|
59
59
|
|
|
60
60
|
for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
|
|
61
|
-
char const *source_row = (char const *)vectors + vector_index *
|
|
61
|
+
char const *source_row = (char const *)vectors + vector_index * stride_in_bytes;
|
|
62
62
|
nk_f32_t norm_sq;
|
|
63
63
|
nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f,
|
|
64
64
|
(nk_maxsim_to_f32_t)nk_bf16_to_f32_serial,
|
|
@@ -72,7 +72,7 @@ NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed( //
|
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed( //
|
|
75
|
-
nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t
|
|
75
|
+
nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride_in_bytes, void *packed) {
|
|
76
76
|
|
|
77
77
|
nk_size_t const element_bytes = sizeof(nk_f32_t);
|
|
78
78
|
nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
|
|
@@ -84,7 +84,7 @@ NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed( //
|
|
|
84
84
|
nk_size_t const original_stride = header->original_stride_bytes;
|
|
85
85
|
|
|
86
86
|
for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
|
|
87
|
-
char const *source_row = (char const *)vectors + vector_index *
|
|
87
|
+
char const *source_row = (char const *)vectors + vector_index * stride_in_bytes;
|
|
88
88
|
nk_f32_t norm_sq;
|
|
89
89
|
nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f, nk_f32_to_f32_,
|
|
90
90
|
&quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
|
|
@@ -97,7 +97,7 @@ NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed( //
|
|
|
97
97
|
}
|
|
98
98
|
|
|
99
99
|
NK_PUBLIC void nk_maxsim_pack_f16_v128relaxed( //
|
|
100
|
-
nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t
|
|
100
|
+
nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride_in_bytes, void *packed) {
|
|
101
101
|
|
|
102
102
|
nk_size_t const element_bytes = sizeof(nk_f16_t);
|
|
103
103
|
nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
|
|
@@ -109,7 +109,7 @@ NK_PUBLIC void nk_maxsim_pack_f16_v128relaxed( //
|
|
|
109
109
|
nk_size_t const original_stride = header->original_stride_bytes;
|
|
110
110
|
|
|
111
111
|
for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
|
|
112
|
-
char const *source_row = (char const *)vectors + vector_index *
|
|
112
|
+
char const *source_row = (char const *)vectors + vector_index * stride_in_bytes;
|
|
113
113
|
nk_f32_t norm_sq;
|
|
114
114
|
nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f,
|
|
115
115
|
(nk_maxsim_to_f32_t)nk_f16_to_f32_serial,
|
|
@@ -6,33 +6,33 @@ Used in structural biology (protein alignment), robotics (point cloud registrati
|
|
|
6
6
|
|
|
7
7
|
Centroid:
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
$$
|
|
10
10
|
\bar{a} = \frac{1}{n}\sum a_i
|
|
11
|
-
|
|
11
|
+
$$
|
|
12
12
|
|
|
13
13
|
Cross-covariance matrix:
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
$$
|
|
16
16
|
H = \sum (a_i - \bar{a})(b_i - \bar{b})^T
|
|
17
|
-
|
|
17
|
+
$$
|
|
18
18
|
|
|
19
19
|
SVD-based rotation:
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
$$
|
|
22
22
|
H = U \Sigma V^T, \quad R = V U^T
|
|
23
|
-
|
|
23
|
+
$$
|
|
24
24
|
|
|
25
25
|
Umeyama scale factor:
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
$$
|
|
28
28
|
s = \frac{\text{tr}(\Sigma)}{n \cdot \sigma_a^2}
|
|
29
|
-
|
|
29
|
+
$$
|
|
30
30
|
|
|
31
31
|
RMSD after alignment:
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
$$
|
|
34
34
|
\text{RMSD} = \sqrt{\frac{1}{n}\sum \|s \cdot R(a_i - \bar{a}) - (b_i - \bar{b})\|^2}
|
|
35
|
-
|
|
35
|
+
$$
|
|
36
36
|
|
|
37
37
|
Reformulating as Python pseudocode:
|
|
38
38
|
|
|
@@ -87,14 +87,14 @@ RVV uses indexed loads with dynamic stride to adapt to variable vector length.
|
|
|
87
87
|
|
|
88
88
|
### Reflection Correction
|
|
89
89
|
|
|
90
|
-
`nk_kabsch_f32_haswell`, `nk_kabsch_f64_skylake` check for improper rotations
|
|
91
|
-
If det(R)
|
|
92
|
-
This ensures the output is always a proper rotation matrix (
|
|
90
|
+
`nk_kabsch_f32_haswell`, `nk_kabsch_f64_skylake` check for improper rotations after computing $R = V U^T$ from the SVD of the cross-covariance matrix $H = U \Sigma V^T$.
|
|
91
|
+
If $\det(R) = -1$ (a reflection rather than a rotation), the last column of $V$ is negated before recomputing $R$.
|
|
92
|
+
This ensures the output is always a proper rotation matrix with $\det(R) = +1$.
|
|
93
93
|
|
|
94
94
|
### Pre-Scaled Rotation for Umeyama
|
|
95
95
|
|
|
96
|
-
`nk_umeyama_f32_haswell`, `nk_umeyama_f64_skylake` fold the computed scale factor into the rotation matrix before applying to points.
|
|
97
|
-
|
|
96
|
+
`nk_umeyama_f32_haswell`, `nk_umeyama_f64_skylake` fold the computed scale factor $s$ into the rotation matrix before applying to points.
|
|
97
|
+
The Umeyama transform is $b_i = s R a_i + t$; by precomputing $R' = s R$ once, the per-point operation reduces to $b_i = R' a_i + t$, avoiding a per-point scalar multiply.
|
|
98
98
|
|
|
99
99
|
### Why SME and SVE Were Removed
|
|
100
100
|
|
|
@@ -142,17 +142,23 @@ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run
|
|
|
142
142
|
| `nk_rmsd_f32_haswell` | 447 mp/s, 0.3 ulp | 484 mp/s, 0.3 ulp | 350 mp/s, 0.4 ulp |
|
|
143
143
|
| `nk_kabsch_f32_haswell` | 101 mp/s, 0.7 ulp | 192 mp/s, 0.9 ulp | 213 mp/s, 1.3 ulp |
|
|
144
144
|
| `nk_umeyama_f32_haswell` | 97.4 mp/s, 0.3 ulp | 155 mp/s, 0.4 ulp | 207 mp/s, 0.8 ulp |
|
|
145
|
-
| `nk_rmsd_f32_skylake` |
|
|
146
|
-
| `nk_kabsch_f32_skylake` |
|
|
147
|
-
| `nk_umeyama_f32_skylake` |
|
|
145
|
+
| `nk_rmsd_f32_skylake` | 1,000 mp/s, 0.7 ulp | 974 mp/s, 1.2 ulp | 786 mp/s, 2.4 ulp |
|
|
146
|
+
| `nk_kabsch_f32_skylake` | 97.5 mp/s, 0.7 ulp | 232 mp/s, 0.7 ulp | 332 mp/s, 0.9 ulp |
|
|
147
|
+
| `nk_umeyama_f32_skylake` | 92.5 mp/s, 0.2 ulp | 227 mp/s, 0.2 ulp | 325 mp/s, 0.3 ulp |
|
|
148
148
|
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
149
149
|
| `nk_rmsd_bf16_haswell` | 511 mp/s, 0.3 ulp | 481 mp/s, 3.5 ulp | 497 mp/s, 12.8 ulp |
|
|
150
150
|
| `nk_kabsch_bf16_haswell` | 52.4 mp/s, 0.7 ulp | 65.3 mp/s, 0.9 ulp | 74.8 mp/s, 1.3 ulp |
|
|
151
151
|
| `nk_umeyama_bf16_haswell` | 51.5 mp/s, 0.2 ulp | 69.2 mp/s, 0.4 ulp | 74.6 mp/s, 0.8 ulp |
|
|
152
|
+
| `nk_rmsd_bf16_skylake` | 1,765 mp/s, 0.3 ulp | 1,945 mp/s, 0.5 ulp | 2,056 mp/s, 6.0 ulp |
|
|
153
|
+
| `nk_kabsch_bf16_skylake` | 132 mp/s, 0.7 ulp | 370 mp/s, 0.8 ulp | 689 mp/s, 0.9 ulp |
|
|
154
|
+
| `nk_umeyama_bf16_skylake` | 130 mp/s, 0.2 ulp | 366 mp/s, 0.3 ulp | 689 mp/s, 0.5 ulp |
|
|
152
155
|
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
153
156
|
| `nk_rmsd_f16_haswell` | 415 mp/s, 0.3 ulp | 497 mp/s, 0.7 ulp | 458 mp/s, 2.5 ulp |
|
|
154
157
|
| `nk_kabsch_f16_haswell` | 151 mp/s, 0.7 ulp | 222 mp/s, 0.9 ulp | 221 mp/s, 1.4 ulp |
|
|
155
158
|
| `nk_umeyama_f16_haswell` | 186 mp/s, 0.2 ulp | 232 mp/s, 0.5 ulp | 222 mp/s, 0.9 ulp |
|
|
159
|
+
| `nk_rmsd_f16_skylake` | 1,813 mp/s, 0.3 ulp | 1,982 mp/s, 0.4 ulp | 2,049 mp/s, 1.8 ulp |
|
|
160
|
+
| `nk_kabsch_f16_skylake` | 367 mp/s, 0.7 ulp | 695 mp/s, 0.7 ulp | 903 mp/s, 0.9 ulp |
|
|
161
|
+
| `nk_umeyama_f16_skylake` | 341 mp/s, 0.2 ulp | 686 mp/s, 0.2 ulp | 882 mp/s, 0.4 ulp |
|
|
156
162
|
|
|
157
163
|
#### WASM
|
|
158
164
|
|
|
@@ -176,52 +182,52 @@ Measured with Wasmtime v42 (Cranelift backend).
|
|
|
176
182
|
| `nk_umeyama_f32_v128relaxed` | 18.3 mp/s, 0.4 ulp | 38.9 mp/s, 0.8 ulp | ? mp/s, 1.5 ulp |
|
|
177
183
|
|
|
178
184
|
|
|
179
|
-
### Apple
|
|
185
|
+
### Apple M5
|
|
180
186
|
|
|
181
187
|
#### Native
|
|
182
188
|
|
|
183
|
-
| Kernel
|
|
184
|
-
|
|
|
185
|
-
| __f64__
|
|
186
|
-
| `nk_rmsd_f64_serial`
|
|
187
|
-
| `nk_kabsch_f64_serial`
|
|
188
|
-
| `nk_umeyama_f64_serial`
|
|
189
|
-
| `nk_rmsd_f64_neon`
|
|
190
|
-
| `nk_kabsch_f64_neon`
|
|
191
|
-
| `nk_umeyama_f64_neon`
|
|
192
|
-
| __f32__
|
|
193
|
-
| `nk_rmsd_f32_serial`
|
|
194
|
-
| `nk_kabsch_f32_serial`
|
|
195
|
-
| `nk_umeyama_f32_serial`
|
|
196
|
-
| `nk_rmsd_f32_neon`
|
|
197
|
-
| `nk_kabsch_f32_neon`
|
|
198
|
-
| `nk_umeyama_f32_neon`
|
|
199
|
-
| __bf16__
|
|
200
|
-
| `
|
|
201
|
-
| `
|
|
202
|
-
| `
|
|
203
|
-
| __f16__
|
|
204
|
-
| `
|
|
205
|
-
| `
|
|
206
|
-
| `
|
|
189
|
+
| Kernel | 256 | 1024 | 4096 |
|
|
190
|
+
| :-------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
191
|
+
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
192
|
+
| `nk_rmsd_f64_serial` | 120 mp/s, 1.4 ulp | 118 mp/s, 2.6 ulp | 121 mp/s, 5.3 ulp |
|
|
193
|
+
| `nk_kabsch_f64_serial` | 40.4 mp/s, 1.4 ulp | 47.3 mp/s, 2.6 ulp | 50.2 mp/s, 5.4 ulp |
|
|
194
|
+
| `nk_umeyama_f64_serial` | 34.5 mp/s, 1.0 ulp | 39.2 mp/s, 1.9 ulp | 41.6 mp/s, 3.7 ulp |
|
|
195
|
+
| `nk_rmsd_f64_neon` | 1,418 mp/s, 0.4 ulp | 1,338 mp/s, 0.7 ulp | 1,419 mp/s, 1.3 ulp |
|
|
196
|
+
| `nk_kabsch_f64_neon` | 119 mp/s, 0.8 ulp | 222 mp/s, 1.3 ulp | 304 mp/s, 2.2 ulp |
|
|
197
|
+
| `nk_umeyama_f64_neon` | 115 mp/s, 0.4 ulp | 220 mp/s, 0.8 ulp | 296 mp/s, 1.6 ulp |
|
|
198
|
+
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
199
|
+
| `nk_rmsd_f32_serial` | 122 mp/s, 1.4 ulp | 123 mp/s, 2.6 ulp | 125 mp/s, 5.2 ulp |
|
|
200
|
+
| `nk_kabsch_f32_serial` | 39.4 mp/s, 1.4 ulp | 46.0 mp/s, 2.7 ulp | 49.9 mp/s, 5.0 ulp |
|
|
201
|
+
| `nk_umeyama_f32_serial` | 33.6 mp/s, 0.9 ulp | 38.8 mp/s, 1.8 ulp | 41.4 mp/s, 3.5 ulp |
|
|
202
|
+
| `nk_rmsd_f32_neon` | 1,337 mp/s, 0.3 ulp | 1,377 mp/s, 0.4 ulp | 1,261 mp/s, 0.8 ulp |
|
|
203
|
+
| `nk_kabsch_f32_neon` | 135 mp/s, 0.7 ulp | 288 mp/s, 0.9 ulp | 385 mp/s, 1.4 ulp |
|
|
204
|
+
| `nk_umeyama_f32_neon` | 130 mp/s, 0.3 ulp | 272 mp/s, 0.4 ulp | 367 mp/s, 0.8 ulp |
|
|
205
|
+
| __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
206
|
+
| `nk_rmsd_bf16_neonbfdot` | 2,342 mp/s, 0.5 ulp | 2,378 mp/s, 6.0 ulp | 2,416 mp/s, 10.0 ulp |
|
|
207
|
+
| `nk_kabsch_bf16_neonbfdot` | 180 mp/s, 0.7 ulp | 448 mp/s, 0.9 ulp | 726 mp/s, 1.3 ulp |
|
|
208
|
+
| `nk_umeyama_bf16_neonbfdot` | 176 mp/s, 0.2 ulp | 433 mp/s, 0.4 ulp | 705 mp/s, 0.8 ulp |
|
|
209
|
+
| __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
210
|
+
| `nk_rmsd_f16_neonhalf` | 2,315 mp/s, 0.4 ulp | 2,372 mp/s, 1.7 ulp | 2,423 mp/s, 4.6 ulp |
|
|
211
|
+
| `nk_kabsch_f16_neonhalf` | 178 mp/s, 0.9 ulp | 443 mp/s, 1.3 ulp | 711 mp/s, 2.4 ulp |
|
|
212
|
+
| `nk_umeyama_f16_neonhalf` | 175 mp/s, 0.4 ulp | 408 mp/s, 0.8 ulp | 620 mp/s, 1.5 ulp |
|
|
207
213
|
|
|
208
214
|
#### WASM
|
|
209
215
|
|
|
210
|
-
Measured with Wasmtime
|
|
216
|
+
Measured with Wasmtime v43 (Cranelift backend).
|
|
211
217
|
|
|
212
218
|
| Kernel | 256 | 1024 | 4096 |
|
|
213
219
|
| :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
|
|
214
220
|
| __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
215
|
-
| `nk_rmsd_f64_serial` |
|
|
216
|
-
| `nk_rmsd_f64_v128relaxed` |
|
|
217
|
-
| `nk_kabsch_f64_serial` | 42.
|
|
218
|
-
| `nk_kabsch_f64_v128relaxed` |
|
|
219
|
-
| `nk_umeyama_f64_serial` | 36.1 mp/s, 1.8 ulp |
|
|
220
|
-
| `nk_umeyama_f64_v128relaxed` |
|
|
221
|
+
| `nk_rmsd_f64_serial` | 137 mp/s, 2.6 ulp | 134 mp/s, 2.6 ulp | 142 mp/s, 2.6 ulp |
|
|
222
|
+
| `nk_rmsd_f64_v128relaxed` | 1,377 mp/s, 0.8 ulp | 1,038 mp/s, 0.8 ulp | 1,566 mp/s, 0.8 ulp |
|
|
223
|
+
| `nk_kabsch_f64_serial` | 42.3 mp/s, 2.7 ulp | 50.4 mp/s, 2.7 ulp | 55.5 mp/s, 2.7 ulp |
|
|
224
|
+
| `nk_kabsch_f64_v128relaxed` | 121 mp/s, 2.2 ulp | 225 mp/s, 2.2 ulp | 345 mp/s, 2.2 ulp |
|
|
225
|
+
| `nk_umeyama_f64_serial` | 36.1 mp/s, 1.8 ulp | 41.3 mp/s, 1.8 ulp | 46.0 mp/s, 1.8 ulp |
|
|
226
|
+
| `nk_umeyama_f64_v128relaxed` | 112 mp/s, 1.5 ulp | 207 mp/s, 1.5 ulp | 293 mp/s, 1.5 ulp |
|
|
221
227
|
| __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
|
|
222
|
-
| `nk_rmsd_f32_serial` |
|
|
223
|
-
| `nk_rmsd_f32_v128relaxed` |
|
|
224
|
-
| `nk_kabsch_f32_serial` |
|
|
225
|
-
| `nk_kabsch_f32_v128relaxed` |
|
|
226
|
-
| `nk_umeyama_f32_serial` |
|
|
227
|
-
| `nk_umeyama_f32_v128relaxed` |
|
|
228
|
+
| `nk_rmsd_f32_serial` | 120 mp/s, 2.7 ulp | 120 mp/s, 2.7 ulp | 124 mp/s, 2.7 ulp |
|
|
229
|
+
| `nk_rmsd_f32_v128relaxed` | 1,025 mp/s, 0.5 ulp | 1,038 mp/s, 0.5 ulp | 1,093 mp/s, 0.5 ulp |
|
|
230
|
+
| `nk_kabsch_f32_serial` | 39.6 mp/s, 2.6 ulp | 47.6 mp/s, 2.6 ulp | 51.4 mp/s, 2.6 ulp |
|
|
231
|
+
| `nk_kabsch_f32_v128relaxed` | 125 mp/s, 1.3 ulp | 255 mp/s, 1.3 ulp | 366 mp/s, 1.3 ulp |
|
|
232
|
+
| `nk_umeyama_f32_serial` | 30.5 mp/s, 1.8 ulp | 35.0 mp/s, 1.8 ulp | 38.9 mp/s, 1.8 ulp |
|
|
233
|
+
| `nk_umeyama_f32_v128relaxed` | 118 mp/s, 0.8 ulp | 240 mp/s, 0.8 ulp | 338 mp/s, 0.8 ulp |
|