npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/mesh.h CHANGED Viewed

@@ -82,17 +82,17 @@
  *
  *  The SIMD kernels are dominated by FMA, permutes, and gathers:
  *
- *      Intrinsic                     Instruction        Notes
- *      _mm256_fmadd_ps/pd            VFMADD*            FMA on FP ports (Haswell/Skylake: ports 0/1)
- *      _mm256_i32gather_ps           VGATHERDPS         High-latency; memory-bound
- *      _mm512_permutex2var_ps/pd     VPERMT2*           Shuffle-heavy; can bottleneck on shuffle ports
- *      _mm512_reduce_add_ps/pd       (sequence)         Implemented via shuffles + adds
+ *      Intrinsic                  Instruction  Notes
+ *      _mm256_fmadd_ps/pd         VFMADD*      FMA on FP ports (Haswell/Skylake: ports 0/1)
+ *      _mm256_i32gather_ps        VGATHERDPS   High-latency; memory-bound
+ *      _mm512_permutex2var_ps/pd  VPERMT2*     Shuffle-heavy; can bottleneck on shuffle ports
+ *      _mm512_reduce_add_ps/pd    (sequence)   Implemented via shuffles + adds
  *
  *  Gather-heavy tails are intentionally isolated to keep the steady-state loop on contiguous loads.
  *
  *  @section references References
  *
- *  - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
+ *  - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
  *  - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
  *
  */
@@ -245,6 +245,25 @@ NK_PUBLIC void nk_kabsch_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_si
 /** @copydoc nk_umeyama_f64 */
 NK_PUBLIC void nk_umeyama_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
                                       nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
+/** @copydoc nk_rmsd_f16 */
+NK_PUBLIC void nk_rmsd_f16_skylake(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                   nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_kabsch_f16 */
+NK_PUBLIC void nk_kabsch_f16_skylake(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                     nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_umeyama_f16 */
+NK_PUBLIC void nk_umeyama_f16_skylake(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                      nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_rmsd_bf16 */
+NK_PUBLIC void nk_rmsd_bf16_skylake(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                    nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_kabsch_bf16 */
+NK_PUBLIC void nk_kabsch_bf16_skylake(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                      nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+/** @copydoc nk_umeyama_bf16 */
+NK_PUBLIC void nk_umeyama_bf16_skylake(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                       nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
 #endif // NK_TARGET_SKYLAKE
 /*  SIMD-powered backends for AVX2 CPUs of Haswell generation and newer.
@@ -313,21 +332,16 @@ NK_PUBLIC void nk_kabsch_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_
 /** @copydoc nk_umeyama_f64 */
 NK_PUBLIC void nk_umeyama_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
                                    nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
-#endif // NK_TARGET_NEON
-/*  SIMD-powered backends for Arm NEON FP16 CPUs.
- */
-#if NK_TARGET_NEONHALF
 /** @copydoc nk_rmsd_f16 */
-NK_PUBLIC void nk_rmsd_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
-                                    nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+NK_PUBLIC void nk_rmsd_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
 /** @copydoc nk_kabsch_f16 */
-NK_PUBLIC void nk_kabsch_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
-                                      nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+NK_PUBLIC void nk_kabsch_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
 /** @copydoc nk_umeyama_f16 */
-NK_PUBLIC void nk_umeyama_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
-                                       nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
-#endif // NK_TARGET_NEONHALF
+NK_PUBLIC void nk_umeyama_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
+                                   nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
+#endif // NK_TARGET_NEON
 /*  SIMD-powered backends for Arm NEON BF16 CPUs.
  */
@@ -406,22 +420,10 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
 #endif // NK_TARGET_V128RELAXED
 /**
- *  @brief  Returns the output dtype for RMSD.
- */
-NK_INTERNAL nk_dtype_t nk_rmsd_output_dtype(nk_dtype_t dtype) {
-    switch (dtype) {
-    case nk_f64_k: return nk_f64_k;
-    case nk_f32_k: return nk_f64_k;
-    case nk_f16_k: return nk_f32_k;
-    case nk_bf16_k: return nk_f32_k;
-    default: return nk_dtype_unknown_k;
-    }
-}
-/**
- *  @brief  Returns the output dtype for Kabsch alignment.
+ *  @brief  Returns the metric output dtype for mesh alignment operations.
+ *  Matches the C++ `mesh_metric_t` alias in types.hpp.
  */
-NK_INTERNAL nk_dtype_t nk_kabsch_output_dtype(nk_dtype_t dtype) {
+NK_INTERNAL nk_dtype_t nk_mesh_metric_dtype(nk_dtype_t dtype) {
     switch (dtype) {
     case nk_f64_k: return nk_f64_k;
     case nk_f32_k: return nk_f64_k;
@@ -432,12 +434,13 @@ NK_INTERNAL nk_dtype_t nk_kabsch_output_dtype(nk_dtype_t dtype) {
 }
 /**
- *  @brief  Returns the output dtype for Umeyama alignment.
+ *  @brief  Returns the transform output dtype for mesh alignment operations.
+ *  Matches the C++ `mesh_transform_t` alias in types.hpp.
  */
-NK_INTERNAL nk_dtype_t nk_umeyama_output_dtype(nk_dtype_t dtype) {
+NK_INTERNAL nk_dtype_t nk_mesh_transform_dtype(nk_dtype_t dtype) {
     switch (dtype) {
     case nk_f64_k: return nk_f64_k;
-    case nk_f32_k: return nk_f64_k;
+    case nk_f32_k: return nk_f32_k;
     case nk_f16_k: return nk_f32_k;
     case nk_bf16_k: return nk_f32_k;
     default: return nk_dtype_unknown_k;
@@ -450,7 +453,6 @@ NK_INTERNAL nk_dtype_t nk_umeyama_output_dtype(nk_dtype_t dtype) {
 #include "numkong/mesh/serial.h"
 #include "numkong/mesh/neon.h"
-#include "numkong/mesh/neonhalf.h"
 #include "numkong/mesh/neonbfdot.h"
 #include "numkong/mesh/haswell.h"
 #include "numkong/mesh/skylake.h"
@@ -499,10 +501,12 @@ NK_PUBLIC void nk_rmsd_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk
 NK_PUBLIC void nk_rmsd_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
                            nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
-#if NK_TARGET_HASWELL
+#if NK_TARGET_SKYLAKE
+    nk_rmsd_f16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_HASWELL
     nk_rmsd_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
-#elif NK_TARGET_NEONHALF
-    nk_rmsd_f16_neonhalf(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_NEON
+    nk_rmsd_f16_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_RVV
     nk_rmsd_f16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #else
@@ -512,7 +516,9 @@ NK_PUBLIC void nk_rmsd_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk
 NK_PUBLIC void nk_rmsd_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
                             nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
-#if NK_TARGET_HASWELL
+#if NK_TARGET_SKYLAKE
+    nk_rmsd_bf16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_HASWELL
     nk_rmsd_bf16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_NEONBFDOT
     nk_rmsd_bf16_neonbfdot(a, b, n, a_centroid, b_centroid, rotation, scale, result);
@@ -559,10 +565,12 @@ NK_PUBLIC void nk_kabsch_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n,
 NK_PUBLIC void nk_kabsch_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
                              nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
-#if NK_TARGET_HASWELL
+#if NK_TARGET_SKYLAKE
+    nk_kabsch_f16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_HASWELL
     nk_kabsch_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
-#elif NK_TARGET_NEONHALF
-    nk_kabsch_f16_neonhalf(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_NEON
+    nk_kabsch_f16_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_RVV
     nk_kabsch_f16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #else
@@ -572,7 +580,9 @@ NK_PUBLIC void nk_kabsch_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n,
 NK_PUBLIC void nk_kabsch_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
                               nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
-#if NK_TARGET_HASWELL
+#if NK_TARGET_SKYLAKE
+    nk_kabsch_bf16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_HASWELL
     nk_kabsch_bf16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_NEONBFDOT
     nk_kabsch_bf16_neonbfdot(a, b, n, a_centroid, b_centroid, rotation, scale, result);
@@ -619,10 +629,12 @@ NK_PUBLIC void nk_umeyama_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n,
 NK_PUBLIC void nk_umeyama_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
                               nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
-#if NK_TARGET_HASWELL
+#if NK_TARGET_SKYLAKE
+    nk_umeyama_f16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_HASWELL
     nk_umeyama_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
-#elif NK_TARGET_NEONHALF
-    nk_umeyama_f16_neonhalf(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_NEON
+    nk_umeyama_f16_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_RVV
     nk_umeyama_f16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #else
@@ -632,7 +644,9 @@ NK_PUBLIC void nk_umeyama_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n,
 NK_PUBLIC void nk_umeyama_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
                                nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
-#if NK_TARGET_HASWELL
+#if NK_TARGET_SKYLAKE
+    nk_umeyama_bf16_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
+#elif NK_TARGET_HASWELL
     nk_umeyama_bf16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
 #elif NK_TARGET_NEONBFDOT
     nk_umeyama_bf16_neonbfdot(a, b, n, a_centroid, b_centroid, rotation, scale, result);

package/include/numkong/mesh.hpp CHANGED Viewed

@@ -17,7 +17,7 @@
 namespace ashvardanian::numkong {
-#pragma region - SVD Helpers for Scalar Fallbacks
+#pragma region SVD Helpers for Scalar Fallbacks
 /** @brief 3x3 matrix determinant. */
 template <typename scalar_type_>
@@ -313,9 +313,9 @@ void svd3x3_(scalar_type_ const *a, scalar_type_ *svd_u, scalar_type_ *svd_s, sc
     svd_s[8] = s3_sq.sqrt();
 }
-#pragma endregion - SVD Helpers for Scalar Fallbacks
+#pragma endregion SVD Helpers for Scalar Fallbacks
-#pragma region - Mesh Alignment Kernels
+#pragma region Mesh Alignment Kernels
 /**
  *  @brief Root Mean Square Deviation between two 3D point clouds (no alignment)
@@ -755,7 +755,7 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
     }
 }
-#pragma endregion - Mesh Alignment Kernels
+#pragma endregion Mesh Alignment Kernels
 } // namespace ashvardanian::numkong

package/include/numkong/numkong.h CHANGED Viewed

@@ -62,9 +62,9 @@ NK_PUBLIC nk_dtype_t nk_kernel_output_dtype(nk_kernel_kind_t kind, nk_dtype_t in
     case nk_kernel_vincenty_k: return nk_vincenty_output_dtype(input);
     case nk_kernel_kld_k:
     case nk_kernel_jsd_k: return nk_probability_output_dtype(input);
-    case nk_kernel_rmsd_k: return nk_rmsd_output_dtype(input);
-    case nk_kernel_kabsch_k: return nk_kabsch_output_dtype(input);
-    case nk_kernel_umeyama_k: return nk_umeyama_output_dtype(input);
+    case nk_kernel_rmsd_k:
+    case nk_kernel_kabsch_k:
+    case nk_kernel_umeyama_k: return nk_mesh_metric_dtype(input);
     case nk_kernel_sparse_dot_k: return nk_sparse_dot_output_dtype(input);
     case nk_kernel_maxsim_packed_k: return nk_maxsim_output_dtype(input);
     default: return nk_dtype_unknown_k;

package/include/numkong/numkong.hpp CHANGED Viewed

@@ -37,6 +37,7 @@
 #define NK_NUMKONG_HPP
 #include "numkong/random.hpp"
+#include "numkong/cast.hpp"
 #include "numkong/dot.hpp"
 #include "numkong/spatial.hpp"
 #include "numkong/spatials.hpp"

package/include/numkong/probability/README.md CHANGED Viewed

@@ -5,17 +5,21 @@ These are used in variational inference, topic modeling, and distribution compar
 Kullback-Leibler divergence from $P$ to $Q$:
-```math
+$$
 \text{KLD}(P \| Q) = \sum_{i=0}^{n-1} P(i) \log_2 \frac{P(i)}{Q(i)}
-```
+$$
 Jensen-Shannon distance is the square root of the symmetrized KLD through a mixture:
-$$\text{JSD}(P, Q) = \frac{1}{2} \text{KLD}(P \| M) + \frac{1}{2} \text{KLD}(Q \| M)$$
+$$
+\text{JSD}(P, Q) = \frac{1}{2} \text{KLD}(P \| M) + \frac{1}{2} \text{KLD}(Q \| M)
+$$
 where $M = \frac{P + Q}{2}$, yielding the distance:
-$$d_{JS}(P, Q) = \sqrt{\text{JSD}(P, Q)}$$
+$$
+d_{JS}(P, Q) = \sqrt{\text{JSD}(P, Q)}
+$$
 Unlike the raw divergence, $d_{JS}$ is a true metric satisfying the triangle inequality.
@@ -35,9 +39,9 @@ def jsd(p: np.ndarray, q: np.ndarray) -> float:
 ## Use Cases
-__Kullback-Leibler divergence__ is the workhorse of variational inference (ELBO objective), knowledge distillation between neural networks, information gain in decision trees, and measuring fit between a model and observed data.
+__Kullback-Leibler divergence__ is widely used in variational inference (ELBO objective), knowledge distillation between neural networks, information gain in decision trees, and measuring fit between a model and observed data.
-__Jensen-Shannon distance__ sees primary use in microbiome community comparison (enterotyping), where its metric property enables clustering with standard algorithms. It also appears in distribution drift detection, topic model evaluation, and as the theoretical foundation of the original GAN objective — though in practice GAN training uses proxy losses rather than computing JSD directly.
+__Jensen-Shannon distance__ is commonly used in microbiome community comparison (enterotyping), where its metric property enables clustering with standard algorithms. It also appears in distribution drift detection, topic model evaluation, and as the theoretical foundation of the original GAN objective — though in practice GAN training uses proxy losses rather than computing JSD directly.
 ## Input & Output Types
@@ -149,25 +153,25 @@ Measured with Wasmtime v42 (Cranelift backend).
 | `nk_kld_f16_serial`  |    0.118 gb/s, 1.04K ulp |    0.127 gb/s, 4.53K ulp |    0.111 gb/s, 18.3K ulp |
 | `nk_jsd_f16_serial`  |     0.0748 gb/s, 1.4 ulp |     0.0681 gb/s, 2.6 ulp |     0.0857 gb/s, 9.7 ulp |
-### Apple M4
+### Apple M5
 #### Native
 | Kernel                |                      256 |                     1024 |                     4096 |
 | :-------------------- | -----------------------: | -----------------------: | -----------------------: |
 | __f64__               | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_kld_f64_serial`   |      2.21 gb/s, 5.6K ulp |       2.22 gb/s, 25K ulp |       2.18 gb/s, 99K ulp |
-| `nk_jsd_f64_serial`   |       1.40 gb/s, 0.4 ulp |       1.45 gb/s, 0.4 ulp |       1.45 gb/s, 0.5 ulp |
+| `nk_kld_f64_serial`   |      3.22 gb/s, 5.6K ulp |       3.36 gb/s, 25K ulp |       3.32 gb/s, 99K ulp |
+| `nk_jsd_f64_serial`   |       2.06 gb/s, 0.4 ulp |       2.17 gb/s, 0.4 ulp |       2.17 gb/s, 0.5 ulp |
 | __f32__               | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_kld_f32_serial`   |      6.29 gb/s, 1.0K ulp |      6.35 gb/s, 4.5K ulp |       6.22 gb/s, 18K ulp |
-| `nk_jsd_f32_serial`   |       1.21 gb/s, 0.4 ulp |       1.20 gb/s, 0.4 ulp |       1.20 gb/s, 4.6 ulp |
-| `nk_kld_f32_neon`     |      14.5 gb/s, 1.0K ulp |      14.4 gb/s, 4.5K ulp |       12.8 gb/s, 18K ulp |
-| `nk_jsd_f32_neon`     |        6.81 gb/s, 15 ulp |        7.04 gb/s, 14 ulp |       6.78 gb/s, 9.9 ulp |
+| `nk_kld_f32_serial`   |      9.26 gb/s, 1.0K ulp |      8.73 gb/s, 4.5K ulp |       9.10 gb/s, 18K ulp |
+| `nk_jsd_f32_serial`   |       2.08 gb/s, 0.4 ulp |       2.16 gb/s, 0.4 ulp |       2.13 gb/s, 4.6 ulp |
+| `nk_kld_f32_neon`     |      19.0 gb/s, 1.0K ulp |      17.4 gb/s, 4.5K ulp |       18.1 gb/s, 18K ulp |
+| `nk_jsd_f32_neon`     |        9.75 gb/s, 15 ulp |        9.32 gb/s, 14 ulp |       9.62 gb/s, 9.9 ulp |
 | __bf16__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_kld_bf16_serial`  |      3.16 gb/s, 1.0K ulp |      2.96 gb/s, 4.5K ulp |       3.16 gb/s, 18K ulp |
-| `nk_jsd_bf16_serial`  |      0.611 gb/s, 1.4 ulp |      0.595 gb/s, 2.9 ulp |      0.613 gb/s, 9.7 ulp |
+| `nk_kld_bf16_serial`  |      4.58 gb/s, 1.0K ulp |      4.47 gb/s, 4.5K ulp |       4.65 gb/s, 18K ulp |
+| `nk_jsd_bf16_serial`  |       1.08 gb/s, 1.4 ulp |       1.07 gb/s, 2.9 ulp |       1.09 gb/s, 9.7 ulp |
 | __f16__               | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_kld_f16_serial`   |      3.15 gb/s, 1.0K ulp |      3.14 gb/s, 4.5K ulp |       2.81 gb/s, 18K ulp |
-| `nk_jsd_f16_serial`   |      0.610 gb/s, 1.4 ulp |      0.611 gb/s, 2.7 ulp |      0.602 gb/s, 8.7 ulp |
-| `nk_kld_f16_neonhalf` |      6.78 gb/s, 1.0K ulp |      6.72 gb/s, 4.5K ulp |       6.09 gb/s, 18K ulp |
-| `nk_jsd_f16_neonhalf` |        3.42 gb/s, 15 ulp |        3.40 gb/s, 14 ulp |       3.14 gb/s, 9.9 ulp |
+| `nk_kld_f16_serial`   |      4.63 gb/s, 1.0K ulp |      4.45 gb/s, 4.5K ulp |       4.55 gb/s, 18K ulp |
+| `nk_jsd_f16_serial`   |       1.03 gb/s, 1.4 ulp |      0.962 gb/s, 2.7 ulp |      0.976 gb/s, 8.7 ulp |
+| `nk_kld_f16_neonhalf` |      10.2 gb/s, 1.0K ulp |      9.67 gb/s, 4.5K ulp |       9.99 gb/s, 18K ulp |
+| `nk_jsd_f16_neonhalf` |        5.00 gb/s, 15 ulp |        4.79 gb/s, 14 ulp |       4.94 gb/s, 9.9 ulp |

package/include/numkong/probability/neon.h CHANGED Viewed

@@ -57,8 +57,8 @@ NK_PUBLIC float32x4_t nk_log2_f32x4_neon_(float32x4_t x) {
 NK_PUBLIC void nk_kld_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
     nk_f32_t epsilon = NK_F32_DIVISION_EPSILON;
     float32x4_t epsilon_f32x4 = vdupq_n_f32(epsilon);
-    float64x2_t sum_lower_f64x2 = vdupq_n_f64(0.0);
-    float64x2_t sum_upper_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t sum_low_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t sum_high_f64x2 = vdupq_n_f64(0.0);
     float32x4_t a_f32x4, b_f32x4;
 nk_kld_f32_neon_cycle:
@@ -79,20 +79,20 @@ nk_kld_f32_neon_cycle:
     float32x4_t ratio_f32x4 = vdivq_f32(vaddq_f32(a_f32x4, epsilon_f32x4), vaddq_f32(b_f32x4, epsilon_f32x4));
     float32x4_t log_ratio_f32x4 = nk_log2_f32x4_neon_(ratio_f32x4);
     float32x4_t contribution_f32x4 = vmulq_f32(a_f32x4, log_ratio_f32x4);
-    sum_lower_f64x2 = vaddq_f64(sum_lower_f64x2, vcvt_f64_f32(vget_low_f32(contribution_f32x4)));
-    sum_upper_f64x2 = vaddq_f64(sum_upper_f64x2, vcvt_f64_f32(vget_high_f32(contribution_f32x4)));
+    sum_low_f64x2 = vaddq_f64(sum_low_f64x2, vcvt_f64_f32(vget_low_f32(contribution_f32x4)));
+    sum_high_f64x2 = vaddq_f64(sum_high_f64x2, vcvt_high_f64_f32(contribution_f32x4));
     if (n != 0) goto nk_kld_f32_neon_cycle;
     nk_f64_t log2_normalizer = 0.6931471805599453;
-    nk_f64_t sum = vaddvq_f64(vaddq_f64(sum_lower_f64x2, sum_upper_f64x2)) * log2_normalizer;
+    nk_f64_t sum = vaddvq_f64(vaddq_f64(sum_low_f64x2, sum_high_f64x2)) * log2_normalizer;
     *result = sum;
 }
 NK_PUBLIC void nk_jsd_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
     nk_f32_t epsilon = NK_F32_DIVISION_EPSILON;
     float32x4_t epsilon_f32x4 = vdupq_n_f32(epsilon);
-    float64x2_t sum_lower_f64x2 = vdupq_n_f64(0.0);
-    float64x2_t sum_upper_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t sum_low_f64x2 = vdupq_n_f64(0.0);
+    float64x2_t sum_high_f64x2 = vdupq_n_f64(0.0);
     float32x4_t a_f32x4, b_f32x4;
 nk_jsd_f32_neon_cycle:
@@ -118,12 +118,12 @@ nk_jsd_f32_neon_cycle:
     float32x4_t contribution_a_f32x4 = vmulq_f32(a_f32x4, log_ratio_a_f32x4);
     float32x4_t contribution_b_f32x4 = vmulq_f32(b_f32x4, log_ratio_b_f32x4);
     float32x4_t contribution_f32x4 = vaddq_f32(contribution_a_f32x4, contribution_b_f32x4);
-    sum_lower_f64x2 = vaddq_f64(sum_lower_f64x2, vcvt_f64_f32(vget_low_f32(contribution_f32x4)));
-    sum_upper_f64x2 = vaddq_f64(sum_upper_f64x2, vcvt_f64_f32(vget_high_f32(contribution_f32x4)));
+    sum_low_f64x2 = vaddq_f64(sum_low_f64x2, vcvt_f64_f32(vget_low_f32(contribution_f32x4)));
+    sum_high_f64x2 = vaddq_f64(sum_high_f64x2, vcvt_high_f64_f32(contribution_f32x4));
     if (n != 0) goto nk_jsd_f32_neon_cycle;
     nk_f64_t log2_normalizer = 0.6931471805599453;
-    nk_f64_t sum = vaddvq_f64(vaddq_f64(sum_lower_f64x2, sum_upper_f64x2)) * log2_normalizer / 2.0;
+    nk_f64_t sum = vaddvq_f64(vaddq_f64(sum_low_f64x2, sum_high_f64x2)) * log2_normalizer / 2.0;
     *result = sum > 0 ? nk_f64_sqrt_neon(sum) : 0;
 }
@@ -134,76 +134,106 @@ nk_jsd_f32_neon_cycle:
 #endif
 #endif // NK_TARGET_NEON
-#if NK_TARGET_NEONHALF
+#if NK_TARGET_NEON
 #if defined(__clang__)
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 #elif defined(__GNUC__)
 #pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd+fp16")
+#pragma GCC target("arch=armv8.2-a+simd")
 #endif
-NK_PUBLIC void nk_kld_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
+NK_PUBLIC void nk_kld_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     nk_f32_t epsilon = NK_F32_DIVISION_EPSILON;
     float32x4_t epsilon_f32x4 = vdupq_n_f32(epsilon);
-    float32x4_t a_f32x4, b_f32x4;
+    float32x4_t a_low_f32x4, a_high_f32x4, b_low_f32x4, b_high_f32x4;
-nk_kld_f16_neonhalf_cycle:
-    if (n < 4) {
-        nk_b64_vec_t a_vec, b_vec;
-        nk_partial_load_b16x4_serial_(a, &a_vec, n);
-        nk_partial_load_b16x4_serial_(b, &b_vec, n);
-        a_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(a_vec.u16x4));
-        b_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(b_vec.u16x4));
+nk_kld_f16_neon_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a, &a_vec, n);
+        nk_partial_load_b16x8_serial_(b, &b_vec, n);
+        float16x8_t a_f16x8 = vreinterpretq_f16_u16(a_vec.u16x8);
+        float16x8_t b_f16x8 = vreinterpretq_f16_u16(b_vec.u16x8);
+        a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
+        b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
         n = 0;
     }
     else {
-        a_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)a));
-        b_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)b));
-        n -= 4, a += 4, b += 4;
+        float16x8_t a_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)a));
+        float16x8_t b_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)b));
+        a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
+        b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
+        n -= 8, a += 8, b += 8;
     }
-    float32x4_t ratio_f32x4 = vdivq_f32(vaddq_f32(a_f32x4, epsilon_f32x4), vaddq_f32(b_f32x4, epsilon_f32x4));
-    float32x4_t log_ratio_f32x4 = nk_log2_f32x4_neon_(ratio_f32x4);
-    float32x4_t contribution_f32x4 = vmulq_f32(a_f32x4, log_ratio_f32x4);
-    sum_f32x4 = vaddq_f32(sum_f32x4, contribution_f32x4);
-    if (n) goto nk_kld_f16_neonhalf_cycle;
+    float32x4_t ratio_low_f32x4 = vdivq_f32(vaddq_f32(a_low_f32x4, epsilon_f32x4),
+                                            vaddq_f32(b_low_f32x4, epsilon_f32x4));
+    float32x4_t ratio_high_f32x4 = vdivq_f32(vaddq_f32(a_high_f32x4, epsilon_f32x4),
+                                             vaddq_f32(b_high_f32x4, epsilon_f32x4));
+    float32x4_t log_ratio_low_f32x4 = nk_log2_f32x4_neon_(ratio_low_f32x4);
+    float32x4_t log_ratio_high_f32x4 = nk_log2_f32x4_neon_(ratio_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, log_ratio_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, log_ratio_high_f32x4);
+    if (n) goto nk_kld_f16_neon_cycle;
     nk_f32_t log2_normalizer = 0.693147181f;
     nk_f32_t sum = vaddvq_f32(sum_f32x4) * log2_normalizer;
     *result = sum;
 }
-NK_PUBLIC void nk_jsd_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
+NK_PUBLIC void nk_jsd_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     nk_f32_t epsilon = NK_F32_DIVISION_EPSILON;
     float32x4_t epsilon_f32x4 = vdupq_n_f32(epsilon);
-    float32x4_t a_f32x4, b_f32x4;
+    float32x4_t a_low_f32x4, a_high_f32x4, b_low_f32x4, b_high_f32x4;
-nk_jsd_f16_neonhalf_cycle:
-    if (n < 4) {
-        nk_b64_vec_t a_vec, b_vec;
-        nk_partial_load_b16x4_serial_(a, &a_vec, n);
-        nk_partial_load_b16x4_serial_(b, &b_vec, n);
-        a_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(a_vec.u16x4));
-        b_f32x4 = vcvt_f32_f16(vreinterpret_f16_u16(b_vec.u16x4));
+nk_jsd_f16_neon_cycle:
+    if (n < 8) {
+        nk_b128_vec_t a_vec, b_vec;
+        nk_partial_load_b16x8_serial_(a, &a_vec, n);
+        nk_partial_load_b16x8_serial_(b, &b_vec, n);
+        float16x8_t a_f16x8 = vreinterpretq_f16_u16(a_vec.u16x8);
+        float16x8_t b_f16x8 = vreinterpretq_f16_u16(b_vec.u16x8);
+        a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
+        b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
         n = 0;
     }
     else {
-        a_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)a));
-        b_f32x4 = vcvt_f32_f16(vld1_f16((nk_f16_for_arm_simd_t const *)b));
-        n -= 4, a += 4, b += 4;
+        float16x8_t a_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)a));
+        float16x8_t b_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)b));
+        a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+        a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
+        b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+        b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
+        n -= 8, a += 8, b += 8;
     }
-    float32x4_t mean_f32x4 = vmulq_n_f32(vaddq_f32(a_f32x4, b_f32x4), 0.5f);
-    float32x4_t ratio_a_f32x4 = vdivq_f32(vaddq_f32(a_f32x4, epsilon_f32x4), vaddq_f32(mean_f32x4, epsilon_f32x4));
-    float32x4_t ratio_b_f32x4 = vdivq_f32(vaddq_f32(b_f32x4, epsilon_f32x4), vaddq_f32(mean_f32x4, epsilon_f32x4));
-    float32x4_t log_ratio_a_f32x4 = nk_log2_f32x4_neon_(ratio_a_f32x4);
-    float32x4_t log_ratio_b_f32x4 = nk_log2_f32x4_neon_(ratio_b_f32x4);
-    float32x4_t contribution_a_f32x4 = vmulq_f32(a_f32x4, log_ratio_a_f32x4);
-    float32x4_t contribution_b_f32x4 = vmulq_f32(b_f32x4, log_ratio_b_f32x4);
-    sum_f32x4 = vaddq_f32(sum_f32x4, vaddq_f32(contribution_a_f32x4, contribution_b_f32x4));
-    if (n) goto nk_jsd_f16_neonhalf_cycle;
+    float32x4_t mean_low_f32x4 = vmulq_n_f32(vaddq_f32(a_low_f32x4, b_low_f32x4), 0.5f);
+    float32x4_t mean_high_f32x4 = vmulq_n_f32(vaddq_f32(a_high_f32x4, b_high_f32x4), 0.5f);
+    float32x4_t ratio_a_low_f32x4 = vdivq_f32(vaddq_f32(a_low_f32x4, epsilon_f32x4),
+                                              vaddq_f32(mean_low_f32x4, epsilon_f32x4));
+    float32x4_t ratio_a_high_f32x4 = vdivq_f32(vaddq_f32(a_high_f32x4, epsilon_f32x4),
+                                               vaddq_f32(mean_high_f32x4, epsilon_f32x4));
+    float32x4_t ratio_b_low_f32x4 = vdivq_f32(vaddq_f32(b_low_f32x4, epsilon_f32x4),
+                                              vaddq_f32(mean_low_f32x4, epsilon_f32x4));
+    float32x4_t ratio_b_high_f32x4 = vdivq_f32(vaddq_f32(b_high_f32x4, epsilon_f32x4),
+                                               vaddq_f32(mean_high_f32x4, epsilon_f32x4));
+    float32x4_t log_ratio_a_low_f32x4 = nk_log2_f32x4_neon_(ratio_a_low_f32x4);
+    float32x4_t log_ratio_a_high_f32x4 = nk_log2_f32x4_neon_(ratio_a_high_f32x4);
+    float32x4_t log_ratio_b_low_f32x4 = nk_log2_f32x4_neon_(ratio_b_low_f32x4);
+    float32x4_t log_ratio_b_high_f32x4 = nk_log2_f32x4_neon_(ratio_b_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_low_f32x4, log_ratio_a_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, a_high_f32x4, log_ratio_a_high_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, b_low_f32x4, log_ratio_b_low_f32x4);
+    sum_f32x4 = vfmaq_f32(sum_f32x4, b_high_f32x4, log_ratio_b_high_f32x4);
+    if (n) goto nk_jsd_f16_neon_cycle;
     nk_f32_t log2_normalizer = 0.693147181f;
     nk_f32_t sum = vaddvq_f32(sum_f32x4) * log2_normalizer / 2;
@@ -215,7 +245,7 @@ nk_jsd_f16_neonhalf_cycle:
 #elif defined(__GNUC__)
 #pragma GCC pop_options
 #endif
-#endif // NK_TARGET_NEONHALF
+#endif // NK_TARGET_NEON
 #if defined(__cplusplus)
 } // extern "C"