npm - numkong - Versions diffs - 7.0.0 → 7.4.1 - Mend

numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +239 -122
package/binding.gyp +25 -491
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/cast.h CHANGED Viewed

@@ -150,6 +150,20 @@ NK_PUBLIC void nk_f32_to_f16_sapphire(nk_f32_t const *src, nk_f16_t *dest);
 NK_PUBLIC void nk_cast_rvv(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
 #endif // NK_TARGET_RVV
+#if NK_TARGET_POWERVSX
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_powervsx(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+/** @copydoc nk_f16_to_f32 */
+NK_PUBLIC void nk_f16_to_f32_powervsx(nk_f16_t const *src, nk_f32_t *dest);
+/** @copydoc nk_f32_to_f16 */
+NK_PUBLIC void nk_f32_to_f16_powervsx(nk_f32_t const *src, nk_f16_t *dest);
+#endif // NK_TARGET_POWERVSX
+#if NK_TARGET_V128RELAXED
+/** @copydoc nk_cast */
+NK_PUBLIC void nk_cast_v128relaxed(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type);
+#endif // NK_TARGET_V128RELAXED
 #if defined(__cplusplus)
 } // extern "C"
 #endif
@@ -161,6 +175,8 @@ NK_PUBLIC void nk_cast_rvv(void const *from, nk_dtype_t from_type, nk_size_t n,
 #include "numkong/cast/icelake.h"
 #include "numkong/cast/sapphire.h"
 #include "numkong/cast/rvv.h"
+#include "numkong/cast/powervsx.h"
+#include "numkong/cast/loongsonasx.h"
 #if defined(__cplusplus)
 extern "C" {
@@ -177,10 +193,14 @@ NK_PUBLIC void nk_cast(void const *from, nk_dtype_t from_type, nk_size_t n, void
     nk_cast_skylake(from, from_type, n, to, to_type);
 #elif NK_TARGET_HASWELL
     nk_cast_haswell(from, from_type, n, to, to_type);
+#elif NK_TARGET_POWERVSX
+    nk_cast_powervsx(from, from_type, n, to, to_type);
 #elif NK_TARGET_RVV
     nk_cast_rvv(from, from_type, n, to, to_type);
 #elif NK_TARGET_NEON
     nk_cast_neon(from, from_type, n, to, to_type);
+#elif NK_TARGET_V128RELAXED
+    nk_cast_v128relaxed(from, from_type, n, to, to_type);
 #else
     nk_cast_serial(from, from_type, n, to, to_type);
 #endif
@@ -191,6 +211,8 @@ NK_PUBLIC void nk_f16_to_f32(nk_f16_t const *src, nk_f32_t *dest) {
     nk_f16_to_f32_sapphire(src, dest);
 #elif NK_TARGET_HASWELL
     nk_f16_to_f32_haswell(src, dest);
+#elif NK_TARGET_POWERVSX
+    nk_f16_to_f32_powervsx(src, dest);
 #elif NK_TARGET_NEON
     nk_f16_to_f32_neon(src, dest);
 #else
@@ -203,6 +225,8 @@ NK_PUBLIC void nk_f32_to_f16(nk_f32_t const *src, nk_f16_t *dest) {
     nk_f32_to_f16_sapphire(src, dest);
 #elif NK_TARGET_HASWELL
     nk_f32_to_f16_haswell(src, dest);
+#elif NK_TARGET_POWERVSX
+    nk_f32_to_f16_powervsx(src, dest);
 #elif NK_TARGET_NEON
     nk_f32_to_f16_neon(src, dest);
 #else

package/include/numkong/cast.hpp ADDED Viewed

@@ -0,0 +1,44 @@
+/**
+ *  @brief C++ wrappers for SIMD-accelerated type casting.
+ *  @file include/numkong/cast.hpp
+ *  @author Ash Vardanian
+ *  @date March 20, 2026
+ */
+#ifndef NK_CAST_HPP
+#define NK_CAST_HPP
+#include <cstddef> // `std::size_t`
+#include "numkong/cast.h"
+#include "numkong/types.hpp"
+#include "numkong/vector.hpp"
+namespace ashvardanian::numkong {
+/**
+ *  @brief Elementwise type-cast from one numeric type to another.
+ *  @param[in] from Input array of `n` elements.
+ *  @param[in] n Number of elements.
+ *  @param[out] to Output array of `n` elements.
+ *
+ *  @tparam from_type_ Source element type.
+ *  @tparam to_type_ Destination element type.
+ *  @tparam allow_simd_ Enable SIMD kernel dispatch when `prefer_simd_k`.
+ */
+template <numeric_dtype from_type_, numeric_dtype to_type_, allow_simd_t allow_simd_ = prefer_simd_k>
+void cast(from_type_ const *from, std::size_t n, to_type_ *to) noexcept {
+    if constexpr (allow_simd_ == prefer_simd_k) nk_cast(from, from_type_::dtype(), n, to, to_type_::dtype());
+    else nk_cast_serial(from, from_type_::dtype(), n, to, to_type_::dtype());
+}
+/** @brief Elementwise type-cast between vector views. Sizes must match. */
+template <numeric_dtype from_type_, numeric_dtype to_type_, allow_simd_t allow_simd_ = prefer_simd_k>
+void cast(vector_view<from_type_> from, vector_span<to_type_> to) noexcept {
+    std::size_t n = from.size() < to.size() ? from.size() : to.size();
+    cast<from_type_, to_type_, allow_simd_>(from.data(), n, to.data());
+}
+} // namespace ashvardanian::numkong
+#endif // NK_CAST_HPP

package/include/numkong/curved/README.md CHANGED Viewed

@@ -6,21 +6,21 @@ These operations are central to Gaussian process inference, metric learning, and
 The bilinear form for real vectors is:
-```math
+$$
 \text{bilinear}(a, b, C) = a^T C b = \sum_{i=0}^{n-1} \sum_{j=0}^{n-1} a_i \cdot c_{ij} \cdot b_j
-```
+$$
 The Mahalanobis distance is:
-```math
+$$
 \text{mahalanobis}(a, b, C) = \sqrt{(a - b)^T C (a - b)}
-```
+$$
 For complex vectors, the bilinear form uses the conjugate transpose:
-```math
+$$
 \text{bilinear}(a, b, C) = a^H C b = \sum_{i=0}^{n-1} \sum_{j=0}^{n-1} \bar{a_i} \cdot c_{ij} \cdot b_j
-```
+$$
 Reformulating as Python pseudocode:
@@ -72,8 +72,8 @@ This nested structure gives $O(n)$ cache-friendly sequential access to the $n \t
 `nk_bilinear_f32_smef64`, `nk_bilinear_f64_smef64`, `nk_bilinear_f32c_smef64`, `nk_bilinear_f64c_smef64`, `nk_mahalanobis_f32_smef64`, `nk_mahalanobis_f64_smef64` use the Scalable Matrix Extension to compute the bilinear form as an outer-product accumulation.
 Each `FMOPA` instruction performs a rank-1 update $a_i \cdot b^T$ into the SME ZA tile array, and the matrix $C$ is streamed row-by-row and multiplied into the accumulator.
-This is fundamentally different from the row-major dot approach — it reformulates $a^T C b$ as a matrix-multiply problem where SME's 2D tile registers can exploit the matrix engine's throughput.
-For dimensions that align to the tile size, this approach achieves near-peak throughput; dimensions that do not align fall back to NEON for cleanup of the residual elements.
+This differs from the row-major dot approach — it reformulates $a^T C b$ as a matrix-multiply problem where SME's 2D tile registers use the matrix engine's throughput.
+For dimensions that align to the tile size, this approach has high throughput; dimensions that do not align fall back to NEON for cleanup of the residual elements.
 ### Complex Bilinear Decomposition
@@ -201,23 +201,23 @@ Measured with Wasmtime v42 (Cranelift backend).
 #### WASM
-Measured with Wasmtime v42 (Cranelift backend).
+Measured with Wasmtime v43 (Cranelift backend).
 | Kernel                     |                     256² |                    1024² |                    4096² |
 | :------------------------- | -----------------------: | -----------------------: | -----------------------: |
 | __f64c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_f64c_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_f64c_serial`  |       0.445 gso/s, ? ulp |       0.445 gso/s, ? ulp |       0.445 gso/s, ? ulp |
 | __f32c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_f32c_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_f32c_serial`  |        2.83 gso/s, ? ulp |        2.83 gso/s, ? ulp |        2.84 gso/s, ? ulp |
 | __bf16c__                  | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_bf16c_serial` |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_bf16c_serial` |        3.05 gso/s, ? ulp |        3.02 gso/s, ? ulp |        3.03 gso/s, ? ulp |
 | __f16c__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_f16c_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_f16c_serial`  |       0.984 gso/s, ? ulp |       0.992 gso/s, ? ulp |       0.995 gso/s, ? ulp |
 | __f64__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_f64_serial`   |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_f64_serial`   |       0.998 gso/s, ? ulp |       0.999 gso/s, ? ulp |       0.999 gso/s, ? ulp |
 | __f32__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_f32_serial`   |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_f32_serial`   |        5.00 gso/s, ? ulp |        3.73 gso/s, ? ulp |        3.49 gso/s, ? ulp |
 | __bf16__                   | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_bf16_serial`  |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_bf16_serial`  |        4.84 gso/s, ? ulp |        3.83 gso/s, ? ulp |        3.60 gso/s, ? ulp |
 | __f16__                    | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
-| `nk_bilinear_f16_serial`   |           ? gso/s, ? ulp |           ? gso/s, ? ulp |           ? gso/s, ? ulp |
+| `nk_bilinear_f16_serial`   |        1.90 gso/s, ? ulp |        1.75 gso/s, ? ulp |        1.93 gso/s, ? ulp |

package/include/numkong/curved/neon.h CHANGED Viewed

@@ -11,13 +11,12 @@
  *
  *  @section neon_curved_instructions Key NEON Instructions
  *
- *      Intrinsic         Instruction                   Latency     Throughput
- *                                                                  A76     M4+/V1+/Oryon
- *      vfmaq_f64         FMLA (V.2D, V.2D, V.2D)       4cy         2/cy    4/cy
- *      vcvt_f64_f32      FCVTL (V.2D, V.2S)            3cy         2/cy    2/cy
- *      vaddvq_f64        FADDP (V.2D to scalar)        3cy         1/cy    1/cy
- *      vld1_f32          LD1 ({Vt.2S}, [Xn])           4cy         2/cy    2/cy
- *      vld2_f32          LD2 ({Vt.2S, Vt2.2S}, [Xn])   4cy         1/cy    1/cy
+ *      Intrinsic     Instruction                  A76       M5
+ *      vfmaq_f64     FMLA (V.2D, V.2D, V.2D)      4cy @ 2p  3cy @ 4p
+ *      vcvt_f64_f32  FCVTL (V.2D, V.2S)           3cy @ 2p  3cy @ 4p
+ *      vaddvq_f64    FADDP (V.2D to scalar)       3cy @ 1p  3cy @ 2p
+ *      vld1_f32      LD1 ({Vt.2S}, [Xn])          4cy @ 2p  4cy @ 3p
+ *      vld2_f32      LD2 ({Vt.2S, Vt2.2S}, [Xn])  4cy @ 1p  4cy @ 1p
  *
  *  For f32 bilinear and Mahalanobis, we upcast to f64 for accumulation to preserve
  *  precision and avoid catastrophic cancellation in large-magnitude sums.
@@ -190,6 +189,131 @@ NK_PUBLIC void nk_bilinear_f32c_neon(nk_f32c_t const *a_pairs, nk_f32c_t const *
     results->imag = outer_sum_imag_f64;
 }
+NK_PUBLIC void nk_bilinear_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
+                                    nk_f32_t *result) {
+    nk_f32_t outer_sum = 0;
+    for (nk_size_t row = 0; row != n; ++row) {
+        nk_f16_t const *c_row = c + row * n;
+        nk_f32_t a_row;
+        nk_f16_to_f32_serial(a + row, &a_row);
+        float32x4_t inner_sum_f32x4 = vdupq_n_f32(0);
+        nk_size_t column = 0;
+        for (; column + 8 <= n; column += 8) {
+            float16x8_t b_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)(b + column)));
+            float16x8_t c_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)(c_row + column)));
+            float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+            float32x4_t b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
+            float32x4_t c_low_f32x4 = vcvt_f32_f16(vget_low_f16(c_f16x8));
+            float32x4_t c_high_f32x4 = vcvt_high_f32_f16(c_f16x8);
+            inner_sum_f32x4 = vfmaq_f32(inner_sum_f32x4, c_low_f32x4, b_low_f32x4);
+            inner_sum_f32x4 = vfmaq_f32(inner_sum_f32x4, c_high_f32x4, b_high_f32x4);
+        }
+        nk_f32_t inner_sum = vaddvq_f32(inner_sum_f32x4);
+        for (; column < n; ++column) {
+            nk_f32_t b_val, c_val;
+            nk_f16_to_f32_serial(b + column, &b_val);
+            nk_f16_to_f32_serial(c_row + column, &c_val);
+            inner_sum += c_val * b_val;
+        }
+        outer_sum += a_row * inner_sum;
+    }
+    *result = outer_sum;
+}
+NK_PUBLIC void nk_mahalanobis_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
+                                       nk_f32_t *result) {
+    nk_f32_t outer_sum = 0;
+    for (nk_size_t row = 0; row != n; ++row) {
+        nk_f16_t const *c_row = c + row * n;
+        nk_f32_t a_row, b_row;
+        nk_f16_to_f32_serial(a + row, &a_row);
+        nk_f16_to_f32_serial(b + row, &b_row);
+        nk_f32_t diff_row = a_row - b_row;
+        float32x4_t inner_sum_f32x4 = vdupq_n_f32(0);
+        nk_size_t column = 0;
+        for (; column + 8 <= n; column += 8) {
+            float16x8_t a_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)(a + column)));
+            float16x8_t b_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)(b + column)));
+            float16x8_t c_f16x8 = vreinterpretq_f16_u16(vld1q_u16((nk_u16_t const *)(c_row + column)));
+            float32x4_t a_low_f32x4 = vcvt_f32_f16(vget_low_f16(a_f16x8));
+            float32x4_t a_high_f32x4 = vcvt_high_f32_f16(a_f16x8);
+            float32x4_t b_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_f16x8));
+            float32x4_t b_high_f32x4 = vcvt_high_f32_f16(b_f16x8);
+            float32x4_t c_low_f32x4 = vcvt_f32_f16(vget_low_f16(c_f16x8));
+            float32x4_t c_high_f32x4 = vcvt_high_f32_f16(c_f16x8);
+            float32x4_t diff_low_f32x4 = vsubq_f32(a_low_f32x4, b_low_f32x4);
+            float32x4_t diff_high_f32x4 = vsubq_f32(a_high_f32x4, b_high_f32x4);
+            inner_sum_f32x4 = vfmaq_f32(inner_sum_f32x4, c_low_f32x4, diff_low_f32x4);
+            inner_sum_f32x4 = vfmaq_f32(inner_sum_f32x4, c_high_f32x4, diff_high_f32x4);
+        }
+        nk_f32_t inner_sum = vaddvq_f32(inner_sum_f32x4);
+        for (; column < n; ++column) {
+            nk_f32_t a_val, b_val, c_val;
+            nk_f16_to_f32_serial(a + column, &a_val);
+            nk_f16_to_f32_serial(b + column, &b_val);
+            nk_f16_to_f32_serial(c_row + column, &c_val);
+            inner_sum += c_val * (a_val - b_val);
+        }
+        outer_sum += diff_row * inner_sum;
+    }
+    nk_f32_t quadratic = outer_sum;
+    *result = nk_f32_sqrt_neon(quadratic > 0 ? quadratic : 0);
+}
+NK_PUBLIC void nk_bilinear_f16c_neon(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_f16c_t const *c_pairs,
+                                     nk_size_t n, nk_f32c_t *results) {
+    nk_f32_t outer_sum_real = 0;
+    nk_f32_t outer_sum_imag = 0;
+    for (nk_size_t row = 0; row != n; ++row) {
+        nk_f16c_t const *c_row = c_pairs + row * n;
+        nk_f32_t a_real, a_imag;
+        nk_f16_to_f32_serial(&(a_pairs + row)->real, &a_real);
+        nk_f16_to_f32_serial(&(a_pairs + row)->imag, &a_imag);
+        float32x4_t inner_sum_real_f32x4 = vdupq_n_f32(0);
+        float32x4_t inner_sum_imag_f32x4 = vdupq_n_f32(0);
+        nk_size_t column = 0;
+        for (; column + 8 <= n; column += 8) {
+            int16x8x2_t b_i16x8x2 = vld2q_s16((short const *)(b_pairs + column));
+            int16x8x2_t c_i16x8x2 = vld2q_s16((short const *)(c_row + column));
+            float16x8_t b_real_f16x8 = vreinterpretq_f16_s16(b_i16x8x2.val[0]);
+            float16x8_t b_imag_f16x8 = vreinterpretq_f16_s16(b_i16x8x2.val[1]);
+            float16x8_t c_real_f16x8 = vreinterpretq_f16_s16(c_i16x8x2.val[0]);
+            float16x8_t c_imag_f16x8 = vreinterpretq_f16_s16(c_i16x8x2.val[1]);
+            float32x4_t b_real_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_real_f16x8));
+            float32x4_t b_real_high_f32x4 = vcvt_high_f32_f16(b_real_f16x8);
+            float32x4_t b_imag_low_f32x4 = vcvt_f32_f16(vget_low_f16(b_imag_f16x8));
+            float32x4_t b_imag_high_f32x4 = vcvt_high_f32_f16(b_imag_f16x8);
+            float32x4_t c_real_low_f32x4 = vcvt_f32_f16(vget_low_f16(c_real_f16x8));
+            float32x4_t c_real_high_f32x4 = vcvt_high_f32_f16(c_real_f16x8);
+            float32x4_t c_imag_low_f32x4 = vcvt_f32_f16(vget_low_f16(c_imag_f16x8));
+            float32x4_t c_imag_high_f32x4 = vcvt_high_f32_f16(c_imag_f16x8);
+            inner_sum_real_f32x4 = vfmaq_f32(inner_sum_real_f32x4, c_real_low_f32x4, b_real_low_f32x4);
+            inner_sum_real_f32x4 = vfmsq_f32(inner_sum_real_f32x4, c_imag_low_f32x4, b_imag_low_f32x4);
+            inner_sum_real_f32x4 = vfmaq_f32(inner_sum_real_f32x4, c_real_high_f32x4, b_real_high_f32x4);
+            inner_sum_real_f32x4 = vfmsq_f32(inner_sum_real_f32x4, c_imag_high_f32x4, b_imag_high_f32x4);
+            inner_sum_imag_f32x4 = vfmaq_f32(inner_sum_imag_f32x4, c_real_low_f32x4, b_imag_low_f32x4);
+            inner_sum_imag_f32x4 = vfmaq_f32(inner_sum_imag_f32x4, c_imag_low_f32x4, b_real_low_f32x4);
+            inner_sum_imag_f32x4 = vfmaq_f32(inner_sum_imag_f32x4, c_real_high_f32x4, b_imag_high_f32x4);
+            inner_sum_imag_f32x4 = vfmaq_f32(inner_sum_imag_f32x4, c_imag_high_f32x4, b_real_high_f32x4);
+        }
+        nk_f32_t inner_sum_real = vaddvq_f32(inner_sum_real_f32x4);
+        nk_f32_t inner_sum_imag = vaddvq_f32(inner_sum_imag_f32x4);
+        for (; column < n; ++column) {
+            nk_f32_t b_real, b_imag, c_real, c_imag;
+            nk_f16_to_f32_serial(&(b_pairs + column)->real, &b_real);
+            nk_f16_to_f32_serial(&(b_pairs + column)->imag, &b_imag);
+            nk_f16_to_f32_serial(&(c_row + column)->real, &c_real);
+            nk_f16_to_f32_serial(&(c_row + column)->imag, &c_imag);
+            inner_sum_real += c_real * b_real - c_imag * b_imag;
+            inner_sum_imag += c_real * b_imag + c_imag * b_real;
+        }
+        outer_sum_real += a_real * inner_sum_real - a_imag * inner_sum_imag;
+        outer_sum_imag += a_real * inner_sum_imag + a_imag * inner_sum_real;
+    }
+    results->real = outer_sum_real;
+    results->imag = outer_sum_imag;
+}
 #if defined(__clang__)
 #pragma clang attribute pop
 #elif defined(__GNUC__)

package/include/numkong/curved/neonbfdot.h CHANGED Viewed

@@ -10,13 +10,12 @@
  *
  *  @section curved_neonbfdot_instructions ARM NEON BF16 Instructions (ARMv8.6-BF16)
  *
- *      Intrinsic                   Instruction                     Latency     Throughput
- *                                                                              A76         M4+/V1+/Oryon
- *      vbfdotq_f32                 BFDOT (V.4S, V.8H, V.8H)        3cy         2/cy        4/cy
- *      vcvt_f32_bf16               BFCVTN (V.4H, V.4S)             3cy         2/cy        4/cy
- *      vld1q_bf16                  LD1 (V.8H)                      4cy         2/cy        3/cy
- *      vaddvq_f32                  FADDP+FADDP (V.4S)              4cy         1/cy        2/cy
- *      vfmaq_f32                   FMLA (V.4S, V.4S, V.4S)         4cy         2/cy        4/cy
+ *      Intrinsic      Instruction               A76       M5
+ *      vbfdotq_f32    BFDOT (V.4S, V.8H, V.8H)  3cy @ 2p  2cy @ 1p
+ *      vcvt_f32_bf16  BFCVTN (V.4H, V.4S)       3cy @ 2p  3cy @ 4p
+ *      vld1q_bf16     LD1 (V.8H)                4cy @ 2p  4cy @ 3p
+ *      vaddvq_f32     FADDP+FADDP (V.4S)        5cy @ 1p  8cy @ 1p
+ *      vfmaq_f32      FMLA (V.4S, V.4S, V.4S)   4cy @ 2p  3cy @ 4p
  *
  *  For bilinear forms, BFDOT enables efficient inner-product computation by processing 8 bf16
  *  pairs into 4 f32 results per instruction. For Mahalanobis distance, bf16 inputs are converted

package/include/numkong/curved/rvv.h CHANGED Viewed

@@ -36,10 +36,10 @@ extern "C" {
 NK_PUBLIC void nk_bilinear_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t n,
                                    nk_f64_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e32m2();
     nk_f64_t outer_sum = 0;
     for (nk_size_t i = 0; i < n; ++i) {
-        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vlmax);
+        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, max_vector_length);
         nk_f32_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -50,7 +50,7 @@ NK_PUBLIC void nk_bilinear_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_f32_
         }
         vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
         nk_f64_t inner_val = __riscv_vfmv_f_s_f64m1_f64(
-            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, vlmax));
+            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, max_vector_length));
         outer_sum += (nk_f64_t)a[i] * inner_val;
     }
     *result = outer_sum;
@@ -58,12 +58,12 @@ NK_PUBLIC void nk_bilinear_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_f32_
 NK_PUBLIC void nk_bilinear_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t n,
                                    nk_f64_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e64m4();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e64m4();
     vfloat64m1_t sum_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
     nk_f64_t outer_compensation = 0;
     for (nk_size_t i = 0; i < n; ++i) {
-        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vlmax);
-        vfloat64m4_t compensation_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vlmax);
+        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, max_vector_length);
+        vfloat64m4_t compensation_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, max_vector_length);
         nk_f64_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -82,7 +82,7 @@ NK_PUBLIC void nk_bilinear_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_f64_
         }
         vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
         nk_f64_t inner_val = __riscv_vfmv_f_s_f64m1_f64(
-            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, vlmax));
+            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, max_vector_length));
         nk_f64_t product_outer = a[i] * inner_val;
         nk_f64_t old_sum = __riscv_vfmv_f_s_f64m1_f64(sum_f64m1);
         nk_f64_t new_sum = old_sum + product_outer;
@@ -96,14 +96,14 @@ NK_PUBLIC void nk_bilinear_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_f64_
 NK_PUBLIC void nk_bilinear_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
                                    nk_f32_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e32m2();
     vfloat32m1_t sum_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
     for (nk_size_t i = 0; i < n; ++i) {
         // Convert a[i] from f16 to f32
         nk_f32_t a_i;
         nk_f16_to_f32_serial(a + i, &a_i);
-        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
+        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, max_vector_length);
         nk_f16_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -117,7 +117,7 @@ NK_PUBLIC void nk_bilinear_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f16_
         }
         vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
         nk_f32_t inner_val = __riscv_vfmv_f_s_f32m1_f32(
-            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, vlmax));
+            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, max_vector_length));
         sum_f32m1 = __riscv_vfmv_v_f_f32m1(__riscv_vfmv_f_s_f32m1_f32(sum_f32m1) + a_i * inner_val, 1);
     }
     *result = __riscv_vfmv_f_s_f32m1_f32(sum_f32m1);
@@ -125,14 +125,14 @@ NK_PUBLIC void nk_bilinear_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f16_
 NK_PUBLIC void nk_bilinear_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_t const *c, nk_size_t n,
                                     nk_f32_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e32m2();
     vfloat32m1_t sum_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
     for (nk_size_t i = 0; i < n; ++i) {
         // Convert a[i] from bf16 to f32
         nk_f32_t a_i;
         nk_bf16_to_f32_serial(a + i, &a_i);
-        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
+        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, max_vector_length);
         nk_bf16_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -146,7 +146,7 @@ NK_PUBLIC void nk_bilinear_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_b
         }
         vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
         nk_f32_t inner_val = __riscv_vfmv_f_s_f32m1_f32(
-            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, vlmax));
+            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, max_vector_length));
         sum_f32m1 = __riscv_vfmv_v_f_f32m1(__riscv_vfmv_f_s_f32m1_f32(sum_f32m1) + a_i * inner_val, 1);
     }
     *result = __riscv_vfmv_f_s_f32m1_f32(sum_f32m1);
@@ -154,11 +154,11 @@ NK_PUBLIC void nk_bilinear_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_b
 NK_PUBLIC void nk_mahalanobis_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t n,
                                       nk_f64_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e32m2();
     nk_f64_t outer_sum = 0;
     for (nk_size_t i = 0; i < n; ++i) {
         nk_f64_t diff_i = (nk_f64_t)a[i] - (nk_f64_t)b[i];
-        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vlmax);
+        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, max_vector_length);
         nk_f32_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -173,7 +173,7 @@ NK_PUBLIC void nk_mahalanobis_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_f
         }
         vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
         nk_f64_t inner_val = __riscv_vfmv_f_s_f64m1_f64(
-            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, vlmax));
+            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, max_vector_length));
         outer_sum += diff_i * inner_val;
     }
     *result = nk_f64_sqrt_rvv(outer_sum > 0 ? outer_sum : 0);
@@ -181,13 +181,13 @@ NK_PUBLIC void nk_mahalanobis_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_f
 NK_PUBLIC void nk_mahalanobis_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t n,
                                       nk_f64_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e64m4();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e64m4();
     vfloat64m1_t sum_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
     nk_f64_t outer_compensation = 0;
     for (nk_size_t i = 0; i < n; ++i) {
         nk_f64_t diff_i = a[i] - b[i];
-        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vlmax);
-        vfloat64m4_t compensation_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, vlmax);
+        vfloat64m4_t inner_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, max_vector_length);
+        vfloat64m4_t compensation_f64m4 = __riscv_vfmv_v_f_f64m4(0.0, max_vector_length);
         nk_f64_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -209,7 +209,7 @@ NK_PUBLIC void nk_mahalanobis_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_f
         }
         vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
         nk_f64_t inner_val = __riscv_vfmv_f_s_f64m1_f64(
-            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, vlmax));
+            __riscv_vfredusum_vs_f64m4_f64m1(inner_f64m4, zero_f64m1, max_vector_length));
         nk_f64_t product_outer = diff_i * inner_val;
         nk_f64_t old_sum = __riscv_vfmv_f_s_f64m1_f64(sum_f64m1);
         nk_f64_t new_sum = old_sum + product_outer;
@@ -224,7 +224,7 @@ NK_PUBLIC void nk_mahalanobis_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_f
 NK_PUBLIC void nk_mahalanobis_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
                                       nk_f32_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e32m2();
     vfloat32m1_t sum_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
     for (nk_size_t i = 0; i < n; ++i) {
         nk_f32_t a_i, b_i;
@@ -232,7 +232,7 @@ NK_PUBLIC void nk_mahalanobis_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f
         nk_f16_to_f32_serial(b + i, &b_i);
         nk_f32_t diff_i = a_i - b_i;
-        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
+        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, max_vector_length);
         nk_f16_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -249,7 +249,7 @@ NK_PUBLIC void nk_mahalanobis_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f
         }
         vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
         nk_f32_t inner_val = __riscv_vfmv_f_s_f32m1_f32(
-            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, vlmax));
+            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, max_vector_length));
         sum_f32m1 = __riscv_vfmv_v_f_f32m1(__riscv_vfmv_f_s_f32m1_f32(sum_f32m1) + diff_i * inner_val, 1);
     }
     nk_f32_t quadratic_f16 = __riscv_vfmv_f_s_f32m1_f32(sum_f32m1);
@@ -258,7 +258,7 @@ NK_PUBLIC void nk_mahalanobis_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_f
 NK_PUBLIC void nk_mahalanobis_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_bf16_t const *c, nk_size_t n,
                                        nk_f32_t *result) {
-    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    nk_size_t max_vector_length = __riscv_vsetvlmax_e32m2();
     vfloat32m1_t sum_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
     for (nk_size_t i = 0; i < n; ++i) {
         nk_f32_t a_i, b_i;
@@ -266,7 +266,7 @@ NK_PUBLIC void nk_mahalanobis_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, n
         nk_bf16_to_f32_serial(b + i, &b_i);
         nk_f32_t diff_i = a_i - b_i;
-        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
+        vfloat32m2_t inner_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, max_vector_length);
         nk_bf16_t const *c_row = c + i * n;
         nk_size_t remaining = n;
         for (nk_size_t vector_length; remaining > 0; remaining -= vector_length, c_row += vector_length) {
@@ -283,7 +283,7 @@ NK_PUBLIC void nk_mahalanobis_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, n
         }
         vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, 1);
         nk_f32_t inner_val = __riscv_vfmv_f_s_f32m1_f32(
-            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, vlmax));
+            __riscv_vfredusum_vs_f32m2_f32m1(inner_f32m2, zero_f32m1, max_vector_length));
         sum_f32m1 = __riscv_vfmv_v_f_f32m1(__riscv_vfmv_f_s_f32m1_f32(sum_f32m1) + diff_i * inner_val, 1);
     }
     nk_f32_t quadratic_bf16 = __riscv_vfmv_f_s_f32m1_f32(sum_f32m1);