npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/dot/rvv.h ADDED Viewed

@@ -0,0 +1,714 @@
+/**
+ *  @brief SIMD-accelerated Dot Products for RISC-V.
+ *  @file include/numkong/dot/rvv.h
+ *  @author Ash Vardanian
+ *  @date January 5, 2026
+ *
+ *  @sa include/numkong/dot.h
+ *
+ *  SpacemiT K1 and similar chips implement base RVV 1.0 without half-precision extensions.
+ *  RVV uses vector length agnostic programming where:
+ *  - `vsetvl_e*m*(n)` sets VL = min(n, VLMAX) and returns actual VL
+ *  - Loads/stores with VL automatically handle partial vectors (tail elements)
+ *  - No explicit masking needed for simple reductions
+ *
+ *  This file contains base RVV 1.0 operations (i8, u8, f32, f64).
+ *  For f16 (Zvfh) see rvvhalf.h, for bf16 (Zvfbfwma) see rvvbf16.h.
+ *
+ *  Widening operations:
+ *  - i8 ⨯ i8 → i16 via vwmul, then i16 reduction → i32 via vwredsum
+ *  - f32 ⨯ f32 → f64 via vfwmul (for precision, like Skylake)
+ */
+#ifndef NK_DOT_RVV_H
+#define NK_DOT_RVV_H
+#if NK_TARGET_RISCV_
+#if NK_TARGET_RVV
+#include "numkong/types.h"
+#include "numkong/cast/rvv.h" // `nk_e4m3m1_to_f32m4_rvv_`
+#include "numkong/set/rvv.h"  // `nk_popcount_u8m4_rvv_`
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=+v"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("arch=+v")
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/** @brief Compensated horizontal sum of RVV f64m1 lanes via TwoSum tree reduction.
+ *
+ *  Uses vslidedown to extract the upper half at each tree level (same pattern as
+ *  nk_reduce_vsaddu_u64m1_rvv_ in reduce/rvv.h). Tail lanes beyond vlmax are zero
+ *  from the initial vfmv_v_f, so they are harmless in the reduction.
+ */
+NK_INTERNAL nk_f64_t nk_dot_stable_sum_f64m1_rvv_(vfloat64m1_t sum_f64m1, vfloat64m1_t compensation_f64m1) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
+    // Stage 0: TwoSum merge of sum + compensation
+    vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_f64m1, compensation_f64m1, vlmax);
+    vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_f64m1, vlmax);
+    vfloat64m1_t accumulated_error_f64m1 = __riscv_vfadd_vv_f64m1(
+        __riscv_vfsub_vv_f64m1(sum_f64m1, __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vlmax),
+                               vlmax),
+        __riscv_vfsub_vv_f64m1(compensation_f64m1, virtual_addend_f64m1, vlmax), vlmax);
+    // Tree reduction: TwoSum halving at each level
+    for (nk_size_t half = vlmax / 2; half > 0; half >>= 1) {
+        vfloat64m1_t upper_sum_f64m1 = __riscv_vslidedown_vx_f64m1(tentative_sum_f64m1, half, vlmax);
+        vfloat64m1_t upper_error_f64m1 = __riscv_vslidedown_vx_f64m1(accumulated_error_f64m1, half, vlmax);
+        vfloat64m1_t halved_tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(tentative_sum_f64m1, upper_sum_f64m1, vlmax);
+        vfloat64m1_t halved_virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(halved_tentative_sum_f64m1,
+                                                                          tentative_sum_f64m1, vlmax);
+        vfloat64m1_t rounding_error_f64m1 = __riscv_vfadd_vv_f64m1(
+            __riscv_vfsub_vv_f64m1(
+                tentative_sum_f64m1,
+                __riscv_vfsub_vv_f64m1(halved_tentative_sum_f64m1, halved_virtual_addend_f64m1, vlmax), vlmax),
+            __riscv_vfsub_vv_f64m1(upper_sum_f64m1, halved_virtual_addend_f64m1, vlmax), vlmax);
+        tentative_sum_f64m1 = halved_tentative_sum_f64m1;
+        accumulated_error_f64m1 = __riscv_vfadd_vv_f64m1(
+            __riscv_vfadd_vv_f64m1(accumulated_error_f64m1, upper_error_f64m1, vlmax), rounding_error_f64m1, vlmax);
+    }
+    return __riscv_vfmv_f_s_f64m1_f64(tentative_sum_f64m1) + __riscv_vfmv_f_s_f64m1_f64(accumulated_error_f64m1);
+}
+NK_PUBLIC void nk_dot_i8_rvv(nk_i8_t const *a_scalars, nk_i8_t const *b_scalars, nk_size_t count_scalars,
+                             nk_i32_t *result) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vint32m4_t sum_i32m4 = __riscv_vmv_v_x_i32m4(0, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(count_scalars);
+        vint8m1_t a_i8m1 = __riscv_vle8_v_i8m1(a_scalars, vector_length);
+        vint8m1_t b_i8m1 = __riscv_vle8_v_i8m1(b_scalars, vector_length);
+        // Widening multiply: i8 ⨯ i8 → i16
+        vint16m2_t ab_i16m2 = __riscv_vwmul_vv_i16m2(a_i8m1, b_i8m1, vector_length);
+        // Per-lane widening accumulate: i32 += i16
+        sum_i32m4 = __riscv_vwadd_wv_i32m4_tu(sum_i32m4, sum_i32m4, ab_i16m2, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vint32m1_t zero_i32m1 = __riscv_vmv_v_x_i32m1(0, vlmax);
+    *result = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m4_i32m1(sum_i32m4, zero_i32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_u8_rvv(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
+                             nk_u32_t *result) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vuint32m4_t sum_u32m4 = __riscv_vmv_v_x_u32m4(0, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(count_scalars);
+        vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1(a_scalars, vector_length);
+        vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1(b_scalars, vector_length);
+        // Widening multiply: u8 ⨯ u8 → u16
+        vuint16m2_t ab_u16m2 = __riscv_vwmulu_vv_u16m2(a_u8m1, b_u8m1, vector_length);
+        // Per-lane widening accumulate: u32 += u16
+        sum_u32m4 = __riscv_vwaddu_wv_u32m4_tu(sum_u32m4, sum_u32m4, ab_u16m2, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vuint32m1_t zero_u32m1 = __riscv_vmv_v_x_u32m1(0, vlmax);
+    *result = __riscv_vmv_x_s_u32m1_u32(__riscv_vredsum_vs_u32m4_u32m1(sum_u32m4, zero_u32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_f32_rvv(nk_f32_t const *a_scalars, nk_f32_t const *b_scalars, nk_size_t count_scalars,
+                              nk_f64_t *result) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
+    vfloat64m2_t sum_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e32m1(count_scalars);
+        vfloat32m1_t a_f32m1 = __riscv_vle32_v_f32m1(a_scalars, vector_length);
+        vfloat32m1_t b_f32m1 = __riscv_vle32_v_f32m1(b_scalars, vector_length);
+        // Widening FMA: f64 += f32 ⨯ f32, per-lane accumulation
+        sum_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(sum_f64m2, a_f32m1, b_f32m1, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    *result = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_f64m2, zero_f64m1, vlmax));
+}
+NK_PUBLIC void nk_dot_f64_rvv(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
+                              nk_f64_t *result) {
+    // Dot2 (Ogita-Rump-Oishi) compensated accumulation via TwoProd + TwoSum
+    nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
+    vfloat64m1_t sum_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    vfloat64m1_t compensation_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e64m1(count_scalars);
+        vfloat64m1_t a_f64m1 = __riscv_vle64_v_f64m1(a_scalars, vector_length);
+        vfloat64m1_t b_f64m1 = __riscv_vle64_v_f64m1(b_scalars, vector_length);
+        // TwoProd: product = a*b, product_error = fma(a,b,-product)
+        vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_f64m1, b_f64m1, vector_length);
+        vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_f64m1, b_f64m1, vector_length);
+        // TwoSum: tentative_sum = sum + product
+        vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_f64m1, product_f64m1, vector_length);
+        vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_f64m1, vector_length);
+        vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+            __riscv_vfsub_vv_f64m1(sum_f64m1,
+                                   __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                   vector_length),
+            __riscv_vfsub_vv_f64m1(product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+        // Tail-undisturbed updates: preserve zero tails across partial iterations
+        sum_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_f64m1, tentative_sum_f64m1, 0, vector_length);
+        vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, product_error_f64m1, vector_length);
+        compensation_f64m1 = __riscv_vfadd_vv_f64m1_tu(compensation_f64m1, compensation_f64m1, total_error_f64m1,
+                                                       vector_length);
+    }
+    // Compensated horizontal reduction
+    *result = nk_dot_stable_sum_f64m1_rvv_(sum_f64m1, compensation_f64m1);
+}
+NK_PUBLIC void nk_dot_f16_rvv(nk_f16_t const *a_scalars, nk_f16_t const *b_scalars, nk_size_t count_scalars,
+                              nk_f32_t *result) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    vfloat32m2_t sum_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e16m1(count_scalars);
+        // Load f16 as u16 bits and convert to f32 via helper
+        vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a_scalars, vector_length);
+        vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b_scalars, vector_length);
+        vfloat32m2_t a_f32m2 = nk_f16m1_to_f32m2_rvv_(a_u16m1, vector_length);
+        vfloat32m2_t b_f32m2 = nk_f16m1_to_f32m2_rvv_(b_u16m1, vector_length);
+        // Per-lane FMA accumulation
+        sum_f32m2 = __riscv_vfmacc_vv_f32m2_tu(sum_f32m2, a_f32m2, b_f32m2, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
+    *result = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(sum_f32m2, zero_f32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_bf16_rvv(nk_bf16_t const *a_scalars, nk_bf16_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f32_t *result) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m2();
+    vfloat32m2_t sum_f32m2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e16m1(count_scalars);
+        // Load bf16 as u16 and convert to f32 via helper
+        vuint16m1_t a_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)a_scalars, vector_length);
+        vuint16m1_t b_u16m1 = __riscv_vle16_v_u16m1((nk_u16_t const *)b_scalars, vector_length);
+        vfloat32m2_t a_f32m2 = nk_bf16m1_to_f32m2_rvv_(a_u16m1, vector_length);
+        vfloat32m2_t b_f32m2 = nk_bf16m1_to_f32m2_rvv_(b_u16m1, vector_length);
+        // Per-lane FMA accumulation
+        sum_f32m2 = __riscv_vfmacc_vv_f32m2_tu(sum_f32m2, a_f32m2, b_f32m2, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
+    *result = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(sum_f32m2, zero_f32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_e4m3_rvv(nk_e4m3_t const *a_scalars, nk_e4m3_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f32_t *result) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vfloat32m4_t sum_f32m4 = __riscv_vfmv_v_f_f32m4(0.0f, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(count_scalars);
+        // Load e4m3 as u8 and convert to f32 via helper
+        vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a_scalars, vector_length);
+        vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b_scalars, vector_length);
+        vfloat32m4_t a_f32m4 = nk_e4m3m1_to_f32m4_rvv_(a_u8m1, vector_length);
+        vfloat32m4_t b_f32m4 = nk_e4m3m1_to_f32m4_rvv_(b_u8m1, vector_length);
+        // Per-lane FMA accumulation
+        sum_f32m4 = __riscv_vfmacc_vv_f32m4_tu(sum_f32m4, a_f32m4, b_f32m4, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
+    *result = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m4_f32m1(sum_f32m4, zero_f32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_e5m2_rvv(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f32_t *result) {
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vfloat32m4_t sum_f32m4 = __riscv_vfmv_v_f_f32m4(0.0f, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(count_scalars);
+        // Load e5m2 as u8 and convert to f32 via helper
+        vuint8m1_t a_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a_scalars, vector_length);
+        vuint8m1_t b_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b_scalars, vector_length);
+        vfloat32m4_t a_f32m4 = nk_e5m2m1_to_f32m4_rvv_(a_u8m1, vector_length);
+        vfloat32m4_t b_f32m4 = nk_e5m2m1_to_f32m4_rvv_(b_u8m1, vector_length);
+        // Per-lane FMA accumulation
+        sum_f32m4 = __riscv_vfmacc_vv_f32m4_tu(sum_f32m4, a_f32m4, b_f32m4, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vfloat32m1_t zero_f32m1 = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
+    *result = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m4_f32m1(sum_f32m4, zero_f32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_e2m3_rvv(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f32_t *result) {
+    // Integer dot product for e2m3 using byte gather LUT + widening multiply.
+    // Every e2m3 value × 16 is an exact integer in [-120, +120].
+    // Result = i32_dot / 256.0f (exact, no rounding error).
+    static nk_u8_t const lut_magnitude[32] = {0,  2,  4,  6,  8,  10, 12, 14, 16, 18, 20, 22, 24, 26,  28,  30,
+                                              32, 36, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120};
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vint32m4_t sum_i32m4 = __riscv_vmv_v_x_i32m4(0, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(count_scalars);
+        vuint8m1_t a_e2m3_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a_scalars, vector_length);
+        vuint8m1_t b_e2m3_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b_scalars, vector_length);
+        // Magnitude extraction + byte gather LUT
+        vuint8m1_t a_magnitude_u8m1 = __riscv_vand_vx_u8m1(a_e2m3_u8m1, 0x1F, vector_length);
+        vuint8m1_t b_magnitude_u8m1 = __riscv_vand_vx_u8m1(b_e2m3_u8m1, 0x1F, vector_length);
+        vuint8m1_t a_unsigned_u8m1 = __riscv_vluxei8_v_u8m1(lut_magnitude, a_magnitude_u8m1, vector_length);
+        vuint8m1_t b_unsigned_u8m1 = __riscv_vluxei8_v_u8m1(lut_magnitude, b_magnitude_u8m1, vector_length);
+        // Combined sign + conditional negate
+        vuint8m1_t sign_combined_u8m1 = __riscv_vand_vx_u8m1(
+            __riscv_vxor_vv_u8m1(a_e2m3_u8m1, b_e2m3_u8m1, vector_length), 0x20, vector_length);
+        vbool8_t negate_mask_b8 = __riscv_vmsne_vx_u8m1_b8(sign_combined_u8m1, 0, vector_length);
+        vint8m1_t b_positive_i8m1 = __riscv_vreinterpret_v_u8m1_i8m1(b_unsigned_u8m1);
+        vint8m1_t b_negated_i8m1 = __riscv_vneg_v_i8m1(b_positive_i8m1, vector_length);
+        vint8m1_t b_signed_i8m1 = __riscv_vmerge_vvm_i8m1(b_positive_i8m1, b_negated_i8m1, negate_mask_b8,
+                                                          vector_length);
+        // Widening multiply: i8×i8 → i16, then accumulate: i32 += i16
+        vint8m1_t a_signed_i8m1 = __riscv_vreinterpret_v_u8m1_i8m1(a_unsigned_u8m1);
+        vint16m2_t products_i16m2 = __riscv_vwmul_vv_i16m2(a_signed_i8m1, b_signed_i8m1, vector_length);
+        sum_i32m4 = __riscv_vwadd_wv_i32m4_tu(sum_i32m4, sum_i32m4, products_i16m2, vector_length);
+    }
+    vint32m1_t zero_i32m1 = __riscv_vmv_v_x_i32m1(0, vlmax);
+    nk_i32_t sum = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m4_i32m1(sum_i32m4, zero_i32m1, vlmax));
+    *result = (nk_f32_t)sum / 256.0f;
+}
+NK_PUBLIC void nk_dot_e3m2_rvv(nk_e3m2_t const *a_scalars, nk_e3m2_t const *b_scalars, nk_size_t count_scalars,
+                               nk_f32_t *result) {
+    // Integer dot product for e3m2 using i16 gather LUT + widening multiply.
+    // Every e3m2 value × 16 is an exact integer, but magnitudes reach 448, requiring i16.
+    // Result = i32_dot / 256.0f (exact, no rounding error).
+    static nk_u16_t const lut_magnitude[32] = {0,  1,  2,  3,  4,  5,  6,  7,   8,   10,  12,  14,  16,  20,  24,  28,
+                                               32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, 448};
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vint32m4_t sum_i32m4 = __riscv_vmv_v_x_i32m4(0, vlmax);
+    for (nk_size_t vector_length; count_scalars > 0;
+         count_scalars -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(count_scalars);
+        vuint8m1_t a_e3m2_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a_scalars, vector_length);
+        vuint8m1_t b_e3m2_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b_scalars, vector_length);
+        // Magnitude extraction: lower 5 bits as u16 byte offsets for gather
+        vuint8m1_t a_mag_u8m1 = __riscv_vand_vx_u8m1(a_e3m2_u8m1, 0x1F, vector_length);
+        vuint8m1_t b_mag_u8m1 = __riscv_vand_vx_u8m1(b_e3m2_u8m1, 0x1F, vector_length);
+        vuint16m2_t a_idx_u16m2 = __riscv_vzext_vf2_u16m2(a_mag_u8m1, vector_length);
+        vuint16m2_t b_idx_u16m2 = __riscv_vzext_vf2_u16m2(b_mag_u8m1, vector_length);
+        // Gather from i16 LUT: byte offsets = index × 2
+        vuint16m2_t a_byte_offsets_u16m2 = __riscv_vsll_vx_u16m2(a_idx_u16m2, 1, vector_length);
+        vuint16m2_t b_byte_offsets_u16m2 = __riscv_vsll_vx_u16m2(b_idx_u16m2, 1, vector_length);
+        vuint16m2_t a_unsigned_u16m2 = __riscv_vluxei16_v_u16m2(lut_magnitude, a_byte_offsets_u16m2, vector_length);
+        vuint16m2_t b_unsigned_u16m2 = __riscv_vluxei16_v_u16m2(lut_magnitude, b_byte_offsets_u16m2, vector_length);
+        // Extract sign bits and apply conditional negate
+        vuint8m1_t a_sign_u8m1 = __riscv_vand_vx_u8m1(a_e3m2_u8m1, 0x20, vector_length);
+        vuint8m1_t b_sign_u8m1 = __riscv_vand_vx_u8m1(b_e3m2_u8m1, 0x20, vector_length);
+        vbool8_t a_negate_b8 = __riscv_vmsne_vx_u8m1_b8(a_sign_u8m1, 0, vector_length);
+        vbool8_t b_negate_b8 = __riscv_vmsne_vx_u8m1_b8(b_sign_u8m1, 0, vector_length);
+        vint16m2_t a_signed_i16m2 = __riscv_vreinterpret_v_u16m2_i16m2(a_unsigned_u16m2);
+        a_signed_i16m2 = __riscv_vneg_v_i16m2_mu(a_negate_b8, a_signed_i16m2, a_signed_i16m2, vector_length);
+        vint16m2_t b_signed_i16m2 = __riscv_vreinterpret_v_u16m2_i16m2(b_unsigned_u16m2);
+        b_signed_i16m2 = __riscv_vneg_v_i16m2_mu(b_negate_b8, b_signed_i16m2, b_signed_i16m2, vector_length);
+        // Widening multiply-accumulate: i16×i16 → i32
+        sum_i32m4 = __riscv_vwmacc_vv_i32m4_tu(sum_i32m4, a_signed_i16m2, b_signed_i16m2, vector_length);
+    }
+    vint32m1_t zero_i32m1 = __riscv_vmv_v_x_i32m1(0, vlmax);
+    nk_i32_t sum = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m4_i32m1(sum_i32m4, zero_i32m1, vlmax));
+    *result = (nk_f32_t)sum / 256.0f;
+}
+NK_PUBLIC void nk_dot_i4_rvv(nk_i4x2_t const *a_scalars, nk_i4x2_t const *b_scalars, nk_size_t count_dimensions,
+                             nk_i32_t *result) {
+    // count_dimensions = number of 4-bit values, not bytes
+    count_dimensions = nk_size_round_up_to_multiple_(count_dimensions, 2);
+    nk_size_t n_full_bytes = count_dimensions / 2;
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vint32m4_t sum_i32m4 = __riscv_vmv_v_x_i32m4(0, vlmax);
+    for (nk_size_t vector_length; n_full_bytes > 0;
+         n_full_bytes -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(n_full_bytes);
+        vuint8m1_t a_packed_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a_scalars, vector_length);
+        vuint8m1_t b_packed_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b_scalars, vector_length);
+        vuint8m1_t a_high_u8m1 = __riscv_vsrl_vx_u8m1(a_packed_u8m1, 4, vector_length);
+        vuint8m1_t b_high_u8m1 = __riscv_vsrl_vx_u8m1(b_packed_u8m1, 4, vector_length);
+        vuint8m1_t a_low_u8m1 = __riscv_vand_vx_u8m1(a_packed_u8m1, 0x0F, vector_length);
+        vuint8m1_t b_low_u8m1 = __riscv_vand_vx_u8m1(b_packed_u8m1, 0x0F, vector_length);
+        // Sign extend 4-bit to 8-bit: (x ^ 8) - 8
+        vint8m1_t a_high_i8m1 = __riscv_vsub_vx_i8m1(
+            __riscv_vxor_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(a_high_u8m1), 8, vector_length), 8, vector_length);
+        vint8m1_t b_high_i8m1 = __riscv_vsub_vx_i8m1(
+            __riscv_vxor_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(b_high_u8m1), 8, vector_length), 8, vector_length);
+        vint8m1_t a_low_i8m1 = __riscv_vsub_vx_i8m1(
+            __riscv_vxor_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(a_low_u8m1), 8, vector_length), 8, vector_length);
+        vint8m1_t b_low_i8m1 = __riscv_vsub_vx_i8m1(
+            __riscv_vxor_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(b_low_u8m1), 8, vector_length), 8, vector_length);
+        // Widening multiply: i8 ⨯ i8 → i16
+        vint16m2_t ab_high_i16m2 = __riscv_vwmul_vv_i16m2(a_high_i8m1, b_high_i8m1, vector_length);
+        vint16m2_t ab_low_i16m2 = __riscv_vwmul_vv_i16m2(a_low_i8m1, b_low_i8m1, vector_length);
+        // Per-lane widening accumulate: i32 += i16
+        sum_i32m4 = __riscv_vwadd_wv_i32m4_tu(sum_i32m4, sum_i32m4, ab_high_i16m2, vector_length);
+        sum_i32m4 = __riscv_vwadd_wv_i32m4_tu(sum_i32m4, sum_i32m4, ab_low_i16m2, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vint32m1_t zero_i32m1 = __riscv_vmv_v_x_i32m1(0, vlmax);
+    *result = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m4_i32m1(sum_i32m4, zero_i32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_u4_rvv(nk_u4x2_t const *a_scalars, nk_u4x2_t const *b_scalars, nk_size_t count_dimensions,
+                             nk_u32_t *result) {
+    // count_dimensions = number of 4-bit values, not bytes
+    count_dimensions = nk_size_round_up_to_multiple_(count_dimensions, 2);
+    nk_size_t n_full_bytes = count_dimensions / 2;
+    nk_size_t vlmax = __riscv_vsetvlmax_e32m4();
+    vuint32m4_t sum_u32m4 = __riscv_vmv_v_x_u32m4(0, vlmax);
+    for (nk_size_t vector_length; n_full_bytes > 0;
+         n_full_bytes -= vector_length, a_scalars += vector_length, b_scalars += vector_length) {
+        vector_length = __riscv_vsetvl_e8m1(n_full_bytes);
+        vuint8m1_t a_packed_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)a_scalars, vector_length);
+        vuint8m1_t b_packed_u8m1 = __riscv_vle8_v_u8m1((nk_u8_t const *)b_scalars, vector_length);
+        vuint8m1_t a_high_u8m1 = __riscv_vsrl_vx_u8m1(a_packed_u8m1, 4, vector_length);
+        vuint8m1_t b_high_u8m1 = __riscv_vsrl_vx_u8m1(b_packed_u8m1, 4, vector_length);
+        vuint8m1_t a_low_u8m1 = __riscv_vand_vx_u8m1(a_packed_u8m1, 0x0F, vector_length);
+        vuint8m1_t b_low_u8m1 = __riscv_vand_vx_u8m1(b_packed_u8m1, 0x0F, vector_length);
+        // Widening multiply: u8 ⨯ u8 → u16
+        vuint16m2_t ab_high_u16m2 = __riscv_vwmulu_vv_u16m2(a_high_u8m1, b_high_u8m1, vector_length);
+        vuint16m2_t ab_low_u16m2 = __riscv_vwmulu_vv_u16m2(a_low_u8m1, b_low_u8m1, vector_length);
+        // Per-lane widening accumulate: u32 += u16
+        sum_u32m4 = __riscv_vwaddu_wv_u32m4_tu(sum_u32m4, sum_u32m4, ab_high_u16m2, vector_length);
+        sum_u32m4 = __riscv_vwaddu_wv_u32m4_tu(sum_u32m4, sum_u32m4, ab_low_u16m2, vector_length);
+    }
+    // Single horizontal reduction at the end
+    vuint32m1_t zero_u32m1 = __riscv_vmv_v_x_u32m1(0, vlmax);
+    *result = __riscv_vmv_x_s_u32m1_u32(__riscv_vredsum_vs_u32m4_u32m1(sum_u32m4, zero_u32m1, vlmax));
+}
+NK_PUBLIC void nk_dot_u1_rvv(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bits, nk_u32_t *result) {
+    nk_size_t count_bytes = nk_size_divide_round_up_(n_bits, NK_BITS_PER_BYTE);
+    vuint32m1_t sum_u32m1 = __riscv_vmv_v_x_u32m1(0, 1);
+    nk_size_t i = 0;
+    for (nk_size_t vector_length; i + 1 <= count_bytes; i += vector_length) {
+        vector_length = __riscv_vsetvl_e8m4(count_bytes - i);
+        // Load and AND to find shared bits (dot product of binary vectors)
+        vuint8m4_t a_u8m4 = __riscv_vle8_v_u8m4(a + i, vector_length);
+        vuint8m4_t b_u8m4 = __riscv_vle8_v_u8m4(b + i, vector_length);
+        vuint8m4_t and_u8m4 = __riscv_vand_vv_u8m4(a_u8m4, b_u8m4, vector_length);
+        // Popcount each byte using arithmetic SWAR
+        vuint8m4_t popcount_u8m4 = nk_popcount_u8m4_rvv_(and_u8m4, vector_length);
+        // Widen to u16 and accumulate via widening reduction sum
+        vuint16m8_t popcount_u16m8 = __riscv_vwaddu_vx_u16m8(popcount_u8m4, 0, vector_length);
+        sum_u32m1 = __riscv_vwredsumu_vs_u16m8_u32m1(popcount_u16m8, sum_u32m1, vector_length);
+    }
+    *result = __riscv_vmv_x_s_u32m1_u32(sum_u32m1);
+}
+NK_PUBLIC void nk_dot_f32c_rvv(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
+                               nk_f64c_t *results) {
+    nk_f32_t const *a_f32 = (nk_f32_t const *)a_pairs;
+    nk_f32_t const *b_f32 = (nk_f32_t const *)b_pairs;
+    nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
+    vfloat64m2_t sum_real_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
+    vfloat64m2_t sum_imag_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
+    for (nk_size_t vector_length; count_pairs > 0;
+         count_pairs -= vector_length, a_f32 += vector_length * 2, b_f32 += vector_length * 2) {
+        vector_length = __riscv_vsetvl_e32m1(count_pairs);
+        vfloat32m1x2_t a_f32m1x2 = __riscv_vlseg2e32_v_f32m1x2(a_f32, vector_length);
+        vfloat32m1x2_t b_f32m1x2 = __riscv_vlseg2e32_v_f32m1x2(b_f32, vector_length);
+        vfloat32m1_t a_real_f32m1 = __riscv_vget_v_f32m1x2_f32m1(a_f32m1x2, 0);
+        vfloat32m1_t a_imag_f32m1 = __riscv_vget_v_f32m1x2_f32m1(a_f32m1x2, 1);
+        vfloat32m1_t b_real_f32m1 = __riscv_vget_v_f32m1x2_f32m1(b_f32m1x2, 0);
+        vfloat32m1_t b_imag_f32m1 = __riscv_vget_v_f32m1x2_f32m1(b_f32m1x2, 1);
+        // real += a_real * b_real - a_imag * b_imag
+        sum_real_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(sum_real_f64m2, a_real_f32m1, b_real_f32m1, vector_length);
+        sum_real_f64m2 = __riscv_vfwnmsac_vv_f64m2_tu(sum_real_f64m2, a_imag_f32m1, b_imag_f32m1, vector_length);
+        // imag += a_real * b_imag + a_imag * b_real
+        sum_imag_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(sum_imag_f64m2, a_real_f32m1, b_imag_f32m1, vector_length);
+        sum_imag_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(sum_imag_f64m2, a_imag_f32m1, b_real_f32m1, vector_length);
+    }
+    vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    results->real = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_real_f64m2, zero_f64m1, vlmax));
+    results->imag = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_imag_f64m2, zero_f64m1, vlmax));
+}
+NK_PUBLIC void nk_vdot_f32c_rvv(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
+                                nk_f64c_t *results) {
+    nk_f32_t const *a_f32 = (nk_f32_t const *)a_pairs;
+    nk_f32_t const *b_f32 = (nk_f32_t const *)b_pairs;
+    nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
+    vfloat64m2_t sum_real_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
+    vfloat64m2_t sum_imag_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
+    for (nk_size_t vector_length; count_pairs > 0;
+         count_pairs -= vector_length, a_f32 += vector_length * 2, b_f32 += vector_length * 2) {
+        vector_length = __riscv_vsetvl_e32m1(count_pairs);
+        vfloat32m1x2_t a_f32m1x2 = __riscv_vlseg2e32_v_f32m1x2(a_f32, vector_length);
+        vfloat32m1x2_t b_f32m1x2 = __riscv_vlseg2e32_v_f32m1x2(b_f32, vector_length);
+        vfloat32m1_t a_real_f32m1 = __riscv_vget_v_f32m1x2_f32m1(a_f32m1x2, 0);
+        vfloat32m1_t a_imag_f32m1 = __riscv_vget_v_f32m1x2_f32m1(a_f32m1x2, 1);
+        vfloat32m1_t b_real_f32m1 = __riscv_vget_v_f32m1x2_f32m1(b_f32m1x2, 0);
+        vfloat32m1_t b_imag_f32m1 = __riscv_vget_v_f32m1x2_f32m1(b_f32m1x2, 1);
+        // Conjugate dot: real += a_real * b_real + a_imag * b_imag
+        sum_real_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(sum_real_f64m2, a_real_f32m1, b_real_f32m1, vector_length);
+        sum_real_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(sum_real_f64m2, a_imag_f32m1, b_imag_f32m1, vector_length);
+        // Conjugate dot: imag += a_real * b_imag - a_imag * b_real
+        sum_imag_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(sum_imag_f64m2, a_real_f32m1, b_imag_f32m1, vector_length);
+        sum_imag_f64m2 = __riscv_vfwnmsac_vv_f64m2_tu(sum_imag_f64m2, a_imag_f32m1, b_real_f32m1, vector_length);
+    }
+    vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    results->real = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_real_f64m2, zero_f64m1, vlmax));
+    results->imag = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_imag_f64m2, zero_f64m1, vlmax));
+}
+NK_PUBLIC void nk_dot_f64c_rvv(nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_size_t count_pairs,
+                               nk_f64c_t *results) {
+    // Dot2 (Ogita-Rump-Oishi) compensated complex dot product
+    nk_f64_t const *a_f64 = (nk_f64_t const *)a_pairs;
+    nk_f64_t const *b_f64 = (nk_f64_t const *)b_pairs;
+    nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
+    vfloat64m1_t sum_real_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    vfloat64m1_t comp_real_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    vfloat64m1_t sum_imag_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    vfloat64m1_t comp_imag_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    for (nk_size_t vector_length; count_pairs > 0;
+         count_pairs -= vector_length, a_f64 += vector_length * 2, b_f64 += vector_length * 2) {
+        vector_length = __riscv_vsetvl_e64m1(count_pairs);
+        vfloat64m1x2_t a_f64m1x2 = __riscv_vlseg2e64_v_f64m1x2(a_f64, vector_length);
+        vfloat64m1x2_t b_f64m1x2 = __riscv_vlseg2e64_v_f64m1x2(b_f64, vector_length);
+        vfloat64m1_t a_real_f64m1 = __riscv_vget_v_f64m1x2_f64m1(a_f64m1x2, 0);
+        vfloat64m1_t a_imag_f64m1 = __riscv_vget_v_f64m1x2_f64m1(a_f64m1x2, 1);
+        vfloat64m1_t b_real_f64m1 = __riscv_vget_v_f64m1x2_f64m1(b_f64m1x2, 0);
+        vfloat64m1_t b_imag_f64m1 = __riscv_vget_v_f64m1x2_f64m1(b_f64m1x2, 1);
+        // TwoProd+TwoSum: sum_real += a_real * b_real
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_real_f64m1, b_real_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_real_f64m1, b_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_real_f64m1, product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_real_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_real_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_real_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, product_error_f64m1,
+                                                                    vector_length);
+            comp_real_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_real_f64m1, comp_real_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+        // TwoProd+TwoSum: sum_real -= a_imag * b_imag
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_imag_f64m1, b_imag_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_imag_f64m1, b_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t neg_product_f64m1 = __riscv_vfneg_v_f64m1(product_f64m1, vector_length);
+            vfloat64m1_t neg_product_error_f64m1 = __riscv_vfneg_v_f64m1(product_error_f64m1, vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_real_f64m1, neg_product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_real_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(neg_product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_real_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_real_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, neg_product_error_f64m1,
+                                                                    vector_length);
+            comp_real_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_real_f64m1, comp_real_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+        // TwoProd+TwoSum: sum_imag += a_real * b_imag
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_real_f64m1, b_imag_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_real_f64m1, b_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_imag_f64m1, product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_imag_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_imag_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_imag_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, product_error_f64m1,
+                                                                    vector_length);
+            comp_imag_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_imag_f64m1, comp_imag_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+        // TwoProd+TwoSum: sum_imag += a_imag * b_real
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_imag_f64m1, b_real_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_imag_f64m1, b_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_imag_f64m1, product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_imag_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_imag_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_imag_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, product_error_f64m1,
+                                                                    vector_length);
+            comp_imag_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_imag_f64m1, comp_imag_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+    }
+    results->real = nk_dot_stable_sum_f64m1_rvv_(sum_real_f64m1, comp_real_f64m1);
+    results->imag = nk_dot_stable_sum_f64m1_rvv_(sum_imag_f64m1, comp_imag_f64m1);
+}
+NK_PUBLIC void nk_vdot_f64c_rvv(nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_size_t count_pairs,
+                                nk_f64c_t *results) {
+    // Dot2 (Ogita-Rump-Oishi) compensated conjugate complex dot product
+    nk_f64_t const *a_f64 = (nk_f64_t const *)a_pairs;
+    nk_f64_t const *b_f64 = (nk_f64_t const *)b_pairs;
+    nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
+    vfloat64m1_t sum_real_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    vfloat64m1_t comp_real_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    vfloat64m1_t sum_imag_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    vfloat64m1_t comp_imag_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+    for (nk_size_t vector_length; count_pairs > 0;
+         count_pairs -= vector_length, a_f64 += vector_length * 2, b_f64 += vector_length * 2) {
+        vector_length = __riscv_vsetvl_e64m1(count_pairs);
+        vfloat64m1x2_t a_f64m1x2 = __riscv_vlseg2e64_v_f64m1x2(a_f64, vector_length);
+        vfloat64m1x2_t b_f64m1x2 = __riscv_vlseg2e64_v_f64m1x2(b_f64, vector_length);
+        vfloat64m1_t a_real_f64m1 = __riscv_vget_v_f64m1x2_f64m1(a_f64m1x2, 0);
+        vfloat64m1_t a_imag_f64m1 = __riscv_vget_v_f64m1x2_f64m1(a_f64m1x2, 1);
+        vfloat64m1_t b_real_f64m1 = __riscv_vget_v_f64m1x2_f64m1(b_f64m1x2, 0);
+        vfloat64m1_t b_imag_f64m1 = __riscv_vget_v_f64m1x2_f64m1(b_f64m1x2, 1);
+        // TwoProd+TwoSum: sum_real += a_real * b_real
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_real_f64m1, b_real_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_real_f64m1, b_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_real_f64m1, product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_real_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_real_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_real_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, product_error_f64m1,
+                                                                    vector_length);
+            comp_real_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_real_f64m1, comp_real_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+        // TwoProd+TwoSum: sum_real += a_imag * b_imag (conjugate: + instead of -)
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_imag_f64m1, b_imag_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_imag_f64m1, b_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_real_f64m1, product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_real_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_real_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_real_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, product_error_f64m1,
+                                                                    vector_length);
+            comp_real_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_real_f64m1, comp_real_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+        // TwoProd+TwoSum: sum_imag += a_real * b_imag
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_real_f64m1, b_imag_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_real_f64m1, b_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_imag_f64m1, product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_imag_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_imag_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_imag_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, product_error_f64m1,
+                                                                    vector_length);
+            comp_imag_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_imag_f64m1, comp_imag_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+        // TwoProd+TwoSum: sum_imag -= a_imag * b_real (conjugate: - instead of +)
+        {
+            vfloat64m1_t product_f64m1 = __riscv_vfmul_vv_f64m1(a_imag_f64m1, b_real_f64m1, vector_length);
+            vfloat64m1_t product_error_f64m1 = __riscv_vfmsac_vv_f64m1(product_f64m1, a_imag_f64m1, b_real_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t neg_product_f64m1 = __riscv_vfneg_v_f64m1(product_f64m1, vector_length);
+            vfloat64m1_t neg_product_error_f64m1 = __riscv_vfneg_v_f64m1(product_error_f64m1, vector_length);
+            vfloat64m1_t tentative_sum_f64m1 = __riscv_vfadd_vv_f64m1(sum_imag_f64m1, neg_product_f64m1, vector_length);
+            vfloat64m1_t virtual_addend_f64m1 = __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, sum_imag_f64m1,
+                                                                       vector_length);
+            vfloat64m1_t sum_error_f64m1 = __riscv_vfadd_vv_f64m1(
+                __riscv_vfsub_vv_f64m1(sum_imag_f64m1,
+                                       __riscv_vfsub_vv_f64m1(tentative_sum_f64m1, virtual_addend_f64m1, vector_length),
+                                       vector_length),
+                __riscv_vfsub_vv_f64m1(neg_product_f64m1, virtual_addend_f64m1, vector_length), vector_length);
+            sum_imag_f64m1 = __riscv_vslideup_vx_f64m1_tu(sum_imag_f64m1, tentative_sum_f64m1, 0, vector_length);
+            vfloat64m1_t total_error_f64m1 = __riscv_vfadd_vv_f64m1(sum_error_f64m1, neg_product_error_f64m1,
+                                                                    vector_length);
+            comp_imag_f64m1 = __riscv_vfadd_vv_f64m1_tu(comp_imag_f64m1, comp_imag_f64m1, total_error_f64m1,
+                                                        vector_length);
+        }
+    }
+    results->real = nk_dot_stable_sum_f64m1_rvv_(sum_real_f64m1, comp_real_f64m1);
+    results->imag = nk_dot_stable_sum_f64m1_rvv_(sum_imag_f64m1, comp_imag_f64m1);
+}
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif // NK_TARGET_RVV
+#endif // NK_TARGET_RISCV_
+#endif // NK_DOT_RVV_H