npm - numkong - Versions diffs - 7.0.0 → 7.4.1 - Mend

numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +239 -122
package/binding.gyp +25 -491
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/dot/powervsx.h ADDED Viewed

@@ -0,0 +1,752 @@
+/**
+ *  @brief SIMD-accelerated Dot Products for POWER9 VSX.
+ *  @file include/numkong/dot/powervsx.h
+ *  @author Ash Vardanian
+ *  @date March 23, 2026
+ *
+ *  @sa include/numkong/dot.h
+ *
+ *  @section dot_powervsx_instructions Power9 VSX Dot Product Instructions
+ *
+ *  Key Power9 VSX instructions for dot products:
+ *
+ *      Intrinsic                        Instruction           POWER9
+ *      vec_madd(a, b, c)                XVMADDADP/XVMADDASP   5cy  FMA: a×b+c
+ *      vec_msub(a, b, c)                XVMSUBADP/XVMSUBASP   5cy  FMS: a×b−c
+ *      vec_msum(a, b, c)                VMSUMUBM/VMSUMMBM     5cy  i8/u8 widening multiply-sum → i32/u32
+ *      vec_msum(a, b, c)                VMSUMSHM/VMSUMUHM     5cy  i16/u16 widening multiply-sum → i32/u32
+ *      vec_doublee(a)                   XVCVSPDP              3cy  Widen even f32 lanes → f64x2
+ *      vec_doubleo(a)                   XVCVSPDP (odd)        3cy  Widen odd f32 lanes → f64x2
+ *      vec_unpackh(a)                   VUPKHSB/VUPKHSH       2cy  Sign-extend high half (i8→i16 or i16→i32)
+ *      vec_unpackl(a)                   VUPKLSB/VUPKLSH       2cy  Sign-extend low half (i8→i16 or i16→i32)
+ *      vec_xor(a, b)                    VXOR/XXLXOR           1cy  Bitwise XOR
+ *      vec_xl(off, ptr)                 LXV                   5cy  Aligned 16-byte load
+ *      vec_xl_len(ptr, len)             LXVL                  5cy  Partial load (Power9), zero-fills tail
+ *      vec_extract_fp32_from_shorth     XVCVHPSP (high)       5cy  f16x4 → f32x4 from high half
+ *      vec_extract_fp32_from_shortl     XVCVHPSP (low)        5cy  f16x4 → f32x4 from low half
+ *      vec_popcnt(a)                    VPOPCNTB/H/W/D        2cy  Per-element popcount
+ *      vec_sum4s(a, b)                  VSUM4UBS/VSUM4SBS     5cy  Sum groups of 4 bytes → i32/u32
+ *      vec_sums(a, b)                   VSUMSWS               5cy  Signed i32x4 horizontal → i32 (lane 3)
+ *
+ *  Power9 (POWER ISA 3.0) provides `vec_xl_len` for partial loads that zero-fill unused bytes,
+ *  enabling branchless tail handling: zero × anything = zero, so partial vectors contribute
+ *  no spurious terms to dot-product accumulators.
+ *
+ *  @section dot_powervsx_stateful Stateful Streaming Logic
+ *
+ *  For memory-optimal tiled algorithms, this file defines state structures and force-inlined
+ *  `NK_INTERNAL` functions:
+ *
+ *  - nk_dot_f32x2 state for f32 inputs with double-precision accumulation,
+ *  - nk_dot_f64x2 state with Dot2 stable dot-products for f64 inputs,
+ *  - nk_dot_bf16x8 state for bf16 inputs with f32 accumulation,
+ *  - nk_dot_f16x8 state for f16 inputs with f32 accumulation,
+ *  - nk_dot_i8x16 state for i8 inputs with i32 accumulation,
+ *  - nk_dot_u8x16 state for u8 inputs with u32 accumulation,
+ *  - nk_dot_u1x128 state for binary inputs with u64 popcount accumulation.
+ */
+#ifndef NK_DOT_POWERVSX_H
+#define NK_DOT_POWERVSX_H
+#if NK_TARGET_POWERVSX
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("power9-vector")
+#endif
+/** @brief Horizontal sum of 4 f32 lanes → scalar f32. */
+NK_INTERNAL nk_f32_t nk_hsum_f32x4_powervsx_(nk_vf32x4_t values_f32x4) {
+    // Rotate by 8 bytes (2 floats) and add → {v[0]+v[2], v[1]+v[3], ...}
+    nk_vf32x4_t rotated_f32x4 = vec_sld(values_f32x4, values_f32x4, 8);
+    nk_vf32x4_t partial_f32x4 = vec_add(values_f32x4, rotated_f32x4);
+    // Rotate by 4 bytes (1 float) and add → {v[0]+v[1]+v[2]+v[3], ...}
+    nk_vf32x4_t shifted_f32x4 = vec_sld(partial_f32x4, partial_f32x4, 4);
+    nk_vf32x4_t total_f32x4 = vec_add(partial_f32x4, shifted_f32x4);
+    return vec_extract(total_f32x4, 0);
+}
+/** @brief Horizontal sum of 2 f64 lanes → scalar f64 via xxpermdi (1 domain crossing). */
+NK_INTERNAL nk_f64_t nk_hsum_f64x2_powervsx_(nk_vf64x2_t values_f64x2) {
+    nk_vf64x2_t swapped_f64x2 = vec_xxpermdi(values_f64x2, values_f64x2, 2);
+    nk_vf64x2_t sum_f64x2 = vec_add(values_f64x2, swapped_f64x2);
+    return vec_extract(sum_f64x2, 0);
+}
+/** @brief Horizontal sum of 4 signed i32 lanes → scalar i32. */
+NK_INTERNAL nk_i32_t nk_hsum_i32x4_powervsx_(nk_vi32x4_t values_i32x4) {
+    // vec_sums reduces i32x4 → i32 in lane 3 of the result
+    nk_vi32x4_t zero_i32x4 = vec_splats((nk_i32_t)0);
+    nk_vi32x4_t sums_i32x4 = vec_sums(values_i32x4, zero_i32x4);
+    return vec_extract(sums_i32x4, 3);
+}
+/** @brief Horizontal sum of 4 unsigned u32 lanes → scalar u32. */
+NK_INTERNAL nk_u32_t nk_hsum_u32x4_powervsx_(nk_vu32x4_t values_u32x4) {
+    // Rotate by 8 bytes (2 ints) and add → {v[0]+v[2], v[1]+v[3], ...}
+    nk_vu32x4_t rotated_u32x4 = vec_sld(values_u32x4, values_u32x4, 8);
+    nk_vu32x4_t partial_u32x4 = vec_add(values_u32x4, rotated_u32x4);
+    // Rotate by 4 bytes (1 int) and add → {v[0]+v[1]+v[2]+v[3], ...}
+    nk_vu32x4_t shifted_u32x4 = vec_sld(partial_u32x4, partial_u32x4, 4);
+    nk_vu32x4_t total_u32x4 = vec_add(partial_u32x4, shifted_u32x4);
+    return vec_extract(total_u32x4, 0);
+}
+/** @brief Horizontal sum of 2 unsigned u64 lanes → scalar u64 via xxpermdi. */
+NK_INTERNAL nk_u64_t nk_hsum_u64x2_powervsx_(nk_vu64x2_t values_u64x2) {
+    nk_vu64x2_t swapped_u64x2 = vec_xxpermdi(values_u64x2, values_u64x2, 2);
+    nk_vu64x2_t sum_u64x2 = vec_add(values_u64x2, swapped_u64x2);
+    return vec_extract(sum_u64x2, 0);
+}
+/** @brief Compensated horizontal sum of 2 f64 lanes via TwoSum. */
+NK_INTERNAL nk_f64_t nk_dot_stable_sum_f64x2_powervsx_(nk_vf64x2_t sum_f64x2, nk_vf64x2_t compensation_f64x2) {
+    // TwoSum merge of sum + compensation (2-wide)
+    nk_vf64x2_t tentative_sum_f64x2 = vec_add(sum_f64x2, compensation_f64x2);
+    nk_vf64x2_t virtual_addend_f64x2 = vec_sub(tentative_sum_f64x2, sum_f64x2);
+    nk_vf64x2_t rounding_error_f64x2 = vec_add(vec_sub(sum_f64x2, vec_sub(tentative_sum_f64x2, virtual_addend_f64x2)),
+                                               vec_sub(compensation_f64x2, virtual_addend_f64x2));
+    // Scalar TwoSum 2 → 1
+    nk_f64_t lower_sum = vec_extract(tentative_sum_f64x2, 0);
+    nk_f64_t upper_sum = vec_extract(tentative_sum_f64x2, 1);
+    nk_f64_t lower_error = vec_extract(rounding_error_f64x2, 0);
+    nk_f64_t upper_error = vec_extract(rounding_error_f64x2, 1);
+    nk_f64_t tentative_sum = lower_sum + upper_sum;
+    nk_f64_t virtual_addend = tentative_sum - lower_sum;
+    nk_f64_t rounding_error = (lower_sum - (tentative_sum - virtual_addend)) + (upper_sum - virtual_addend);
+    return tentative_sum + (lower_error + upper_error + rounding_error);
+}
+#pragma region F32 and F64 Floats
+NK_PUBLIC void nk_dot_f32_powervsx(nk_f32_t const *a_scalars, nk_f32_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f64_t *result) {
+    // Upcast f32 → f64 for accumulation via vec_doublee (even lanes) and vec_doubleo (odd lanes)
+    nk_vf64x2_t sum_even_f64x2 = vec_splats((nk_f64_t)0);
+    nk_vf64x2_t sum_odd_f64x2 = vec_splats((nk_f64_t)0);
+    nk_vf32x4_t a_f32x4, b_f32x4;
+    nk_size_t tail_bytes;
+nk_dot_f32_powervsx_cycle:
+    if (count_scalars < 4) {
+        tail_bytes = count_scalars * sizeof(nk_f32_t);
+        a_f32x4 = vec_xl_len((nk_f32_t *)a_scalars, tail_bytes);
+        b_f32x4 = vec_xl_len((nk_f32_t *)b_scalars, tail_bytes);
+        count_scalars = 0;
+    }
+    else {
+        a_f32x4 = vec_xl(0, a_scalars);
+        b_f32x4 = vec_xl(0, b_scalars);
+        a_scalars += 4, b_scalars += 4, count_scalars -= 4;
+    }
+    // Widen even/odd f32 lanes → f64x2, then FMA
+    nk_vf64x2_t a_even_f64x2 = vec_doublee(a_f32x4);
+    nk_vf64x2_t b_even_f64x2 = vec_doublee(b_f32x4);
+    nk_vf64x2_t a_odd_f64x2 = vec_doubleo(a_f32x4);
+    nk_vf64x2_t b_odd_f64x2 = vec_doubleo(b_f32x4);
+    sum_even_f64x2 = vec_madd(a_even_f64x2, b_even_f64x2, sum_even_f64x2);
+    sum_odd_f64x2 = vec_madd(a_odd_f64x2, b_odd_f64x2, sum_odd_f64x2);
+    if (count_scalars) goto nk_dot_f32_powervsx_cycle;
+    // Combine even and odd accumulators → final scalar
+    nk_vf64x2_t total_f64x2 = vec_add(sum_even_f64x2, sum_odd_f64x2);
+    *result = nk_hsum_f64x2_powervsx_(total_f64x2);
+}
+NK_PUBLIC void nk_dot_f64_powervsx(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f64_t *result) {
+    // Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated dot product
+    nk_vf64x2_t sum_f64x2 = vec_splats((nk_f64_t)0);
+    nk_vf64x2_t compensation_f64x2 = vec_splats((nk_f64_t)0);
+    nk_vf64x2_t a_f64x2, b_f64x2;
+    nk_size_t tail_bytes;
+nk_dot_f64_powervsx_cycle:
+    if (count_scalars < 2) {
+        tail_bytes = count_scalars * sizeof(nk_f64_t);
+        a_f64x2 = vec_xl_len((nk_f64_t *)a_scalars, tail_bytes);
+        b_f64x2 = vec_xl_len((nk_f64_t *)b_scalars, tail_bytes);
+        count_scalars = 0;
+    }
+    else {
+        a_f64x2 = vec_xl(0, a_scalars);
+        b_f64x2 = vec_xl(0, b_scalars);
+        a_scalars += 2, b_scalars += 2, count_scalars -= 2;
+    }
+    // TwoProd: product = a * b, error = msub(a, b, product) captures rounding error
+    nk_vf64x2_t product_f64x2 = vec_mul(a_f64x2, b_f64x2);
+    nk_vf64x2_t product_error_f64x2 = vec_msub(a_f64x2, b_f64x2, product_f64x2);
+    // TwoSum: (t, q) = TwoSum(sum, product) where t = sum + product rounded, q = error
+    nk_vf64x2_t tentative_sum_f64x2 = vec_add(sum_f64x2, product_f64x2);
+    nk_vf64x2_t virtual_addend_f64x2 = vec_sub(tentative_sum_f64x2, sum_f64x2);
+    nk_vf64x2_t sum_error_f64x2 = vec_add(vec_sub(sum_f64x2, vec_sub(tentative_sum_f64x2, virtual_addend_f64x2)),
+                                          vec_sub(product_f64x2, virtual_addend_f64x2));
+    // Update: sum = t, compensation += q + r
+    sum_f64x2 = tentative_sum_f64x2;
+    compensation_f64x2 = vec_add(compensation_f64x2, vec_add(sum_error_f64x2, product_error_f64x2));
+    if (count_scalars) goto nk_dot_f64_powervsx_cycle;
+    // Compensated horizontal reduction preserving Dot2 error tracking
+    *result = nk_dot_stable_sum_f64x2_powervsx_(sum_f64x2, compensation_f64x2);
+}
+#pragma endregion F32 and F64 Floats
+#pragma region F16 and BF16 Floats
+NK_PUBLIC void nk_dot_bf16_powervsx(nk_bf16_t const *a_scalars, nk_bf16_t const *b_scalars, nk_size_t count_scalars,
+                                    nk_f32_t *result) {
+    // bf16 → f32 via mergeh/mergel with zero: shift 16 bits into f32 upper half
+    nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
+    nk_vf32x4_t sum_f32x4 = vec_splats((nk_f32_t)0);
+    nk_vu16x8_t a_u16x8, b_u16x8;
+    nk_size_t tail_bytes;
+nk_dot_bf16_powervsx_cycle:
+    if (count_scalars < 8) {
+        tail_bytes = count_scalars * sizeof(nk_bf16_t);
+        a_u16x8 = vec_xl_len((nk_u16_t *)a_scalars, tail_bytes);
+        b_u16x8 = vec_xl_len((nk_u16_t *)b_scalars, tail_bytes);
+        count_scalars = 0;
+    }
+    else {
+        a_u16x8 = vec_xl(0, (nk_u16_t const *)a_scalars);
+        b_u16x8 = vec_xl(0, (nk_u16_t const *)b_scalars);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    // Convert bf16 → f32: merge with zero puts bf16 bits in upper 16 of each f32
+    nk_vf32x4_t a_high_f32x4 = (nk_vf32x4_t)vec_mergeh(zero_u16x8, a_u16x8);
+    nk_vf32x4_t a_low_f32x4 = (nk_vf32x4_t)vec_mergel(zero_u16x8, a_u16x8);
+    nk_vf32x4_t b_high_f32x4 = (nk_vf32x4_t)vec_mergeh(zero_u16x8, b_u16x8);
+    nk_vf32x4_t b_low_f32x4 = (nk_vf32x4_t)vec_mergel(zero_u16x8, b_u16x8);
+    sum_f32x4 = vec_madd(a_high_f32x4, b_high_f32x4, sum_f32x4);
+    sum_f32x4 = vec_madd(a_low_f32x4, b_low_f32x4, sum_f32x4);
+    if (count_scalars) goto nk_dot_bf16_powervsx_cycle;
+    *result = nk_hsum_f32x4_powervsx_(sum_f32x4);
+}
+NK_PUBLIC void nk_dot_f16_powervsx(nk_f16_t const *a_scalars, nk_f16_t const *b_scalars, nk_size_t count_scalars,
+                                   nk_f32_t *result) {
+    // f16 → f32 via vec_extract_fp32_from_shorth/shortl (Power9 XVCVHPSP)
+    nk_vf32x4_t sum_f32x4 = vec_splats((nk_f32_t)0);
+    nk_vu16x8_t a_u16x8, b_u16x8;
+    nk_size_t tail_bytes;
+nk_dot_f16_powervsx_cycle:
+    if (count_scalars < 8) {
+        tail_bytes = count_scalars * sizeof(nk_f16_t);
+        a_u16x8 = vec_xl_len((nk_u16_t *)a_scalars, tail_bytes);
+        b_u16x8 = vec_xl_len((nk_u16_t *)b_scalars, tail_bytes);
+        count_scalars = 0;
+    }
+    else {
+        a_u16x8 = vec_xl(0, (nk_u16_t const *)a_scalars);
+        b_u16x8 = vec_xl(0, (nk_u16_t const *)b_scalars);
+        a_scalars += 8, b_scalars += 8, count_scalars -= 8;
+    }
+    // Convert f16 → f32 via hardware XVCVHPSP
+    nk_vf32x4_t a_high_f32x4 = vec_extract_fp32_from_shorth(a_u16x8);
+    nk_vf32x4_t a_low_f32x4 = vec_extract_fp32_from_shortl(a_u16x8);
+    nk_vf32x4_t b_high_f32x4 = vec_extract_fp32_from_shorth(b_u16x8);
+    nk_vf32x4_t b_low_f32x4 = vec_extract_fp32_from_shortl(b_u16x8);
+    sum_f32x4 = vec_madd(a_high_f32x4, b_high_f32x4, sum_f32x4);
+    sum_f32x4 = vec_madd(a_low_f32x4, b_low_f32x4, sum_f32x4);
+    if (count_scalars) goto nk_dot_f16_powervsx_cycle;
+    *result = nk_hsum_f32x4_powervsx_(sum_f32x4);
+}
+#pragma endregion F16 and BF16 Floats
+#pragma region I8 and U8 Integers
+NK_PUBLIC void nk_dot_i8_powervsx(nk_i8_t const *a_scalars, nk_i8_t const *b_scalars, nk_size_t count_scalars,
+                                  nk_i32_t *result) {
+    // Algebraic transform for i8×i8 using VMSUMMBM (i8×u8 → i32):
+    //   b' = b ⊕ 0x80  (reinterpret signed as unsigned)
+    //   a·b = a·b' − 128·Σa
+    // Σ(a+128) accumulated via VSUM4UBS; correction applied after loop.
+    // Tail handling is free: vec_xl_len zero-fills unused lanes.
+    //   - Product: 0 × (0⊕0x80) = 0 → no spurious contribution
+    //   - Correction: (0⊕0x80) = 128 in sum_a_biased, compensated by count_padded
+    nk_vu8x16_t const bias_u8x16 = vec_splats((nk_u8_t)0x80);
+    nk_vi32x4_t accumulator_i32x4 = vec_splats((nk_i32_t)0);
+    nk_vu32x4_t sum_a_biased_u32x4 = vec_splats((nk_u32_t)0);
+    nk_size_t count_padded = ((count_scalars + 15) / 16) * 16;
+    nk_vi8x16_t a_i8x16;
+    nk_vu8x16_t b_biased_u8x16;
+    nk_size_t tail_bytes;
+nk_dot_i8_powervsx_cycle:
+    if (count_scalars < 16) {
+        tail_bytes = count_scalars * sizeof(nk_i8_t);
+        a_i8x16 = vec_xl_len((nk_i8_t *)a_scalars, tail_bytes);
+        b_biased_u8x16 = vec_xor(vec_xl_len((nk_u8_t *)b_scalars, tail_bytes), bias_u8x16);
+        count_scalars = 0;
+    }
+    else {
+        a_i8x16 = vec_xl(0, a_scalars);
+        b_biased_u8x16 = vec_xor(vec_xl(0, (nk_u8_t *)b_scalars), bias_u8x16);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    // VMSUMMBM: i8 × u8 → i32 (16 products per instruction)
+    accumulator_i32x4 = vec_msum(a_i8x16, b_biased_u8x16, accumulator_i32x4);
+    // VSUM4UBS: accumulate Σ(a+128) as unsigned (independent chain, good ILP)
+    sum_a_biased_u32x4 = vec_sum4s(vec_xor((nk_vu8x16_t)a_i8x16, bias_u8x16), sum_a_biased_u32x4);
+    if (count_scalars) goto nk_dot_i8_powervsx_cycle;
+    // Correction: a·b = biased_dot − 128·Σa = biased_dot − 128·(Σ(a+128) − 128·count_padded)
+    nk_i32_t biased_dot = nk_hsum_i32x4_powervsx_(accumulator_i32x4);
+    nk_i64_t correction = 128LL * (nk_i64_t)nk_hsum_u32x4_powervsx_(sum_a_biased_u32x4) -
+                          16384LL * (nk_i64_t)count_padded;
+    *result = (nk_i32_t)((nk_i64_t)biased_dot - correction);
+}
+NK_PUBLIC void nk_dot_u8_powervsx(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
+                                  nk_u32_t *result) {
+    // vec_msum: multiply u8×u8 pairs and accumulate 16 products → 4 u32 lanes per call
+    nk_vu32x4_t accumulator_u32x4 = vec_splats((nk_u32_t)0);
+    nk_vu8x16_t a_u8x16, b_u8x16;
+    nk_size_t tail_bytes;
+nk_dot_u8_powervsx_cycle:
+    if (count_scalars < 16) {
+        tail_bytes = count_scalars * sizeof(nk_u8_t);
+        a_u8x16 = vec_xl_len((nk_u8_t *)a_scalars, tail_bytes);
+        b_u8x16 = vec_xl_len((nk_u8_t *)b_scalars, tail_bytes);
+        count_scalars = 0;
+    }
+    else {
+        a_u8x16 = vec_xl(0, a_scalars);
+        b_u8x16 = vec_xl(0, b_scalars);
+        a_scalars += 16, b_scalars += 16, count_scalars -= 16;
+    }
+    // Unsigned × unsigned multiply-sum: 16 u8 products accumulated into 4 u32 lanes
+    accumulator_u32x4 = vec_msum(a_u8x16, b_u8x16, accumulator_u32x4);
+    if (count_scalars) goto nk_dot_u8_powervsx_cycle;
+    *result = nk_hsum_u32x4_powervsx_(accumulator_u32x4);
+}
+#pragma endregion I8 and U8 Integers
+#pragma region Binary
+NK_PUBLIC void nk_dot_u1_powervsx(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bits, nk_u32_t *result) {
+    nk_size_t n_bytes = nk_size_divide_round_up_(n_bits, NK_BITS_PER_BYTE);
+    nk_vu64x2_t accumulator_u64x2 = vec_splats((nk_u64_t)0);
+    nk_vu8x16_t a_u8x16, b_u8x16;
+nk_dot_u1_powervsx_cycle:
+    if (n_bytes < 16) {
+        a_u8x16 = vec_xl_len((nk_u8_t *)a, n_bytes);
+        b_u8x16 = vec_xl_len((nk_u8_t *)b, n_bytes);
+        n_bytes = 0;
+    }
+    else {
+        a_u8x16 = vec_xl(0, (nk_u8_t const *)a);
+        b_u8x16 = vec_xl(0, (nk_u8_t const *)b);
+        a += 16, b += 16, n_bytes -= 16;
+    }
+    // AND → doubleword popcount (vpopcntd) → accumulate u64 lanes
+    nk_vu8x16_t and_u8x16 = vec_and(a_u8x16, b_u8x16);
+    nk_vu64x2_t popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)and_u8x16);
+    accumulator_u64x2 = vec_add(accumulator_u64x2, popcnt_u64x2);
+    if (n_bytes) goto nk_dot_u1_powervsx_cycle;
+    *result = (nk_u32_t)nk_hsum_u64x2_powervsx_(accumulator_u64x2);
+}
+#pragma endregion Binary
+/**
+ *  @brief Running state for 128-bit dot accumulation over f32 scalars on Power VSX.
+ *
+ *  Processes 2 f32 values at a time, upcasting to f64 for accumulation to avoid
+ *  catastrophic cancellation in long reductions.
+ */
+typedef struct nk_dot_f32x2_state_powervsx_t {
+    nk_vf64x2_t sum_f64x2;
+} nk_dot_f32x2_state_powervsx_t;
+NK_INTERNAL void nk_dot_f32x2_init_powervsx(nk_dot_f32x2_state_powervsx_t *state) {
+    state->sum_f64x2 = vec_splats((nk_f64_t)0);
+}
+NK_INTERNAL void nk_dot_f32x2_update_powervsx(nk_dot_f32x2_state_powervsx_t *state, nk_b64_vec_t a, nk_b64_vec_t b,
+                                              nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Load 8 bytes (2 f32s) into a vector register, zero-filling the upper 8 bytes
+    nk_vf32x4_t a_f32x4 = vec_xl_len((nk_f32_t *)a.f32s, 8);
+    nk_vf32x4_t b_f32x4 = vec_xl_len((nk_f32_t *)b.f32s, 8);
+    // Widen even lanes (the two f32 values) → f64x2
+    nk_vf64x2_t a_f64x2 = vec_doublee(a_f32x4);
+    nk_vf64x2_t b_f64x2 = vec_doublee(b_f32x4);
+    // Permute to get {lane0, lane2} → {a[0], a[1]} as f64x2
+    a_f64x2 = vec_xxpermdi(a_f64x2, vec_doubleo(a_f32x4), 0);
+    b_f64x2 = vec_xxpermdi(b_f64x2, vec_doubleo(b_f32x4), 0);
+    state->sum_f64x2 = vec_madd(a_f64x2, b_f64x2, state->sum_f64x2);
+}
+NK_INTERNAL void nk_dot_f32x2_finalize_powervsx(                                                //
+    nk_dot_f32x2_state_powervsx_t const *state_a, nk_dot_f32x2_state_powervsx_t const *state_b, //
+    nk_dot_f32x2_state_powervsx_t const *state_c, nk_dot_f32x2_state_powervsx_t const *state_d, //
+    nk_size_t total_dimensions, nk_b256_vec_t *result) {
+    nk_unused_(total_dimensions);
+    nk_vf64x2_t sum_a_f64x2 = vec_add(state_a->sum_f64x2, vec_xxpermdi(state_a->sum_f64x2, state_a->sum_f64x2, 2));
+    nk_vf64x2_t sum_b_f64x2 = vec_add(state_b->sum_f64x2, vec_xxpermdi(state_b->sum_f64x2, state_b->sum_f64x2, 2));
+    nk_vf64x2_t sum_c_f64x2 = vec_add(state_c->sum_f64x2, vec_xxpermdi(state_c->sum_f64x2, state_c->sum_f64x2, 2));
+    nk_vf64x2_t sum_d_f64x2 = vec_add(state_d->sum_f64x2, vec_xxpermdi(state_d->sum_f64x2, state_d->sum_f64x2, 2));
+    result->vf64x2s[0] = vec_xxpermdi(sum_a_f64x2, sum_b_f64x2, 0);
+    result->vf64x2s[1] = vec_xxpermdi(sum_c_f64x2, sum_d_f64x2, 0);
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over f64 scalars on Power VSX.
+ *
+ *  Uses the Dot2 algorithm (Ogita-Rump-Oishi 2005) for compensated dot product.
+ */
+typedef struct nk_dot_f64x2_state_powervsx_t {
+    nk_vf64x2_t sum_f64x2;
+    nk_vf64x2_t compensation_f64x2;
+} nk_dot_f64x2_state_powervsx_t;
+NK_INTERNAL void nk_dot_f64x2_init_powervsx(nk_dot_f64x2_state_powervsx_t *state) {
+    state->sum_f64x2 = vec_splats((nk_f64_t)0);
+    state->compensation_f64x2 = vec_splats((nk_f64_t)0);
+}
+NK_INTERNAL void nk_dot_f64x2_update_powervsx(nk_dot_f64x2_state_powervsx_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                              nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    nk_vf64x2_t sum_f64x2 = state->sum_f64x2;
+    nk_vf64x2_t compensation_f64x2 = state->compensation_f64x2;
+    nk_vf64x2_t a_f64x2 = a.vf64x2;
+    nk_vf64x2_t b_f64x2 = b.vf64x2;
+    // TwoProd: product = a × b, error = msub(a, b, product) captures rounding error
+    nk_vf64x2_t product_f64x2 = vec_mul(a_f64x2, b_f64x2);
+    nk_vf64x2_t product_error_f64x2 = vec_msub(a_f64x2, b_f64x2, product_f64x2);
+    // TwoSum: (t, q) = TwoSum(sum, product) where t = sum + product rounded, q = error
+    nk_vf64x2_t tentative_sum_f64x2 = vec_add(sum_f64x2, product_f64x2);
+    nk_vf64x2_t virtual_addend_f64x2 = vec_sub(tentative_sum_f64x2, sum_f64x2);
+    nk_vf64x2_t sum_error_f64x2 = vec_add(vec_sub(sum_f64x2, vec_sub(tentative_sum_f64x2, virtual_addend_f64x2)),
+                                          vec_sub(product_f64x2, virtual_addend_f64x2));
+    // Update: sum = t, compensation += q + r
+    state->sum_f64x2 = tentative_sum_f64x2;
+    state->compensation_f64x2 = vec_add(compensation_f64x2, vec_add(sum_error_f64x2, product_error_f64x2));
+}
+NK_INTERNAL void nk_dot_f64x2_finalize_powervsx(                                                //
+    nk_dot_f64x2_state_powervsx_t const *state_a, nk_dot_f64x2_state_powervsx_t const *state_b, //
+    nk_dot_f64x2_state_powervsx_t const *state_c, nk_dot_f64x2_state_powervsx_t const *state_d, //
+    nk_size_t total_dimensions, nk_b256_vec_t *result) {
+    nk_unused_(total_dimensions);
+    // Compensated horizontal reduction preserving Dot2 error tracking per state
+    result->f64s[0] = nk_dot_stable_sum_f64x2_powervsx_(state_a->sum_f64x2, state_a->compensation_f64x2);
+    result->f64s[1] = nk_dot_stable_sum_f64x2_powervsx_(state_b->sum_f64x2, state_b->compensation_f64x2);
+    result->f64s[2] = nk_dot_stable_sum_f64x2_powervsx_(state_c->sum_f64x2, state_c->compensation_f64x2);
+    result->f64s[3] = nk_dot_stable_sum_f64x2_powervsx_(state_d->sum_f64x2, state_d->compensation_f64x2);
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over bf16 scalars on Power VSX.
+ *
+ *  Processes 8 bf16 values at a time (128 bits), converting to f32 via vec_mergeh/mergel
+ *  with zero for accumulation.
+ */
+typedef struct nk_dot_bf16x8_state_powervsx_t {
+    nk_vf32x4_t sum_f32x4;
+} nk_dot_bf16x8_state_powervsx_t;
+NK_INTERNAL void nk_dot_bf16x8_init_powervsx(nk_dot_bf16x8_state_powervsx_t *state) {
+    state->sum_f32x4 = vec_splats((nk_f32_t)0);
+}
+NK_INTERNAL void nk_dot_bf16x8_update_powervsx(nk_dot_bf16x8_state_powervsx_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                               nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Convert bf16 → f32 inline: merge with zero puts bf16 bits in upper 16 of each f32
+    nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
+    nk_vu16x8_t a_u16x8 = a.vu16x8;
+    nk_vu16x8_t b_u16x8 = b.vu16x8;
+    nk_vf32x4_t a_high_f32x4 = (nk_vf32x4_t)vec_mergeh(zero_u16x8, a_u16x8);
+    nk_vf32x4_t a_low_f32x4 = (nk_vf32x4_t)vec_mergel(zero_u16x8, a_u16x8);
+    nk_vf32x4_t b_high_f32x4 = (nk_vf32x4_t)vec_mergeh(zero_u16x8, b_u16x8);
+    nk_vf32x4_t b_low_f32x4 = (nk_vf32x4_t)vec_mergel(zero_u16x8, b_u16x8);
+    state->sum_f32x4 = vec_madd(a_high_f32x4, b_high_f32x4, state->sum_f32x4);
+    state->sum_f32x4 = vec_madd(a_low_f32x4, b_low_f32x4, state->sum_f32x4);
+}
+NK_INTERNAL void nk_dot_bf16x8_finalize_powervsx(                                                 //
+    nk_dot_bf16x8_state_powervsx_t const *state_a, nk_dot_bf16x8_state_powervsx_t const *state_b, //
+    nk_dot_bf16x8_state_powervsx_t const *state_c, nk_dot_bf16x8_state_powervsx_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    nk_vf32x4_t a_f32x4 = state_a->sum_f32x4, b_f32x4 = state_b->sum_f32x4, c_f32x4 = state_c->sum_f32x4,
+                d_f32x4 = state_d->sum_f32x4;
+    nk_vf32x4_t transpose_ab_low_f32x4 = vec_mergeh(a_f32x4, b_f32x4);
+    nk_vf32x4_t transpose_cd_low_f32x4 = vec_mergeh(c_f32x4, d_f32x4);
+    nk_vf32x4_t transpose_ab_high_f32x4 = vec_mergel(a_f32x4, b_f32x4);
+    nk_vf32x4_t transpose_cd_high_f32x4 = vec_mergel(c_f32x4, d_f32x4);
+    nk_vf32x4_t sum_lane0_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_f32x4, 0);
+    nk_vf32x4_t sum_lane1_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_f32x4, 3);
+    nk_vf32x4_t sum_lane2_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_f32x4, 0);
+    nk_vf32x4_t sum_lane3_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_f32x4, 3);
+    result->vf32x4 = vec_add(vec_add(sum_lane0_f32x4, sum_lane1_f32x4), vec_add(sum_lane2_f32x4, sum_lane3_f32x4));
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over f16 scalars on Power VSX.
+ *
+ *  Processes 8 f16 values at a time (128 bits), converting to f32 via
+ *  vec_extract_fp32_from_shorth/shortl for accumulation.
+ */
+typedef struct nk_dot_f16x8_state_powervsx_t {
+    nk_vf32x4_t sum_f32x4;
+} nk_dot_f16x8_state_powervsx_t;
+NK_INTERNAL void nk_dot_f16x8_init_powervsx(nk_dot_f16x8_state_powervsx_t *state) {
+    state->sum_f32x4 = vec_splats((nk_f32_t)0);
+}
+NK_INTERNAL void nk_dot_f16x8_update_powervsx(nk_dot_f16x8_state_powervsx_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                              nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Convert f16 → f32 via hardware XVCVHPSP
+    nk_vu16x8_t a_u16x8 = a.vu16x8;
+    nk_vu16x8_t b_u16x8 = b.vu16x8;
+    nk_vf32x4_t a_high_f32x4 = vec_extract_fp32_from_shorth(a_u16x8);
+    nk_vf32x4_t a_low_f32x4 = vec_extract_fp32_from_shortl(a_u16x8);
+    nk_vf32x4_t b_high_f32x4 = vec_extract_fp32_from_shorth(b_u16x8);
+    nk_vf32x4_t b_low_f32x4 = vec_extract_fp32_from_shortl(b_u16x8);
+    state->sum_f32x4 = vec_madd(a_high_f32x4, b_high_f32x4, state->sum_f32x4);
+    state->sum_f32x4 = vec_madd(a_low_f32x4, b_low_f32x4, state->sum_f32x4);
+}
+NK_INTERNAL void nk_dot_f16x8_finalize_powervsx(                                                //
+    nk_dot_f16x8_state_powervsx_t const *state_a, nk_dot_f16x8_state_powervsx_t const *state_b, //
+    nk_dot_f16x8_state_powervsx_t const *state_c, nk_dot_f16x8_state_powervsx_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    nk_vf32x4_t a_f32x4 = state_a->sum_f32x4, b_f32x4 = state_b->sum_f32x4, c_f32x4 = state_c->sum_f32x4,
+                d_f32x4 = state_d->sum_f32x4;
+    nk_vf32x4_t transpose_ab_low_f32x4 = vec_mergeh(a_f32x4, b_f32x4);
+    nk_vf32x4_t transpose_cd_low_f32x4 = vec_mergeh(c_f32x4, d_f32x4);
+    nk_vf32x4_t transpose_ab_high_f32x4 = vec_mergel(a_f32x4, b_f32x4);
+    nk_vf32x4_t transpose_cd_high_f32x4 = vec_mergel(c_f32x4, d_f32x4);
+    nk_vf32x4_t sum_lane0_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_f32x4, 0);
+    nk_vf32x4_t sum_lane1_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_f32x4, 3);
+    nk_vf32x4_t sum_lane2_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_f32x4, 0);
+    nk_vf32x4_t sum_lane3_f32x4 = (nk_vf32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_f32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_f32x4, 3);
+    result->vf32x4 = vec_add(vec_add(sum_lane0_f32x4, sum_lane1_f32x4), vec_add(sum_lane2_f32x4, sum_lane3_f32x4));
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over i8 scalars on Power VSX.
+ *
+ *  Algebraic transform: a·b = a·(b⊕0x80) − 128·Σa. Uses VMSUMMBM (i8×u8 → i32) for the biased
+ *  product. Correction is applied at finalize using precomputed column sums from the compensated
+ *  macro infrastructure.
+ */
+typedef struct nk_dot_i8x16_state_powervsx_t {
+    nk_vi32x4_t biased_sum_i32x4;
+} nk_dot_i8x16_state_powervsx_t;
+NK_INTERNAL void nk_dot_i8x16_init_powervsx(nk_dot_i8x16_state_powervsx_t *state) {
+    state->biased_sum_i32x4 = vec_splats((nk_i32_t)0);
+}
+NK_INTERNAL void nk_dot_i8x16_update_powervsx(nk_dot_i8x16_state_powervsx_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                              nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // VMSUMMBM(b, a⊕0x80) = Σ(b_i · (a_i+128)) = a·b + 128·Σb
+    // Swapping operands: b in signed slot, biased a in unsigned slot.
+    // Correction −128·Σb uses precomputed B column sums from the compensated macro.
+    nk_vu8x16_t const bias_u8x16 = vec_splats((nk_u8_t)0x80);
+    nk_vu8x16_t a_biased_u8x16 = vec_xor(a.vu8x16, bias_u8x16);
+    state->biased_sum_i32x4 = vec_msum(b.vi8x16, a_biased_u8x16, state->biased_sum_i32x4);
+}
+NK_INTERNAL void nk_dot_i8x16_finalize_powervsx(                                                //
+    nk_dot_i8x16_state_powervsx_t const *state_a, nk_dot_i8x16_state_powervsx_t const *state_b, //
+    nk_dot_i8x16_state_powervsx_t const *state_c, nk_dot_i8x16_state_powervsx_t const *state_d, //
+    nk_size_t total_dimensions,                                                                 //
+    nk_i32_t a_sum, nk_b128_vec_t b_sums, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    nk_unused_(a_sum);
+    // Transpose-reduce biased products across 4 accumulators → one i32x4
+    nk_vi32x4_t a_i32x4 = state_a->biased_sum_i32x4, b_i32x4 = state_b->biased_sum_i32x4,
+                c_i32x4 = state_c->biased_sum_i32x4, d_i32x4 = state_d->biased_sum_i32x4;
+    nk_vi32x4_t transpose_ab_low_i32x4 = vec_mergeh(a_i32x4, b_i32x4);
+    nk_vi32x4_t transpose_cd_low_i32x4 = vec_mergeh(c_i32x4, d_i32x4);
+    nk_vi32x4_t transpose_ab_high_i32x4 = vec_mergel(a_i32x4, b_i32x4);
+    nk_vi32x4_t transpose_cd_high_i32x4 = vec_mergel(c_i32x4, d_i32x4);
+    nk_vi32x4_t sum_lane0_i32x4 = (nk_vi32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_i32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_i32x4, 0);
+    nk_vi32x4_t sum_lane1_i32x4 = (nk_vi32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_i32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_i32x4, 3);
+    nk_vi32x4_t sum_lane2_i32x4 = (nk_vi32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_i32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_i32x4, 0);
+    nk_vi32x4_t sum_lane3_i32x4 = (nk_vi32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_i32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_i32x4, 3);
+    nk_vi32x4_t biased_i32x4 = vec_add(vec_add(sum_lane0_i32x4, sum_lane1_i32x4),
+                                       vec_add(sum_lane2_i32x4, sum_lane3_i32x4));
+    // Correction: VMSUMMBM(b, a⊕0x80) = Σ(b_i·(a_i+128)) = a·b + 128·Σb
+    // So a·b = biased − 128·Σb. B column sums are precomputed during packing.
+    nk_vu32x4_t shift_u32x4 = vec_splats((nk_u32_t)7);
+    nk_vi32x4_t correction_i32x4 = (nk_vi32x4_t)vec_sl((nk_vu32x4_t)b_sums.vi32x4, shift_u32x4);
+    result->vi32x4 = vec_sub(biased_i32x4, correction_i32x4);
+}
+/** @brief Running state for i8 column sum precomputation on Power VSX. */
+typedef struct nk_sum_i8x16_state_powervsx_t {
+    nk_vu32x4_t biased_sum_u32x4;
+} nk_sum_i8x16_state_powervsx_t;
+NK_INTERNAL void nk_sum_i8x16_init_powervsx(nk_sum_i8x16_state_powervsx_t *state) {
+    state->biased_sum_u32x4 = vec_splats((nk_u32_t)0);
+}
+NK_INTERNAL void nk_sum_i8x16_update_powervsx(nk_sum_i8x16_state_powervsx_t *state, nk_b128_vec_t values_vec) {
+    nk_vu8x16_t const bias_u8x16 = vec_splats((nk_u8_t)0x80);
+    nk_vu8x16_t biased_u8x16 = vec_xor(values_vec.vu8x16, bias_u8x16);
+    state->biased_sum_u32x4 = vec_sum4s(biased_u8x16, state->biased_sum_u32x4);
+}
+NK_INTERNAL nk_i32_t nk_sum_i8x16_finalize_powervsx(nk_sum_i8x16_state_powervsx_t const *state, nk_size_t count) {
+    nk_u32_t biased_sum = nk_hsum_u32x4_powervsx_(state->biased_sum_u32x4);
+    return (nk_i32_t)((nk_i64_t)biased_sum - 128 * (nk_i64_t)count);
+}
+/**
+ *  @brief Running state for 128-bit dot accumulation over u8 scalars on Power VSX.
+ *
+ *  Processes 16 u8 values at a time via vec_msum, accumulating into 4 u32 lanes.
+ */
+typedef struct nk_dot_u8x16_state_powervsx_t {
+    nk_vu32x4_t sum_u32x4;
+} nk_dot_u8x16_state_powervsx_t;
+NK_INTERNAL void nk_dot_u8x16_init_powervsx(nk_dot_u8x16_state_powervsx_t *state) {
+    state->sum_u32x4 = vec_splats((nk_u32_t)0);
+}
+NK_INTERNAL void nk_dot_u8x16_update_powervsx(nk_dot_u8x16_state_powervsx_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                              nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // Unsigned × unsigned multiply-sum: 16 u8 products accumulated into 4 u32 lanes
+    nk_vu8x16_t a_u8x16 = a.vu8x16;
+    nk_vu8x16_t b_u8x16 = b.vu8x16;
+    state->sum_u32x4 = vec_msum(a_u8x16, b_u8x16, state->sum_u32x4);
+}
+NK_INTERNAL void nk_dot_u8x16_finalize_powervsx(                                                //
+    nk_dot_u8x16_state_powervsx_t const *state_a, nk_dot_u8x16_state_powervsx_t const *state_b, //
+    nk_dot_u8x16_state_powervsx_t const *state_c, nk_dot_u8x16_state_powervsx_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    nk_vu32x4_t a_u32x4 = state_a->sum_u32x4, b_u32x4 = state_b->sum_u32x4, c_u32x4 = state_c->sum_u32x4,
+                d_u32x4 = state_d->sum_u32x4;
+    nk_vu32x4_t transpose_ab_low_u32x4 = vec_mergeh(a_u32x4, b_u32x4);
+    nk_vu32x4_t transpose_cd_low_u32x4 = vec_mergeh(c_u32x4, d_u32x4);
+    nk_vu32x4_t transpose_ab_high_u32x4 = vec_mergel(a_u32x4, b_u32x4);
+    nk_vu32x4_t transpose_cd_high_u32x4 = vec_mergel(c_u32x4, d_u32x4);
+    nk_vu32x4_t sum_lane0_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_u32x4, 0);
+    nk_vu32x4_t sum_lane1_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_low_u32x4,
+                                                            (nk_vu64x2_t)transpose_cd_low_u32x4, 3);
+    nk_vu32x4_t sum_lane2_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_u32x4, 0);
+    nk_vu32x4_t sum_lane3_u32x4 = (nk_vu32x4_t)vec_xxpermdi((nk_vu64x2_t)transpose_ab_high_u32x4,
+                                                            (nk_vu64x2_t)transpose_cd_high_u32x4, 3);
+    result->vu32x4 = vec_add(vec_add(sum_lane0_u32x4, sum_lane1_u32x4), vec_add(sum_lane2_u32x4, sum_lane3_u32x4));
+}
+/**
+ *  @brief Running state for 128-bit binary dot accumulation on Power VSX.
+ *
+ *  Processes 128 bits (16 bytes) at a time via AND + doubleword popcount (vpopcntd),
+ *  accumulating bit-match counts into 2 u64 lanes.
+ */
+typedef struct nk_dot_u1x128_state_powervsx_t {
+    nk_vu64x2_t dot_count_u64x2;
+} nk_dot_u1x128_state_powervsx_t;
+NK_INTERNAL void nk_dot_u1x128_init_powervsx(nk_dot_u1x128_state_powervsx_t *state) {
+    state->dot_count_u64x2 = vec_splats((nk_u64_t)0);
+}
+NK_INTERNAL void nk_dot_u1x128_update_powervsx(nk_dot_u1x128_state_powervsx_t *state, nk_b128_vec_t a, nk_b128_vec_t b,
+                                               nk_size_t depth_offset, nk_size_t active_dimensions) {
+    nk_unused_(depth_offset);
+    nk_unused_(active_dimensions);
+    // AND → doubleword popcount (vpopcntd, 3cy ALU) → vec_add (7cy DP)
+    // Simpler data flow than vpopcntb + vec_sum4s, and u64 accumulator holds larger counts
+    nk_vu8x16_t a_u8x16 = a.vu8x16;
+    nk_vu8x16_t b_u8x16 = b.vu8x16;
+    nk_vu8x16_t and_u8x16 = vec_and(a_u8x16, b_u8x16);
+    nk_vu64x2_t popcnt_u64x2 = vec_popcnt((nk_vu64x2_t)and_u8x16);
+    state->dot_count_u64x2 = vec_add(state->dot_count_u64x2, popcnt_u64x2);
+}
+NK_INTERNAL void nk_dot_u1x128_finalize_powervsx(                                                 //
+    nk_dot_u1x128_state_powervsx_t const *state_a, nk_dot_u1x128_state_powervsx_t const *state_b, //
+    nk_dot_u1x128_state_powervsx_t const *state_c, nk_dot_u1x128_state_powervsx_t const *state_d, //
+    nk_size_t total_dimensions, nk_b128_vec_t *result) {
+    nk_unused_(total_dimensions);
+    nk_vu64x2_t sum_a_u64x2 = vec_add(state_a->dot_count_u64x2,
+                                      vec_xxpermdi(state_a->dot_count_u64x2, state_a->dot_count_u64x2, 2));
+    nk_vu64x2_t sum_b_u64x2 = vec_add(state_b->dot_count_u64x2,
+                                      vec_xxpermdi(state_b->dot_count_u64x2, state_b->dot_count_u64x2, 2));
+    nk_vu64x2_t sum_c_u64x2 = vec_add(state_c->dot_count_u64x2,
+                                      vec_xxpermdi(state_c->dot_count_u64x2, state_c->dot_count_u64x2, 2));
+    nk_vu64x2_t sum_d_u64x2 = vec_add(state_d->dot_count_u64x2,
+                                      vec_xxpermdi(state_d->dot_count_u64x2, state_d->dot_count_u64x2, 2));
+    nk_vu64x2_t ab_u64x2 = vec_xxpermdi(sum_a_u64x2, sum_b_u64x2, 0);
+    nk_vu64x2_t cd_u64x2 = vec_xxpermdi(sum_c_u64x2, sum_d_u64x2, 0);
+    result->vu32x4 = vec_pack(ab_u64x2, cd_u64x2);
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_POWERVSX
+#endif // NK_DOT_POWERVSX_H