npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/cast/powervsx.h ADDED Viewed

@@ -0,0 +1,449 @@
+/**
+ *  @brief SIMD-accelerated Type Conversions for Power VSX.
+ *  @file include/numkong/cast/powervsx.h
+ *  @author Ash Vardanian
+ *  @date March 23, 2026
+ *
+ *  @sa include/numkong/cast.h
+ *
+ *  @section powervsx_cast_instructions Power VSX Conversion Instructions (POWER9+)
+ *
+ *  Float16 hardware conversion (POWER9+):
+ *
+ *      Intrinsic                        Instruction       Notes
+ *      vec_extract_fp32_from_shorth     xvcvhpsp          High 4 f16 → f32x4 (1 instruction!)
+ *      vec_extract_fp32_from_shortl     xvcvhpsp          Low 4 f16 → f32x4 (1 instruction!)
+ *
+ *  Scalar f16 ↔ f32 (POWER9 inline asm):
+ *
+ *      Instruction   Notes
+ *      lxsihzx       Load f16 → VSR (zero-extended)
+ *      xscvhpdp      Convert half → double precision
+ *      xscvdphp      Convert double → half precision
+ *      stxsihx       Store f16 from VSR
+ *
+ *  Scalar sqrt (POWER9 inline asm):
+ *
+ *      Instruction   Notes
+ *      xssqrtsp      Scalar single-precision sqrt
+ *      xssqrtdp      Scalar double-precision sqrt
+ *
+ *  Float ↔ integer conversions:
+ *
+ *      Intrinsic   Instruction   Notes
+ *      vec_cts     xvcvspsxws    f32x4 → i32x4 (truncation)
+ *      vec_ctu     xvcvspuxws    f32x4 → u32x4 (truncation)
+ *      vec_ctf     xvcvsxwsp     i32x4 → f32x4
+ *      vec_ctf     xvcvuxwsp     u32x4 → f32x4
+ *
+ *  Integer narrowing/widening:
+ *
+ *      Intrinsic     Instruction   Notes
+ *      vec_pack      vpkuwum       u32x4 → u16x8 (modular)
+ *      vec_packs     vpkswss       i32x4 → i16x8 (signed saturation)
+ *      vec_packsu    vpkswus       i32x4 → u16x8 (unsigned saturation from signed)
+ *      vec_unpackh   vupkhsh       i16x8 → i32x4 (sign-extend high half)
+ *      vec_mergeh    vmrghh        Interleave high halves (zero-extend via merge with zero)
+ *
+ *  Partial-length load:
+ *
+ *      Intrinsic     Instruction   Notes
+ *      vec_xl_len    lxvl          Load up to 16 bytes with runtime length (POWER9)
+ *
+ *  Load/store:
+ *
+ *      Intrinsic   Instruction   Notes
+ *      vec_xl      lxvd2x        Aligned/unaligned load
+ *      vec_xst     stxvd2x       Aligned/unaligned store
+ *
+ *  BF16 conversions use bit manipulation (no hardware support):
+ *  - bf16 → f32: zero-extend u16 → u32 via vec_mergeh with zero, reinterpret
+ *  - f32 → bf16: RNE rounding + vec_sr by 16 + vec_pack
+ *
+ *  FP8 (E4M3/E5M2/E2M3/E3M2) types have no Power hardware support.
+ *  Serial fallback via cast/serial.h is used for those formats.
+ */
+#ifndef NK_CAST_POWERVSX_H
+#define NK_CAST_POWERVSX_H
+#if NK_TARGET_POWER_
+#if NK_TARGET_POWERVSX
+#include "numkong/types.h"
+#include "numkong/cast/serial.h"   // `nk_cast_serial`, `nk_dtype_bits`
+#include "numkong/reduce/serial.h" // `nk_reduce_moments_f32_serial`
+// Power VSX vector typedefs — wrapping altivec built-in vector types.
+// These may move to `numkong/types.h` in the future.
+#ifndef NK_POWERVSX_TYPES_DEFINED_
+#define NK_POWERVSX_TYPES_DEFINED_
+#endif // NK_POWERVSX_TYPES_DEFINED_
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("power9-vector")
+#endif
+/** @brief Convert scalar f16 → f32 via POWER9 vector path (xvcvhpsp). */
+NK_PUBLIC void nk_f16_to_f32_powervsx(nk_f16_t const *source, nk_f32_t *destination) {
+    nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 2);
+    *destination = vec_extract(vec_extract_fp32_from_shorth(values_u16x8), 0);
+}
+/** @brief Convert scalar f32 → f16 via POWER9 vector path (xvcvsphp). */
+NK_PUBLIC void nk_f32_to_f16_powervsx(nk_f32_t const *source, nk_f16_t *destination) {
+    nk_vu16x8_t packed_u16x8 = vec_pack_to_short_fp32(vec_splats(*source), vec_splats(*source));
+    *destination = vec_extract(packed_u16x8, 0);
+}
+/** @brief Type-agnostic 128-bit full load (Power VSX). */
+NK_INTERNAL void nk_load_b128_powervsx_(void const *source, nk_b128_vec_t *destination) {
+    destination->vu8x16 = vec_xl(0, (nk_u8_t const *)source);
+}
+/** @brief Type-agnostic 256-bit full load (Power VSX). */
+NK_INTERNAL void nk_load_b256_powervsx_(void const *source, nk_b256_vec_t *destination) {
+    destination->vu8x16s[0] = vec_xl(0, (nk_u8_t const *)source);
+    destination->vu8x16s[1] = vec_xl(16, (nk_u8_t const *)source);
+}
+/** @brief Type-agnostic 128-bit full store (Power VSX). */
+NK_INTERNAL void nk_store_b128_powervsx_(nk_b128_vec_t const *source, void *destination) {
+    vec_xst(source->vu8x16, 0, (nk_u8_t *)destination);
+}
+/** @brief Type-agnostic 256-bit full store (Power VSX). */
+NK_INTERNAL void nk_store_b256_powervsx_(nk_b256_vec_t const *source, void *destination) {
+    vec_xst(source->vu8x16s[0], 0, (nk_u8_t *)destination);
+    vec_xst(source->vu8x16s[1], 16, (nk_u8_t *)destination);
+}
+/** @brief Type-agnostic 64-bit load (Power VSX). */
+NK_INTERNAL void nk_load_b64_powervsx_(void const *source, nk_b64_vec_t *destination) {
+    destination->u64 = *(nk_u64_t const *)source;
+}
+/** @brief Partial load for 64-bit elements (n elements, max 4) into 256-bit vector.
+ *  Uses vec_xl_len to load exactly n×8 bytes, zero-filling the remainder.
+ *  vec_xl_len with length=0 produces a zero vector (no branch needed). */
+NK_INTERNAL void nk_partial_load_b64x4_powervsx_(void const *source, nk_b256_vec_t *destination, nk_size_t n) {
+    nk_size_t bytes = n * 8;
+    nk_size_t first_half = bytes < 16 ? bytes : 16;
+    nk_size_t second_half = bytes > 16 ? bytes - 16 : 0;
+    destination->vu8x16s[0] = vec_xl_len((nk_u8_t *)source, first_half);
+    destination->vu8x16s[1] = vec_xl_len((nk_u8_t *)source + 16, second_half);
+}
+/** @brief Partial load for 64-bit elements (n elements, max 2) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b64x2_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
+    destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n * 8);
+}
+/** @brief Partial load for 32-bit elements (n elements, max 4) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b32x4_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
+    destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n * 4);
+}
+/** @brief Partial load for 32-bit elements (n elements, max 2) into 64-bit vector. */
+NK_INTERNAL void nk_partial_load_b32x2_powervsx_(void const *source, nk_b64_vec_t *destination, nk_size_t n) {
+    nk_copy_bytes_(destination, source, n * 4);
+}
+/** @brief Partial load for 16-bit elements (n elements, max 8) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b16x8_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
+    destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n * 2);
+}
+/** @brief Partial load for 8-bit elements (n elements, max 16) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b8x16_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
+    destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n);
+}
+/** @brief Partial load for 1-bit elements (n bits, max 128) into 128-bit vector. */
+NK_INTERNAL void nk_partial_load_b1x128_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n_bits) {
+    destination->vu8x16 = vec_xl_len((nk_u8_t *)source, nk_size_divide_round_up_(n_bits, 8));
+}
+/** @brief Partial store for 64-bit elements (n elements, max 4) from 256-bit vector.
+ *  vec_xst_len with length=0 stores nothing (no branch needed). */
+NK_INTERNAL void nk_partial_store_b64x4_powervsx_(nk_b256_vec_t const *source, void *destination, nk_size_t n) {
+    nk_size_t bytes = n * 8;
+    nk_size_t first_half = bytes < 16 ? bytes : 16;
+    nk_size_t second_half = bytes > 16 ? bytes - 16 : 0;
+    vec_xst_len(source->vu8x16s[0], (nk_u8_t *)destination, first_half);
+    vec_xst_len(source->vu8x16s[1], (nk_u8_t *)destination + 16, second_half);
+}
+/** @brief Partial store for 32-bit elements (n elements, max 4) from 128-bit vector. */
+NK_INTERNAL void nk_partial_store_b32x4_powervsx_(nk_b128_vec_t const *source, void *destination, nk_size_t n) {
+    vec_xst_len(source->vu8x16, (nk_u8_t *)destination, n * 4);
+}
+/** @brief Convert 4x f16 → f32x4 via POWER9 hardware (xvcvhpsp, 1 instruction!).
+ *  Loads 4 f16 values into a u16x8 register and uses `vec_extract_fp32_from_shorth`. */
+NK_INTERNAL nk_vf32x4_t nk_f16x4_to_f32x4_powervsx_(nk_f16_t const *source) {
+    nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 8);
+    return vec_extract_fp32_from_shorth(values_u16x8);
+}
+/** @brief Convert f32x4 → 4x f16 via POWER9 hardware (xvcvsphp, 1 instruction!).
+ *  Uses `vec_pack_to_short_fp32` to pack 4 f32 values into 4 f16 values. */
+NK_INTERNAL nk_b64_vec_t nk_f32x4_to_f16x4_powervsx_(nk_vf32x4_t values_f32x4) {
+    nk_vu16x8_t packed_u16x8 = vec_pack_to_short_fp32(values_f32x4, values_f32x4);
+    nk_b64_vec_t result_vec;
+    result_vec.u64 = vec_extract((nk_vu64x2_t)packed_u16x8, 0);
+    return result_vec;
+}
+/** @brief Convert 4x bf16 → f32x4 via branchless bit manipulation (Power VSX).
+ *  BF16 format: upper 16 bits of f32. Conversion is zero-extend via vec_mergeh, reinterpret. */
+NK_INTERNAL nk_vf32x4_t nk_bf16x4_to_f32x4_powervsx_(nk_bf16_t const *source) {
+    nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 8);
+    nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
+    nk_vu32x4_t bits_u32x4 = (nk_vu32x4_t)vec_mergeh(zero_u16x8, values_u16x8);
+    return (nk_vf32x4_t)bits_u32x4;
+}
+/** @brief Convert f32x4 → bf16 packed in u16x8 with RNE rounding (Power VSX).
+ *  Round-to-nearest-even: add (0x7FFF + lsb) before truncation.
+ *  Uses vec_sr by 16, then vec_pack to narrow u32x4 → u16x8.
+ *  Result is in low 4 lanes of the returned u16x8. */
+NK_INTERNAL nk_vu16x8_t nk_f32x4_to_bf16_pack_powervsx_(nk_vf32x4_t values_f32x4) {
+    nk_vu32x4_t shift_u32x4 = vec_splats((nk_u32_t)16);
+    nk_vu32x4_t one_u32x4 = vec_splats((nk_u32_t)1);
+    nk_vu32x4_t rounding_base_u32x4 = vec_splats((nk_u32_t)0x7FFF);
+    nk_vu32x4_t bits_u32x4 = (nk_vu32x4_t)values_f32x4;
+    // RNE rounding: lsb = (bits >> 16) & 1; bits += 0x7FFF + lsb
+    nk_vu32x4_t lsb_u32x4 = vec_and(vec_sr(bits_u32x4, shift_u32x4), one_u32x4);
+    nk_vu32x4_t rounding_u32x4 = vec_add(rounding_base_u32x4, lsb_u32x4);
+    bits_u32x4 = vec_add(bits_u32x4, rounding_u32x4);
+    bits_u32x4 = vec_sr(bits_u32x4, shift_u32x4);
+    return vec_pack(bits_u32x4, bits_u32x4);
+}
+/** @brief Convert f32x4 → 4x bf16 with RNE rounding (Power VSX). Returns nk_b64_vec_t. */
+NK_INTERNAL nk_b64_vec_t nk_f32x4_to_bf16x4_powervsx_(nk_vf32x4_t values_f32x4) {
+    nk_b64_vec_t result_vec;
+    result_vec.u64 = vec_extract((nk_vu64x2_t)nk_f32x4_to_bf16_pack_powervsx_(values_f32x4), 0);
+    return result_vec;
+}
+/** @brief Convert 4x i16 → f32x4 (Power VSX). Sign-extend via vec_unpackh, then vec_ctf. */
+NK_INTERNAL nk_vf32x4_t nk_i16x4_to_f32x4_powervsx_(nk_i16_t const *source) {
+    nk_vi16x8_t values_i16x8 = (nk_vi16x8_t)vec_xl_len((nk_u8_t *)source, 8);
+    nk_vi32x4_t values_i32x4 = vec_unpackh(values_i16x8);
+    return vec_ctf(values_i32x4, 0);
+}
+/** @brief Convert 4x u16 → f32x4 (Power VSX). Zero-extend via vec_mergeh with zero, then vec_ctf. */
+NK_INTERNAL nk_vf32x4_t nk_u16x4_to_f32x4_powervsx_(nk_u16_t const *source) {
+    nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 8);
+    nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
+    nk_vu32x4_t values_u32x4 = (nk_vu32x4_t)vec_mergeh(values_u16x8, zero_u16x8);
+    return vec_ctf(values_u32x4, 0);
+}
+/** @brief Convert 4x i8 → f32x4 (Power VSX). Double unpack via vec_unpackh (i8 → i16 → i32), then vec_ctf. */
+NK_INTERNAL nk_vf32x4_t nk_i8x4_to_f32x4_powervsx_(void const *source) {
+    nk_vi8x16_t values_i8x16 = (nk_vi8x16_t)vec_xl_len((nk_u8_t *)source, 4);
+    nk_vi16x8_t values_i16x8 = vec_unpackh(values_i8x16);
+    nk_vi32x4_t values_i32x4 = vec_unpackh(values_i16x8);
+    return vec_ctf(values_i32x4, 0);
+}
+/** @brief Convert 4x u8 → f32x4 (Power VSX). Double merge with zero (u8 → u16 → u32), then vec_ctf. */
+NK_INTERNAL nk_vf32x4_t nk_u8x4_to_f32x4_powervsx_(void const *source) {
+    nk_vu8x16_t values_u8x16 = (nk_vu8x16_t)vec_xl_len((nk_u8_t *)source, 4);
+    nk_vu8x16_t zero_u8x16 = vec_splats((nk_u8_t)0);
+    nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_mergeh(values_u8x16, zero_u8x16);
+    nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
+    nk_vu32x4_t values_u32x4 = (nk_vu32x4_t)vec_mergeh(values_u16x8, zero_u16x8);
+    return vec_ctf(values_u32x4, 0);
+}
+/** @brief Convert f32x4 → 4x i16 with vector saturation (Power VSX).
+ *  Uses vec_cts + vec_min/vec_max for clamping, then vec_packs to narrow. */
+NK_INTERNAL nk_b64_vec_t nk_f32x4_to_i16x4_powervsx_(nk_vf32x4_t values_f32x4) {
+    nk_vi32x4_t min_i32x4 = vec_splats((nk_i32_t)-32768);
+    nk_vi32x4_t max_i32x4 = vec_splats((nk_i32_t)32767);
+    nk_vi32x4_t values_i32x4 = vec_cts(vec_round(values_f32x4), 0);
+    values_i32x4 = vec_max(values_i32x4, min_i32x4);
+    values_i32x4 = vec_min(values_i32x4, max_i32x4);
+    // Signed saturating pack: i32x4 → i16x8, extract low 8 bytes
+    nk_vi16x8_t packed_i16x8 = vec_packs(values_i32x4, values_i32x4);
+    nk_b64_vec_t result_vec;
+    result_vec.u64 = vec_extract((nk_vu64x2_t)packed_i16x8, 0);
+    return result_vec;
+}
+/** @brief Convert f32x4 → 4x u16 with vector saturation (Power VSX).
+ *  Uses vec_ctu + vec_round/vec_max for clamping, then vec_pack to narrow. */
+NK_INTERNAL nk_b64_vec_t nk_f32x4_to_u16x4_powervsx_(nk_vf32x4_t values_f32x4) {
+    nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
+    nk_vu32x4_t max_u32x4 = vec_splats((nk_u32_t)65535);
+    values_f32x4 = vec_max(values_f32x4, zero_f32x4);
+    nk_vu32x4_t values_u32x4 = vec_ctu(vec_round(values_f32x4), 0);
+    values_u32x4 = vec_min(values_u32x4, max_u32x4);
+    // Pack u32x4 → u16x8, extract low 8 bytes
+    nk_vu16x8_t packed_u16x8 = vec_pack(values_u32x4, values_u32x4);
+    nk_b64_vec_t result_vec;
+    result_vec.u64 = vec_extract((nk_vu64x2_t)packed_u16x8, 0);
+    return result_vec;
+}
+/** @brief Convert f32x4 → 4x i8 with vector saturation (Power VSX).
+ *  Uses vec_cts + vec_min/vec_max for clamping, then vec_packs twice to narrow. */
+NK_INTERNAL nk_b32_vec_t nk_f32x4_to_i8x4_powervsx_(nk_vf32x4_t values_f32x4) {
+    nk_vi32x4_t min_i32x4 = vec_splats((nk_i32_t)-128);
+    nk_vi32x4_t max_i32x4 = vec_splats((nk_i32_t)127);
+    nk_vi32x4_t values_i32x4 = vec_cts(vec_round(values_f32x4), 0);
+    values_i32x4 = vec_max(values_i32x4, min_i32x4);
+    values_i32x4 = vec_min(values_i32x4, max_i32x4);
+    // Narrow: i32x4 → i16x8 → i8x16, extract low 4 bytes
+    nk_vi16x8_t packed_i16x8 = vec_packs(values_i32x4, values_i32x4);
+    nk_vi8x16_t packed_i8x16 = vec_packs(packed_i16x8, packed_i16x8);
+    nk_b32_vec_t result_vec;
+    result_vec.u32 = vec_extract((nk_vu32x4_t)packed_i8x16, 0);
+    return result_vec;
+}
+/** @brief Convert f32x4 → 4x u8 with vector saturation (Power VSX).
+ *  Uses vec_ctu + vec_min/vec_max for clamping, then vec_pack twice to narrow. */
+NK_INTERNAL nk_b32_vec_t nk_f32x4_to_u8x4_powervsx_(nk_vf32x4_t values_f32x4) {
+    nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
+    nk_vu32x4_t max_u32x4 = vec_splats((nk_u32_t)255);
+    values_f32x4 = vec_max(values_f32x4, zero_f32x4);
+    nk_vu32x4_t values_u32x4 = vec_ctu(vec_round(values_f32x4), 0);
+    values_u32x4 = vec_min(values_u32x4, max_u32x4);
+    // Narrow: u32x4 → u16x8 → u8x16, extract low 4 bytes
+    nk_vu16x8_t packed_u16x8 = vec_pack(values_u32x4, values_u32x4);
+    nk_vu8x16_t packed_u8x16 = vec_pack(packed_u16x8, packed_u16x8);
+    nk_b32_vec_t result_vec;
+    result_vec.u32 = vec_extract((nk_vu32x4_t)packed_u8x16, 0);
+    return result_vec;
+}
+NK_PUBLIC void nk_cast_powervsx(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type) {
+    // Same-type fast path
+    if (from_type == to_type) {
+        nk_size_t size_bits = nk_dtype_bits(from_type);
+        if (size_bits > 0) nk_copy_bytes_(to, from, nk_size_divide_round_up_(n * size_bits, 8));
+        return;
+    }
+    // Validate supported types (f32 and smaller, no FP8 vectorization on Power)
+    int from_ok = (from_type == nk_f32_k || from_type == nk_f16_k || from_type == nk_bf16_k || from_type == nk_i8_k ||
+                   from_type == nk_u8_k || from_type == nk_i16_k || from_type == nk_u16_k || from_type == nk_i32_k ||
+                   from_type == nk_u32_k);
+    int to_ok = (to_type == nk_f32_k || to_type == nk_f16_k || to_type == nk_bf16_k || to_type == nk_i8_k ||
+                 to_type == nk_u8_k || to_type == nk_i16_k || to_type == nk_u16_k || to_type == nk_i32_k ||
+                 to_type == nk_u32_k);
+    // Fall back to serial for unsupported types or i32 ↔ u32 (loses precision through f32)
+    if (!from_ok || !to_ok || (from_type == nk_i32_k && to_type == nk_u32_k) ||
+        (from_type == nk_u32_k && to_type == nk_i32_k)) {
+        nk_cast_serial(from, from_type, n, to, to_type);
+        return;
+    }
+    // F32 hub with predicated loads/stores — no serial fallback needed
+    nk_size_t from_element_bytes = nk_dtype_bits(from_type) / 8;
+    nk_size_t to_element_bytes = nk_dtype_bits(to_type) / 8;
+    nk_u8_t const *from_ptr = (nk_u8_t const *)from;
+    nk_u8_t *to_ptr = (nk_u8_t *)to;
+    for (nk_size_t index = 0; index < n; index += 4) {
+        nk_size_t remaining = n - index < 4 ? n - index : 4;
+        nk_size_t from_bytes = remaining * from_element_bytes;
+        nk_size_t to_bytes = remaining * to_element_bytes;
+        // Predicated load → upcast to f32x4 hub
+        nk_vu8x16_t raw_u8x16 = vec_xl_len((nk_u8_t *)from_ptr, from_bytes);
+        nk_vf32x4_t hub_f32x4;
+        switch (from_type) {
+        case nk_f32_k: hub_f32x4 = (nk_vf32x4_t)raw_u8x16; break;
+        case nk_f16_k: hub_f32x4 = vec_extract_fp32_from_shorth((nk_vu16x8_t)raw_u8x16); break;
+        case nk_bf16_k: hub_f32x4 = (nk_vf32x4_t)vec_mergeh(vec_splats((nk_u16_t)0), (nk_vu16x8_t)raw_u8x16); break;
+        case nk_i32_k: hub_f32x4 = vec_ctf((nk_vi32x4_t)raw_u8x16, 0); break;
+        case nk_u32_k: hub_f32x4 = vec_ctf((nk_vu32x4_t)raw_u8x16, 0); break;
+        case nk_i16_k: hub_f32x4 = vec_ctf(vec_unpackh((nk_vi16x8_t)raw_u8x16), 0); break;
+        case nk_u16_k:
+            hub_f32x4 = vec_ctf((nk_vu32x4_t)vec_mergeh((nk_vu16x8_t)raw_u8x16, vec_splats((nk_u16_t)0)), 0);
+            break;
+        case nk_i8_k: hub_f32x4 = vec_ctf(vec_unpackh(vec_unpackh((nk_vi8x16_t)raw_u8x16)), 0); break;
+        case nk_u8_k:
+            hub_f32x4 = vec_ctf((nk_vu32x4_t)vec_mergeh((nk_vu16x8_t)vec_mergeh(raw_u8x16, vec_splats((nk_u8_t)0)),
+                                                        vec_splats((nk_u16_t)0)),
+                                0);
+            break;
+        default: hub_f32x4 = vec_splats(0.0f); break;
+        }
+        // Downcast from f32x4 hub → predicated store
+        switch (to_type) {
+        case nk_f32_k: vec_xst_len(hub_f32x4, (nk_f32_t *)to_ptr, to_bytes); break;
+        case nk_f16_k:
+            vec_xst_len((nk_vu8x16_t)vec_pack_to_short_fp32(hub_f32x4, hub_f32x4), (nk_u8_t *)to_ptr, to_bytes);
+            break;
+        case nk_bf16_k:
+            vec_xst_len((nk_vu8x16_t)nk_f32x4_to_bf16_pack_powervsx_(hub_f32x4), (nk_u8_t *)to_ptr, to_bytes);
+            break;
+        case nk_i32_k: vec_xst_len(vec_cts(vec_round(hub_f32x4), 0), (nk_i32_t *)to_ptr, to_bytes); break;
+        case nk_u32_k: vec_xst_len(vec_ctu(vec_round(hub_f32x4), 0), (nk_u32_t *)to_ptr, to_bytes); break;
+        case nk_i16_k:
+            vec_xst_len((nk_vu8x16_t)vec_packs(vec_cts(vec_round(hub_f32x4), 0), vec_cts(vec_round(hub_f32x4), 0)),
+                        (nk_u8_t *)to_ptr, to_bytes);
+            break;
+        case nk_u16_k:
+            vec_xst_len((nk_vu8x16_t)vec_pack(vec_ctu(vec_round(hub_f32x4), 0), vec_ctu(vec_round(hub_f32x4), 0)),
+                        (nk_u8_t *)to_ptr, to_bytes);
+            break;
+        case nk_i8_k:
+            vec_xst_len(
+                (nk_vu8x16_t)vec_packs(vec_packs(vec_cts(vec_round(hub_f32x4), 0), vec_cts(vec_round(hub_f32x4), 0)),
+                                       vec_packs(vec_cts(vec_round(hub_f32x4), 0), vec_cts(vec_round(hub_f32x4), 0))),
+                (nk_u8_t *)to_ptr, to_bytes);
+            break;
+        case nk_u8_k:
+            vec_xst_len(
+                (nk_vu8x16_t)vec_pack(vec_pack(vec_ctu(vec_round(hub_f32x4), 0), vec_ctu(vec_round(hub_f32x4), 0)),
+                                      vec_pack(vec_ctu(vec_round(hub_f32x4), 0), vec_ctu(vec_round(hub_f32x4), 0))),
+                (nk_u8_t *)to_ptr, to_bytes);
+            break;
+        default: break;
+        }
+        from_ptr += from_bytes;
+        to_ptr += to_bytes;
+    }
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_POWERVSX
+#endif // NK_TARGET_POWER_
+#endif // NK_CAST_POWERVSX_H