numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -0,0 +1,449 @@
1
+ /**
2
+ * @brief SIMD-accelerated Type Conversions for Power VSX.
3
+ * @file include/numkong/cast/powervsx.h
4
+ * @author Ash Vardanian
5
+ * @date March 23, 2026
6
+ *
7
+ * @sa include/numkong/cast.h
8
+ *
9
+ * @section powervsx_cast_instructions Power VSX Conversion Instructions (POWER9+)
10
+ *
11
+ * Float16 hardware conversion (POWER9+):
12
+ *
13
+ * Intrinsic Instruction Notes
14
+ * vec_extract_fp32_from_shorth xvcvhpsp High 4 f16 → f32x4 (1 instruction!)
15
+ * vec_extract_fp32_from_shortl xvcvhpsp Low 4 f16 → f32x4 (1 instruction!)
16
+ *
17
+ * Scalar f16 ↔ f32 (POWER9 inline asm):
18
+ *
19
+ * Instruction Notes
20
+ * lxsihzx Load f16 → VSR (zero-extended)
21
+ * xscvhpdp Convert half → double precision
22
+ * xscvdphp Convert double → half precision
23
+ * stxsihx Store f16 from VSR
24
+ *
25
+ * Scalar sqrt (POWER9 inline asm):
26
+ *
27
+ * Instruction Notes
28
+ * xssqrtsp Scalar single-precision sqrt
29
+ * xssqrtdp Scalar double-precision sqrt
30
+ *
31
+ * Float ↔ integer conversions:
32
+ *
33
+ * Intrinsic Instruction Notes
34
+ * vec_cts xvcvspsxws f32x4 → i32x4 (truncation)
35
+ * vec_ctu xvcvspuxws f32x4 → u32x4 (truncation)
36
+ * vec_ctf xvcvsxwsp i32x4 → f32x4
37
+ * vec_ctf xvcvuxwsp u32x4 → f32x4
38
+ *
39
+ * Integer narrowing/widening:
40
+ *
41
+ * Intrinsic Instruction Notes
42
+ * vec_pack vpkuwum u32x4 → u16x8 (modular)
43
+ * vec_packs vpkswss i32x4 → i16x8 (signed saturation)
44
+ * vec_packsu vpkswus i32x4 → u16x8 (unsigned saturation from signed)
45
+ * vec_unpackh vupkhsh i16x8 → i32x4 (sign-extend high half)
46
+ * vec_mergeh vmrghh Interleave high halves (zero-extend via merge with zero)
47
+ *
48
+ * Partial-length load:
49
+ *
50
+ * Intrinsic Instruction Notes
51
+ * vec_xl_len lxvl Load up to 16 bytes with runtime length (POWER9)
52
+ *
53
+ * Load/store:
54
+ *
55
+ * Intrinsic Instruction Notes
56
+ * vec_xl lxvd2x Aligned/unaligned load
57
+ * vec_xst stxvd2x Aligned/unaligned store
58
+ *
59
+ * BF16 conversions use bit manipulation (no hardware support):
60
+ * - bf16 → f32: zero-extend u16 → u32 via vec_mergeh with zero, reinterpret
61
+ * - f32 → bf16: RNE rounding + vec_sr by 16 + vec_pack
62
+ *
63
+ * FP8 (E4M3/E5M2/E2M3/E3M2) types have no Power hardware support.
64
+ * Serial fallback via cast/serial.h is used for those formats.
65
+ */
66
+ #ifndef NK_CAST_POWERVSX_H
67
+ #define NK_CAST_POWERVSX_H
68
+
69
+ #if NK_TARGET_POWER_
70
+ #if NK_TARGET_POWERVSX
71
+
72
+ #include "numkong/types.h"
73
+ #include "numkong/cast/serial.h" // `nk_cast_serial`, `nk_dtype_bits`
74
+ #include "numkong/reduce/serial.h" // `nk_reduce_moments_f32_serial`
75
+
76
+ // Power VSX vector typedefs — wrapping altivec built-in vector types.
77
+ // These may move to `numkong/types.h` in the future.
78
+ #ifndef NK_POWERVSX_TYPES_DEFINED_
79
+ #define NK_POWERVSX_TYPES_DEFINED_
80
+ #endif // NK_POWERVSX_TYPES_DEFINED_
81
+
82
+ #if defined(__cplusplus)
83
+ extern "C" {
84
+ #endif
85
+
86
+ #if defined(__clang__)
87
+ #pragma clang attribute push(__attribute__((target("power9-vector"))), apply_to = function)
88
+ #elif defined(__GNUC__)
89
+ #pragma GCC push_options
90
+ #pragma GCC target("power9-vector")
91
+ #endif
92
+
93
+ /** @brief Convert scalar f16 → f32 via POWER9 vector path (xvcvhpsp). */
94
+ NK_PUBLIC void nk_f16_to_f32_powervsx(nk_f16_t const *source, nk_f32_t *destination) {
95
+ nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 2);
96
+ *destination = vec_extract(vec_extract_fp32_from_shorth(values_u16x8), 0);
97
+ }
98
+
99
+ /** @brief Convert scalar f32 → f16 via POWER9 vector path (xvcvsphp). */
100
+ NK_PUBLIC void nk_f32_to_f16_powervsx(nk_f32_t const *source, nk_f16_t *destination) {
101
+ nk_vu16x8_t packed_u16x8 = vec_pack_to_short_fp32(vec_splats(*source), vec_splats(*source));
102
+ *destination = vec_extract(packed_u16x8, 0);
103
+ }
104
+
105
+ /** @brief Type-agnostic 128-bit full load (Power VSX). */
106
+ NK_INTERNAL void nk_load_b128_powervsx_(void const *source, nk_b128_vec_t *destination) {
107
+ destination->vu8x16 = vec_xl(0, (nk_u8_t const *)source);
108
+ }
109
+
110
+ /** @brief Type-agnostic 256-bit full load (Power VSX). */
111
+ NK_INTERNAL void nk_load_b256_powervsx_(void const *source, nk_b256_vec_t *destination) {
112
+ destination->vu8x16s[0] = vec_xl(0, (nk_u8_t const *)source);
113
+ destination->vu8x16s[1] = vec_xl(16, (nk_u8_t const *)source);
114
+ }
115
+
116
+ /** @brief Type-agnostic 128-bit full store (Power VSX). */
117
+ NK_INTERNAL void nk_store_b128_powervsx_(nk_b128_vec_t const *source, void *destination) {
118
+ vec_xst(source->vu8x16, 0, (nk_u8_t *)destination);
119
+ }
120
+
121
+ /** @brief Type-agnostic 256-bit full store (Power VSX). */
122
+ NK_INTERNAL void nk_store_b256_powervsx_(nk_b256_vec_t const *source, void *destination) {
123
+ vec_xst(source->vu8x16s[0], 0, (nk_u8_t *)destination);
124
+ vec_xst(source->vu8x16s[1], 16, (nk_u8_t *)destination);
125
+ }
126
+
127
+ /** @brief Type-agnostic 64-bit load (Power VSX). */
128
+ NK_INTERNAL void nk_load_b64_powervsx_(void const *source, nk_b64_vec_t *destination) {
129
+ destination->u64 = *(nk_u64_t const *)source;
130
+ }
131
+
132
+ /** @brief Partial load for 64-bit elements (n elements, max 4) into 256-bit vector.
133
+ * Uses vec_xl_len to load exactly n×8 bytes, zero-filling the remainder.
134
+ * vec_xl_len with length=0 produces a zero vector (no branch needed). */
135
+ NK_INTERNAL void nk_partial_load_b64x4_powervsx_(void const *source, nk_b256_vec_t *destination, nk_size_t n) {
136
+ nk_size_t bytes = n * 8;
137
+ nk_size_t first_half = bytes < 16 ? bytes : 16;
138
+ nk_size_t second_half = bytes > 16 ? bytes - 16 : 0;
139
+ destination->vu8x16s[0] = vec_xl_len((nk_u8_t *)source, first_half);
140
+ destination->vu8x16s[1] = vec_xl_len((nk_u8_t *)source + 16, second_half);
141
+ }
142
+
143
+ /** @brief Partial load for 64-bit elements (n elements, max 2) into 128-bit vector. */
144
+ NK_INTERNAL void nk_partial_load_b64x2_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
145
+ destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n * 8);
146
+ }
147
+
148
+ /** @brief Partial load for 32-bit elements (n elements, max 4) into 128-bit vector. */
149
+ NK_INTERNAL void nk_partial_load_b32x4_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
150
+ destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n * 4);
151
+ }
152
+
153
+ /** @brief Partial load for 32-bit elements (n elements, max 2) into 64-bit vector. */
154
+ NK_INTERNAL void nk_partial_load_b32x2_powervsx_(void const *source, nk_b64_vec_t *destination, nk_size_t n) {
155
+ nk_copy_bytes_(destination, source, n * 4);
156
+ }
157
+
158
+ /** @brief Partial load for 16-bit elements (n elements, max 8) into 128-bit vector. */
159
+ NK_INTERNAL void nk_partial_load_b16x8_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
160
+ destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n * 2);
161
+ }
162
+
163
+ /** @brief Partial load for 8-bit elements (n elements, max 16) into 128-bit vector. */
164
+ NK_INTERNAL void nk_partial_load_b8x16_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n) {
165
+ destination->vu8x16 = vec_xl_len((nk_u8_t *)source, n);
166
+ }
167
+
168
+ /** @brief Partial load for 1-bit elements (n bits, max 128) into 128-bit vector. */
169
+ NK_INTERNAL void nk_partial_load_b1x128_powervsx_(void const *source, nk_b128_vec_t *destination, nk_size_t n_bits) {
170
+ destination->vu8x16 = vec_xl_len((nk_u8_t *)source, nk_size_divide_round_up_(n_bits, 8));
171
+ }
172
+
173
+ /** @brief Partial store for 64-bit elements (n elements, max 4) from 256-bit vector.
174
+ * vec_xst_len with length=0 stores nothing (no branch needed). */
175
+ NK_INTERNAL void nk_partial_store_b64x4_powervsx_(nk_b256_vec_t const *source, void *destination, nk_size_t n) {
176
+ nk_size_t bytes = n * 8;
177
+ nk_size_t first_half = bytes < 16 ? bytes : 16;
178
+ nk_size_t second_half = bytes > 16 ? bytes - 16 : 0;
179
+ vec_xst_len(source->vu8x16s[0], (nk_u8_t *)destination, first_half);
180
+ vec_xst_len(source->vu8x16s[1], (nk_u8_t *)destination + 16, second_half);
181
+ }
182
+
183
+ /** @brief Partial store for 32-bit elements (n elements, max 4) from 128-bit vector. */
184
+ NK_INTERNAL void nk_partial_store_b32x4_powervsx_(nk_b128_vec_t const *source, void *destination, nk_size_t n) {
185
+ vec_xst_len(source->vu8x16, (nk_u8_t *)destination, n * 4);
186
+ }
187
+
188
+ /** @brief Convert 4x f16 → f32x4 via POWER9 hardware (xvcvhpsp, 1 instruction!).
189
+ * Loads 4 f16 values into a u16x8 register and uses `vec_extract_fp32_from_shorth`. */
190
+ NK_INTERNAL nk_vf32x4_t nk_f16x4_to_f32x4_powervsx_(nk_f16_t const *source) {
191
+ nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 8);
192
+ return vec_extract_fp32_from_shorth(values_u16x8);
193
+ }
194
+
195
+ /** @brief Convert f32x4 → 4x f16 via POWER9 hardware (xvcvsphp, 1 instruction!).
196
+ * Uses `vec_pack_to_short_fp32` to pack 4 f32 values into 4 f16 values. */
197
+ NK_INTERNAL nk_b64_vec_t nk_f32x4_to_f16x4_powervsx_(nk_vf32x4_t values_f32x4) {
198
+ nk_vu16x8_t packed_u16x8 = vec_pack_to_short_fp32(values_f32x4, values_f32x4);
199
+ nk_b64_vec_t result_vec;
200
+ result_vec.u64 = vec_extract((nk_vu64x2_t)packed_u16x8, 0);
201
+ return result_vec;
202
+ }
203
+
204
+ /** @brief Convert 4x bf16 → f32x4 via branchless bit manipulation (Power VSX).
205
+ * BF16 format: upper 16 bits of f32. Conversion is zero-extend via vec_mergeh, reinterpret. */
206
+ NK_INTERNAL nk_vf32x4_t nk_bf16x4_to_f32x4_powervsx_(nk_bf16_t const *source) {
207
+ nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 8);
208
+ nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
209
+ nk_vu32x4_t bits_u32x4 = (nk_vu32x4_t)vec_mergeh(zero_u16x8, values_u16x8);
210
+ return (nk_vf32x4_t)bits_u32x4;
211
+ }
212
+
213
+ /** @brief Convert f32x4 → bf16 packed in u16x8 with RNE rounding (Power VSX).
214
+ * Round-to-nearest-even: add (0x7FFF + lsb) before truncation.
215
+ * Uses vec_sr by 16, then vec_pack to narrow u32x4 → u16x8.
216
+ * Result is in low 4 lanes of the returned u16x8. */
217
+ NK_INTERNAL nk_vu16x8_t nk_f32x4_to_bf16_pack_powervsx_(nk_vf32x4_t values_f32x4) {
218
+ nk_vu32x4_t shift_u32x4 = vec_splats((nk_u32_t)16);
219
+ nk_vu32x4_t one_u32x4 = vec_splats((nk_u32_t)1);
220
+ nk_vu32x4_t rounding_base_u32x4 = vec_splats((nk_u32_t)0x7FFF);
221
+
222
+ nk_vu32x4_t bits_u32x4 = (nk_vu32x4_t)values_f32x4;
223
+
224
+ // RNE rounding: lsb = (bits >> 16) & 1; bits += 0x7FFF + lsb
225
+ nk_vu32x4_t lsb_u32x4 = vec_and(vec_sr(bits_u32x4, shift_u32x4), one_u32x4);
226
+ nk_vu32x4_t rounding_u32x4 = vec_add(rounding_base_u32x4, lsb_u32x4);
227
+ bits_u32x4 = vec_add(bits_u32x4, rounding_u32x4);
228
+ bits_u32x4 = vec_sr(bits_u32x4, shift_u32x4);
229
+ return vec_pack(bits_u32x4, bits_u32x4);
230
+ }
231
+
232
+ /** @brief Convert f32x4 → 4x bf16 with RNE rounding (Power VSX). Returns nk_b64_vec_t. */
233
+ NK_INTERNAL nk_b64_vec_t nk_f32x4_to_bf16x4_powervsx_(nk_vf32x4_t values_f32x4) {
234
+ nk_b64_vec_t result_vec;
235
+ result_vec.u64 = vec_extract((nk_vu64x2_t)nk_f32x4_to_bf16_pack_powervsx_(values_f32x4), 0);
236
+ return result_vec;
237
+ }
238
+
239
+ /** @brief Convert 4x i16 → f32x4 (Power VSX). Sign-extend via vec_unpackh, then vec_ctf. */
240
+ NK_INTERNAL nk_vf32x4_t nk_i16x4_to_f32x4_powervsx_(nk_i16_t const *source) {
241
+ nk_vi16x8_t values_i16x8 = (nk_vi16x8_t)vec_xl_len((nk_u8_t *)source, 8);
242
+ nk_vi32x4_t values_i32x4 = vec_unpackh(values_i16x8);
243
+ return vec_ctf(values_i32x4, 0);
244
+ }
245
+
246
+ /** @brief Convert 4x u16 → f32x4 (Power VSX). Zero-extend via vec_mergeh with zero, then vec_ctf. */
247
+ NK_INTERNAL nk_vf32x4_t nk_u16x4_to_f32x4_powervsx_(nk_u16_t const *source) {
248
+ nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_xl_len((nk_u8_t *)source, 8);
249
+ nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
250
+ nk_vu32x4_t values_u32x4 = (nk_vu32x4_t)vec_mergeh(values_u16x8, zero_u16x8);
251
+ return vec_ctf(values_u32x4, 0);
252
+ }
253
+
254
+ /** @brief Convert 4x i8 → f32x4 (Power VSX). Double unpack via vec_unpackh (i8 → i16 → i32), then vec_ctf. */
255
+ NK_INTERNAL nk_vf32x4_t nk_i8x4_to_f32x4_powervsx_(void const *source) {
256
+ nk_vi8x16_t values_i8x16 = (nk_vi8x16_t)vec_xl_len((nk_u8_t *)source, 4);
257
+ nk_vi16x8_t values_i16x8 = vec_unpackh(values_i8x16);
258
+ nk_vi32x4_t values_i32x4 = vec_unpackh(values_i16x8);
259
+ return vec_ctf(values_i32x4, 0);
260
+ }
261
+
262
+ /** @brief Convert 4x u8 → f32x4 (Power VSX). Double merge with zero (u8 → u16 → u32), then vec_ctf. */
263
+ NK_INTERNAL nk_vf32x4_t nk_u8x4_to_f32x4_powervsx_(void const *source) {
264
+ nk_vu8x16_t values_u8x16 = (nk_vu8x16_t)vec_xl_len((nk_u8_t *)source, 4);
265
+ nk_vu8x16_t zero_u8x16 = vec_splats((nk_u8_t)0);
266
+ nk_vu16x8_t values_u16x8 = (nk_vu16x8_t)vec_mergeh(values_u8x16, zero_u8x16);
267
+ nk_vu16x8_t zero_u16x8 = vec_splats((nk_u16_t)0);
268
+ nk_vu32x4_t values_u32x4 = (nk_vu32x4_t)vec_mergeh(values_u16x8, zero_u16x8);
269
+ return vec_ctf(values_u32x4, 0);
270
+ }
271
+
272
+ /** @brief Convert f32x4 → 4x i16 with vector saturation (Power VSX).
273
+ * Uses vec_cts + vec_min/vec_max for clamping, then vec_packs to narrow. */
274
+ NK_INTERNAL nk_b64_vec_t nk_f32x4_to_i16x4_powervsx_(nk_vf32x4_t values_f32x4) {
275
+ nk_vi32x4_t min_i32x4 = vec_splats((nk_i32_t)-32768);
276
+ nk_vi32x4_t max_i32x4 = vec_splats((nk_i32_t)32767);
277
+
278
+ nk_vi32x4_t values_i32x4 = vec_cts(vec_round(values_f32x4), 0);
279
+ values_i32x4 = vec_max(values_i32x4, min_i32x4);
280
+ values_i32x4 = vec_min(values_i32x4, max_i32x4);
281
+
282
+ // Signed saturating pack: i32x4 → i16x8, extract low 8 bytes
283
+ nk_vi16x8_t packed_i16x8 = vec_packs(values_i32x4, values_i32x4);
284
+ nk_b64_vec_t result_vec;
285
+ result_vec.u64 = vec_extract((nk_vu64x2_t)packed_i16x8, 0);
286
+ return result_vec;
287
+ }
288
+
289
+ /** @brief Convert f32x4 → 4x u16 with vector saturation (Power VSX).
290
+ * Uses vec_ctu + vec_round/vec_max for clamping, then vec_pack to narrow. */
291
+ NK_INTERNAL nk_b64_vec_t nk_f32x4_to_u16x4_powervsx_(nk_vf32x4_t values_f32x4) {
292
+ nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
293
+ nk_vu32x4_t max_u32x4 = vec_splats((nk_u32_t)65535);
294
+
295
+ values_f32x4 = vec_max(values_f32x4, zero_f32x4);
296
+ nk_vu32x4_t values_u32x4 = vec_ctu(vec_round(values_f32x4), 0);
297
+ values_u32x4 = vec_min(values_u32x4, max_u32x4);
298
+
299
+ // Pack u32x4 → u16x8, extract low 8 bytes
300
+ nk_vu16x8_t packed_u16x8 = vec_pack(values_u32x4, values_u32x4);
301
+ nk_b64_vec_t result_vec;
302
+ result_vec.u64 = vec_extract((nk_vu64x2_t)packed_u16x8, 0);
303
+ return result_vec;
304
+ }
305
+
306
+ /** @brief Convert f32x4 → 4x i8 with vector saturation (Power VSX).
307
+ * Uses vec_cts + vec_min/vec_max for clamping, then vec_packs twice to narrow. */
308
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_i8x4_powervsx_(nk_vf32x4_t values_f32x4) {
309
+ nk_vi32x4_t min_i32x4 = vec_splats((nk_i32_t)-128);
310
+ nk_vi32x4_t max_i32x4 = vec_splats((nk_i32_t)127);
311
+
312
+ nk_vi32x4_t values_i32x4 = vec_cts(vec_round(values_f32x4), 0);
313
+ values_i32x4 = vec_max(values_i32x4, min_i32x4);
314
+ values_i32x4 = vec_min(values_i32x4, max_i32x4);
315
+
316
+ // Narrow: i32x4 → i16x8 → i8x16, extract low 4 bytes
317
+ nk_vi16x8_t packed_i16x8 = vec_packs(values_i32x4, values_i32x4);
318
+ nk_vi8x16_t packed_i8x16 = vec_packs(packed_i16x8, packed_i16x8);
319
+ nk_b32_vec_t result_vec;
320
+ result_vec.u32 = vec_extract((nk_vu32x4_t)packed_i8x16, 0);
321
+ return result_vec;
322
+ }
323
+
324
+ /** @brief Convert f32x4 → 4x u8 with vector saturation (Power VSX).
325
+ * Uses vec_ctu + vec_min/vec_max for clamping, then vec_pack twice to narrow. */
326
+ NK_INTERNAL nk_b32_vec_t nk_f32x4_to_u8x4_powervsx_(nk_vf32x4_t values_f32x4) {
327
+ nk_vf32x4_t zero_f32x4 = vec_splats(0.0f);
328
+ nk_vu32x4_t max_u32x4 = vec_splats((nk_u32_t)255);
329
+
330
+ values_f32x4 = vec_max(values_f32x4, zero_f32x4);
331
+ nk_vu32x4_t values_u32x4 = vec_ctu(vec_round(values_f32x4), 0);
332
+ values_u32x4 = vec_min(values_u32x4, max_u32x4);
333
+
334
+ // Narrow: u32x4 → u16x8 → u8x16, extract low 4 bytes
335
+ nk_vu16x8_t packed_u16x8 = vec_pack(values_u32x4, values_u32x4);
336
+ nk_vu8x16_t packed_u8x16 = vec_pack(packed_u16x8, packed_u16x8);
337
+ nk_b32_vec_t result_vec;
338
+ result_vec.u32 = vec_extract((nk_vu32x4_t)packed_u8x16, 0);
339
+ return result_vec;
340
+ }
341
+
342
+ NK_PUBLIC void nk_cast_powervsx(void const *from, nk_dtype_t from_type, nk_size_t n, void *to, nk_dtype_t to_type) {
343
+ // Same-type fast path
344
+ if (from_type == to_type) {
345
+ nk_size_t size_bits = nk_dtype_bits(from_type);
346
+ if (size_bits > 0) nk_copy_bytes_(to, from, nk_size_divide_round_up_(n * size_bits, 8));
347
+ return;
348
+ }
349
+
350
+ // Validate supported types (f32 and smaller, no FP8 vectorization on Power)
351
+ int from_ok = (from_type == nk_f32_k || from_type == nk_f16_k || from_type == nk_bf16_k || from_type == nk_i8_k ||
352
+ from_type == nk_u8_k || from_type == nk_i16_k || from_type == nk_u16_k || from_type == nk_i32_k ||
353
+ from_type == nk_u32_k);
354
+ int to_ok = (to_type == nk_f32_k || to_type == nk_f16_k || to_type == nk_bf16_k || to_type == nk_i8_k ||
355
+ to_type == nk_u8_k || to_type == nk_i16_k || to_type == nk_u16_k || to_type == nk_i32_k ||
356
+ to_type == nk_u32_k);
357
+
358
+ // Fall back to serial for unsupported types or i32 ↔ u32 (loses precision through f32)
359
+ if (!from_ok || !to_ok || (from_type == nk_i32_k && to_type == nk_u32_k) ||
360
+ (from_type == nk_u32_k && to_type == nk_i32_k)) {
361
+ nk_cast_serial(from, from_type, n, to, to_type);
362
+ return;
363
+ }
364
+
365
+ // F32 hub with predicated loads/stores — no serial fallback needed
366
+ nk_size_t from_element_bytes = nk_dtype_bits(from_type) / 8;
367
+ nk_size_t to_element_bytes = nk_dtype_bits(to_type) / 8;
368
+ nk_u8_t const *from_ptr = (nk_u8_t const *)from;
369
+ nk_u8_t *to_ptr = (nk_u8_t *)to;
370
+
371
+ for (nk_size_t index = 0; index < n; index += 4) {
372
+ nk_size_t remaining = n - index < 4 ? n - index : 4;
373
+ nk_size_t from_bytes = remaining * from_element_bytes;
374
+ nk_size_t to_bytes = remaining * to_element_bytes;
375
+
376
+ // Predicated load → upcast to f32x4 hub
377
+ nk_vu8x16_t raw_u8x16 = vec_xl_len((nk_u8_t *)from_ptr, from_bytes);
378
+ nk_vf32x4_t hub_f32x4;
379
+ switch (from_type) {
380
+ case nk_f32_k: hub_f32x4 = (nk_vf32x4_t)raw_u8x16; break;
381
+ case nk_f16_k: hub_f32x4 = vec_extract_fp32_from_shorth((nk_vu16x8_t)raw_u8x16); break;
382
+ case nk_bf16_k: hub_f32x4 = (nk_vf32x4_t)vec_mergeh(vec_splats((nk_u16_t)0), (nk_vu16x8_t)raw_u8x16); break;
383
+ case nk_i32_k: hub_f32x4 = vec_ctf((nk_vi32x4_t)raw_u8x16, 0); break;
384
+ case nk_u32_k: hub_f32x4 = vec_ctf((nk_vu32x4_t)raw_u8x16, 0); break;
385
+ case nk_i16_k: hub_f32x4 = vec_ctf(vec_unpackh((nk_vi16x8_t)raw_u8x16), 0); break;
386
+ case nk_u16_k:
387
+ hub_f32x4 = vec_ctf((nk_vu32x4_t)vec_mergeh((nk_vu16x8_t)raw_u8x16, vec_splats((nk_u16_t)0)), 0);
388
+ break;
389
+ case nk_i8_k: hub_f32x4 = vec_ctf(vec_unpackh(vec_unpackh((nk_vi8x16_t)raw_u8x16)), 0); break;
390
+ case nk_u8_k:
391
+ hub_f32x4 = vec_ctf((nk_vu32x4_t)vec_mergeh((nk_vu16x8_t)vec_mergeh(raw_u8x16, vec_splats((nk_u8_t)0)),
392
+ vec_splats((nk_u16_t)0)),
393
+ 0);
394
+ break;
395
+ default: hub_f32x4 = vec_splats(0.0f); break;
396
+ }
397
+
398
+ // Downcast from f32x4 hub → predicated store
399
+ switch (to_type) {
400
+ case nk_f32_k: vec_xst_len(hub_f32x4, (nk_f32_t *)to_ptr, to_bytes); break;
401
+ case nk_f16_k:
402
+ vec_xst_len((nk_vu8x16_t)vec_pack_to_short_fp32(hub_f32x4, hub_f32x4), (nk_u8_t *)to_ptr, to_bytes);
403
+ break;
404
+ case nk_bf16_k:
405
+ vec_xst_len((nk_vu8x16_t)nk_f32x4_to_bf16_pack_powervsx_(hub_f32x4), (nk_u8_t *)to_ptr, to_bytes);
406
+ break;
407
+ case nk_i32_k: vec_xst_len(vec_cts(vec_round(hub_f32x4), 0), (nk_i32_t *)to_ptr, to_bytes); break;
408
+ case nk_u32_k: vec_xst_len(vec_ctu(vec_round(hub_f32x4), 0), (nk_u32_t *)to_ptr, to_bytes); break;
409
+ case nk_i16_k:
410
+ vec_xst_len((nk_vu8x16_t)vec_packs(vec_cts(vec_round(hub_f32x4), 0), vec_cts(vec_round(hub_f32x4), 0)),
411
+ (nk_u8_t *)to_ptr, to_bytes);
412
+ break;
413
+ case nk_u16_k:
414
+ vec_xst_len((nk_vu8x16_t)vec_pack(vec_ctu(vec_round(hub_f32x4), 0), vec_ctu(vec_round(hub_f32x4), 0)),
415
+ (nk_u8_t *)to_ptr, to_bytes);
416
+ break;
417
+ case nk_i8_k:
418
+ vec_xst_len(
419
+ (nk_vu8x16_t)vec_packs(vec_packs(vec_cts(vec_round(hub_f32x4), 0), vec_cts(vec_round(hub_f32x4), 0)),
420
+ vec_packs(vec_cts(vec_round(hub_f32x4), 0), vec_cts(vec_round(hub_f32x4), 0))),
421
+ (nk_u8_t *)to_ptr, to_bytes);
422
+ break;
423
+ case nk_u8_k:
424
+ vec_xst_len(
425
+ (nk_vu8x16_t)vec_pack(vec_pack(vec_ctu(vec_round(hub_f32x4), 0), vec_ctu(vec_round(hub_f32x4), 0)),
426
+ vec_pack(vec_ctu(vec_round(hub_f32x4), 0), vec_ctu(vec_round(hub_f32x4), 0))),
427
+ (nk_u8_t *)to_ptr, to_bytes);
428
+ break;
429
+ default: break;
430
+ }
431
+
432
+ from_ptr += from_bytes;
433
+ to_ptr += to_bytes;
434
+ }
435
+ }
436
+
437
+ #if defined(__clang__)
438
+ #pragma clang attribute pop
439
+ #elif defined(__GNUC__)
440
+ #pragma GCC pop_options
441
+ #endif
442
+
443
+ #if defined(__cplusplus)
444
+ } // extern "C"
445
+ #endif
446
+
447
+ #endif // NK_TARGET_POWERVSX
448
+ #endif // NK_TARGET_POWER_
449
+ #endif // NK_CAST_POWERVSX_H