local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,202 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK8_0 32
22
+ typedef struct {
23
+ half d; // delta
24
+ char qs[QK8_0]; // quants
25
+ } block_q8_0;
26
+
27
+ #define NB_Q8_0 8
28
+
29
+ #ifdef INTEL_GPU
30
+ #define N_R0_Q8_0 4 // number of rows each subgroup works on
31
+ #define N_SG_Q8_0 2 // number of subgroups in a work group
32
+ #define N_SIMDWIDTH 16 // subgroup size
33
+ #elif defined (ADRENO_GPU)
34
+ #define N_R0_Q8_0 4
35
+ #define N_SG_Q8_0 2
36
+ #define N_SIMDWIDTH 64
37
+ #endif
38
+
39
+ #ifdef INTEL_GPU
40
+ REQD_SUBGROUP_SIZE_16
41
+ #elif defined (ADRENO_GPU)
42
+ REQD_SUBGROUP_SIZE_64
43
+ #endif
44
+ kernel void kernel_mul_mv_q8_0_f32_flat(
45
+ global char * src0_q,
46
+ global half * src0_d,
47
+ global char * src1,
48
+ ulong offset1,
49
+ global char * dst,
50
+ ulong offsetd,
51
+ int ne00,
52
+ int ne01,
53
+ ulong nb01,
54
+ ulong nb02,
55
+ ulong nb03,
56
+ int ne12,
57
+ ulong nb11,
58
+ ulong nb12,
59
+ ulong nb13,
60
+ int ne0,
61
+ int ne1,
62
+ int r2,
63
+ int r3
64
+ ) {
65
+ src1 = (global char*)((global char*)src1 + offset1);
66
+ dst = (global char*)((global char*)dst + offsetd);
67
+
68
+ int nb = ne00/QK8_0;
69
+
70
+ int r0 = get_group_id(0);
71
+ int r1 = get_group_id(1);
72
+ int im = get_group_id(2);
73
+
74
+ int first_row = (r0*N_SG_Q8_0 + get_sub_group_id()) * N_R0_Q8_0;
75
+
76
+ uint i12 = im%ne12;
77
+ uint i13 = im/ne12;
78
+
79
+ ulong offset_src1 = r1*nb11 + i12*nb12 + i13*nb13;
80
+ global float * y = (global float *) (src1 + offset_src1);
81
+
82
+ // pointers to src0 rows
83
+ uint offset_src0_base = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
84
+
85
+ global char * ax0, * ax1, * ax2, * ax3;
86
+ global half * ad0, * ad1, * ad2, * ad3;
87
+ uint offset_src0;
88
+
89
+ offset_src0 = offset_src0_base + 0*nb01;
90
+ offset_src0 = offset_src0/34;
91
+ ax0 = (global char *) ((global char *) src0_q + offset_src0*sizeof(char)*QK8_0);
92
+ ad0 = (global half *) ((global char *) src0_d + offset_src0*sizeof(half));
93
+
94
+ offset_src0 = offset_src0_base + 1*nb01;
95
+ offset_src0 = offset_src0/34;
96
+ ax1 = (global char *) ((global char *) src0_q + offset_src0*sizeof(char)*QK8_0);
97
+ ad1 = (global half *) ((global char *) src0_d + offset_src0*sizeof(half));
98
+
99
+ offset_src0 = offset_src0_base + 2*nb01;
100
+ offset_src0 = offset_src0/34;
101
+ ax2 = (global char *) ((global char *) src0_q + offset_src0*sizeof(char)*QK8_0);
102
+ ad2 = (global half *) ((global char *) src0_d + offset_src0*sizeof(half));
103
+
104
+ offset_src0 = offset_src0_base + 3*nb01;
105
+ offset_src0 = offset_src0/34;
106
+ ax3 = (global char *) ((global char *) src0_q + offset_src0*sizeof(char)*QK8_0);
107
+ ad3 = (global half *) ((global char *) src0_d + offset_src0*sizeof(half));
108
+
109
+ const short ix = get_sub_group_local_id()/4;
110
+ const short il = get_sub_group_local_id()%4;
111
+
112
+ global float * yb = y + ix*QK8_0 + il*NB_Q8_0;
113
+
114
+ float8 yl;
115
+ float8 qv;
116
+ float4 sumf = 0.f;
117
+ float sumq = 0.f;
118
+ global char * qs;
119
+
120
+ // each thread handles NB_Q8_0 quants at a time
121
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/4) {
122
+ yl = vload8(0, yb);
123
+
124
+ qs = ax0 + ib*sizeof(char)*QK8_0 + il*NB_Q8_0;
125
+ qv = convert_float8(vload8(0, qs));
126
+ sumq = 0;
127
+ sumq += qv.s0*yl.s0;
128
+ sumq += qv.s1*yl.s1;
129
+ sumq += qv.s2*yl.s2;
130
+ sumq += qv.s3*yl.s3;
131
+ sumq += qv.s4*yl.s4;
132
+ sumq += qv.s5*yl.s5;
133
+ sumq += qv.s6*yl.s6;
134
+ sumq += qv.s7*yl.s7;
135
+ sumf.s0 += sumq*ad0[ib];
136
+
137
+ qs = ax1 + ib*sizeof(char)*QK8_0 + il*NB_Q8_0;
138
+ qv = convert_float8(vload8(0, qs));
139
+ sumq = 0;
140
+ sumq += qv.s0*yl.s0;
141
+ sumq += qv.s1*yl.s1;
142
+ sumq += qv.s2*yl.s2;
143
+ sumq += qv.s3*yl.s3;
144
+ sumq += qv.s4*yl.s4;
145
+ sumq += qv.s5*yl.s5;
146
+ sumq += qv.s6*yl.s6;
147
+ sumq += qv.s7*yl.s7;
148
+ sumf.s1 += sumq*ad1[ib];
149
+
150
+ qs = ax2 + ib*sizeof(char)*QK8_0 + il*NB_Q8_0;
151
+ qv = convert_float8(vload8(0, qs));
152
+ sumq = 0;
153
+ sumq += qv.s0*yl.s0;
154
+ sumq += qv.s1*yl.s1;
155
+ sumq += qv.s2*yl.s2;
156
+ sumq += qv.s3*yl.s3;
157
+ sumq += qv.s4*yl.s4;
158
+ sumq += qv.s5*yl.s5;
159
+ sumq += qv.s6*yl.s6;
160
+ sumq += qv.s7*yl.s7;
161
+ sumf.s2 += sumq*ad2[ib];
162
+
163
+ qs = ax3 + ib*sizeof(char)*QK8_0 + il*NB_Q8_0;
164
+ qv = convert_float8(vload8(0, qs));
165
+ sumq = 0;
166
+ sumq += qv.s0*yl.s0;
167
+ sumq += qv.s1*yl.s1;
168
+ sumq += qv.s2*yl.s2;
169
+ sumq += qv.s3*yl.s3;
170
+ sumq += qv.s4*yl.s4;
171
+ sumq += qv.s5*yl.s5;
172
+ sumq += qv.s6*yl.s6;
173
+ sumq += qv.s7*yl.s7;
174
+ sumf.s3 += sumq*ad3[ib];
175
+
176
+ yb += N_SIMDWIDTH*NB_Q8_0;
177
+ }
178
+
179
+ global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
180
+
181
+ float4 tot = (float4)(
182
+ sub_group_reduce_add(sumf.s0),
183
+ sub_group_reduce_add(sumf.s1),
184
+ sub_group_reduce_add(sumf.s2),
185
+ sub_group_reduce_add(sumf.s3)
186
+ );
187
+
188
+ if (get_sub_group_local_id() == 0) {
189
+ if (first_row + 0 < ne01) {
190
+ dst_f32[first_row + 0] = tot.s0;
191
+ }
192
+ if (first_row + 1 < ne01) {
193
+ dst_f32[first_row + 1] = tot.s1;
194
+ }
195
+ if (first_row + 2 < ne01) {
196
+ dst_f32[first_row + 2] = tot.s2;
197
+ }
198
+ if (first_row + 3 < ne01) {
199
+ dst_f32[first_row + 3] = tot.s3;
200
+ }
201
+ }
202
+ }
@@ -0,0 +1,161 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_required_subgroup_size
4
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
5
+ #define INTEL_GPU 1
6
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
7
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
8
+ #elif defined(cl_qcom_reqd_sub_group_size)
9
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
10
+ #define ADRENO_GPU 1
11
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
12
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
13
+ #endif
14
+
15
+ //------------------------------------------------------------------------------
16
+ // norm
17
+ //------------------------------------------------------------------------------
18
+ kernel void kernel_norm(
19
+ global void * src0,
20
+ ulong offset0,
21
+ global float * dst,
22
+ ulong offsetd,
23
+ int ne00,
24
+ int ne01,
25
+ int ne02,
26
+ int ne03,
27
+ ulong nb01,
28
+ ulong nb02,
29
+ ulong nb03,
30
+ float eps,
31
+ local float * sum
32
+ ) {
33
+ src0 = (global void*)((global char*)src0 + offset0);
34
+ dst = (global void*)((global char*)dst + offsetd);
35
+
36
+ int i03 = get_group_id(2);
37
+ int i02 = get_group_id(1);
38
+ int i01 = get_group_id(0);
39
+
40
+ global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
41
+
42
+ // MEAN
43
+ // parallel sum
44
+ sum[get_local_id(0)] = 0.0f;
45
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
46
+ sum[get_local_id(0)] += x[i00];
47
+ }
48
+ // reduce
49
+ barrier(CLK_LOCAL_MEM_FENCE);
50
+ for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
51
+ if (get_local_id(0) < i) {
52
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
53
+ }
54
+ barrier(CLK_LOCAL_MEM_FENCE);
55
+ }
56
+ float mean = sum[0] / ne00;
57
+
58
+ // recenter and VARIANCE
59
+ barrier(CLK_LOCAL_MEM_FENCE);
60
+ global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
61
+ sum[get_local_id(0)] = 0.0f;
62
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
63
+ y[i00] = x[i00] - mean;
64
+ sum[get_local_id(0)] += y[i00] * y[i00];
65
+ }
66
+
67
+ // reduce
68
+ barrier(CLK_LOCAL_MEM_FENCE);
69
+ for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
70
+ if (get_local_id(0) < i) {
71
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
72
+ }
73
+ barrier(CLK_LOCAL_MEM_FENCE);
74
+ }
75
+ float variance = sum[0] / ne00;
76
+
77
+ float scale = 1.0f/sqrt(variance + eps);
78
+ for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
79
+ y[i00] = y[i00] * scale;
80
+ }
81
+ }
82
+
83
+ //------------------------------------------------------------------------------
84
+ // norm_mul_add
85
+ //------------------------------------------------------------------------------
86
+ #ifdef INTEL_GPU
87
+ REQD_SUBGROUP_SIZE_32
88
+ #elif defined (ADRENO_GPU)
89
+ REQD_SUBGROUP_SIZE_64
90
+ #endif
91
+ kernel void kernel_norm_mul_add(
92
+ global char * src0_ptr, ulong src0_offset,
93
+ global char * src1_ptr, ulong src1_offset,
94
+ global char * src2_ptr, ulong src2_offset,
95
+ global char * dst_ptr, ulong dst_offset,
96
+ int ne00, int ne01, int ne02, int ne03,
97
+ ulong nb01, ulong nb02, ulong nb03,
98
+ int ne10, int ne11, int ne12, int ne13,
99
+ ulong nb11, ulong nb12, ulong nb13,
100
+ int ne20, int ne21, int ne22, int ne23,
101
+ ulong nb21, ulong nb22, ulong nb23,
102
+ ulong nbd1, ulong nbd2, ulong nbd3,
103
+ float eps,
104
+ local float2 * sums
105
+ ) {
106
+ const int i03 = get_group_id(2);
107
+ const int i02 = get_group_id(1);
108
+ const int i01 = get_group_id(0);
109
+
110
+ global float4 * x = (global float4 *)(src0_ptr + src0_offset + i01*nb01 + i02*nb02 + i03*nb03);
111
+ global float4 * w = (global float4 *)(src1_ptr + src1_offset + (i01%ne11)*nb11 + (i02%ne12)*nb12 + (i03%ne13)*nb13);
112
+ global float4 * b = (global float4 *)(src2_ptr + src2_offset + (i01%ne21)*nb21 + (i02%ne22)*nb22 + (i03%ne23)*nb23);
113
+ global float4 * y = (global float4 *)(dst_ptr + dst_offset + i01*nbd1 + i02*nbd2 + i03*nbd3);
114
+
115
+ float p_sum = 0.0f;
116
+ float p_sum_sq = 0.0f;
117
+
118
+ const int n_chunks = ne00 / 4;
119
+ for (int i00 = get_local_id(0); i00 < n_chunks; i00 += get_local_size(0)) {
120
+ float4 val = x[i00];
121
+ p_sum += val.x + val.y + val.z + val.w;
122
+ p_sum_sq += dot(val, val);
123
+ }
124
+
125
+ p_sum = sub_group_reduce_add(p_sum);
126
+ p_sum_sq = sub_group_reduce_add(p_sum_sq);
127
+
128
+ if (get_sub_group_local_id() == 0) {
129
+ sums[get_sub_group_id()] = (float2)(p_sum, p_sum_sq);
130
+ }
131
+ barrier(CLK_LOCAL_MEM_FENCE);
132
+
133
+ if (get_local_id(0) == 0) {
134
+ float sum = 0.0f;
135
+ float sum_sq = 0.0f;
136
+ for (uint i = 0; i < get_num_sub_groups(); ++i) {
137
+ float2 s = sums[i];
138
+ sum += s.x;
139
+ sum_sq += s.y;
140
+ }
141
+
142
+ const float inv_ne00 = 1.0f / (float)ne00;
143
+ const float mean = sum * inv_ne00;
144
+ const float variance = mad(-mean, mean, sum_sq * inv_ne00);
145
+
146
+ sums[0] = (float2)(mean, rsqrt(variance + eps));
147
+ }
148
+ barrier(CLK_LOCAL_MEM_FENCE);
149
+
150
+ const float2 mean_scale = sums[0];
151
+ const float mean = mean_scale.x;
152
+ const float scale = mean_scale.y;
153
+ const float neg_mean_scale = -mean * scale;
154
+
155
+ for (int i00 = get_local_id(0); i00 < n_chunks; i00 += get_local_size(0)) {
156
+ const int w_idx = ne10 > 1 ? i00 : 0;
157
+ const int b_idx = ne20 > 1 ? i00 : 0;
158
+ const float4 norm_x = mad(x[i00], (float4)scale, (float4)neg_mean_scale);
159
+ y[i00] = mad(norm_x, w[w_idx], b[b_idx]);
160
+ }
161
+ }
@@ -0,0 +1,39 @@
1
+ kernel void kernel_pad(
2
+ global void * src0,
3
+ ulong offset0,
4
+ global void * dst,
5
+ ulong offsetd,
6
+ int ne00, int ne01, int ne02, int ne03,
7
+ ulong nb00, ulong nb01, ulong nb02, ulong nb03,
8
+ int ne0, int ne1, int ne2, int ne3,
9
+ ulong nb0, ulong nb1, ulong nb2, ulong nb3,
10
+ int lp0, int rp0,
11
+ int lp1, int rp1,
12
+ int lp2, int rp2,
13
+ int lp3, int rp3
14
+ ) {
15
+ src0 = (global float*)((global char*)src0 + offset0);
16
+ dst = (global float*)((global char*)dst + offsetd);
17
+
18
+ int i0 = get_global_id(0);
19
+ int i1 = get_group_id(1);
20
+ int i2 = get_group_id(2) % ne2;
21
+ int i3 = get_group_id(2) / ne2;
22
+
23
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
24
+ return;
25
+ }
26
+
27
+ uint src0_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
28
+ uint dst_idx = i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0;
29
+
30
+ global float * src0_ptr = (global float *)((global char *)src0 + src0_idx);
31
+ global float * dst_ptr = (global float *)((global char *)dst + dst_idx);
32
+
33
+ bool in_src_bounds = (i0 >= lp0 && i0 < ne0 - rp0) &&
34
+ (i1 >= lp1 && i1 < ne1 - rp1) &&
35
+ (i2 >= lp2 && i2 < ne2 - rp2) &&
36
+ (i3 >= lp3 && i3 < ne3 - rp3);
37
+
38
+ *dst_ptr = in_src_bounds ? *src0_ptr : 0.0f;
39
+ }
@@ -0,0 +1,16 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // relu
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_relu(
7
+ global float * src0,
8
+ ulong offset0,
9
+ global float * dst,
10
+ ulong offsetd
11
+ ) {
12
+ src0 = (global float*)((global char*)src0 + offset0);
13
+ dst = (global float*)((global char*)dst + offsetd);
14
+
15
+ dst[get_global_id(0)] = fmax(0.0f, src0[get_global_id(0)]);
16
+ }
@@ -0,0 +1,38 @@
1
+ kernel void kernel_repeat_f32(
2
+ global const char * src0,
3
+ ulong offset0,
4
+ global char * dst,
5
+ ulong offsetd,
6
+ int ne00,
7
+ int ne01,
8
+ int ne02,
9
+ int ne03,
10
+ ulong nb00,
11
+ ulong nb01,
12
+ ulong nb02,
13
+ ulong nb03,
14
+ int ne0,
15
+ ulong nb0,
16
+ ulong nb1,
17
+ ulong nb2,
18
+ ulong nb3
19
+ ) {
20
+ src0 = src0 + offset0;
21
+ dst = dst + offsetd;
22
+
23
+ const int i3 = get_group_id(2);
24
+ const int i2 = get_group_id(1);
25
+ const int i1 = get_group_id(0);
26
+
27
+ const int i03 = i3%ne03;
28
+ const int i02 = i2%ne02;
29
+ const int i01 = i1%ne01;
30
+
31
+ global const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
32
+ global char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1;
33
+
34
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
35
+ const int i00 = i0%ne00;
36
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i00*nb00));
37
+ }
38
+ }
@@ -0,0 +1,190 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ //------------------------------------------------------------------------------
22
+ // rms_norm
23
+ //------------------------------------------------------------------------------
24
+ // This kernel depends on subgroup size.
25
+ #ifdef INTEL_GPU
26
+ REQD_SUBGROUP_SIZE_32
27
+ #elif defined (ADRENO_GPU)
28
+ REQD_SUBGROUP_SIZE_64
29
+ #endif
30
+ kernel void kernel_rms_norm(
31
+ global void * src0,
32
+ ulong offset0,
33
+ global float * dst,
34
+ ulong offsetd,
35
+ int ne00,
36
+ int ne01,
37
+ int ne02,
38
+ int ne03,
39
+ ulong nb01,
40
+ ulong nb02,
41
+ ulong nb03,
42
+ float eps,
43
+ local float * sum // Note, the size depends on number of subgroups
44
+ ) {
45
+ src0 = (global void*)((global char*)src0 + offset0);
46
+ dst = (global float*)((global char*)dst + offsetd);
47
+
48
+ int i03 = get_group_id(2);
49
+ int i02 = get_group_id(1);
50
+ int i01 = get_group_id(0);
51
+
52
+ global float4 * x = (global float4 *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
53
+ global float * x_scalar = (global float *) x;
54
+ float4 sumf = 0;
55
+ float all_sum = 0;
56
+
57
+ // parallel sum
58
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
59
+ sumf += x[i00] * x[i00];
60
+ }
61
+ all_sum = sumf.s0 + sumf.s1 + sumf.s2 + sumf.s3;
62
+ all_sum = sub_group_reduce_add(all_sum);
63
+ if (get_sub_group_local_id() == 0) {
64
+ sum[get_sub_group_id()] = all_sum;
65
+ }
66
+
67
+ barrier(CLK_LOCAL_MEM_FENCE);
68
+ // broadcast
69
+ for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
70
+ if (get_local_id(0) < i) {
71
+ sum[get_local_id(0)] += sum[get_local_id(0) + i];
72
+ }
73
+ }
74
+ if (get_local_id(0) == 0) {
75
+ for (int i = 4 * (ne00 / 4); i < ne00; i++) {
76
+ sum[0] += x_scalar[i];
77
+ }
78
+ sum[0] /= ne00;
79
+ }
80
+
81
+ barrier(CLK_LOCAL_MEM_FENCE);
82
+
83
+ const float mean = sum[0];
84
+ const float scale = 1.0f/sqrt(mean + eps);
85
+
86
+ global float4 * y = (global float4 *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
87
+ global float * y_scalar = (global float *) y;
88
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
89
+ y[i00] = x[i00] * scale;
90
+ }
91
+ if (get_local_id(0) == 0) {
92
+ for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
93
+ y_scalar[i00] = x_scalar[i00] * scale;
94
+ }
95
+ }
96
+ }
97
+
98
+ //------------------------------------------------------------------------------
99
+ // rms_norm_mul
100
+ //------------------------------------------------------------------------------
101
+ #ifdef INTEL_GPU
102
+ REQD_SUBGROUP_SIZE_32
103
+ #elif defined (ADRENO_GPU)
104
+ REQD_SUBGROUP_SIZE_64
105
+ #endif
106
+ kernel void kernel_rms_norm_mul(
107
+ global char * src0,
108
+ ulong offset0,
109
+ global char * src1,
110
+ ulong offset1,
111
+ global char * dst,
112
+ ulong offsetd,
113
+ int ne00,
114
+ int ne01,
115
+ int ne02,
116
+ int ne03,
117
+ ulong nb01,
118
+ ulong nb02,
119
+ ulong nb03,
120
+ int ne10,
121
+ int ne11,
122
+ int ne12,
123
+ int ne13,
124
+ ulong nb11,
125
+ ulong nb12,
126
+ ulong nb13,
127
+ ulong nb1,
128
+ ulong nb2,
129
+ ulong nb3,
130
+ float eps,
131
+ local float * sum
132
+ ) {
133
+ src0 = src0 + offset0;
134
+ src1 = src1 + offset1;
135
+ dst = dst + offsetd;
136
+
137
+ // The size of sum is sizeof(float)*subgroup_size.
138
+ // Each subgroup writes its partial sum to this array.
139
+ // So the number of subgroups per workgroup for this kernel cannot exceed the subgroup size.
140
+ // This is generally true -
141
+ // for subgroup size 64, workgroup size should be less than 4096 (the max is usually 1024).
142
+ if (get_sub_group_id() == 0) {
143
+ sum[get_sub_group_local_id()] = 0.0f;
144
+ }
145
+
146
+ int i03 = get_group_id(2);
147
+ int i02 = get_group_id(1);
148
+ int i01 = get_group_id(0);
149
+
150
+ global float4 * x = (global float4 *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
151
+ global float4 * f = (global float4 *) (src1 + (i03%ne13)*nb13 + (i02%ne12)*nb12 + (i01%ne11)*nb11);
152
+
153
+ float sumf = 0;
154
+
155
+ // parallel sum
156
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
157
+ sumf += dot(x[i00], x[i00]);
158
+ }
159
+ sumf = sub_group_reduce_add(sumf);
160
+
161
+ barrier(CLK_LOCAL_MEM_FENCE);
162
+
163
+ if (get_sub_group_local_id() == 0) {
164
+ sum[get_sub_group_id()] = sumf;
165
+ }
166
+
167
+ barrier(CLK_LOCAL_MEM_FENCE);
168
+
169
+ //for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
170
+ // if (get_local_id(0) < i) {
171
+ // sum[get_local_id(0)] += sum[get_local_id(0) + i];
172
+ // }
173
+ //}
174
+ //if (get_local_id(0) == 0) {
175
+ // sum[0] /= ne00;
176
+ //}
177
+
178
+ //barrier(CLK_LOCAL_MEM_FENCE);
179
+
180
+ sumf = sum[get_sub_group_local_id()];
181
+ sumf = sub_group_reduce_add(sumf);
182
+
183
+ float mean = sumf / ne00;
184
+ float scale = 1.0f/sqrt(mean + eps);
185
+
186
+ global float4 * y = (global float4 *) (dst + i03*nb3 + i02*nb2 + i01*nb1);
187
+ for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
188
+ y[i00] = (x[i00] * scale) * f[i00%(ne10/4)];
189
+ }
190
+ }