local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,254 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_0 32
22
+ #define QR4_0 2
23
+ #define QK4_1 32
24
+ #define QR4_1 2
25
+ #define QK5_0 32
26
+ #define QR5_0 2
27
+ #define QK5_1 32
28
+ #define QR5_1 2
29
+ #define QK8_0 32
30
+ #define QR8_0 1
31
+ #define QK_K 256
32
+ #define K_QUANTS_PER_ITERATION 2
33
+
34
+ typedef char int8_t;
35
+ typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
+ typedef int int32_t;
39
+ typedef uint uint32_t;
40
+
41
+ //------------------------------------------------------------------------------
42
+ // block_q4_0
43
+ //------------------------------------------------------------------------------
44
+ struct block_q4_0
45
+ {
46
+ half d;
47
+ uint8_t qs[QK4_0 / 2];
48
+ };
49
+
50
+ //
51
+ // This variant unrolls the loops and uses vector types instead of pointers.
52
+ // It improves performance on Adreno but not so much on Intel.
53
+ //
54
+ inline float block_q_4_0_dot_y_v(
55
+ global struct block_q4_0 * qb_curr,
56
+ float sumy,
57
+ float16 yl,
58
+ int il
59
+ ) {
60
+ float d = qb_curr->d;
61
+ float acc = 0.f;
62
+ global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
63
+
64
+ acc += yl.s0 * (qs[0] & 0x000F);
65
+ acc += yl.s1 * (qs[0] & 0x0F00);
66
+ acc += yl.s8 * (qs[0] & 0x00F0);
67
+ acc += yl.s9 * (qs[0] & 0xF000);
68
+
69
+ acc += yl.s2 * (qs[1] & 0x000F);
70
+ acc += yl.s3 * (qs[1] & 0x0F00);
71
+ acc += yl.sa * (qs[1] & 0x00F0);
72
+ acc += yl.sb * (qs[1] & 0xF000);
73
+
74
+ acc += yl.s4 * (qs[2] & 0x000F);
75
+ acc += yl.s5 * (qs[2] & 0x0F00);
76
+ acc += yl.sc * (qs[2] & 0x00F0);
77
+ acc += yl.sd * (qs[2] & 0xF000);
78
+
79
+ acc += yl.s6 * (qs[3] & 0x000F);
80
+ acc += yl.s7 * (qs[3] & 0x0F00);
81
+ acc += yl.se * (qs[3] & 0x00F0);
82
+ acc += yl.sf * (qs[3] & 0xF000);
83
+
84
+ return d * (sumy * -8.f + acc);
85
+ }
86
+
87
+ #undef N_DST
88
+ #undef N_SIMDGROUP
89
+ #undef N_SIMDWIDTH
90
+
91
+ #ifdef INTEL_GPU
92
+ #define N_DST 4 // each SIMD group works on 4 rows
93
+ #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
94
+ #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
95
+ #elif defined (ADRENO_GPU)
96
+ #define N_DST 4
97
+ #define N_SIMDGROUP 1
98
+ #define N_SIMDWIDTH 64
99
+ #endif
100
+
101
+ inline void mul_vec_q_n_f32_v(
102
+ global void * src0,
103
+ global float * src1,
104
+ global float * dst,
105
+ int ne00,
106
+ int ne01,
107
+ int ne02,
108
+ int ne10,
109
+ int ne12,
110
+ int ne0,
111
+ int ne1,
112
+ int r2,
113
+ int r3
114
+ ) {
115
+ const ulong nb = ne00/QK4_0;
116
+
117
+ int r0 = get_group_id(0);
118
+ int r1 = get_group_id(1);
119
+ int im = get_group_id(2);
120
+
121
+ // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
122
+ // id of a SIMD group in the grid.
123
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
124
+
125
+ int i12 = im%ne12;
126
+ int i13 = im/ne12;
127
+
128
+ ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
129
+
130
+ global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
131
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
132
+
133
+ float16 yl; // src1 vector cache
134
+ float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
135
+
136
+ int ix = get_sub_group_local_id()/2;
137
+ int il = 8*(get_sub_group_local_id()%2);
138
+
139
+ global float * yb = y + ix * QK4_0 + il;
140
+
141
+ // each thread in a SIMD group deals with half a block.
142
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
143
+ float sumy = 0;
144
+
145
+ sumy += yb[0];
146
+ sumy += yb[1];
147
+ sumy += yb[2];
148
+ sumy += yb[3];
149
+ sumy += yb[4];
150
+ sumy += yb[5];
151
+ sumy += yb[6];
152
+ sumy += yb[7];
153
+
154
+ sumy += yb[16];
155
+ sumy += yb[17];
156
+ sumy += yb[18];
157
+ sumy += yb[19];
158
+ sumy += yb[20];
159
+ sumy += yb[21];
160
+ sumy += yb[22];
161
+ sumy += yb[23];
162
+
163
+
164
+ yl.s0 = yb[0];
165
+ yl.s1 = yb[1]/256.f;
166
+
167
+ yl.s2 = yb[2];
168
+ yl.s3 = yb[3]/256.f;
169
+
170
+ yl.s4 = yb[4];
171
+ yl.s5 = yb[5]/256.f;
172
+
173
+ yl.s6 = yb[6];
174
+ yl.s7 = yb[7]/256.f;
175
+
176
+ yl.s8 = yb[16]/16.f;
177
+ yl.s9 = yb[17]/4096.f;
178
+
179
+ yl.sa = yb[18]/16.f;
180
+ yl.sb = yb[19]/4096.f;
181
+
182
+ yl.sc = yb[20]/16.f;
183
+ yl.sd = yb[21]/4096.f;
184
+
185
+ yl.se = yb[22]/16.f;
186
+ yl.sf = yb[23]/4096.f;
187
+
188
+ sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
189
+ sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
190
+ sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
191
+ sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
192
+
193
+ // One thread in a SIMD group (i.e., subgroup) handles a half block,
194
+ // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
195
+ // y points to the activation matrix (of type float). Therefore for
196
+ // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
197
+ // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
198
+ // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
199
+ yb += QK4_0 * (N_SIMDWIDTH/2);
200
+ }
201
+
202
+ // The above does not work for Adreno - it produces incorrect results for
203
+ // row = 1, 2, 3 and only row = 0 gives the correct result.
204
+ // If N_DST is changed, the below array must be initialized accordingly.
205
+ // This also seems to perform better on Intel.
206
+ float4 tot = (float4)(
207
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
208
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
209
+ );
210
+
211
+ if (get_sub_group_local_id() == 0) {
212
+ if (first_row + 0 < ne01) {
213
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
214
+ }
215
+ if (first_row + 1 < ne01) {
216
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
217
+ }
218
+ if (first_row + 2 < ne01) {
219
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
220
+ }
221
+ if (first_row + 3 < ne01) {
222
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
223
+ }
224
+ }
225
+ }
226
+
227
+ #ifdef INTEL_GPU
228
+ REQD_SUBGROUP_SIZE_16
229
+ #elif defined (ADRENO_GPU)
230
+ REQD_SUBGROUP_SIZE_64
231
+ #endif
232
+ kernel void kernel_mul_mat_q4_0_f32_v(
233
+ global void * src0,
234
+ ulong offset0,
235
+ global float * src1,
236
+ ulong offset1,
237
+ global float * dst,
238
+ ulong offsetd,
239
+ int ne00,
240
+ int ne01,
241
+ int ne02,
242
+ int ne10,
243
+ int ne12,
244
+ int ne0,
245
+ int ne1,
246
+ int r2,
247
+ int r3
248
+ ) {
249
+ src0 = (global void*)((global char*)src0 + offset0);
250
+ src1 = (global float*)((global char*)src1 + offset1);
251
+ dst = (global float*)((global char*)dst + offsetd);
252
+
253
+ mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
254
+ }
@@ -0,0 +1,219 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_1 32
22
+
23
+ struct block_q4_1 {
24
+ half d; // delta
25
+ half m; // min
26
+ uchar qs[QK4_1 / 2]; // nibbles / quants
27
+ };
28
+
29
+ inline float block_q4_1_dot_y(
30
+ global const struct block_q4_1 * qb_curr,
31
+ float sumy,
32
+ float16 yl,
33
+ int il
34
+ ) {
35
+ float d = qb_curr->d;
36
+ float m = qb_curr->m;
37
+
38
+ float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
39
+
40
+ global const ushort * qs = ((global const ushort *) qb_curr + 2 + il/2);
41
+
42
+ acc.s0 += yl.s0 * (qs[0] & 0x000F);
43
+ acc.s0 += yl.s1 * (qs[0] & 0x0F00);
44
+ acc.s0 += yl.s8 * (qs[0] & 0x00F0);
45
+ acc.s3 += yl.s9 * (qs[0] & 0xF000);
46
+
47
+ acc.s0 += yl.s2 * (qs[1] & 0x000F);
48
+ acc.s1 += yl.s3 * (qs[1] & 0x0F00);
49
+ acc.s2 += yl.sa * (qs[1] & 0x00F0);
50
+ acc.s3 += yl.sb * (qs[1] & 0xF000);
51
+
52
+ acc.s0 += yl.s4 * (qs[2] & 0x000F);
53
+ acc.s1 += yl.s5 * (qs[2] & 0x0F00);
54
+ acc.s2 += yl.sc * (qs[2] & 0x00F0);
55
+ acc.s3 += yl.sd * (qs[2] & 0xF000);
56
+
57
+ acc.s0 += yl.s6 * (qs[3] & 0x000F);
58
+ acc.s1 += yl.s7 * (qs[3] & 0x0F00);
59
+ acc.s2 += yl.se * (qs[3] & 0x00F0);
60
+ acc.s3 += yl.sf * (qs[3] & 0xF000);
61
+
62
+ return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3) + sumy * m;
63
+ }
64
+
65
+ #undef N_DST
66
+ #undef N_SIMDGROUP
67
+ #undef N_SIMDWIDTH
68
+
69
+ #ifdef INTEL_GPU
70
+ #define N_DST 4 // each subgroup works on 4 rows
71
+ #define N_SIMDGROUP 1 // number of subgroups in a thread group
72
+ #define N_SIMDWIDTH 16 // assuming subgroup size is 16
73
+ #elif defined (ADRENO_GPU)
74
+ #define N_DST 4
75
+ #define N_SIMDGROUP 1
76
+ #define N_SIMDWIDTH 64
77
+ #endif
78
+
79
+ inline void mul_vec_q_n_f32(
80
+ global void * src0,
81
+ global float * src1,
82
+ global float * dst,
83
+ int ne00,
84
+ int ne01,
85
+ int ne02,
86
+ int ne10,
87
+ int ne12,
88
+ int ne0,
89
+ int ne1,
90
+ int r2,
91
+ int r3
92
+ ) {
93
+ const ulong nb = ne00/QK4_1;
94
+
95
+ int r0 = get_group_id(0);
96
+ int r1 = get_group_id(1);
97
+ int im = get_group_id(2);
98
+
99
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
100
+
101
+ int i12 = im%ne12;
102
+ int i13 = im/ne12;
103
+
104
+ ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
105
+
106
+ global struct block_q4_1 * x = (global struct block_q4_1 *) src0 + offset0;
107
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
108
+
109
+ float16 yl;
110
+ float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
111
+
112
+ int ix = get_sub_group_local_id()/2;
113
+ int il = 8*(get_sub_group_local_id()%2);
114
+
115
+ global float * yb = y + ix * QK4_1 + il;
116
+
117
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
118
+ float sumy = 0;
119
+
120
+ sumy += yb[0];
121
+ sumy += yb[1];
122
+ sumy += yb[2];
123
+ sumy += yb[3];
124
+ sumy += yb[4];
125
+ sumy += yb[5];
126
+ sumy += yb[6];
127
+ sumy += yb[7];
128
+
129
+ sumy += yb[16];
130
+ sumy += yb[17];
131
+ sumy += yb[18];
132
+ sumy += yb[19];
133
+ sumy += yb[20];
134
+ sumy += yb[21];
135
+ sumy += yb[22];
136
+ sumy += yb[23];
137
+
138
+
139
+ yl.s0 = yb[0];
140
+ yl.s1 = yb[1]/256.f;
141
+
142
+ yl.s2 = yb[2];
143
+ yl.s3 = yb[3]/256.f;
144
+
145
+ yl.s4 = yb[4];
146
+ yl.s5 = yb[5]/256.f;
147
+
148
+ yl.s6 = yb[6];
149
+ yl.s7 = yb[7]/256.f;
150
+
151
+ yl.s8 = yb[16]/16.f;
152
+ yl.s9 = yb[17]/4096.f;
153
+
154
+ yl.sa = yb[18]/16.f;
155
+ yl.sb = yb[19]/4096.f;
156
+
157
+ yl.sc = yb[20]/16.f;
158
+ yl.sd = yb[21]/4096.f;
159
+
160
+ yl.se = yb[22]/16.f;
161
+ yl.sf = yb[23]/4096.f;
162
+
163
+ sumf.s0 += block_q4_1_dot_y(x+ib+0*nb, sumy, yl, il);
164
+ sumf.s1 += block_q4_1_dot_y(x+ib+1*nb, sumy, yl, il);
165
+ sumf.s2 += block_q4_1_dot_y(x+ib+2*nb, sumy, yl, il);
166
+ sumf.s3 += block_q4_1_dot_y(x+ib+3*nb, sumy, yl, il);
167
+
168
+ yb += QK4_1 * (N_SIMDWIDTH/2);
169
+ }
170
+
171
+ float4 tot = (float4)(
172
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
173
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
174
+ );
175
+
176
+ if (get_sub_group_local_id() == 0) {
177
+ if (first_row + 0 < ne01) {
178
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
179
+ }
180
+ if (first_row + 1 < ne01) {
181
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
182
+ }
183
+ if (first_row + 2 < ne01) {
184
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
185
+ }
186
+ if (first_row + 3 < ne01) {
187
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
188
+ }
189
+ }
190
+ }
191
+
192
+ #ifdef INTEL_GPU
193
+ REQD_SUBGROUP_SIZE_16
194
+ #elif defined (ADRENO_GPU)
195
+ REQD_SUBGROUP_SIZE_64
196
+ #endif
197
+ kernel void kernel_mul_mv_q4_1_f32(
198
+ global void * src0,
199
+ ulong offset0,
200
+ global float * src1,
201
+ ulong offset1,
202
+ global float * dst,
203
+ ulong offsetd,
204
+ int ne00,
205
+ int ne01,
206
+ int ne02,
207
+ int ne10,
208
+ int ne12,
209
+ int ne0,
210
+ int ne1,
211
+ int r2,
212
+ int r3
213
+ ) {
214
+ src0 = (global void*)((global char*)src0 + offset0);
215
+ src1 = (global float*)((global char*)src1 + offset1);
216
+ dst = (global float*)((global char*)dst + offsetd);
217
+
218
+ mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
219
+ }
@@ -0,0 +1,229 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ #ifdef cl_intel_subgroups
4
+ #pragma OPENCL EXTENSION cl_intel_subgroups : enable
5
+ #else
6
+ #pragma OPENCL EXTENSION cl_khr_subgroups : enable
7
+ #endif
8
+
9
+ #ifdef cl_intel_required_subgroup_size
10
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11
+ #define INTEL_GPU 1
12
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14
+ #elif defined(cl_qcom_reqd_sub_group_size)
15
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16
+ #define ADRENO_GPU 1
17
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19
+ #endif
20
+
21
+ #define QK4_1 32
22
+
23
+ struct block_q4_1 {
24
+ half d; // delta
25
+ half m; // min
26
+ uchar qs[QK4_1 / 2]; // nibbles / quants
27
+ };
28
+
29
+ inline float block_q4_1_dot_y_flat(
30
+ global const uchar * x,
31
+ global const half * dh,
32
+ global const half * mh,
33
+ float sumy,
34
+ float16 yl,
35
+ int il
36
+ ) {
37
+ float d = *dh;
38
+ float m = *mh;
39
+ global const ushort * qs = ((global const ushort *) x + il/2);
40
+
41
+ float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
42
+
43
+ acc.s0 += yl.s0 * (qs[0] & 0x000F);
44
+ acc.s0 += yl.s1 * (qs[0] & 0x0F00);
45
+ acc.s0 += yl.s8 * (qs[0] & 0x00F0);
46
+ acc.s3 += yl.s9 * (qs[0] & 0xF000);
47
+
48
+ acc.s0 += yl.s2 * (qs[1] & 0x000F);
49
+ acc.s1 += yl.s3 * (qs[1] & 0x0F00);
50
+ acc.s2 += yl.sa * (qs[1] & 0x00F0);
51
+ acc.s3 += yl.sb * (qs[1] & 0xF000);
52
+
53
+ acc.s0 += yl.s4 * (qs[2] & 0x000F);
54
+ acc.s1 += yl.s5 * (qs[2] & 0x0F00);
55
+ acc.s2 += yl.sc * (qs[2] & 0x00F0);
56
+ acc.s3 += yl.sd * (qs[2] & 0xF000);
57
+
58
+ acc.s0 += yl.s6 * (qs[3] & 0x000F);
59
+ acc.s1 += yl.s7 * (qs[3] & 0x0F00);
60
+ acc.s2 += yl.se * (qs[3] & 0x00F0);
61
+ acc.s3 += yl.sf * (qs[3] & 0xF000);
62
+
63
+ return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3) + sumy * m;
64
+ }
65
+
66
+ #undef N_DST
67
+ #undef N_SIMDGROUP
68
+ #undef N_SIMDWIDTH
69
+
70
+ #ifdef INTEL_GPU
71
+ #define N_DST 4 // each subgroup works on 4 rows
72
+ #define N_SIMDGROUP 1 // number of subgroups in a thread group
73
+ #define N_SIMDWIDTH 16 // assuming subgroup size is 16
74
+ #elif defined (ADRENO_GPU)
75
+ #define N_DST 4
76
+ #define N_SIMDGROUP 1
77
+ #define N_SIMDWIDTH 64
78
+ #endif
79
+
80
+ inline void mul_vec_q_n_f32_flat(
81
+ global void * src0_q,
82
+ global void * src0_d,
83
+ global void * src0_m,
84
+ global float * src1,
85
+ global float * dst,
86
+ int ne00,
87
+ int ne01,
88
+ int ne02,
89
+ int ne10,
90
+ int ne12,
91
+ int ne0,
92
+ int ne1,
93
+ int r2,
94
+ int r3
95
+ ) {
96
+ const ulong nb = ne00/QK4_1;
97
+
98
+ int r0 = get_group_id(0);
99
+ int r1 = get_group_id(1);
100
+ int im = get_group_id(2);
101
+
102
+ int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
103
+
104
+ int i12 = im%ne12;
105
+ int i13 = im/ne12;
106
+
107
+ ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
108
+
109
+ // The number of scales/mins is the same as the number of blocks.
110
+ ulong offset0_dm = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02));
111
+ // Each block contains QK4_1/2 uchars, hence offset for qs is as follows.
112
+ ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_1/2;
113
+
114
+ global uchar * x = (global uchar *) src0_q + offset0_q;
115
+ global half * d = (global half *) src0_d + offset0_dm;
116
+ global half * m = (global half *) src0_m + offset0_dm;
117
+ global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
118
+
119
+ float16 yl;
120
+ float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
121
+
122
+ int ix = get_sub_group_local_id()/2;
123
+ int il = 8*(get_sub_group_local_id()%2);
124
+
125
+ global float * yb = y + ix * QK4_1 + il;
126
+
127
+ for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
128
+ float sumy = 0;
129
+
130
+ sumy += yb[0];
131
+ sumy += yb[1];
132
+ sumy += yb[2];
133
+ sumy += yb[3];
134
+ sumy += yb[4];
135
+ sumy += yb[5];
136
+ sumy += yb[6];
137
+ sumy += yb[7];
138
+
139
+ sumy += yb[16];
140
+ sumy += yb[17];
141
+ sumy += yb[18];
142
+ sumy += yb[19];
143
+ sumy += yb[20];
144
+ sumy += yb[21];
145
+ sumy += yb[22];
146
+ sumy += yb[23];
147
+
148
+
149
+ yl.s0 = yb[0];
150
+ yl.s1 = yb[1]/256.f;
151
+
152
+ yl.s2 = yb[2];
153
+ yl.s3 = yb[3]/256.f;
154
+
155
+ yl.s4 = yb[4];
156
+ yl.s5 = yb[5]/256.f;
157
+
158
+ yl.s6 = yb[6];
159
+ yl.s7 = yb[7]/256.f;
160
+
161
+ yl.s8 = yb[16]/16.f;
162
+ yl.s9 = yb[17]/4096.f;
163
+
164
+ yl.sa = yb[18]/16.f;
165
+ yl.sb = yb[19]/4096.f;
166
+
167
+ yl.sc = yb[20]/16.f;
168
+ yl.sd = yb[21]/4096.f;
169
+
170
+ yl.se = yb[22]/16.f;
171
+ yl.sf = yb[23]/4096.f;
172
+
173
+ sumf.s0 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 0*nb*QK4_1/2, d + ib + 0*nb, m + ib + 0*nb, sumy, yl, il);
174
+ sumf.s1 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 1*nb*QK4_1/2, d + ib + 1*nb, m + ib + 1*nb, sumy, yl, il);
175
+ sumf.s2 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 2*nb*QK4_1/2, d + ib + 2*nb, m + ib + 2*nb, sumy, yl, il);
176
+ sumf.s3 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 3*nb*QK4_1/2, d + ib + 3*nb, m + ib + 3*nb, sumy, yl, il);
177
+
178
+ yb += QK4_1 * (N_SIMDWIDTH/2);
179
+ }
180
+
181
+ float4 tot = (float4)(
182
+ sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
183
+ sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
184
+ );
185
+
186
+ if (get_sub_group_local_id() == 0) {
187
+ if (first_row + 0 < ne01) {
188
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
189
+ }
190
+ if (first_row + 1 < ne01) {
191
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
192
+ }
193
+ if (first_row + 2 < ne01) {
194
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
195
+ }
196
+ if (first_row + 3 < ne01) {
197
+ dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
198
+ }
199
+ }
200
+ }
201
+
202
+ #ifdef INTEL_GPU
203
+ REQD_SUBGROUP_SIZE_16
204
+ #elif defined (ADRENO_GPU)
205
+ REQD_SUBGROUP_SIZE_64
206
+ #endif
207
+ kernel void kernel_mul_mv_q4_1_f32_flat(
208
+ global void * src0_q,
209
+ global void * src0_d,
210
+ global void * src0_m,
211
+ global float * src1,
212
+ ulong offset1,
213
+ global float * dst,
214
+ ulong offsetd,
215
+ int ne00,
216
+ int ne01,
217
+ int ne02,
218
+ int ne10,
219
+ int ne12,
220
+ int ne0,
221
+ int ne1,
222
+ int r2,
223
+ int r3
224
+ ) {
225
+ src1 = (global float*)((global char*)src1 + offset1);
226
+ dst = (global float*)((global char*)dst + offsetd);
227
+
228
+ mul_vec_q_n_f32_flat(src0_q, src0_d, src0_m, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
229
+ }