local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,702 @@
1
+ #import "ggml-metal-context.h"
2
+
3
+ #import "ggml-impl.h"
4
+ #import "ggml-backend-impl.h"
5
+
6
+ #import "ggml-metal-impl.h"
7
+ #import "ggml-metal-common.h"
8
+ #import "ggml-metal-ops.h"
9
+
10
+ #import <Foundation/Foundation.h>
11
+
12
+ #import <Metal/Metal.h>
13
+
14
+ #undef MIN
15
+ #undef MAX
16
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
17
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
18
+
19
+ // max number of MTLCommandBuffer used to submit a graph for processing
20
+ #define GGML_METAL_MAX_COMMAND_BUFFERS 8
21
+
22
+ struct ggml_metal_command_buffer {
23
+ id<MTLCommandBuffer> obj;
24
+ };
25
+
26
+ struct ggml_metal {
27
+ char name[128];
28
+
29
+ ggml_metal_device_t dev;
30
+ ggml_metal_library_t lib;
31
+
32
+ ggml_metal_event_t ev_cpy; // for async copies
33
+
34
+ dispatch_queue_t d_queue;
35
+
36
+ // additional, inference-time compiled pipelines
37
+ ggml_metal_pipelines_t pipelines_ext;
38
+
39
+ bool use_fusion;
40
+ bool use_concurrency;
41
+ bool use_graph_optimize;
42
+
43
+ int debug_graph;
44
+ int debug_fusion;
45
+
46
+ // how many times a given op was fused
47
+ uint64_t fuse_cnt[GGML_OP_COUNT];
48
+
49
+ // capture state
50
+ bool capture_next_compute;
51
+ bool capture_started;
52
+
53
+ id<MTLCaptureScope> capture_scope;
54
+
55
+ // command buffer state
56
+ int n_cb; // number of extra threads used to submit the command buffers
57
+ int n_nodes_0; // number of nodes submitted by the main thread
58
+ int n_nodes_1; // remaining number of nodes submitted by the n_cb threads
59
+ int n_nodes_per_cb;
60
+
61
+ struct ggml_cgraph * gf;
62
+
63
+ // the callback given to the thread pool
64
+ void (^encode_async)(size_t ith);
65
+
66
+ // n_cb command buffers + 1 used by the main thread
67
+ struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
68
+
69
+ // extra command buffers for things like getting, setting and copying tensors
70
+ NSMutableArray * cmd_bufs_ext;
71
+
72
+ // the last command buffer queued into the Metal queue with operations relevant to the current Metal backend
73
+ id<MTLCommandBuffer> cmd_buf_last;
74
+
75
+ // abort ggml_metal_graph_compute if callback returns true
76
+ ggml_abort_callback abort_callback;
77
+ void * abort_callback_data;
78
+ };
79
+
80
+ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
81
+ GGML_LOG_INFO("%s: allocating\n", __func__);
82
+
83
+ #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
84
+ // Show all the Metal device instances in the system
85
+ NSArray * devices = MTLCopyAllDevices();
86
+ for (id<MTLDevice> device in devices) {
87
+ GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
88
+ }
89
+ [devices release]; // since it was created by a *Copy* C method
90
+ #endif
91
+
92
+ // init context
93
+ ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
94
+
95
+ id<MTLDevice> device = ggml_metal_device_get_obj(dev);
96
+
97
+ GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
98
+
99
+ // TODO: would it be better to have one queue for the backend and one queue for the device?
100
+ // the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
101
+ //res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
102
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
103
+ if (queue == nil) {
104
+ GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
105
+ return NULL;
106
+ }
107
+
108
+ res->dev = dev;
109
+ res->lib = ggml_metal_device_get_library(dev);
110
+ if (res->lib == NULL) {
111
+ GGML_LOG_WARN("%s: the device does not have a precompiled Metal library - this is unexpected\n", __func__);
112
+ GGML_LOG_WARN("%s: will try to compile it on the fly\n", __func__);
113
+
114
+ res->lib = ggml_metal_library_init(dev);
115
+ if (res->lib == NULL) {
116
+ GGML_LOG_ERROR("%s: error: failed to initialize the Metal library\n", __func__);
117
+
118
+ free(res);
119
+
120
+ return NULL;
121
+ }
122
+ }
123
+
124
+ res->ev_cpy = ggml_metal_device_event_init(dev);
125
+
126
+ const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
127
+
128
+ snprintf(res->name, sizeof(res->name), "%s", props_dev->name);
129
+
130
+ res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
131
+
132
+ res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
133
+ res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
134
+
135
+ {
136
+ const char * val = getenv("GGML_METAL_GRAPH_DEBUG");
137
+ res->debug_graph = val ? atoi(val) : 0;
138
+ }
139
+
140
+ {
141
+ const char * val = getenv("GGML_METAL_FUSION_DEBUG");
142
+ res->debug_fusion = val ? atoi(val) : 0;
143
+ }
144
+
145
+ res->use_graph_optimize = true;
146
+
147
+ if (getenv("GGML_METAL_GRAPH_OPTIMIZE_DISABLE") != NULL) {
148
+ res->use_graph_optimize = false;
149
+ }
150
+
151
+ memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
152
+
153
+ GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
154
+ GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
155
+ GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
156
+
157
+ res->capture_next_compute = false;
158
+ res->capture_started = false;
159
+ res->capture_scope = nil;
160
+
161
+ res->gf = nil;
162
+ res->encode_async = nil;
163
+ for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
164
+ res->cmd_bufs[i].obj = nil;
165
+ }
166
+
167
+ res->cmd_bufs_ext = [[NSMutableArray alloc] init];
168
+
169
+ res->cmd_buf_last = nil;
170
+
171
+ res->pipelines_ext = ggml_metal_pipelines_init();
172
+
173
+ return res;
174
+ }
175
+
176
+ void ggml_metal_free(ggml_metal_t ctx) {
177
+ GGML_LOG_INFO("%s: deallocating\n", __func__);
178
+
179
+ for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
180
+ if (ctx->cmd_bufs[i].obj) {
181
+ [ctx->cmd_bufs[i].obj release];
182
+ }
183
+ }
184
+
185
+ for (int i = 0; i < (int) ctx->cmd_bufs_ext.count; ++i) {
186
+ if (ctx->cmd_bufs_ext[i]) {
187
+ [ctx->cmd_bufs_ext[i] release];
188
+ }
189
+ }
190
+
191
+ [ctx->cmd_bufs_ext removeAllObjects];
192
+ [ctx->cmd_bufs_ext release];
193
+
194
+ if (ctx->pipelines_ext) {
195
+ ggml_metal_pipelines_free(ctx->pipelines_ext);
196
+ ctx->pipelines_ext = nil;
197
+ }
198
+
199
+ if (ctx->debug_fusion > 0) {
200
+ GGML_LOG_DEBUG("%s: fusion stats:\n", __func__);
201
+ for (int i = 0; i < GGML_OP_COUNT; i++) {
202
+ if (ctx->fuse_cnt[i] == 0) {
203
+ continue;
204
+ }
205
+
206
+ // note: cannot use ggml_log here
207
+ GGML_LOG_DEBUG("%s: - %s: %" PRIu64 "\n", __func__, ggml_op_name((enum ggml_op) i), ctx->fuse_cnt[i]);
208
+ }
209
+ }
210
+
211
+ Block_release(ctx->encode_async);
212
+
213
+ //[ctx->queue release]; // [TAG_QUEUE_PER_BACKEND]
214
+
215
+ dispatch_release(ctx->d_queue);
216
+
217
+ ggml_metal_device_event_free(ctx->dev, ctx->ev_cpy);
218
+
219
+ free(ctx);
220
+ }
221
+
222
+ const char * ggml_metal_get_name(ggml_metal_t ctx) {
223
+ return ctx->name;
224
+ }
225
+
226
+ void ggml_metal_synchronize(ggml_metal_t ctx) {
227
+ // wait for any backend operations to finish
228
+ if (ctx->cmd_buf_last) {
229
+ [ctx->cmd_buf_last waitUntilCompleted];
230
+ ctx->cmd_buf_last = nil;
231
+ }
232
+
233
+ // check status of all command buffers
234
+ {
235
+ const int n_cb = ctx->n_cb;
236
+
237
+ for (int cb_idx = 0; cb_idx <= n_cb; ++cb_idx) {
238
+ id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
239
+ if (!cmd_buf) {
240
+ continue;
241
+ }
242
+
243
+ MTLCommandBufferStatus status = [cmd_buf status];
244
+ if (status != MTLCommandBufferStatusCompleted) {
245
+ GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, cb_idx, (int) status);
246
+ if (status == MTLCommandBufferStatusError) {
247
+ GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
248
+ }
249
+ GGML_ABORT("fatal error");
250
+ }
251
+ }
252
+ }
253
+
254
+ // release any completed extra command buffers
255
+ if (ctx->cmd_bufs_ext.count > 0) {
256
+ for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
257
+ id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
258
+
259
+ MTLCommandBufferStatus status = [cmd_buf status];
260
+ if (status != MTLCommandBufferStatusCompleted) {
261
+ GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status);
262
+ if (status == MTLCommandBufferStatusError) {
263
+ GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
264
+ }
265
+ GGML_ABORT("fatal error");
266
+ }
267
+
268
+ [cmd_buf release];
269
+ }
270
+
271
+ [ctx->cmd_bufs_ext removeAllObjects];
272
+ }
273
+ }
274
+
275
+ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_tensor * t) {
276
+ if (!t) {
277
+ return (struct ggml_metal_buffer_id) { nil, 0 };
278
+ }
279
+
280
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
281
+
282
+ return ggml_metal_buffer_get_id(buffer->context, t);
283
+ }
284
+
285
+ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
286
+ @autoreleasepool {
287
+ // wrap the source data into a Metal buffer
288
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
289
+ id<MTLBuffer> buf_src = [device newBufferWithBytes:data
290
+ length:size
291
+ options:MTLResourceStorageModeShared];
292
+
293
+ GGML_ASSERT(buf_src);
294
+
295
+ struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
296
+ if (bid_dst.metal == nil) {
297
+ GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
298
+ }
299
+
300
+ bid_dst.offs += offset;
301
+
302
+ // queue the copy operation into the queue of the Metal context
303
+ // this will be queued at the end, after any currently ongoing GPU operations
304
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
305
+ id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
306
+ id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
307
+
308
+ [encoder copyFromBuffer:buf_src
309
+ sourceOffset:0
310
+ toBuffer:bid_dst.metal
311
+ destinationOffset:bid_dst.offs
312
+ size:size];
313
+
314
+ [encoder endEncoding];
315
+ [cmd_buf commit];
316
+ [buf_src release];
317
+
318
+ // do not wait here for completion
319
+ //[cmd_buf waitUntilCompleted];
320
+
321
+ // instead, remember a reference to the command buffer and wait for it later if needed
322
+ [ctx->cmd_bufs_ext addObject:cmd_buf];
323
+ ctx->cmd_buf_last = cmd_buf;
324
+
325
+ [cmd_buf retain];
326
+ }
327
+ }
328
+
329
+ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
330
+ @autoreleasepool {
331
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
332
+ id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
333
+ length:size
334
+ options:MTLResourceStorageModeShared
335
+ deallocator:nil];
336
+
337
+ GGML_ASSERT(buf_dst);
338
+
339
+ struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
340
+ if (bid_src.metal == nil) {
341
+ GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
342
+ }
343
+
344
+ bid_src.offs += offset;
345
+
346
+ // queue the copy operation into the queue of the Metal context
347
+ // this will be queued at the end, after any currently ongoing GPU operations
348
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
349
+ id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
350
+ id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
351
+
352
+ [encoder copyFromBuffer:bid_src.metal
353
+ sourceOffset:bid_src.offs
354
+ toBuffer:buf_dst
355
+ destinationOffset:0
356
+ size:size];
357
+
358
+ [encoder endEncoding];
359
+ [cmd_buf commit];
360
+ [buf_dst release];
361
+
362
+ // do not wait here for completion
363
+ //[cmd_buf waitUntilCompleted];
364
+
365
+ // instead, remember a reference to the command buffer and wait for it later if needed
366
+ [ctx->cmd_bufs_ext addObject:cmd_buf];
367
+ ctx->cmd_buf_last = cmd_buf;
368
+
369
+ [cmd_buf retain];
370
+ }
371
+ }
372
+
373
+ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
374
+ @autoreleasepool {
375
+ struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src);
376
+ struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(dst);
377
+
378
+ if (bid_src.metal == nil || bid_dst.metal == nil) {
379
+ return false;
380
+ }
381
+
382
+ // queue the copy operation into the Metal context
383
+ // this will be queued at the end, after any currently ongoing GPU operations
384
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx_src->dev);
385
+ id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
386
+ id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
387
+
388
+ [encoder copyFromBuffer:bid_src.metal
389
+ sourceOffset:bid_src.offs
390
+ toBuffer:bid_dst.metal
391
+ destinationOffset:bid_dst.offs
392
+ size:ggml_nbytes(src)];
393
+
394
+ [encoder endEncoding];
395
+
396
+ ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src);
397
+ ggml_metal_event_encode_signal(ev_cpy, cmd_buf);
398
+
399
+ [cmd_buf commit];
400
+
401
+ // do not wait here for completion
402
+ //[cmd_buf waitUntilCompleted];
403
+
404
+ // instead, remember a reference to the command buffer and wait for it later if needed
405
+ [ctx_src->cmd_bufs_ext addObject:cmd_buf];
406
+ ctx_src->cmd_buf_last = cmd_buf;
407
+
408
+ [cmd_buf retain];
409
+
410
+ ggml_metal_event_wait(ctx_dst, ev_cpy);
411
+
412
+ return true;
413
+ }
414
+ }
415
+
416
+ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
417
+ // number of nodes encoded by the main thread (empirically determined)
418
+ const int n_main = MAX(64, 0.1*gf->n_nodes);
419
+
420
+ // number of threads in addition to the main thread
421
+ const int n_cb = ctx->n_cb;
422
+
423
+ // keep the memory wired
424
+ ggml_metal_device_rsets_keep_alive(ctx->dev);
425
+
426
+ // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
427
+ // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
428
+ // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
429
+ // each thread creates it's own command buffer and enqueues the ops in parallel
430
+ //
431
+ // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
432
+
433
+ @autoreleasepool {
434
+ ctx->gf = gf;
435
+
436
+ ctx->n_nodes_0 = MIN(n_main, gf->n_nodes);
437
+ ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0;
438
+
439
+ ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
440
+
441
+ const bool use_capture = ctx->capture_next_compute;
442
+ if (use_capture) {
443
+ ctx->capture_next_compute = false;
444
+
445
+ // make sure all previous computations have finished before starting the capture
446
+ if (ctx->cmd_buf_last) {
447
+ [ctx->cmd_buf_last waitUntilCompleted];
448
+ ctx->cmd_buf_last = nil;
449
+ }
450
+
451
+ if (!ctx->capture_started) {
452
+ // create capture scope
453
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
454
+ ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
455
+
456
+ MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
457
+ descriptor.captureObject = ctx->capture_scope;
458
+ descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
459
+ descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
460
+
461
+ NSError * error = nil;
462
+ if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
463
+ GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
464
+ } else {
465
+ [ctx->capture_scope beginScope];
466
+ ctx->capture_started = true;
467
+ }
468
+ }
469
+ }
470
+
471
+ // short-hand
472
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
473
+
474
+ // the main thread commits the first few commands immediately
475
+ // cmd_buf[n_cb]
476
+ {
477
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
478
+ [cmd_buf retain];
479
+
480
+ if (ctx->cmd_bufs[n_cb].obj) {
481
+ [ctx->cmd_bufs[n_cb].obj release];
482
+ }
483
+ ctx->cmd_bufs[n_cb].obj = cmd_buf;
484
+
485
+ [cmd_buf enqueue];
486
+
487
+ ctx->encode_async(n_cb);
488
+ }
489
+
490
+ // remember the command buffer for the next iteration
491
+ ctx->cmd_buf_last = ctx->cmd_bufs[n_cb].obj;
492
+
493
+ // prepare the rest of the command buffers asynchronously (optional)
494
+ // cmd_buf[0.. n_cb)
495
+ for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
496
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
497
+ [cmd_buf retain];
498
+
499
+ if (ctx->cmd_bufs[cb_idx].obj) {
500
+ [ctx->cmd_bufs[cb_idx].obj release];
501
+ }
502
+ ctx->cmd_bufs[cb_idx].obj = cmd_buf;
503
+
504
+ // always enqueue the first two command buffers
505
+ // enqueue all of the command buffers if we don't need to abort
506
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
507
+ [cmd_buf enqueue];
508
+
509
+ // update the pointer to the last queued command buffer
510
+ // this is needed to implement synchronize()
511
+ ctx->cmd_buf_last = cmd_buf;
512
+ }
513
+ }
514
+
515
+ dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
516
+
517
+ // for debugging: block until graph is computed
518
+ //[ctx->cmd_buf_last waitUntilCompleted];
519
+
520
+ // enter here only when capturing in order to wait for all computation to finish
521
+ // otherwise, we leave the graph to compute asynchronously
522
+ if (!use_capture && ctx->capture_started) {
523
+ // wait for completion and check status of each command buffer
524
+ // needed to detect if the device ran out-of-memory for example (#1881)
525
+ {
526
+ id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
527
+ [cmd_buf waitUntilCompleted];
528
+
529
+ MTLCommandBufferStatus status = [cmd_buf status];
530
+ if (status != MTLCommandBufferStatusCompleted) {
531
+ GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
532
+ if (status == MTLCommandBufferStatusError) {
533
+ GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
534
+ }
535
+
536
+ return GGML_STATUS_FAILED;
537
+ }
538
+ }
539
+
540
+ for (int i = 0; i < n_cb; ++i) {
541
+ id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
542
+ [cmd_buf waitUntilCompleted];
543
+
544
+ MTLCommandBufferStatus status = [cmd_buf status];
545
+ if (status != MTLCommandBufferStatusCompleted) {
546
+ GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
547
+ if (status == MTLCommandBufferStatusError) {
548
+ GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
549
+ }
550
+
551
+ return GGML_STATUS_FAILED;
552
+ }
553
+
554
+ id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
555
+ if (!next_buffer) {
556
+ continue;
557
+ }
558
+
559
+ const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
560
+ if (next_queued) {
561
+ continue;
562
+ }
563
+
564
+ if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
565
+ GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
566
+ return GGML_STATUS_ABORTED;
567
+ }
568
+
569
+ [next_buffer commit];
570
+ }
571
+
572
+ [ctx->capture_scope endScope];
573
+ [[MTLCaptureManager sharedCaptureManager] stopCapture];
574
+ }
575
+ }
576
+
577
+ return GGML_STATUS_SUCCESS;
578
+ }
579
+
580
+ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) {
581
+ //const int64_t t_start = ggml_time_us();
582
+
583
+ if (ctx->use_graph_optimize) {
584
+ ggml_graph_optimize(gf);
585
+ }
586
+
587
+ //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
588
+ }
589
+
590
+ void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev) {
591
+ @autoreleasepool {
592
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
593
+ id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
594
+
595
+ ggml_metal_event_encode_signal(ev, cmd_buf);
596
+
597
+ [cmd_buf commit];
598
+
599
+ [ctx->cmd_bufs_ext addObject:cmd_buf];
600
+ ctx->cmd_buf_last = cmd_buf;
601
+
602
+ [cmd_buf retain];
603
+ }
604
+ }
605
+
606
+ void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) {
607
+ @autoreleasepool {
608
+ id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
609
+ id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
610
+
611
+ ggml_metal_event_encode_wait(ev, cmd_buf);
612
+
613
+ [cmd_buf commit];
614
+
615
+ [ctx->cmd_bufs_ext addObject:cmd_buf];
616
+ ctx->cmd_buf_last = cmd_buf;
617
+
618
+ [cmd_buf retain];
619
+ }
620
+ }
621
+
622
+ ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) {
623
+ return ctx->ev_cpy;
624
+ }
625
+
626
+ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
627
+ if (ctx->n_cb != n_cb) {
628
+ ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
629
+
630
+ if (ctx->n_cb > 2) {
631
+ GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
632
+ }
633
+ }
634
+
635
+ if (ctx->encode_async) {
636
+ Block_release(ctx->encode_async);
637
+ }
638
+
639
+ ctx->encode_async = Block_copy(^(size_t iter) {
640
+ const int cb_idx = iter;
641
+ const int n_cb_l = ctx->n_cb;
642
+
643
+ const int n_nodes_0 = ctx->n_nodes_0;
644
+ const int n_nodes_1 = ctx->n_nodes_1;
645
+
646
+ const int n_nodes_per_cb = ctx->n_nodes_per_cb;
647
+
648
+ int idx_start = 0;
649
+ int idx_end = n_nodes_0;
650
+
651
+ if (cb_idx < n_cb_l) {
652
+ idx_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb);
653
+ idx_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
654
+ }
655
+
656
+ id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
657
+
658
+ ggml_metal_op_t ctx_op = ggml_metal_op_init(
659
+ ctx->dev,
660
+ cmd_buf,
661
+ ctx->gf,
662
+ idx_start,
663
+ idx_end,
664
+ ctx->use_fusion,
665
+ ctx->use_concurrency,
666
+ ctx->capture_next_compute,
667
+ ctx->debug_graph,
668
+ ctx->debug_fusion);
669
+
670
+ for (int idx = 0; idx < ggml_metal_op_n_nodes(ctx_op); ++idx) {
671
+ const int res = ggml_metal_op_encode(ctx_op, idx);
672
+ if (res == 0) {
673
+ break;
674
+ }
675
+
676
+ idx += res - 1;
677
+ }
678
+
679
+ ggml_metal_op_free(ctx_op);
680
+
681
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
682
+ [cmd_buf commit];
683
+ }
684
+ });
685
+ }
686
+
687
+ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data) {
688
+ ctx->abort_callback = abort_callback;
689
+ ctx->abort_callback_data = user_data;
690
+ }
691
+
692
+ bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
693
+ GGML_ASSERT(ctx->dev != nil);
694
+
695
+ id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
696
+
697
+ return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
698
+ }
699
+
700
+ void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
701
+ ctx->capture_next_compute = true;
702
+ }