local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,1156 @@
1
+ #include "clip.h"
2
+ #include "clip-impl.h"
3
+ #include "mtmd.h"
4
+ #include "mtmd-audio.h"
5
+
6
+ #include "llama.h"
7
+
8
+ // fix problem with std::min and std::max
9
+ #if defined(_WIN32)
10
+ #define WIN32_LEAN_AND_MEAN
11
+ #ifndef NOMINMAX
12
+ # define NOMINMAX
13
+ #endif
14
+ #include <windows.h>
15
+ #endif
16
+
17
+ #include <algorithm>
18
+ #include <cerrno>
19
+ #include <cstdio>
20
+ #include <cstdlib>
21
+ #include <cstring>
22
+ #include <vector>
23
+
24
+ // represents raw image data, layout is RGBRGBRGB...
25
+ // length of data must be nx * ny * 3
26
+ struct mtmd_bitmap {
27
+ uint32_t nx;
28
+ uint32_t ny;
29
+ std::vector<unsigned char> data;
30
+ std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
31
+ bool is_audio = false; // true if the bitmap is audio
32
+ };
33
+
34
+ struct mtmd_image_tokens {
35
+ uint32_t nx; // number of tokens in x direction
36
+ uint32_t ny; // number of tokens in y direction
37
+ bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
38
+ uint32_t n_tokens() const { return nx * ny; }
39
+ clip_image_f32_batch batch_f32; // preprocessed image patches
40
+ std::string id; // optional user-defined ID, useful for KV cache tracking
41
+
42
+ mtmd_image_tokens clone() {
43
+ return mtmd_image_tokens{
44
+ nx,
45
+ ny,
46
+ use_mrope_pos,
47
+ batch_f32.clone(),
48
+ id
49
+ };
50
+ }
51
+ };
52
+ using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
53
+
54
+ struct mtmd_audio_tokens {
55
+ uint32_t n_tokens; // number of tokens
56
+ clip_image_f32_batch batch_f32; // preprocessed image patches
57
+ std::string id; // optional user-defined ID, useful for KV cache tracking
58
+
59
+ mtmd_audio_tokens clone() {
60
+ return mtmd_audio_tokens{
61
+ n_tokens,
62
+ batch_f32.clone(),
63
+ id
64
+ };
65
+ }
66
+ };
67
+ using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
68
+
69
+ struct mtmd_input_chunk {
70
+ mtmd_input_chunk_type type;
71
+ std::vector<llama_token> tokens_text;
72
+ mtmd_image_tokens_ptr tokens_image;
73
+ mtmd_audio_tokens_ptr tokens_audio;
74
+ };
75
+
76
+ struct mtmd_input_chunks {
77
+ std::vector<mtmd_input_chunk> entries;
78
+ };
79
+
80
+ // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
81
+ // models not having it (llava-1.6) will process embeddings without any special tokens in-between
82
+ enum mtmd_slice_tmpl {
83
+ MTMD_SLICE_TMPL_NONE,
84
+ MTMD_SLICE_TMPL_MINICPMV_2_5,
85
+ MTMD_SLICE_TMPL_MINICPMV_2_6,
86
+ MTMD_SLICE_TMPL_LLAMA4,
87
+ MTMD_SLICE_TMPL_IDEFICS3,
88
+ MTMD_SLICE_TMPL_LFM2,
89
+ };
90
+
91
+ const char * mtmd_default_marker() {
92
+ return "<__media__>";
93
+ }
94
+
95
+ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
96
+ switch (flash_attn_type) {
97
+ case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
98
+ case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
99
+ case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
100
+ }
101
+ return CLIP_FLASH_ATTN_TYPE_AUTO;
102
+ }
103
+
104
+ mtmd_context_params mtmd_context_params_default() {
105
+ mtmd_context_params params {
106
+ /* use_gpu */ true,
107
+ /* print_timings */ true,
108
+ /* n_threads */ 4,
109
+ /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
110
+ /* media_marker */ mtmd_default_marker(),
111
+ /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
112
+ /* warmup */ true,
113
+ /* image_min_tokens */ -1,
114
+ /* image_max_tokens */ -1,
115
+ /* cb_eval */ nullptr,
116
+ /* cb_eval_user_data */ nullptr,
117
+ };
118
+ return params;
119
+ }
120
+
121
+ struct mtmd_context {
122
+ struct clip_ctx * ctx_v; // vision
123
+ struct clip_ctx * ctx_a; // audio
124
+ const struct llama_model * text_model;
125
+ std::vector<float> image_embd_v; // image embedding vector
126
+
127
+ bool print_timings;
128
+ int n_threads;
129
+ std::string media_marker;
130
+ const int n_embd_text;
131
+
132
+ // these are not token, but strings used to mark the beginning and end of image/audio embeddings
133
+ std::string img_beg;
134
+ std::string img_end;
135
+ std::string aud_beg;
136
+ std::string aud_end;
137
+
138
+ // for llava-uhd style models, we need special tokens in-between slices
139
+ // minicpmv calls them "slices", llama 4 calls them "tiles"
140
+ mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
141
+ std::vector<llama_token> tok_ov_img_start; // overview image
142
+ std::vector<llama_token> tok_ov_img_end; // overview image
143
+ std::vector<llama_token> tok_slices_start; // start of all slices
144
+ std::vector<llama_token> tok_slices_end; // end of all slices
145
+ std::vector<llama_token> tok_sli_img_start; // single slice start
146
+ std::vector<llama_token> tok_sli_img_end; // single slice end
147
+ std::vector<llama_token> tok_sli_img_mid; // between 2 slices
148
+ std::vector<llama_token> tok_row_end; // end of row
149
+ bool tok_row_end_trail = false;
150
+ bool ov_img_first = false;
151
+
152
+ // string template for slice image delimiters with row/col (idefics3)
153
+ std::string sli_img_start_tmpl;
154
+
155
+ std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
156
+
157
+ // TODO @ngxson : add timings
158
+
159
+ mtmd_context(const char * mmproj_fname,
160
+ const llama_model * text_model,
161
+ const mtmd_context_params & ctx_params) :
162
+ text_model (text_model),
163
+ print_timings(ctx_params.print_timings),
164
+ n_threads (ctx_params.n_threads),
165
+ media_marker (ctx_params.media_marker),
166
+ n_embd_text (llama_model_n_embd_inp(text_model))
167
+ {
168
+ if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
169
+ throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
170
+ }
171
+
172
+ if (media_marker.empty()) {
173
+ throw std::runtime_error("media_marker must not be empty");
174
+ }
175
+
176
+ clip_context_params ctx_clip_params {
177
+ /* use_gpu */ ctx_params.use_gpu,
178
+ /* flash_attn_type */ mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type),
179
+ /* image_min_tokens */ ctx_params.image_min_tokens,
180
+ /* image_max_tokens */ ctx_params.image_max_tokens,
181
+ /* warmup */ ctx_params.warmup,
182
+ /* cb_eval */ ctx_params.cb_eval,
183
+ /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
184
+ };
185
+
186
+ auto res = clip_init(mmproj_fname, ctx_clip_params);
187
+ ctx_v = res.ctx_v;
188
+ ctx_a = res.ctx_a;
189
+ if (!ctx_v && !ctx_a) {
190
+ throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
191
+ }
192
+
193
+ // if both vision and audio mmproj are present, we need to validate their n_embd
194
+ if (ctx_v && ctx_a) {
195
+ int n_embd_v = clip_n_mmproj_embd(ctx_v);
196
+ int n_embd_a = clip_n_mmproj_embd(ctx_a);
197
+ if (n_embd_v != n_embd_a) {
198
+ throw std::runtime_error(string_format(
199
+ "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
200
+ n_embd_v, n_embd_a));
201
+ }
202
+ }
203
+
204
+ // since we already validate n_embd of vision and audio mmproj,
205
+ // we can safely assume that they are the same
206
+ int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
207
+ if (n_embd_text != n_embd_clip) {
208
+ throw std::runtime_error(string_format(
209
+ "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
210
+ "hint: you may be using wrong mmproj\n",
211
+ n_embd_text, n_embd_clip));
212
+ }
213
+ if (ctx_v) {
214
+ init_vision();
215
+ }
216
+ if (ctx_a) {
217
+ init_audio();
218
+ }
219
+ }
220
+
221
+ void init_vision() {
222
+ GGML_ASSERT(ctx_v != nullptr);
223
+
224
+ projector_type proj = clip_get_projector_type(ctx_v);
225
+ int minicpmv_version = clip_is_minicpmv(ctx_v);
226
+ if (minicpmv_version == 2) {
227
+ // minicpmv 2.5 format:
228
+ // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
229
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
230
+ tok_ov_img_start = {lookup_token("<image>")};
231
+ tok_ov_img_end = {lookup_token("</image>")};
232
+ tok_slices_start = {lookup_token("<slice>")};
233
+ tok_slices_end = {lookup_token("</slice>")};
234
+ tok_sli_img_start = tok_ov_img_start;
235
+ tok_sli_img_end = tok_ov_img_end;
236
+ tok_row_end = {lookup_token("\n")};
237
+ tok_row_end_trail = false; // no trailing end-of-row token
238
+ ov_img_first = true;
239
+
240
+ } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
241
+ // minicpmv 2.6 format:
242
+ // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
243
+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
244
+ tok_ov_img_start = {lookup_token("<image>")};
245
+ tok_ov_img_end = {lookup_token("</image>")};
246
+ tok_sli_img_start = {lookup_token("<slice>")};
247
+ tok_sli_img_end = {lookup_token("</slice>")};
248
+ tok_row_end = {lookup_token("\n")};
249
+ tok_row_end_trail = false; // no trailing end-of-row token
250
+ ov_img_first = true;
251
+
252
+ } else if (minicpmv_version != 0) {
253
+ GGML_ASSERT(false && "unsupported minicpmv version");
254
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
255
+ // llama 4 format:
256
+ // <|image_start|>
257
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
258
+ // (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
259
+ // ... <|tile_y_separator|> <-- trailing end-of-row token
260
+ // <|image|> (overview) <-- overview image is last
261
+ // <|image_end|>
262
+ slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
263
+ tok_ov_img_start = {lookup_token("<|image|>")};
264
+ tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
265
+ tok_row_end = {lookup_token("<|tile_y_separator|>")};
266
+ tok_row_end_trail = true; // add trailing end-of-row token
267
+ ov_img_first = false; // overview image is last
268
+ }
269
+
270
+ // set boi/eoi
271
+ if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
272
+ // <start_of_image> ... (image embeddings) ... <end_of_image>
273
+ img_beg = "<start_of_image>";
274
+ img_end = "<end_of_image>";
275
+
276
+ } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
277
+ // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
278
+ slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
279
+ tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
280
+ tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
281
+ tok_row_end = {lookup_token("\n")};
282
+ sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
283
+
284
+ } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
285
+ // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
286
+ img_end = "[IMG_END]";
287
+
288
+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
289
+ // <|vision_start|> ... (image embeddings) ... <|vision_end|>
290
+ img_beg = "<|vision_start|>";
291
+ img_end = "<|vision_end|>";
292
+
293
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
294
+ // (more details in mtmd_context constructor)
295
+ img_beg = "<|image_start|>";
296
+ img_end = "<|image_end|>";
297
+ LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
298
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
299
+
300
+ } else if (proj == PROJECTOR_TYPE_INTERNVL) {
301
+ // <img> ... (image embeddings) ... </img>
302
+ img_beg = "<img>";
303
+ img_end = "</img>";
304
+
305
+ } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
306
+ // <|im_start|> ... (image embeddings) ... <|im_end|>
307
+ img_beg = "<|im_start|>";
308
+ img_end = "<|im_end|>";
309
+
310
+ } else if (proj == PROJECTOR_TYPE_LFM2) {
311
+ // multi-tile:
312
+ // <|image_start|>
313
+ // <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
314
+ // <|img_thumbnail|> (thumbnail)
315
+ // <|image_end|>
316
+ // single-tile:
317
+ // <|image_start|> (image) <|image_end|>
318
+ img_beg = "<|image_start|>";
319
+ img_end = "<|image_end|>";
320
+ slice_tmpl = MTMD_SLICE_TMPL_LFM2;
321
+ sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
322
+ tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
323
+ ov_img_first = false;
324
+ } else if (proj == PROJECTOR_TYPE_GLM4V) {
325
+ img_beg = "<|begin_of_image|>";
326
+ img_end = "<|end_of_image|>";
327
+
328
+ } else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
329
+ // <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
330
+ img_beg = "<|IMAGE_START|>";
331
+ img_end = "<|IMAGE_END|>";
332
+ }
333
+ }
334
+
335
+ void init_audio() {
336
+ GGML_ASSERT(ctx_a != nullptr);
337
+ projector_type proj = clip_get_projector_type(ctx_a);
338
+
339
+ LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
340
+ " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
341
+
342
+ // set preprocessor
343
+ switch (proj) {
344
+ case PROJECTOR_TYPE_QWEN2A:
345
+ case PROJECTOR_TYPE_QWEN25O:
346
+ case PROJECTOR_TYPE_ULTRAVOX:
347
+ case PROJECTOR_TYPE_VOXTRAL:
348
+ case PROJECTOR_TYPE_GLMA:
349
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
350
+ audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
351
+ break;
352
+ case PROJECTOR_TYPE_LFM2A:
353
+ audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
354
+ break;
355
+ default:
356
+ GGML_ABORT("unsupported audio projector type");
357
+ }
358
+
359
+ // initialize audio preprocessor
360
+ audio_preproc->initialize();
361
+
362
+ // set special tokens
363
+ if (proj == PROJECTOR_TYPE_QWEN2A) {
364
+ // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
365
+ aud_beg = "<|audio_bos|>";
366
+ aud_end = "<|audio_eos|>";
367
+
368
+ } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
369
+ // [BEGIN_AUDIO] ... (embeddings) ...
370
+ aud_beg = "[BEGIN_AUDIO]";
371
+
372
+ } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
373
+ // <sound> ... (embeddings) ...
374
+ aud_beg = "<sound>";
375
+ }
376
+ }
377
+
378
+ // get clip ctx based on chunk type
379
+ clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
380
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
381
+ return ctx_v;
382
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
383
+ return ctx_a;
384
+ }
385
+ GGML_ABORT("unknown chunk type");
386
+ }
387
+
388
+ projector_type proj_type_v() const {
389
+ return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
390
+ }
391
+
392
+ projector_type proj_type_a() const {
393
+ return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
394
+ }
395
+
396
+ ~mtmd_context() {
397
+ clip_free(ctx_a);
398
+ clip_free(ctx_v);
399
+ }
400
+
401
+ private:
402
+ llama_token lookup_token(const std::string & token_text) {
403
+ const llama_vocab * vocab = llama_model_get_vocab(text_model);
404
+ const int n_vocab = llama_vocab_n_tokens(vocab);
405
+ for (int i = 0; i < n_vocab; i++) {
406
+ if (token_to_piece(vocab, i, true) == token_text) {
407
+ return i;
408
+ }
409
+ }
410
+ return LLAMA_TOKEN_NULL;
411
+ }
412
+
413
+ std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
414
+ std::string piece;
415
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
416
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
417
+ if (n_chars < 0) {
418
+ piece.resize(-n_chars);
419
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
420
+ GGML_ASSERT(check == -n_chars);
421
+ } else {
422
+ piece.resize(n_chars);
423
+ }
424
+ return piece;
425
+ }
426
+ };
427
+
428
+ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
429
+ const struct llama_model * text_model,
430
+ const struct mtmd_context_params ctx_params) {
431
+ try {
432
+ return new mtmd_context(mmproj_fname, text_model, ctx_params);
433
+ } catch (const std::exception & e) {
434
+ LOG_ERR("%s: error: %s\n", __func__, e.what());
435
+ return nullptr;
436
+ }
437
+ }
438
+
439
+ void mtmd_free(mtmd_context * ctx) {
440
+ delete ctx;
441
+ }
442
+
443
+ struct mtmd_tokenizer {
444
+ mtmd_context * ctx;
445
+ std::vector<const mtmd_bitmap *> bitmaps;
446
+
447
+ std::string input_text;
448
+ bool add_special;
449
+ bool parse_special;
450
+ const llama_vocab * vocab;
451
+
452
+ mtmd_input_chunks cur;
453
+
454
+ mtmd_tokenizer(mtmd_context * ctx,
455
+ const mtmd_input_text * text,
456
+ const mtmd_bitmap ** bitmaps,
457
+ size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
458
+ add_special = text->add_special;
459
+ parse_special = text->parse_special;
460
+ input_text = text->text;
461
+ vocab = llama_model_get_vocab(ctx->text_model);
462
+
463
+ // for compatibility, we convert image marker to media marker
464
+ string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
465
+ }
466
+
467
+ int32_t tokenize(mtmd_input_chunks * output) {
468
+ cur.entries.clear();
469
+ std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
470
+ size_t i_bm = 0; // index of the current bitmap
471
+ for (auto & part : parts) {
472
+ if (part == ctx->media_marker) {
473
+ // this is a marker, we should add the next bitmap
474
+ if (i_bm >= bitmaps.size()) {
475
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
476
+ __func__, bitmaps.size(), parts.size() - 1);
477
+ return 1;
478
+ }
479
+ const mtmd_bitmap * bitmap = bitmaps[i_bm++];
480
+ int32_t res = add_media(bitmap);
481
+ if (res != 0) {
482
+ return res;
483
+ }
484
+ } else {
485
+ // this is a text part, we should add it as text
486
+ add_text(part, parse_special);
487
+ }
488
+ }
489
+
490
+ if (add_special && llama_vocab_get_add_bos(vocab)) {
491
+ // if first chunk is text, we add BOS token to first text chunk
492
+ // otherwise, create a new text chunk with BOS token
493
+ if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
494
+ // add BOS token to the beginning of first text chunk
495
+ cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
496
+ } else {
497
+ // create a new text chunk with BOS token at the beginning
498
+ mtmd_input_chunk bos_chunk{
499
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
500
+ {llama_vocab_bos(vocab)},
501
+ nullptr, // image tokens
502
+ nullptr, // audio tokens
503
+ };
504
+ cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
505
+ }
506
+ }
507
+
508
+ if (add_special && llama_vocab_get_add_eos(vocab)) {
509
+ // if last chunk is text, we add EOS token to it
510
+ add_text({llama_vocab_eos(vocab)});
511
+ }
512
+
513
+ if (i_bm != bitmaps.size()) {
514
+ LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
515
+ __func__, bitmaps.size(), parts.size() - 1);
516
+ return 1;
517
+ }
518
+
519
+ *output = std::move(cur);
520
+
521
+ return 0;
522
+ }
523
+
524
+ void add_text(const std::string & txt, bool parse_special) {
525
+ LOG_DBG("%s: %s\n", __func__, txt.c_str());
526
+ auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
527
+ add_text(tokens);
528
+ }
529
+
530
+ void add_text(const std::vector<llama_token> & tokens) {
531
+ if (tokens.empty()) {
532
+ return;
533
+ }
534
+ // if last entry is also a text chunk, add tokens to it instead of creating new chunk
535
+ if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
536
+ cur.entries.back().tokens_text.insert(
537
+ cur.entries.back().tokens_text.end(),
538
+ tokens.begin(),
539
+ tokens.end());
540
+ } else {
541
+ mtmd_input_chunk chunk{
542
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
543
+ tokens,
544
+ nullptr, // image tokens
545
+ nullptr, // audio tokens
546
+ };
547
+ cur.entries.emplace_back(std::move(chunk));
548
+ }
549
+ }
550
+
551
+ int32_t add_media(const mtmd_bitmap * bitmap) {
552
+ if (!bitmap->is_audio) {
553
+ // handle image
554
+
555
+ if (!ctx->ctx_v) {
556
+ LOG_ERR("%s: error: model does not support vision input\n", __func__);
557
+ return 2;
558
+ }
559
+
560
+ if (!ctx->img_beg.empty()) {
561
+ add_text(ctx->img_beg, true); // add image begin token
562
+ }
563
+
564
+ // convert mtmd_bitmap to clip_image_u8
565
+ clip_image_u8_ptr img_u8(clip_image_u8_init());
566
+ img_u8->nx = bitmap->nx;
567
+ img_u8->ny = bitmap->ny;
568
+ img_u8->buf.resize(bitmap->data.size());
569
+ std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
570
+
571
+ // preprocess image
572
+ clip_image_f32_batch batch_f32;
573
+ bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
574
+ if (!ok) {
575
+ LOG_ERR("Unable to preprocess image\n");
576
+ return 2;
577
+ }
578
+
579
+ // handle llava-uhd style preprocessing
580
+ const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
581
+ if (
582
+ ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
583
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
584
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
585
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
586
+ || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
587
+ ) {
588
+ const int n_col = batch_f32.grid_x;
589
+ const int n_row = batch_f32.grid_y;
590
+ // split batch into chunks of single images
591
+ // NOTE: batch_f32 will be invalidated after this call
592
+ auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
593
+ GGML_ASSERT(chunks.size() > 0);
594
+
595
+ auto ov_chunk = std::move(chunks.front());
596
+ chunks.erase(chunks.begin());
597
+
598
+ // add overview image (first)
599
+ if (ctx->ov_img_first) {
600
+ add_text(ctx->tok_ov_img_start);
601
+ cur.entries.emplace_back(std::move(ov_chunk));
602
+ add_text(ctx->tok_ov_img_end);
603
+ }
604
+
605
+ // add slices (or tiles)
606
+ if (!chunks.empty()) {
607
+ GGML_ASSERT((int)chunks.size() == n_row * n_col);
608
+ add_text(ctx->tok_slices_start);
609
+ for (int y = 0; y < n_row; y++) {
610
+ for (int x = 0; x < n_col; x++) {
611
+ const bool is_last_in_row = (x == n_col - 1);
612
+ if (!ctx->tok_sli_img_start.empty()) {
613
+ add_text(ctx->tok_sli_img_start);
614
+ } else if (!ctx->sli_img_start_tmpl.empty()) {
615
+ // If using a template to preceed a slice image
616
+ const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
617
+ std::unique_ptr<char[]> buf(new char[sz]);
618
+ std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
619
+ add_text(std::string(buf.get(), buf.get() + sz - 1), true);
620
+ }
621
+ cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
622
+ add_text(ctx->tok_sli_img_end);
623
+ if (!is_last_in_row) {
624
+ add_text(ctx->tok_sli_img_mid);
625
+ }
626
+ }
627
+ if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
628
+ add_text(ctx->tok_row_end);
629
+ }
630
+ }
631
+ add_text(ctx->tok_slices_end);
632
+ }
633
+
634
+ // add overview image (last)
635
+ if (!ctx->ov_img_first) {
636
+ add_text(ctx->tok_ov_img_start);
637
+ cur.entries.emplace_back(std::move(ov_chunk));
638
+ add_text(ctx->tok_ov_img_end);
639
+ }
640
+
641
+ } else {
642
+ size_t n_tokens = 0;
643
+ for (const auto & entry : batch_f32.entries) {
644
+ n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
645
+ }
646
+
647
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
648
+ if (mtmd_decode_use_mrope(ctx)) {
649
+ // for Qwen2VL, we need this information for M-RoPE decoding positions
650
+ image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
651
+ image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
652
+ image_tokens->use_mrope_pos = true;
653
+ } else {
654
+ // other models, we only need the total number of tokens
655
+ image_tokens->nx = n_tokens;
656
+ image_tokens->ny = 1;
657
+ }
658
+ image_tokens->batch_f32 = std::move(batch_f32);
659
+ image_tokens->id = bitmap->id; // optional
660
+
661
+ LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
662
+ LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
663
+ LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
664
+
665
+ mtmd_input_chunk chunk{
666
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
667
+ {}, // text tokens
668
+ std::move(image_tokens),
669
+ nullptr, // audio tokens
670
+ };
671
+ cur.entries.emplace_back(std::move(chunk));
672
+ }
673
+
674
+ if (!ctx->img_end.empty()) {
675
+ add_text(ctx->img_end, true); // add image end token
676
+ }
677
+
678
+ } else {
679
+ // handle audio
680
+
681
+ if (!ctx->ctx_a) {
682
+ LOG_ERR("%s: error: model does not support audio input\n", __func__);
683
+ return 2;
684
+ }
685
+
686
+ if (bitmap->data.size() == 0) {
687
+ LOG_ERR("%s: error: empty audio data\n", __func__);
688
+ return 2;
689
+ }
690
+
691
+ if (!ctx->aud_beg.empty()) {
692
+ add_text(ctx->aud_beg, true); // add audio begin token
693
+ }
694
+
695
+ // preprocess audio
696
+ std::vector<mtmd_audio_mel> mel_spec_chunks;
697
+ const float * samples = (const float *)bitmap->data.data();
698
+ size_t n_samples = bitmap->data.size() / sizeof(float);
699
+ bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
700
+ if (!ok) {
701
+ LOG_ERR("Unable to preprocess audio\n");
702
+ return 2;
703
+ }
704
+
705
+ // consider each mel_spec as a separate audio chunk
706
+ // TODO: maybe support batching, but this may come with memory cost
707
+ for (auto & mel_spec : mel_spec_chunks) {
708
+ clip_image_f32_ptr mel_f32(clip_image_f32_init());
709
+ mel_f32->nx = mel_spec.n_len;
710
+ mel_f32->ny = mel_spec.n_mel;
711
+ mel_f32->buf = std::move(mel_spec.data);
712
+ size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
713
+
714
+ clip_image_f32_batch batch_f32;
715
+ batch_f32.is_audio = true;
716
+ batch_f32.entries.push_back(std::move(mel_f32));
717
+
718
+ mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
719
+ audio_tokens->n_tokens = n_tokens;
720
+ audio_tokens->batch_f32 = std::move(batch_f32);
721
+ audio_tokens->id = bitmap->id; // optional
722
+
723
+ LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
724
+
725
+ mtmd_input_chunk chunk{
726
+ MTMD_INPUT_CHUNK_TYPE_AUDIO,
727
+ {}, // text tokens
728
+ nullptr, // image tokens
729
+ std::move(audio_tokens),
730
+ };
731
+ cur.entries.emplace_back(std::move(chunk));
732
+ }
733
+
734
+ if (!ctx->aud_end.empty()) {
735
+ add_text(ctx->aud_end, true); // add audio end token
736
+ }
737
+ }
738
+
739
+ return 0;
740
+ }
741
+
742
+ std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
743
+ std::vector<mtmd_input_chunk> chunks;
744
+
745
+ for (auto & entry : batch_f32.entries) {
746
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
747
+ image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
748
+ image_tokens->ny = 1;
749
+ image_tokens->batch_f32.entries.push_back(std::move(entry));
750
+ image_tokens->id = id;
751
+
752
+ mtmd_input_chunk chunk{
753
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
754
+ {}, // text tokens
755
+ std::move(image_tokens),
756
+ nullptr, // audio tokens
757
+ };
758
+ chunks.emplace_back(std::move(chunk));
759
+ }
760
+
761
+ return chunks;
762
+ }
763
+
764
+ // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
765
+ static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
766
+ std::vector<std::string> result;
767
+ if (input.empty()) {
768
+ return result;
769
+ }
770
+ size_t start = 0;
771
+ size_t pos = 0;
772
+ while ((pos = input.find(delimiter, start)) != std::string::npos) {
773
+ if (pos > start) {
774
+ result.push_back(input.substr(start, pos - start));
775
+ }
776
+ result.push_back(delimiter);
777
+ start = pos + delimiter.length();
778
+ }
779
+ if (start < input.length()) {
780
+ result.push_back(input.substr(start));
781
+ }
782
+ return result;
783
+ }
784
+
785
+ // copied from common_tokenize
786
+ static std::vector<llama_token> mtmd_tokenize_text_internal(
787
+ const struct llama_vocab * vocab,
788
+ const std::string & text,
789
+ bool add_special,
790
+ bool parse_special) {
791
+ // upper limit for the number of tokens
792
+ int n_tokens = text.length() + 2 * add_special;
793
+ std::vector<llama_token> result(n_tokens);
794
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
795
+ if (n_tokens < 0) {
796
+ result.resize(-n_tokens);
797
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
798
+ GGML_ASSERT(check == -n_tokens);
799
+ } else {
800
+ result.resize(n_tokens);
801
+ }
802
+ return result;
803
+ }
804
+ };
805
+
806
+ int32_t mtmd_tokenize(mtmd_context * ctx,
807
+ mtmd_input_chunks * output,
808
+ const mtmd_input_text * text,
809
+ const mtmd_bitmap ** bitmaps,
810
+ size_t n_bitmaps) {
811
+ mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
812
+ return tokenizer.tokenize(output);
813
+ }
814
+
815
+ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
816
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
817
+ LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
818
+ return 0;
819
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
820
+ if (!ctx->ctx_v) {
821
+ LOG_ERR("%s: model does not support vision input\n", __func__);
822
+ return 1;
823
+ }
824
+ return mtmd_encode(ctx, chunk->tokens_image.get());
825
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
826
+ if (!ctx->ctx_a) {
827
+ LOG_ERR("%s: model does not support audio input\n", __func__);
828
+ return 1;
829
+ }
830
+ int n_mmproj_embd = ctx->n_embd_text;
831
+ ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
832
+ bool ok = clip_image_batch_encode(
833
+ ctx->ctx_a,
834
+ ctx->n_threads,
835
+ &chunk->tokens_audio->batch_f32,
836
+ ctx->image_embd_v.data());
837
+ return ok ? 0 : 1;
838
+ }
839
+
840
+ LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
841
+ return 1;
842
+ }
843
+
844
+ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
845
+ clip_ctx * ctx_clip = ctx->ctx_v;
846
+ if (!ctx_clip) {
847
+ LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
848
+ return 1;
849
+ }
850
+ int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
851
+ ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
852
+ bool ok = false;
853
+
854
+ if (clip_is_llava(ctx_clip)
855
+ || clip_is_minicpmv(ctx_clip)
856
+ || clip_is_glm(ctx_clip)) {
857
+ // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
858
+ const auto & entries = image_tokens->batch_f32.entries;
859
+ for (size_t i = 0; i < entries.size(); i++) {
860
+ int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
861
+ ok = clip_image_encode(
862
+ ctx_clip,
863
+ ctx->n_threads,
864
+ entries[i].get(),
865
+ ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
866
+ }
867
+ } else {
868
+ ok = clip_image_batch_encode(
869
+ ctx_clip,
870
+ ctx->n_threads,
871
+ &image_tokens->batch_f32,
872
+ ctx->image_embd_v.data());
873
+ }
874
+
875
+ return ok ? 0 : 1;
876
+ }
877
+
878
+ float * mtmd_get_output_embd(mtmd_context * ctx) {
879
+ return ctx->image_embd_v.data();
880
+ }
881
+
882
+ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
883
+ switch (ctx->proj_type_v()) {
884
+ case PROJECTOR_TYPE_GEMMA3:
885
+ return true;
886
+ default:
887
+ return false;
888
+ }
889
+ }
890
+
891
+ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
892
+ switch (ctx->proj_type_v()) {
893
+ case PROJECTOR_TYPE_QWEN2VL:
894
+ case PROJECTOR_TYPE_QWEN25VL:
895
+ case PROJECTOR_TYPE_QWEN3VL:
896
+ case PROJECTOR_TYPE_GLM4V:
897
+ case PROJECTOR_TYPE_PADDLEOCR:
898
+ return true;
899
+ default:
900
+ return false;
901
+ }
902
+ }
903
+
904
+ bool mtmd_support_vision(mtmd_context * ctx) {
905
+ return ctx->ctx_v != nullptr;
906
+ }
907
+
908
+ bool mtmd_support_audio(mtmd_context * ctx) {
909
+ return ctx->ctx_a != nullptr;
910
+ }
911
+
912
+ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
913
+ if (!ctx->ctx_a) {
914
+ return -1;
915
+ }
916
+ return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
917
+ }
918
+
919
+ //
920
+ // public API functions
921
+ //
922
+
923
+ // mtmd_bitmap
924
+
925
+ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
926
+ uint32_t ny,
927
+ const unsigned char * data) {
928
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
929
+ bitmap->nx = nx;
930
+ bitmap->ny = ny;
931
+ size_t data_size = (size_t)nx * ny * 3;
932
+ bitmap->data.resize(data_size);
933
+ std::memcpy(bitmap->data.data(), data, data_size);
934
+ return bitmap;
935
+ }
936
+
937
+ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
938
+ const float * data) {
939
+ mtmd_bitmap * bitmap = new mtmd_bitmap;
940
+ bitmap->nx = n_samples;
941
+ bitmap->ny = 1;
942
+ bitmap->is_audio = true;
943
+ size_t data_size = n_samples * sizeof(float);
944
+ bitmap->data.resize(data_size);
945
+ std::memcpy(bitmap->data.data(), data, data_size);
946
+ return bitmap;
947
+ }
948
+
949
+ uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
950
+ return bitmap->nx;
951
+ }
952
+
953
+ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
954
+ return bitmap->ny;
955
+ }
956
+
957
+ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
958
+ return bitmap->data.data();
959
+ }
960
+
961
+ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
962
+ return bitmap->data.size();
963
+ }
964
+
965
+ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
966
+ return bitmap->is_audio;
967
+ }
968
+
969
+ const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
970
+ return bitmap->id.c_str();
971
+ }
972
+
973
+ void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
974
+ if (id) {
975
+ bitmap->id = std::string(id);
976
+ } else {
977
+ bitmap->id.clear();
978
+ }
979
+ }
980
+
981
+ void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
982
+ if (bitmap) {
983
+ delete bitmap;
984
+ }
985
+ }
986
+
987
+ // mtmd_input_chunks
988
+
989
+ mtmd_input_chunks * mtmd_input_chunks_init() {
990
+ return new mtmd_input_chunks;
991
+ }
992
+
993
+ size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
994
+ return chunks->entries.size();
995
+ }
996
+
997
+ const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
998
+ if (idx >= chunks->entries.size()) {
999
+ return nullptr;
1000
+ }
1001
+ return &chunks->entries[idx];
1002
+ }
1003
+
1004
+ void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
1005
+ if (chunks) {
1006
+ delete chunks;
1007
+ }
1008
+ }
1009
+
1010
+ // mtmd_input_chunk
1011
+
1012
+ enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
1013
+ return chunk->type;
1014
+ }
1015
+
1016
+ const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
1017
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1018
+ *n_tokens_output = chunk->tokens_text.size();
1019
+ return chunk->tokens_text.data();
1020
+ }
1021
+ *n_tokens_output = 0;
1022
+ return nullptr;
1023
+ }
1024
+
1025
+ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
1026
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1027
+ return chunk->tokens_image.get();
1028
+ }
1029
+ return nullptr;
1030
+ }
1031
+
1032
+ size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
1033
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1034
+ return chunk->tokens_text.size();
1035
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1036
+ return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
1037
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1038
+ return chunk->tokens_audio->n_tokens;
1039
+ } else {
1040
+ GGML_ABORT("invalid chunk type");
1041
+ }
1042
+ }
1043
+
1044
+ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
1045
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1046
+ return chunk->tokens_text.size();
1047
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1048
+ return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
1049
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1050
+ return chunk->tokens_audio->n_tokens;
1051
+ } else {
1052
+ GGML_ABORT("invalid chunk type");
1053
+ }
1054
+ }
1055
+
1056
+ const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
1057
+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1058
+ return chunk->tokens_image->id.c_str();
1059
+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1060
+ return chunk->tokens_audio->id.c_str();
1061
+ }
1062
+ return nullptr;
1063
+ }
1064
+
1065
+ mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
1066
+ mtmd_input_chunk * copy = new mtmd_input_chunk{
1067
+ chunk->type,
1068
+ chunk->tokens_text,
1069
+ nullptr,
1070
+ nullptr,
1071
+ };
1072
+ if (chunk->tokens_image) {
1073
+ // copy the image tokens
1074
+ copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
1075
+ *copy->tokens_image = chunk->tokens_image->clone();
1076
+ }
1077
+ if (chunk->tokens_audio) {
1078
+ // copy the audio tokens
1079
+ copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
1080
+ *copy->tokens_audio = chunk->tokens_audio->clone();
1081
+ }
1082
+ return copy;
1083
+ }
1084
+
1085
+ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
1086
+ if (chunk) {
1087
+ delete chunk;
1088
+ }
1089
+ }
1090
+
1091
+ // mtmd_image_tokens
1092
+
1093
+ size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
1094
+ return image_tokens->n_tokens();
1095
+ }
1096
+
1097
+ size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
1098
+ return image_tokens->nx;
1099
+ }
1100
+
1101
+ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
1102
+ return image_tokens->ny;
1103
+ }
1104
+
1105
+ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
1106
+ return image_tokens->id.c_str();
1107
+ }
1108
+
1109
+ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
1110
+ if (image_tokens->use_mrope_pos) {
1111
+ // for M-RoPE, temporal dimension = max(t,h,w)
1112
+ // t is omitted as we don't support video input
1113
+ return std::max(image_tokens->nx, image_tokens->ny);
1114
+ }
1115
+ return image_tokens->n_tokens();
1116
+ }
1117
+
1118
+ // test function
1119
+
1120
+ mtmd_input_chunks * mtmd_test_create_input_chunks() {
1121
+ mtmd_input_chunks * chunks = mtmd_input_chunks_init();
1122
+ if (!chunks) {
1123
+ return nullptr;
1124
+ }
1125
+
1126
+ // create a text chunk
1127
+ std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
1128
+ mtmd_input_chunk chunk_text{
1129
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
1130
+ std::move(tokens_text),
1131
+ nullptr, // image tokens
1132
+ nullptr, // audio tokens
1133
+ };
1134
+ chunks->entries.emplace_back(std::move(chunk_text));
1135
+
1136
+ // create an image chunk
1137
+ mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
1138
+ image_tokens->nx = 4;
1139
+ image_tokens->ny = 4;
1140
+ image_tokens->batch_f32.entries.resize(16);
1141
+ image_tokens->id = "image_1";
1142
+ mtmd_input_chunk chunk_image{
1143
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
1144
+ {}, // text tokens
1145
+ std::move(image_tokens),
1146
+ nullptr, // audio tokens
1147
+ };
1148
+ chunks->entries.emplace_back(std::move(chunk_image));
1149
+
1150
+ return chunks;
1151
+ }
1152
+
1153
+ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
1154
+ g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
1155
+ g_logger_state.log_callback_user_data = user_data;
1156
+ }