local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,1267 @@
1
+ #import "LocalLLM.h"
2
+
3
+ #import <React/RCTBridge.h>
4
+ #import <React/RCTLog.h>
5
+
6
+ #include <string>
7
+ #include <unordered_map>
8
+ #include <mutex>
9
+ #include <set>
10
+ #include <vector>
11
+ #include <cstdint>
12
+ #include <cmath>
13
+ #include <atomic>
14
+
15
+ #include "llama.h"
16
+ #include "common.h"
17
+ #include "json-schema-to-grammar.h"
18
+ #include "mtmd.h"
19
+ #include "mtmd-helper.h"
20
+
21
+ #import <Metal/Metal.h>
22
+ #import <os/proc.h>
23
+
24
+ // ── UUID generation ──────────────────────────────────────────────────────────
25
+
26
+ static NSString *generateUUID() {
27
+ return [[NSUUID UUID] UUIDString];
28
+ }
29
+
30
+ // ── Handle maps ──────────────────────────────────────────────────────────────
31
+
32
+ static std::mutex g_mutex;
33
+
34
+ static std::unordered_map<std::string, llama_model *> g_models;
35
+ static std::unordered_map<std::string, llama_context *> g_contexts;
36
+ static std::unordered_map<std::string, mtmd_context *> g_mtmd_contexts;
37
+
38
+ // Track live handles to prevent double-free
39
+ static std::set<void *> g_live_handles;
40
+
41
+ static void register_handle(void *ptr) {
42
+ std::lock_guard<std::mutex> lock(g_mutex);
43
+ g_live_handles.insert(ptr);
44
+ }
45
+
46
+ static bool unregister_handle(void *ptr) {
47
+ std::lock_guard<std::mutex> lock(g_mutex);
48
+ return g_live_handles.erase(ptr) > 0;
49
+ }
50
+
51
+ // ── Stream cancellation ──────────────────────────────────────────────────────
52
+
53
+ static std::mutex g_cancel_mutex;
54
+ static std::set<std::string> g_cancel_set;
55
+
56
+ static void request_cancel(const std::string &ctxId) {
57
+ std::lock_guard<std::mutex> lock(g_cancel_mutex);
58
+ g_cancel_set.insert(ctxId);
59
+ }
60
+
61
+ static bool is_cancelled(const std::string &ctxId) {
62
+ std::lock_guard<std::mutex> lock(g_cancel_mutex);
63
+ return g_cancel_set.count(ctxId) > 0;
64
+ }
65
+
66
+ static void clear_cancel(const std::string &ctxId) {
67
+ std::lock_guard<std::mutex> lock(g_cancel_mutex);
68
+ g_cancel_set.erase(ctxId);
69
+ }
70
+
71
+ // ── Log state (static so the C callback can access without capturing `self`) ─
72
+
73
+ static std::atomic<int> g_log_min_level{2}; // default: info (GGML_LOG_LEVEL_INFO)
74
+ static std::atomic<bool> g_log_events_enabled{false};
75
+ static __weak LocalLLM *g_log_module = nil;
76
+
77
+ // ── Sampler creation ─────────────────────────────────────────────────────────
78
+
79
+ struct SamplerParams {
80
+ int32_t max_tokens = 256;
81
+ float temperature = 0.7f;
82
+ float top_p = 0.95f;
83
+ int32_t top_k = 40;
84
+ float repeat_penalty = 1.1f;
85
+ float frequency_penalty = 0.0f;
86
+ float presence_penalty = 0.0f;
87
+ int32_t seed = -1;
88
+ std::string grammar;
89
+ std::string grammar_root;
90
+ int32_t n_past = 0;
91
+ };
92
+
93
+ static SamplerParams parse_sampler_params(NSDictionary *options) {
94
+ SamplerParams p;
95
+ if (options[@"max_tokens"]) p.max_tokens = [options[@"max_tokens"] intValue];
96
+ if (options[@"temperature"]) p.temperature = [options[@"temperature"] floatValue];
97
+ if (options[@"top_p"]) p.top_p = [options[@"top_p"] floatValue];
98
+ if (options[@"top_k"]) p.top_k = [options[@"top_k"] intValue];
99
+ if (options[@"repeat_penalty"]) p.repeat_penalty = [options[@"repeat_penalty"] floatValue];
100
+ if (options[@"frequency_penalty"]) p.frequency_penalty = [options[@"frequency_penalty"] floatValue];
101
+ if (options[@"presence_penalty"]) p.presence_penalty = [options[@"presence_penalty"] floatValue];
102
+ if (options[@"seed"]) p.seed = [options[@"seed"] intValue];
103
+ if (options[@"grammar"]) p.grammar = [options[@"grammar"] UTF8String];
104
+ if (options[@"grammar_root"]) p.grammar_root = [options[@"grammar_root"] UTF8String];
105
+ if (options[@"n_past"]) p.n_past = [options[@"n_past"] intValue];
106
+ return p;
107
+ }
108
+
109
+ static llama_sampler *create_sampler(const SamplerParams &p, const llama_model *model) {
110
+ auto *smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
111
+
112
+ if (p.repeat_penalty != 1.0f || p.frequency_penalty != 0.0f || p.presence_penalty != 0.0f) {
113
+ llama_sampler_chain_add(smpl,
114
+ llama_sampler_init_penalties(
115
+ llama_model_n_ctx_train(model),
116
+ p.repeat_penalty,
117
+ p.frequency_penalty,
118
+ p.presence_penalty
119
+ ));
120
+ }
121
+
122
+ if (p.temperature <= 0.0f) {
123
+ llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
124
+ } else {
125
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_k(p.top_k));
126
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_p(p.top_p, 1));
127
+ llama_sampler_chain_add(smpl, llama_sampler_init_temp(p.temperature));
128
+ llama_sampler_chain_add(smpl, llama_sampler_init_dist(p.seed));
129
+ }
130
+
131
+ if (!p.grammar.empty()) {
132
+ llama_sampler_chain_add(smpl,
133
+ llama_sampler_init_grammar(
134
+ llama_model_get_vocab(model),
135
+ p.grammar.c_str(),
136
+ p.grammar_root.empty() ? "root" : p.grammar_root.c_str()
137
+ ));
138
+ }
139
+
140
+ return smpl;
141
+ }
142
+
143
+ // ── Token to string helper ───────────────────────────────────────────────────
144
+
145
+ static std::string token_to_piece(const llama_model *model, llama_token token) {
146
+ char buf[256];
147
+ int n = llama_token_to_piece(llama_model_get_vocab(model), token, buf, sizeof(buf), 0, true);
148
+ if (n < 0) {
149
+ std::string result(-(int)n, '\0');
150
+ llama_token_to_piece(llama_model_get_vocab(model), token, result.data(), result.size(), 0, true);
151
+ return result;
152
+ }
153
+ return std::string(buf, n);
154
+ }
155
+
156
+ // ── Base64 decoding ──────────────────────────────────────────────────────────
157
+
158
+ static std::vector<uint8_t> decode_base64(NSString *base64) {
159
+ NSData *data = [[NSData alloc] initWithBase64EncodedString:base64 options:0];
160
+ if (!data) return {};
161
+ const uint8_t *bytes = (const uint8_t *)[data bytes];
162
+ return std::vector<uint8_t>(bytes, bytes + [data length]);
163
+ }
164
+
165
+ // ── Inference dispatch queue ─────────────────────────────────────────────────
166
+
167
+ static dispatch_queue_t inference_queue() {
168
+ static dispatch_queue_t q = dispatch_queue_create("com.hilum.llm.inference", DISPATCH_QUEUE_SERIAL);
169
+ return q;
170
+ }
171
+
172
+ // ── Download session management ──────────────────────────────────────────────
173
+
174
+ @interface LLMDownloadDelegate : NSObject <NSURLSessionDownloadDelegate>
175
+ @property (nonatomic, weak) LocalLLM *module;
176
+ @property (nonatomic, strong) NSMutableDictionary<NSString *, NSString *> *destPaths;
177
+ @end
178
+
179
+ @implementation LLMDownloadDelegate
180
+
181
+ - (instancetype)initWithModule:(LocalLLM *)module {
182
+ self = [super init];
183
+ if (self) {
184
+ _module = module;
185
+ _destPaths = [NSMutableDictionary new];
186
+ }
187
+ return self;
188
+ }
189
+
190
+ - (void)URLSession:(NSURLSession *)session
191
+ downloadTask:(NSURLSessionDownloadTask *)downloadTask
192
+ didWriteData:(int64_t)bytesWritten
193
+ totalBytesWritten:(int64_t)totalBytesWritten
194
+ totalBytesExpectedToWrite:(int64_t)totalBytesExpectedToWrite {
195
+ NSString *url = downloadTask.originalRequest.URL.absoluteString;
196
+ double percent = totalBytesExpectedToWrite > 0
197
+ ? (double)totalBytesWritten / (double)totalBytesExpectedToWrite * 100.0
198
+ : 0.0;
199
+ [_module sendEventWithName:@"onDownloadProgress" body:@{
200
+ @"url": url ?: @"",
201
+ @"downloaded": @(totalBytesWritten),
202
+ @"total": @(totalBytesExpectedToWrite),
203
+ @"percent": @(percent),
204
+ }];
205
+ }
206
+
207
+ - (void)URLSession:(NSURLSession *)session
208
+ downloadTask:(NSURLSessionDownloadTask *)downloadTask
209
+ didFinishDownloadingToURL:(NSURL *)location {
210
+ NSString *url = downloadTask.originalRequest.URL.absoluteString;
211
+ NSString *destPath = _destPaths[url];
212
+ if (destPath) {
213
+ NSError *error = nil;
214
+ NSFileManager *fm = [NSFileManager defaultManager];
215
+ // Remove existing file if present
216
+ [fm removeItemAtPath:destPath error:nil];
217
+ // Create parent directory
218
+ [fm createDirectoryAtPath:[destPath stringByDeletingLastPathComponent]
219
+ withIntermediateDirectories:YES attributes:nil error:nil];
220
+ [fm moveItemAtURL:location toURL:[NSURL fileURLWithPath:destPath] error:&error];
221
+ if (error) {
222
+ [_module sendEventWithName:@"onDownloadError" body:@{
223
+ @"url": url ?: @"",
224
+ @"error": error.localizedDescription ?: @"Move failed",
225
+ @"resumable": @NO,
226
+ }];
227
+ return;
228
+ }
229
+ }
230
+ [_module sendEventWithName:@"onDownloadComplete" body:@{
231
+ @"url": url ?: @"",
232
+ }];
233
+ }
234
+
235
+ - (void)URLSession:(NSURLSession *)session
236
+ task:(NSURLSessionTask *)task
237
+ didCompleteWithError:(NSError *)error {
238
+ if (!error) return;
239
+ NSString *url = task.originalRequest.URL.absoluteString;
240
+ BOOL resumable = error.userInfo[NSURLSessionDownloadTaskResumeData] != nil;
241
+ [_module sendEventWithName:@"onDownloadError" body:@{
242
+ @"url": url ?: @"",
243
+ @"error": error.localizedDescription ?: @"Download failed",
244
+ @"resumable": @(resumable),
245
+ }];
246
+ }
247
+
248
+ @end
249
+
250
+ // ── Module implementation ────────────────────────────────────────────────────
251
+
252
+ @implementation LocalLLM {
253
+ NSURLSession *_downloadSession;
254
+ LLMDownloadDelegate *_downloadDelegate;
255
+ bool _hasListeners;
256
+ }
257
+
258
+ RCT_EXPORT_MODULE()
259
+
260
+ + (BOOL)requiresMainQueueSetup {
261
+ return NO;
262
+ }
263
+
264
+ - (instancetype)init {
265
+ self = [super init];
266
+ if (self) {
267
+ _hasListeners = NO;
268
+ _downloadDelegate = [[LLMDownloadDelegate alloc] initWithModule:self];
269
+ NSURLSessionConfiguration *config =
270
+ [NSURLSessionConfiguration backgroundSessionConfigurationWithIdentifier:@"com.hilum.llm.downloads"];
271
+ config.sessionSendsLaunchEvents = YES;
272
+ _downloadSession = [NSURLSession sessionWithConfiguration:config
273
+ delegate:_downloadDelegate
274
+ delegateQueue:nil];
275
+ }
276
+ return self;
277
+ }
278
+
279
+ - (NSArray<NSString *> *)supportedEvents {
280
+ return @[
281
+ @"onToken",
282
+ @"onBatchToken",
283
+ @"onQuantizeComplete",
284
+ @"onLog",
285
+ @"onDownloadProgress",
286
+ @"onDownloadComplete",
287
+ @"onDownloadError",
288
+ ];
289
+ }
290
+
291
+ - (void)startObserving { _hasListeners = YES; }
292
+ - (void)stopObserving { _hasListeners = NO; }
293
+
294
+ // ── Backend info ─────────────────────────────────────────────────────────────
295
+
296
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(backendInfo) {
297
+ return @(llama_print_system_info());
298
+ }
299
+
300
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(backendVersion) {
301
+ // Use LLAMA_BUILD_NUMBER if available, otherwise a static version
302
+ return @"1.0.0";
303
+ }
304
+
305
+ // ── Model lifecycle ──────────────────────────────────────────────────────────
306
+
307
+ RCT_EXPORT_METHOD(loadModel:(NSString *)path
308
+ options:(NSDictionary *)options
309
+ resolve:(RCTPromiseResolveBlock)resolve
310
+ reject:(RCTPromiseRejectBlock)reject) {
311
+ dispatch_async(inference_queue(), ^{
312
+ // RAM guard
313
+ uint64_t available = os_proc_available_memory();
314
+ uint64_t minimumRAM = 512 * 1024 * 1024; // 512 MB absolute floor
315
+
316
+ if (available < minimumRAM) {
317
+ reject(@"E_INSUFFICIENT_MEMORY",
318
+ [NSString stringWithFormat:
319
+ @"Insufficient memory to load model. Available: %llu MB, minimum: %llu MB. "
320
+ @"Close other apps or use a smaller quantized model.",
321
+ available / (1024 * 1024), minimumRAM / (1024 * 1024)],
322
+ nil);
323
+ return;
324
+ }
325
+
326
+ int n_gpu_layers = options[@"n_gpu_layers"] ? [options[@"n_gpu_layers"] intValue] : 999;
327
+ bool use_mmap = options[@"use_mmap"] ? [options[@"use_mmap"] boolValue] : true;
328
+
329
+ llama_model_params params = llama_model_default_params();
330
+ params.n_gpu_layers = n_gpu_layers;
331
+ params.use_mmap = use_mmap;
332
+
333
+ llama_model *model = llama_model_load_from_file([path UTF8String], params);
334
+ if (!model) {
335
+ reject(@"E_MODEL_LOAD", @"Failed to load model", nil);
336
+ return;
337
+ }
338
+
339
+ register_handle(model);
340
+ NSString *modelId = generateUUID();
341
+ {
342
+ std::lock_guard<std::mutex> lock(g_mutex);
343
+ g_models[[modelId UTF8String]] = model;
344
+ }
345
+ resolve(modelId);
346
+ });
347
+ }
348
+
349
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(getModelSize:(NSString *)modelId) {
350
+ std::lock_guard<std::mutex> lock(g_mutex);
351
+ auto it = g_models.find([modelId UTF8String]);
352
+ if (it == g_models.end()) return @(0);
353
+ return @((double)llama_model_size(it->second));
354
+ }
355
+
356
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(freeModel:(NSString *)modelId) {
357
+ std::lock_guard<std::mutex> lock(g_mutex);
358
+ auto it = g_models.find([modelId UTF8String]);
359
+ if (it != g_models.end()) {
360
+ if (unregister_handle(it->second)) {
361
+ llama_model_free(it->second);
362
+ }
363
+ g_models.erase(it);
364
+ }
365
+ return nil;
366
+ }
367
+
368
+ // ── Context lifecycle ────────────────────────────────────────────────────────
369
+
370
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(createContext:(NSString *)modelId
371
+ options:(NSDictionary *)options) {
372
+ std::lock_guard<std::mutex> lock(g_mutex);
373
+ auto it = g_models.find([modelId UTF8String]);
374
+ if (it == g_models.end()) return @"";
375
+
376
+ llama_context_params params = llama_context_default_params();
377
+ if (options[@"n_ctx"]) params.n_ctx = [options[@"n_ctx"] intValue];
378
+ if (options[@"n_batch"]) params.n_batch = [options[@"n_batch"] intValue];
379
+ if (options[@"n_threads"]) params.n_threads = [options[@"n_threads"] intValue];
380
+ if (options[@"n_seq_max"]) params.n_seq_max = [options[@"n_seq_max"] intValue];
381
+ if (options[@"flash_attn_type"]) params.flash_attn = [options[@"flash_attn_type"] intValue] > 0;
382
+ if (options[@"type_k"]) params.type_k = (enum ggml_type)[options[@"type_k"] intValue];
383
+ if (options[@"type_v"]) params.type_v = (enum ggml_type)[options[@"type_v"] intValue];
384
+
385
+ llama_context *ctx = llama_init_from_model(it->second, params);
386
+ if (!ctx) return @"";
387
+
388
+ register_handle(ctx);
389
+ NSString *ctxId = generateUUID();
390
+ g_contexts[[ctxId UTF8String]] = ctx;
391
+ return ctxId;
392
+ }
393
+
394
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(getContextSize:(NSString *)contextId) {
395
+ std::lock_guard<std::mutex> lock(g_mutex);
396
+ auto it = g_contexts.find([contextId UTF8String]);
397
+ if (it == g_contexts.end()) return @(0);
398
+ return @((int)llama_n_ctx(it->second));
399
+ }
400
+
401
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(freeContext:(NSString *)contextId) {
402
+ std::lock_guard<std::mutex> lock(g_mutex);
403
+ auto it = g_contexts.find([contextId UTF8String]);
404
+ if (it != g_contexts.end()) {
405
+ if (unregister_handle(it->second)) {
406
+ llama_free(it->second);
407
+ }
408
+ g_contexts.erase(it);
409
+ }
410
+ return nil;
411
+ }
412
+
413
+ // ── KV cache ─────────────────────────────────────────────────────────────────
414
+
415
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(kvCacheClear:(NSString *)contextId
416
+ fromPos:(double)fromPos) {
417
+ std::lock_guard<std::mutex> lock(g_mutex);
418
+ auto it = g_contexts.find([contextId UTF8String]);
419
+ if (it != g_contexts.end()) {
420
+ llama_kv_cache_seq_rm(it->second, 0, (int)fromPos, -1);
421
+ }
422
+ return nil;
423
+ }
424
+
425
+ // ── Tokenization ─────────────────────────────────────────────────────────────
426
+
427
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(tokenize:(NSString *)modelId
428
+ text:(NSString *)text
429
+ addSpecial:(BOOL)addSpecial
430
+ parseSpecial:(BOOL)parseSpecial) {
431
+ std::lock_guard<std::mutex> lock(g_mutex);
432
+ auto it = g_models.find([modelId UTF8String]);
433
+ if (it == g_models.end()) return @[];
434
+
435
+ const char *ctext = [text UTF8String];
436
+ int text_len = (int)strlen(ctext);
437
+ const llama_vocab *vocab = llama_model_get_vocab(it->second);
438
+
439
+ std::vector<llama_token> tokens(text_len + 16);
440
+ int n = llama_tokenize(vocab, ctext, text_len, tokens.data(), (int)tokens.size(), addSpecial, parseSpecial);
441
+ if (n < 0) {
442
+ tokens.resize(-n);
443
+ n = llama_tokenize(vocab, ctext, text_len, tokens.data(), (int)tokens.size(), addSpecial, parseSpecial);
444
+ }
445
+ tokens.resize(n);
446
+
447
+ NSMutableArray *result = [NSMutableArray arrayWithCapacity:n];
448
+ for (int i = 0; i < n; i++) {
449
+ [result addObject:@(tokens[i])];
450
+ }
451
+ return result;
452
+ }
453
+
454
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(detokenize:(NSString *)modelId
455
+ tokens:(NSArray<NSNumber *> *)tokens) {
456
+ std::lock_guard<std::mutex> lock(g_mutex);
457
+ auto it = g_models.find([modelId UTF8String]);
458
+ if (it == g_models.end()) return @"";
459
+
460
+ std::string result;
461
+ for (NSNumber *tok in tokens) {
462
+ result += token_to_piece(it->second, [tok intValue]);
463
+ }
464
+ return [NSString stringWithUTF8String:result.c_str()];
465
+ }
466
+
467
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(applyChatTemplate:(NSString *)modelId
468
+ messages:(NSArray<NSDictionary *> *)messages
469
+ addAssistant:(BOOL)addAssistant) {
470
+ std::lock_guard<std::mutex> lock(g_mutex);
471
+ auto it = g_models.find([modelId UTF8String]);
472
+ if (it == g_models.end()) return @"";
473
+
474
+ std::vector<llama_chat_msg> chat_msgs;
475
+ for (NSDictionary *msg in messages) {
476
+ llama_chat_msg m;
477
+ m.role = [msg[@"role"] UTF8String];
478
+ m.content = [msg[@"content"] UTF8String];
479
+ chat_msgs.push_back(m);
480
+ }
481
+
482
+ std::string result(4096, '\0');
483
+ int n = llama_chat_apply_template(
484
+ llama_model_chat_template(it->second, nullptr),
485
+ chat_msgs.data(), chat_msgs.size(),
486
+ addAssistant,
487
+ result.data(), (int)result.size()
488
+ );
489
+ if (n > (int)result.size()) {
490
+ result.resize(n);
491
+ llama_chat_apply_template(
492
+ llama_model_chat_template(it->second, nullptr),
493
+ chat_msgs.data(), chat_msgs.size(),
494
+ addAssistant,
495
+ result.data(), (int)result.size()
496
+ );
497
+ }
498
+ result.resize(n);
499
+ return [NSString stringWithUTF8String:result.c_str()];
500
+ }
501
+
502
+ // ── Text inference ───────────────────────────────────────────────────────────
503
+
504
+ RCT_EXPORT_METHOD(generate:(NSString *)modelId
505
+ contextId:(NSString *)contextId
506
+ prompt:(NSString *)prompt
507
+ options:(NSDictionary *)options
508
+ resolve:(RCTPromiseResolveBlock)resolve
509
+ reject:(RCTPromiseRejectBlock)reject) {
510
+ dispatch_async(inference_queue(), ^{
511
+ llama_model *model;
512
+ llama_context *ctx;
513
+ {
514
+ std::lock_guard<std::mutex> lock(g_mutex);
515
+ auto mi = g_models.find([modelId UTF8String]);
516
+ auto ci = g_contexts.find([contextId UTF8String]);
517
+ if (mi == g_models.end() || ci == g_contexts.end()) {
518
+ reject(@"E_NOT_FOUND", @"Model or context not found", nil);
519
+ return;
520
+ }
521
+ model = mi->second;
522
+ ctx = ci->second;
523
+ }
524
+
525
+ SamplerParams sp = parse_sampler_params(options);
526
+ const char *cprompt = [prompt UTF8String];
527
+ const llama_vocab *vocab = llama_model_get_vocab(model);
528
+
529
+ // Tokenize prompt
530
+ int prompt_len = (int)strlen(cprompt);
531
+ std::vector<llama_token> tokens(prompt_len + 16);
532
+ int n_tokens = llama_tokenize(vocab, cprompt, prompt_len, tokens.data(), (int)tokens.size(), true, true);
533
+ if (n_tokens < 0) {
534
+ tokens.resize(-n_tokens);
535
+ n_tokens = llama_tokenize(vocab, cprompt, prompt_len, tokens.data(), (int)tokens.size(), true, true);
536
+ }
537
+ tokens.resize(n_tokens);
538
+
539
+ // Eval prompt
540
+ llama_batch batch = llama_batch_init(n_tokens, 0, 1);
541
+ for (int i = sp.n_past; i < n_tokens; i++) {
542
+ llama_batch_add(batch, tokens[i], i, {0}, i == n_tokens - 1);
543
+ }
544
+ if (llama_decode(ctx, batch) != 0) {
545
+ llama_batch_free(batch);
546
+ reject(@"E_DECODE", @"Failed to decode prompt", nil);
547
+ return;
548
+ }
549
+ llama_batch_free(batch);
550
+
551
+ // Sample loop
552
+ llama_sampler *smpl = create_sampler(sp, model);
553
+ std::string result;
554
+
555
+ for (int i = 0; i < sp.max_tokens; i++) {
556
+ llama_token new_token = llama_sampler_sample(smpl, ctx, -1);
557
+
558
+ if (llama_vocab_is_eog(vocab, new_token)) break;
559
+
560
+ result += token_to_piece(model, new_token);
561
+
562
+ // Eval the new token
563
+ llama_batch single = llama_batch_init(1, 0, 1);
564
+ llama_batch_add(single, new_token, n_tokens + i, {0}, true);
565
+ if (llama_decode(ctx, single) != 0) {
566
+ llama_batch_free(single);
567
+ break;
568
+ }
569
+ llama_batch_free(single);
570
+ }
571
+
572
+ llama_sampler_free(smpl);
573
+ resolve([NSString stringWithUTF8String:result.c_str()]);
574
+ });
575
+ }
576
+
577
+ RCT_EXPORT_METHOD(startStream:(NSString *)modelId
578
+ contextId:(NSString *)contextId
579
+ prompt:(NSString *)prompt
580
+ options:(NSDictionary *)options) {
581
+ std::string ctxIdStr = [contextId UTF8String];
582
+ clear_cancel(ctxIdStr);
583
+
584
+ dispatch_async(inference_queue(), ^{
585
+ llama_model *model;
586
+ llama_context *ctx;
587
+ {
588
+ std::lock_guard<std::mutex> lock(g_mutex);
589
+ auto mi = g_models.find([modelId UTF8String]);
590
+ auto ci = g_contexts.find(ctxIdStr);
591
+ if (mi == g_models.end() || ci == g_contexts.end()) {
592
+ [self sendEventWithName:@"onToken" body:@{
593
+ @"contextId": contextId, @"done": @YES, @"error": @"Model or context not found"
594
+ }];
595
+ return;
596
+ }
597
+ model = mi->second;
598
+ ctx = ci->second;
599
+ }
600
+
601
+ SamplerParams sp = parse_sampler_params(options);
602
+ const char *cprompt = [prompt UTF8String];
603
+ const llama_vocab *vocab = llama_model_get_vocab(model);
604
+
605
+ // Tokenize prompt
606
+ int prompt_len = (int)strlen(cprompt);
607
+ std::vector<llama_token> tokens(prompt_len + 16);
608
+ int n_tokens = llama_tokenize(vocab, cprompt, prompt_len, tokens.data(), (int)tokens.size(), true, true);
609
+ if (n_tokens < 0) {
610
+ tokens.resize(-n_tokens);
611
+ n_tokens = llama_tokenize(vocab, cprompt, prompt_len, tokens.data(), (int)tokens.size(), true, true);
612
+ }
613
+ tokens.resize(n_tokens);
614
+
615
+ // Eval prompt
616
+ llama_batch batch = llama_batch_init(n_tokens, 0, 1);
617
+ for (int i = sp.n_past; i < n_tokens; i++) {
618
+ llama_batch_add(batch, tokens[i], i, {0}, i == n_tokens - 1);
619
+ }
620
+ if (llama_decode(ctx, batch) != 0) {
621
+ llama_batch_free(batch);
622
+ [self sendEventWithName:@"onToken" body:@{
623
+ @"contextId": contextId, @"done": @YES, @"error": @"Failed to decode prompt"
624
+ }];
625
+ return;
626
+ }
627
+ llama_batch_free(batch);
628
+
629
+ // Sample loop
630
+ llama_sampler *smpl = create_sampler(sp, model);
631
+
632
+ for (int i = 0; i < sp.max_tokens; i++) {
633
+ if (is_cancelled(ctxIdStr)) break;
634
+
635
+ llama_token new_token = llama_sampler_sample(smpl, ctx, -1);
636
+
637
+ if (llama_vocab_is_eog(vocab, new_token)) break;
638
+
639
+ std::string piece = token_to_piece(model, new_token);
640
+ [self sendEventWithName:@"onToken" body:@{
641
+ @"contextId": contextId,
642
+ @"token": [NSString stringWithUTF8String:piece.c_str()],
643
+ @"done": @NO,
644
+ }];
645
+
646
+ llama_batch single = llama_batch_init(1, 0, 1);
647
+ llama_batch_add(single, new_token, n_tokens + i, {0}, true);
648
+ if (llama_decode(ctx, single) != 0) {
649
+ llama_batch_free(single);
650
+ break;
651
+ }
652
+ llama_batch_free(single);
653
+ }
654
+
655
+ llama_sampler_free(smpl);
656
+ clear_cancel(ctxIdStr);
657
+ [self sendEventWithName:@"onToken" body:@{
658
+ @"contextId": contextId, @"done": @YES
659
+ }];
660
+ });
661
+ }
662
+
663
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(stopStream:(NSString *)contextId) {
664
+ request_cancel([contextId UTF8String]);
665
+ return nil;
666
+ }
667
+
668
+ // ── Vision ───────────────────────────────────────────────────────────────────
669
+
670
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(loadProjector:(NSString *)modelId
671
+ path:(NSString *)path
672
+ options:(NSDictionary *)options) {
673
+ std::lock_guard<std::mutex> lock(g_mutex);
674
+ auto it = g_models.find([modelId UTF8String]);
675
+ if (it == g_models.end()) return @"";
676
+
677
+ mtmd_context_params mparams = mtmd_context_default_params();
678
+ mparams.use_gpu = options[@"use_gpu"] ? [options[@"use_gpu"] boolValue] : true;
679
+ if (options[@"n_threads"]) mparams.n_threads = [options[@"n_threads"] intValue];
680
+
681
+ mtmd_context *mctx = mtmd_init_from_file([path UTF8String], it->second, mparams);
682
+ if (!mctx) return @"";
683
+
684
+ register_handle(mctx);
685
+ NSString *mtmdId = generateUUID();
686
+ g_mtmd_contexts[[mtmdId UTF8String]] = mctx;
687
+ return mtmdId;
688
+ }
689
+
690
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(supportVision:(NSString *)mtmdId) {
691
+ std::lock_guard<std::mutex> lock(g_mutex);
692
+ auto it = g_mtmd_contexts.find([mtmdId UTF8String]);
693
+ if (it == g_mtmd_contexts.end()) return @NO;
694
+ return @(mtmd_support_vision(it->second));
695
+ }
696
+
697
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(freeMtmdContext:(NSString *)mtmdId) {
698
+ std::lock_guard<std::mutex> lock(g_mutex);
699
+ auto it = g_mtmd_contexts.find([mtmdId UTF8String]);
700
+ if (it != g_mtmd_contexts.end()) {
701
+ if (unregister_handle(it->second)) {
702
+ mtmd_free(it->second);
703
+ }
704
+ g_mtmd_contexts.erase(it);
705
+ }
706
+ return nil;
707
+ }
708
+
709
+ RCT_EXPORT_METHOD(generateVision:(NSString *)modelId
710
+ contextId:(NSString *)contextId
711
+ mtmdId:(NSString *)mtmdId
712
+ prompt:(NSString *)prompt
713
+ imageBase64s:(NSArray<NSString *> *)imageBase64s
714
+ options:(NSDictionary *)options
715
+ resolve:(RCTPromiseResolveBlock)resolve
716
+ reject:(RCTPromiseRejectBlock)reject) {
717
+ dispatch_async(inference_queue(), ^{
718
+ llama_model *model;
719
+ llama_context *ctx;
720
+ mtmd_context *mctx;
721
+ {
722
+ std::lock_guard<std::mutex> lock(g_mutex);
723
+ auto mi = g_models.find([modelId UTF8String]);
724
+ auto ci = g_contexts.find([contextId UTF8String]);
725
+ auto vi = g_mtmd_contexts.find([mtmdId UTF8String]);
726
+ if (mi == g_models.end() || ci == g_contexts.end() || vi == g_mtmd_contexts.end()) {
727
+ reject(@"E_NOT_FOUND", @"Model, context, or vision context not found", nil);
728
+ return;
729
+ }
730
+ model = mi->second;
731
+ ctx = ci->second;
732
+ mctx = vi->second;
733
+ }
734
+
735
+ // Decode base64 images
736
+ std::vector<mtmd_bitmap> bitmaps;
737
+ for (NSString *b64 in imageBase64s) {
738
+ std::vector<uint8_t> imgData = decode_base64(b64);
739
+ if (imgData.empty()) continue;
740
+ mtmd_bitmap *bmp = mtmd_helper_bitmap_init_from_buf(imgData.data(), imgData.size());
741
+ if (bmp) bitmaps.push_back(*bmp);
742
+ }
743
+
744
+ // Tokenize with vision
745
+ SamplerParams sp = parse_sampler_params(options);
746
+ const llama_vocab *vocab = llama_model_get_vocab(model);
747
+ mtmd_input_chunks *chunks = mtmd_input_chunks_init();
748
+
749
+ if (mtmd_tokenize(mctx, chunks, [prompt UTF8String], bitmaps.data(), bitmaps.size()) != 0) {
750
+ mtmd_input_chunks_free(chunks);
751
+ reject(@"E_VISION_TOKENIZE", @"Failed to tokenize vision input", nil);
752
+ return;
753
+ }
754
+
755
+ // Eval chunks
756
+ if (mtmd_helper_eval(mctx, ctx, chunks, llama_n_ctx(ctx), 0) != 0) {
757
+ mtmd_input_chunks_free(chunks);
758
+ reject(@"E_VISION_EVAL", @"Failed to evaluate vision input", nil);
759
+ return;
760
+ }
761
+
762
+ int n_past = mtmd_helper_get_n_pos(chunks);
763
+ mtmd_input_chunks_free(chunks);
764
+
765
+ // Sample
766
+ llama_sampler *smpl = create_sampler(sp, model);
767
+ std::string result;
768
+
769
+ for (int i = 0; i < sp.max_tokens; i++) {
770
+ llama_token new_token = llama_sampler_sample(smpl, ctx, -1);
771
+ if (llama_vocab_is_eog(vocab, new_token)) break;
772
+
773
+ result += token_to_piece(model, new_token);
774
+
775
+ llama_batch single = llama_batch_init(1, 0, 1);
776
+ llama_batch_add(single, new_token, n_past + i, {0}, true);
777
+ if (llama_decode(ctx, single) != 0) {
778
+ llama_batch_free(single);
779
+ break;
780
+ }
781
+ llama_batch_free(single);
782
+ }
783
+
784
+ llama_sampler_free(smpl);
785
+ resolve([NSString stringWithUTF8String:result.c_str()]);
786
+ });
787
+ }
788
+
789
+ RCT_EXPORT_METHOD(startStreamVision:(NSString *)modelId
790
+ contextId:(NSString *)contextId
791
+ mtmdId:(NSString *)mtmdId
792
+ prompt:(NSString *)prompt
793
+ imageBase64s:(NSArray<NSString *> *)imageBase64s
794
+ options:(NSDictionary *)options) {
795
+ std::string ctxIdStr = [contextId UTF8String];
796
+ clear_cancel(ctxIdStr);
797
+
798
+ dispatch_async(inference_queue(), ^{
799
+ llama_model *model;
800
+ llama_context *ctx;
801
+ mtmd_context *mctx;
802
+ {
803
+ std::lock_guard<std::mutex> lock(g_mutex);
804
+ auto mi = g_models.find([modelId UTF8String]);
805
+ auto ci = g_contexts.find(ctxIdStr);
806
+ auto vi = g_mtmd_contexts.find([mtmdId UTF8String]);
807
+ if (mi == g_models.end() || ci == g_contexts.end() || vi == g_mtmd_contexts.end()) {
808
+ [self sendEventWithName:@"onToken" body:@{
809
+ @"contextId": contextId, @"done": @YES, @"error": @"Not found"
810
+ }];
811
+ return;
812
+ }
813
+ model = mi->second;
814
+ ctx = ci->second;
815
+ mctx = vi->second;
816
+ }
817
+
818
+ // Decode base64 images
819
+ std::vector<mtmd_bitmap> bitmaps;
820
+ for (NSString *b64 in imageBase64s) {
821
+ std::vector<uint8_t> imgData = decode_base64(b64);
822
+ if (imgData.empty()) continue;
823
+ mtmd_bitmap *bmp = mtmd_helper_bitmap_init_from_buf(imgData.data(), imgData.size());
824
+ if (bmp) bitmaps.push_back(*bmp);
825
+ }
826
+
827
+ SamplerParams sp = parse_sampler_params(options);
828
+ const llama_vocab *vocab = llama_model_get_vocab(model);
829
+ mtmd_input_chunks *chunks = mtmd_input_chunks_init();
830
+
831
+ if (mtmd_tokenize(mctx, chunks, [prompt UTF8String], bitmaps.data(), bitmaps.size()) != 0) {
832
+ mtmd_input_chunks_free(chunks);
833
+ [self sendEventWithName:@"onToken" body:@{
834
+ @"contextId": contextId, @"done": @YES, @"error": @"Vision tokenize failed"
835
+ }];
836
+ return;
837
+ }
838
+
839
+ if (mtmd_helper_eval(mctx, ctx, chunks, llama_n_ctx(ctx), 0) != 0) {
840
+ mtmd_input_chunks_free(chunks);
841
+ [self sendEventWithName:@"onToken" body:@{
842
+ @"contextId": contextId, @"done": @YES, @"error": @"Vision eval failed"
843
+ }];
844
+ return;
845
+ }
846
+
847
+ int n_past = mtmd_helper_get_n_pos(chunks);
848
+ mtmd_input_chunks_free(chunks);
849
+
850
+ llama_sampler *smpl = create_sampler(sp, model);
851
+
852
+ for (int i = 0; i < sp.max_tokens; i++) {
853
+ if (is_cancelled(ctxIdStr)) break;
854
+
855
+ llama_token new_token = llama_sampler_sample(smpl, ctx, -1);
856
+ if (llama_vocab_is_eog(vocab, new_token)) break;
857
+
858
+ std::string piece = token_to_piece(model, new_token);
859
+ [self sendEventWithName:@"onToken" body:@{
860
+ @"contextId": contextId,
861
+ @"token": [NSString stringWithUTF8String:piece.c_str()],
862
+ @"done": @NO,
863
+ }];
864
+
865
+ llama_batch single = llama_batch_init(1, 0, 1);
866
+ llama_batch_add(single, new_token, n_past + i, {0}, true);
867
+ if (llama_decode(ctx, single) != 0) {
868
+ llama_batch_free(single);
869
+ break;
870
+ }
871
+ llama_batch_free(single);
872
+ }
873
+
874
+ llama_sampler_free(smpl);
875
+ clear_cancel(ctxIdStr);
876
+ [self sendEventWithName:@"onToken" body:@{
877
+ @"contextId": contextId, @"done": @YES
878
+ }];
879
+ });
880
+ }
881
+
882
+ // ── Grammar ──────────────────────────────────────────────────────────────────
883
+
884
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(jsonSchemaToGrammar:(NSString *)schemaJson) {
885
+ try {
886
+ auto schema = nlohmann::ordered_json::parse([schemaJson UTF8String]);
887
+ std::string grammar = json_schema_to_grammar(schema);
888
+ return [NSString stringWithUTF8String:grammar.c_str()];
889
+ } catch (...) {
890
+ return @"";
891
+ }
892
+ }
893
+
894
+ // ── Embeddings ───────────────────────────────────────────────────────────────
895
+
896
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(getEmbeddingDimension:(NSString *)modelId) {
897
+ std::lock_guard<std::mutex> lock(g_mutex);
898
+ auto it = g_models.find([modelId UTF8String]);
899
+ if (it == g_models.end()) return @(0);
900
+ return @((int)llama_model_n_embd(it->second));
901
+ }
902
+
903
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(createEmbeddingContext:(NSString *)modelId
904
+ options:(NSDictionary *)options) {
905
+ std::lock_guard<std::mutex> lock(g_mutex);
906
+ auto it = g_models.find([modelId UTF8String]);
907
+ if (it == g_models.end()) return @"";
908
+
909
+ llama_context_params params = llama_context_default_params();
910
+ params.embeddings = true;
911
+ if (options[@"n_ctx"]) params.n_ctx = [options[@"n_ctx"] intValue];
912
+ if (options[@"n_batch"]) params.n_batch = [options[@"n_batch"] intValue];
913
+ if (options[@"n_threads"]) params.n_threads = [options[@"n_threads"] intValue];
914
+ if (options[@"pooling_type"]) params.pooling_type = (enum llama_pooling_type)[options[@"pooling_type"] intValue];
915
+
916
+ llama_context *ctx = llama_init_from_model(it->second, params);
917
+ if (!ctx) return @"";
918
+
919
+ register_handle(ctx);
920
+ NSString *ctxId = generateUUID();
921
+ g_contexts[[ctxId UTF8String]] = ctx;
922
+ return ctxId;
923
+ }
924
+
925
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(embed:(NSString *)contextId
926
+ modelId:(NSString *)modelId
927
+ tokens:(NSArray<NSNumber *> *)tokens) {
928
+ std::lock_guard<std::mutex> lock(g_mutex);
929
+ auto ci = g_contexts.find([contextId UTF8String]);
930
+ auto mi = g_models.find([modelId UTF8String]);
931
+ if (ci == g_contexts.end() || mi == g_models.end()) {
932
+ @throw [NSException exceptionWithName:@"E_INVALID_HANDLE"
933
+ reason:@"Invalid context or model ID for embed()"
934
+ userInfo:nil];
935
+ }
936
+
937
+ llama_context *ctx = ci->second;
938
+
939
+ // Build batch
940
+ int n = (int)tokens.count;
941
+ llama_batch batch = llama_batch_init(n, 0, 1);
942
+ for (int i = 0; i < n; i++) {
943
+ llama_batch_add(batch, [tokens[i] intValue], i, {0}, true);
944
+ }
945
+
946
+ if (llama_encode(ctx, batch) != 0) {
947
+ llama_batch_free(batch);
948
+ return @[];
949
+ }
950
+ llama_batch_free(batch);
951
+
952
+ // Extract embeddings
953
+ int n_embd = llama_model_n_embd(mi->second);
954
+ const float *embd = llama_get_embeddings_seq(ctx, 0);
955
+ if (!embd) return @[];
956
+
957
+ // L2 normalize
958
+ float norm = 0.0f;
959
+ for (int i = 0; i < n_embd; i++) norm += embd[i] * embd[i];
960
+ norm = sqrtf(norm);
961
+
962
+ NSMutableArray *result = [NSMutableArray arrayWithCapacity:n_embd];
963
+ for (int i = 0; i < n_embd; i++) {
964
+ [result addObject:@(norm > 0.0f ? embd[i] / norm : 0.0f)];
965
+ }
966
+ return result;
967
+ }
968
+
969
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(embedBatch:(NSString *)contextId
970
+ modelId:(NSString *)modelId
971
+ tokenArrays:(NSArray<NSArray<NSNumber *> *> *)tokenArrays) {
972
+ std::lock_guard<std::mutex> lock(g_mutex);
973
+ auto ci = g_contexts.find([contextId UTF8String]);
974
+ auto mi = g_models.find([modelId UTF8String]);
975
+ if (ci == g_contexts.end() || mi == g_models.end()) {
976
+ @throw [NSException exceptionWithName:@"E_INVALID_HANDLE"
977
+ reason:@"Invalid context or model ID for embedBatch()"
978
+ userInfo:nil];
979
+ }
980
+
981
+ llama_context *ctx = ci->second;
982
+ int n_seqs = (int)tokenArrays.count;
983
+ int n_embd = llama_model_n_embd(mi->second);
984
+
985
+ // Calculate total tokens
986
+ int total_tokens = 0;
987
+ for (NSArray *toks in tokenArrays) total_tokens += (int)toks.count;
988
+
989
+ llama_batch batch = llama_batch_init(total_tokens, 0, n_seqs);
990
+ int pos = 0;
991
+ for (int seq = 0; seq < n_seqs; seq++) {
992
+ NSArray *toks = tokenArrays[seq];
993
+ for (int i = 0; i < (int)toks.count; i++) {
994
+ llama_batch_add(batch, [toks[i] intValue], i, {seq}, i == (int)toks.count - 1);
995
+ }
996
+ }
997
+
998
+ if (llama_encode(ctx, batch) != 0) {
999
+ llama_batch_free(batch);
1000
+ return @[];
1001
+ }
1002
+ llama_batch_free(batch);
1003
+
1004
+ NSMutableArray *results = [NSMutableArray arrayWithCapacity:n_seqs];
1005
+ for (int seq = 0; seq < n_seqs; seq++) {
1006
+ const float *embd = llama_get_embeddings_seq(ctx, seq);
1007
+ if (!embd) {
1008
+ [results addObject:@[]];
1009
+ continue;
1010
+ }
1011
+
1012
+ float norm = 0.0f;
1013
+ for (int i = 0; i < n_embd; i++) norm += embd[i] * embd[i];
1014
+ norm = sqrtf(norm);
1015
+
1016
+ NSMutableArray *vec = [NSMutableArray arrayWithCapacity:n_embd];
1017
+ for (int i = 0; i < n_embd; i++) {
1018
+ [vec addObject:@(norm > 0.0f ? embd[i] / norm : 0.0f)];
1019
+ }
1020
+ [results addObject:vec];
1021
+ }
1022
+ return results;
1023
+ }
1024
+
1025
+ // ── Batch inference ──────────────────────────────────────────────────────────
1026
+
1027
+ RCT_EXPORT_METHOD(startBatch:(NSString *)modelId
1028
+ contextId:(NSString *)contextId
1029
+ prompts:(NSArray<NSString *> *)prompts
1030
+ options:(NSDictionary *)options) {
1031
+ std::string ctxIdStr = [contextId UTF8String];
1032
+
1033
+ dispatch_async(inference_queue(), ^{
1034
+ llama_model *model;
1035
+ llama_context *ctx;
1036
+ {
1037
+ std::lock_guard<std::mutex> lock(g_mutex);
1038
+ auto mi = g_models.find([modelId UTF8String]);
1039
+ auto ci = g_contexts.find(ctxIdStr);
1040
+ if (mi == g_models.end() || ci == g_contexts.end()) {
1041
+ [self sendEventWithName:@"onBatchToken" body:@{
1042
+ @"contextId": contextId, @"done": @YES, @"error": @"Not found", @"seqIndex": @(-1)
1043
+ }];
1044
+ return;
1045
+ }
1046
+ model = mi->second;
1047
+ ctx = ci->second;
1048
+ }
1049
+
1050
+ SamplerParams sp = parse_sampler_params(options);
1051
+ const llama_vocab *vocab = llama_model_get_vocab(model);
1052
+ int n_seqs = (int)prompts.count;
1053
+
1054
+ // Tokenize all prompts
1055
+ std::vector<std::vector<llama_token>> all_tokens(n_seqs);
1056
+ int total_tokens = 0;
1057
+ for (int s = 0; s < n_seqs; s++) {
1058
+ const char *cprompt = [prompts[s] UTF8String];
1059
+ int plen = (int)strlen(cprompt);
1060
+ all_tokens[s].resize(plen + 16);
1061
+ int n = llama_tokenize(vocab, cprompt, plen, all_tokens[s].data(), (int)all_tokens[s].size(), true, true);
1062
+ if (n < 0) {
1063
+ all_tokens[s].resize(-n);
1064
+ n = llama_tokenize(vocab, cprompt, plen, all_tokens[s].data(), (int)all_tokens[s].size(), true, true);
1065
+ }
1066
+ all_tokens[s].resize(n);
1067
+ total_tokens += n;
1068
+ }
1069
+
1070
+ // Eval all prompts
1071
+ llama_batch batch = llama_batch_init(total_tokens, 0, n_seqs);
1072
+ for (int s = 0; s < n_seqs; s++) {
1073
+ for (int i = 0; i < (int)all_tokens[s].size(); i++) {
1074
+ llama_batch_add(batch, all_tokens[s][i], i, {s}, i == (int)all_tokens[s].size() - 1);
1075
+ }
1076
+ }
1077
+ if (llama_decode(ctx, batch) != 0) {
1078
+ llama_batch_free(batch);
1079
+ [self sendEventWithName:@"onBatchToken" body:@{
1080
+ @"contextId": contextId, @"done": @YES, @"error": @"Decode failed", @"seqIndex": @(-1)
1081
+ }];
1082
+ return;
1083
+ }
1084
+ llama_batch_free(batch);
1085
+
1086
+ // Sample per sequence
1087
+ std::vector<llama_sampler *> samplers(n_seqs);
1088
+ std::vector<bool> done(n_seqs, false);
1089
+ std::vector<int> positions(n_seqs);
1090
+ for (int s = 0; s < n_seqs; s++) {
1091
+ samplers[s] = create_sampler(sp, model);
1092
+ positions[s] = (int)all_tokens[s].size();
1093
+ }
1094
+
1095
+ bool cancelled = false;
1096
+ for (int iter = 0; iter < sp.max_tokens; iter++) {
1097
+ if (is_cancelled(ctxIdStr)) { cancelled = true; break; }
1098
+
1099
+ bool all_done = true;
1100
+ for (int s = 0; s < n_seqs; s++) {
1101
+ if (done[s]) continue;
1102
+ all_done = false;
1103
+
1104
+ llama_token new_token = llama_sampler_sample(samplers[s], ctx, -1);
1105
+ if (llama_vocab_is_eog(vocab, new_token)) {
1106
+ done[s] = true;
1107
+ [self sendEventWithName:@"onBatchToken" body:@{
1108
+ @"contextId": contextId, @"seqIndex": @(s), @"done": @YES, @"finishReason": @"stop"
1109
+ }];
1110
+ continue;
1111
+ }
1112
+
1113
+ std::string piece = token_to_piece(model, new_token);
1114
+ [self sendEventWithName:@"onBatchToken" body:@{
1115
+ @"contextId": contextId, @"seqIndex": @(s),
1116
+ @"token": [NSString stringWithUTF8String:piece.c_str()], @"done": @NO
1117
+ }];
1118
+
1119
+ llama_batch single = llama_batch_init(1, 0, n_seqs);
1120
+ llama_batch_add(single, new_token, positions[s], {s}, true);
1121
+ positions[s]++;
1122
+ llama_decode(ctx, single);
1123
+ llama_batch_free(single);
1124
+ }
1125
+ if (all_done) break;
1126
+ }
1127
+
1128
+ // Cleanup
1129
+ for (auto *s : samplers) llama_sampler_free(s);
1130
+ clear_cancel(ctxIdStr);
1131
+
1132
+ // Mark any remaining sequences as done
1133
+ NSString *reason = cancelled ? @"cancelled" : @"length";
1134
+ for (int s = 0; s < n_seqs; s++) {
1135
+ if (!done[s]) {
1136
+ [self sendEventWithName:@"onBatchToken" body:@{
1137
+ @"contextId": contextId, @"seqIndex": @(s), @"done": @YES, @"finishReason": reason
1138
+ }];
1139
+ }
1140
+ }
1141
+ });
1142
+ }
1143
+
1144
+ // ── Quantization ─────────────────────────────────────────────────────────────
1145
+
1146
+ RCT_EXPORT_METHOD(quantize:(NSString *)inputPath
1147
+ outputPath:(NSString *)outputPath
1148
+ options:(NSDictionary *)options) {
1149
+ dispatch_async(inference_queue(), ^{
1150
+ llama_model_quantize_params params = llama_model_quantize_default_params();
1151
+ params.ftype = options[@"ftype"] ? [options[@"ftype"] intValue] : 15; // Q4_K_M default
1152
+ if (options[@"nthread"]) params.nthread = [options[@"nthread"] intValue];
1153
+ if (options[@"allow_requantize"]) params.allow_requantize = [options[@"allow_requantize"] boolValue];
1154
+ if (options[@"quantize_output_tensor"]) params.quantize_output_tensor = [options[@"quantize_output_tensor"] boolValue];
1155
+ if (options[@"pure"]) params.pure = [options[@"pure"] boolValue];
1156
+
1157
+ uint32_t result = llama_model_quantize([inputPath UTF8String], [outputPath UTF8String], &params);
1158
+
1159
+ NSString *error = (result != 0) ? [NSString stringWithFormat:@"Quantization failed with code %u", result] : nil;
1160
+ [self sendEventWithName:@"onQuantizeComplete" body:@{
1161
+ @"error": error ?: [NSNull null],
1162
+ }];
1163
+ });
1164
+ }
1165
+
1166
+ // ── Logging ──────────────────────────────────────────────────────────────────
1167
+
1168
+ static void llm_log_callback(enum ggml_log_level level, const char *text, void * /*user_data*/) {
1169
+ if (!g_log_events_enabled.load(std::memory_order_relaxed)) return;
1170
+ if ((int)level < g_log_min_level.load(std::memory_order_relaxed)) return;
1171
+
1172
+ LocalLLM *module = g_log_module;
1173
+ if (!module || !module->_hasListeners) return;
1174
+
1175
+ [module sendEventWithName:@"onLog" body:@{
1176
+ @"level": @((int)level),
1177
+ @"text": @(text),
1178
+ }];
1179
+ }
1180
+
1181
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(setLogLevel:(double)level) {
1182
+ g_log_min_level.store((int)level, std::memory_order_relaxed);
1183
+ return nil;
1184
+ }
1185
+
1186
+ RCT_EXPORT_METHOD(enableLogEvents:(BOOL)enabled) {
1187
+ g_log_events_enabled.store(enabled, std::memory_order_relaxed);
1188
+ if (enabled) {
1189
+ g_log_module = self;
1190
+ llama_log_set(llm_log_callback, nullptr);
1191
+ } else {
1192
+ llama_log_set(nullptr, nullptr);
1193
+ g_log_module = nil;
1194
+ }
1195
+ }
1196
+
1197
+ // ── Downloads ────────────────────────────────────────────────────────────────
1198
+
1199
+ RCT_EXPORT_METHOD(downloadModel:(NSString *)url destPath:(NSString *)destPath) {
1200
+ NSURL *nsUrl = [NSURL URLWithString:url];
1201
+ if (!nsUrl) return;
1202
+
1203
+ _downloadDelegate.destPaths[url] = destPath;
1204
+ NSURLSessionDownloadTask *task = [_downloadSession downloadTaskWithURL:nsUrl];
1205
+ [task resume];
1206
+ }
1207
+
1208
+ RCT_EXPORT_METHOD(cancelDownload:(NSString *)url) {
1209
+ [_downloadSession getTasksWithCompletionHandler:^(NSArray *dataTasks, NSArray *uploadTasks, NSArray *downloadTasks) {
1210
+ for (NSURLSessionDownloadTask *task in downloadTasks) {
1211
+ if ([task.originalRequest.URL.absoluteString isEqualToString:url]) {
1212
+ [task cancel];
1213
+ }
1214
+ }
1215
+ }];
1216
+ }
1217
+
1218
+ // ── Device capabilities ──────────────────────────────────────────────────────
1219
+
1220
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(getDeviceCapabilities) {
1221
+ NSProcessInfo *info = [NSProcessInfo processInfo];
1222
+ id<MTLDevice> gpu = MTLCreateSystemDefaultDevice();
1223
+
1224
+ uint64_t totalRAM = info.physicalMemory;
1225
+ uint64_t availableRAM = os_proc_available_memory();
1226
+
1227
+ NSOperatingSystemVersion ver = info.operatingSystemVersion;
1228
+ NSString *iosVersion = [NSString stringWithFormat:@"%ld.%ld.%ld",
1229
+ (long)ver.majorVersion, (long)ver.minorVersion, (long)ver.patchVersion];
1230
+
1231
+ // Detect Metal GPU family
1232
+ int metalFamily = 0;
1233
+ if (gpu) {
1234
+ if ([gpu supportsFamily:MTLGPUFamilyApple9]) metalFamily = 9;
1235
+ else if ([gpu supportsFamily:MTLGPUFamilyApple8]) metalFamily = 8;
1236
+ else if ([gpu supportsFamily:MTLGPUFamilyApple7]) metalFamily = 7;
1237
+ else if ([gpu supportsFamily:MTLGPUFamilyApple6]) metalFamily = 6;
1238
+ else if ([gpu supportsFamily:MTLGPUFamilyApple5]) metalFamily = 5;
1239
+ else if ([gpu supportsFamily:MTLGPUFamilyApple4]) metalFamily = 4;
1240
+ }
1241
+
1242
+ int metalVersion = metalFamily >= 7 ? 3 : metalFamily >= 5 ? 2 : 1;
1243
+
1244
+ return @{
1245
+ @"totalRAM": @(totalRAM),
1246
+ @"availableRAM": @(availableRAM),
1247
+ @"gpuName": gpu ? gpu.name : @"unknown",
1248
+ @"metalFamily": @(metalFamily),
1249
+ @"metalVersion": @(metalVersion),
1250
+ @"iosVersion": iosVersion,
1251
+ @"isLowPowerMode": @(info.isLowPowerModeEnabled),
1252
+ };
1253
+ }
1254
+
1255
+ RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(getModelStoragePath) {
1256
+ NSArray *paths = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDomainMask, YES);
1257
+ NSString *appSupport = paths.firstObject;
1258
+ NSString *llmDir = [appSupport stringByAppendingPathComponent:@"local-llm/models"];
1259
+
1260
+ NSFileManager *fm = [NSFileManager defaultManager];
1261
+ if (![fm fileExistsAtPath:llmDir]) {
1262
+ [fm createDirectoryAtPath:llmDir withIntermediateDirectories:YES attributes:nil error:nil];
1263
+ }
1264
+ return llmDir;
1265
+ }
1266
+
1267
+ @end