@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -3,7 +3,9 @@
3
3
  #include "common.h"
4
4
  #include "log.h"
5
5
  #include "llama.h"
6
- #include "common/base64.hpp"
6
+ #include "arg.h" // common_remote_get_content
7
+ #include "base64.hpp"
8
+ #include "mtmd.h"
7
9
 
8
10
  // increase max payload length to allow use of larger context size
9
11
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -21,6 +23,7 @@
21
23
  #include <string>
22
24
  #include <vector>
23
25
  #include <memory>
26
+ #include <cinttypes>
24
27
 
25
28
  #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
26
29
 
@@ -41,6 +44,8 @@ using json = nlohmann::ordered_json;
41
44
  #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
42
45
  #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
43
46
 
47
+ using raw_buffer = std::vector<uint8_t>;
48
+
44
49
  template <typename T>
45
50
  static T json_value(const json & body, const std::string & key, const T & default_value) {
46
51
  // Fallback null to default value
@@ -58,6 +63,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
58
63
 
59
64
  const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
60
65
 
66
+ // thin wrapper around common_grammar_trigger with (de)serialization functions
67
+ struct server_grammar_trigger {
68
+ common_grammar_trigger value;
69
+
70
+ server_grammar_trigger() = default;
71
+ server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
72
+ server_grammar_trigger(const json & in) {
73
+ value.type = (common_grammar_trigger_type) in.at("type").get<int>();
74
+ value.value = in.at("value").get<std::string>();
75
+ if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
76
+ value.token = (llama_token) in.at("token").get<int>();
77
+ }
78
+ }
79
+
80
+ json to_json() const {
81
+ json out {
82
+ {"type", (int) value.type},
83
+ {"value", value.value},
84
+ };
85
+ if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
86
+ out["token"] = (int) value.token;
87
+ }
88
+ return out;
89
+ }
90
+ };
91
+
61
92
  //
62
93
  // tokenizer and input processing utils
63
94
  //
@@ -360,7 +391,7 @@ static inline bool is_base64(uint8_t c) {
360
391
  return (isalnum(c) || (c == '+') || (c == '/'));
361
392
  }
362
393
 
363
- static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
394
+ static inline raw_buffer base64_decode(const std::string & encoded_string) {
364
395
  int i = 0;
365
396
  int j = 0;
366
397
  int in_ = 0;
@@ -370,7 +401,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
370
401
  uint8_t char_array_4[4];
371
402
  uint8_t char_array_3[3];
372
403
 
373
- std::vector<uint8_t> ret;
404
+ raw_buffer ret;
374
405
 
375
406
  while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
376
407
  char_array_4[i++] = encoded_string[in_]; in_++;
@@ -552,8 +583,11 @@ static json oaicompat_completion_params_parse(const json & body) {
552
583
  static json oaicompat_completion_params_parse(
553
584
  const json & body, /* openai api json semantics */
554
585
  bool use_jinja,
586
+ bool prefill_assistant,
555
587
  common_reasoning_format reasoning_format,
556
- const struct common_chat_templates * tmpls)
588
+ const struct common_chat_templates * tmpls,
589
+ bool allow_non_text,
590
+ std::vector<raw_buffer> & out_files)
557
591
  {
558
592
  json llama_params;
559
593
 
@@ -601,8 +635,89 @@ static json oaicompat_completion_params_parse(
601
635
  }
602
636
  }
603
637
 
638
+ // get input files
639
+ if (!body.contains("messages")) {
640
+ throw std::runtime_error("'messages' is required");
641
+ }
642
+ json messages = body.at("messages");
643
+ if (!messages.is_array()) {
644
+ throw std::runtime_error("Expected 'messages' to be an array");
645
+ }
646
+ for (auto & msg : messages) {
647
+ std::string role = json_value(msg, "role", std::string());
648
+ if (role != "assistant" && !msg.contains("content")) {
649
+ throw std::runtime_error("All non-assistant messages must contain 'content'");
650
+ }
651
+ if (role == "assistant") {
652
+ if (!msg.contains("content") && !msg.contains("tool_calls")) {
653
+ throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
654
+ }
655
+ if (!msg.contains("content")) {
656
+ continue; // avoid errors with no content
657
+ }
658
+ }
659
+ json & content = msg.at("content");
660
+ if (content.is_string() || content.is_null()) {
661
+ continue;
662
+ }
663
+
664
+ if (!content.is_array()) {
665
+ throw std::runtime_error("Expected 'content' to be a string or an array");
666
+ }
667
+
668
+ for (auto & p : content) {
669
+ std::string type = json_value(p, "type", std::string());
670
+ json image_url = json_value(p, "image_url", json::object());
671
+ if (type == "image_url") {
672
+ if (!allow_non_text) {
673
+ throw std::runtime_error("image input is not supported by this server");
674
+ }
675
+
676
+ std::string url = json_value(image_url, "url", std::string());
677
+ if (string_starts_with(url, "http")) {
678
+ // download remote image
679
+ // TODO @ngxson : maybe make these params configurable
680
+ common_remote_params params;
681
+ params.headers.push_back("User-Agent: llama.cpp/" + build_info);
682
+ params.max_size = 1024 * 1024 * 10; // 10MB
683
+ params.timeout = 10; // seconds
684
+ SRV_INF("downloading image from '%s'\n", url.c_str());
685
+ auto res = common_remote_get_content(url, params);
686
+ if (200 <= res.first && res.first < 300) {
687
+ SRV_INF("downloaded %ld bytes\n", res.second.size());
688
+ raw_buffer data;
689
+ data.insert(data.end(), res.second.begin(), res.second.end());
690
+ out_files.push_back(data);
691
+ } else {
692
+ throw std::runtime_error("Failed to download image");
693
+ }
694
+
695
+ } else {
696
+ // try to decode base64 image
697
+ std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
698
+ if (parts.size() != 2) {
699
+ throw std::runtime_error("Invalid image_url.url value");
700
+ } else if (!string_starts_with(parts[0], "data:image/")) {
701
+ throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
702
+ } else if (!string_ends_with(parts[0], "base64")) {
703
+ throw std::runtime_error("image_url.url must be base64 encoded");
704
+ } else {
705
+ auto base64_data = parts[1];
706
+ auto decoded_data = base64_decode(base64_data);
707
+ out_files.push_back(decoded_data);
708
+ }
709
+ }
710
+
711
+ // replace this chunk with a marker
712
+ p["type"] = "text";
713
+ p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
714
+ p.erase("image_url");
715
+ }
716
+ }
717
+ }
718
+
604
719
  common_chat_templates_inputs inputs;
605
- inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
720
+ inputs.messages = common_chat_msgs_parse_oaicompat(messages);
606
721
  inputs.tools = common_chat_tools_parse_oaicompat(tools);
607
722
  inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
608
723
  inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
@@ -616,9 +731,31 @@ static json oaicompat_completion_params_parse(
616
731
  throw std::runtime_error("Cannot use custom grammar constraints with tools.");
617
732
  }
618
733
 
734
+ // if the assistant message appears at the end of list, we do not add end-of-turn token
735
+ // for ex. this can be useful to modify the reasoning process in reasoning models
736
+ bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
737
+ common_chat_msg last_message;
738
+ if (prefill_assistant_message) {
739
+ last_message = inputs.messages.back();
740
+ inputs.messages.pop_back();
741
+
742
+ /* sanity check, max one assistant message at the end of the list */
743
+ if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
744
+ throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
745
+ }
746
+
747
+ inputs.extract_reasoning = false;
748
+ inputs.add_generation_prompt = true;
749
+ }
750
+
619
751
  // Apply chat template to the list of messages
620
752
  auto chat_params = common_chat_templates_apply(tmpls, inputs);
621
753
 
754
+ /* Append assistant prefilled message */
755
+ if (prefill_assistant_message) {
756
+ chat_params.prompt += last_message.content;
757
+ }
758
+
622
759
  llama_params["chat_format"] = static_cast<int>(chat_params.format);
623
760
  llama_params["prompt"] = chat_params.prompt;
624
761
  if (!chat_params.grammar.empty()) {
@@ -627,7 +764,8 @@ static json oaicompat_completion_params_parse(
627
764
  llama_params["grammar_lazy"] = chat_params.grammar_lazy;
628
765
  auto grammar_triggers = json::array();
629
766
  for (const auto & trigger : chat_params.grammar_triggers) {
630
- grammar_triggers.push_back(trigger.to_json<json>());
767
+ server_grammar_trigger ct(trigger);
768
+ grammar_triggers.push_back(ct.to_json());
631
769
  }
632
770
  llama_params["grammar_triggers"] = grammar_triggers;
633
771
  llama_params["preserved_tokens"] = chat_params.preserved_tokens;
@@ -886,3 +1024,286 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
886
1024
 
887
1025
  return lora;
888
1026
  }
1027
+
1028
+ //
1029
+ // utils for interacting with libmtmd
1030
+ // (may need to refactor in near future)
1031
+ //
1032
+
1033
+ /**
1034
+ * server_tokens is a helper to manage the input tokens and image for the server.
1035
+ * it is made this way to simplify the logic of KV cache management.
1036
+ */
1037
+ struct server_tokens {
1038
+ bool has_mtmd = false;
1039
+
1040
+ private: // disallow accessing these members directly, risking out-of-sync
1041
+
1042
+ // map a **start** position in tokens to the image chunk
1043
+ std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
1044
+
1045
+ // list of tokens
1046
+ // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
1047
+ // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
1048
+ // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
1049
+ llama_tokens tokens;
1050
+
1051
+ // for ex. with input of 5 text tokens and 2 images:
1052
+ // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1053
+ // pos 0 1 2 3 4 5 6 7 8 9
1054
+ // map_pos_to_image will contain: {5, img0}, {8, img1}
1055
+
1056
+ public:
1057
+ server_tokens() = default;
1058
+ ~server_tokens() = default;
1059
+
1060
+ // Prevent copying
1061
+ server_tokens(const server_tokens&) = delete;
1062
+ server_tokens& operator=(const server_tokens&) = delete;
1063
+
1064
+ // Allow moving (usually implicitly generated if members are movable)
1065
+ server_tokens(server_tokens&&) = default;
1066
+ server_tokens& operator=(server_tokens&&) = default;
1067
+
1068
+ // Allow accessing elements using [] operator
1069
+ llama_token operator[](size_t index) { return tokens[index]; }
1070
+ const llama_token& operator[](size_t index) const { return tokens[index]; }
1071
+
1072
+ server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
1073
+ for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
1074
+ push_back(mtmd_chunks[i]);
1075
+ }
1076
+ }
1077
+
1078
+ server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
1079
+
1080
+ // for debugging
1081
+ std::string str() const {
1082
+ std::ostringstream oss;
1083
+ oss << "tokens: ";
1084
+ for (const auto & t : tokens) {
1085
+ if (t == LLAMA_TOKEN_NULL) {
1086
+ oss << "<embd> ";
1087
+ } else {
1088
+ oss << t << " ";
1089
+ }
1090
+ }
1091
+ oss << "\n";
1092
+ oss << "image pos: ";
1093
+ for (const auto & it : map_pos_to_image) {
1094
+ oss << it.first << ", ";
1095
+ }
1096
+ return oss.str();
1097
+ }
1098
+
1099
+ const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
1100
+ auto it = map_pos_to_image.find(pos);
1101
+ if (it != map_pos_to_image.end()) {
1102
+ return it->second;
1103
+ } else {
1104
+ throw std::runtime_error("Chunk not found");
1105
+ }
1106
+ }
1107
+
1108
+ void push_back(llama_token tok) {
1109
+ if (tok == LLAMA_TOKEN_NULL) {
1110
+ throw std::runtime_error("Invalid token");
1111
+ }
1112
+ tokens.emplace_back(tok);
1113
+ }
1114
+
1115
+ // will create a copy of the chunk if it contains non-text data
1116
+ void push_back(const mtmd_input_chunk * chunk) {
1117
+ auto type = mtmd_input_chunk_get_type(chunk);
1118
+ if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1119
+ GGML_ASSERT(has_mtmd);
1120
+ auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
1121
+ const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
1122
+ llama_pos start_pos = tokens.size();
1123
+ for (int i = 0; i < n_pos; ++i) {
1124
+ tokens.emplace_back(LLAMA_TOKEN_NULL);
1125
+ }
1126
+ mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
1127
+ map_pos_to_image[start_pos] = std::move(new_chunk);
1128
+ } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1129
+ size_t n_tokens;
1130
+ auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
1131
+ for (size_t i = 0; i < n_tokens; ++i) {
1132
+ push_back(text_tokens[i]);
1133
+ }
1134
+ } else {
1135
+ GGML_ABORT("Invalid chunk type");
1136
+ }
1137
+ }
1138
+
1139
+ // for compatibility with context shift and prompt truncation
1140
+ void insert(const llama_tokens & inp_tokens) {
1141
+ GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1142
+ tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
1143
+ }
1144
+
1145
+ // for compatibility with speculative decoding, ctx shift, slot save/load
1146
+ const llama_tokens & get_text_tokens() const {
1147
+ GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1148
+ return tokens;
1149
+ }
1150
+
1151
+ // for compatibility with speculative decoding
1152
+ void set_token(llama_pos pos, llama_token id) {
1153
+ GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1154
+ tokens[pos] = id;
1155
+ }
1156
+
1157
+ size_t size() const {
1158
+ return tokens.size();
1159
+ }
1160
+
1161
+ bool empty() const {
1162
+ return tokens.empty();
1163
+ }
1164
+
1165
+ void clear() {
1166
+ tokens.clear();
1167
+ }
1168
+
1169
+ void keep_first(size_t n) {
1170
+ GGML_ASSERT(n <= tokens.size());
1171
+ if (has_mtmd) {
1172
+ // we throw an error if we try to remove a token in the middle of an image
1173
+ // for ex. with input of 5 text tokens and 2 images:
1174
+ // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1175
+ // n 1 2 3 4 5 6 7 8 9 10
1176
+ // allowed to resize ^ ^
1177
+ // disallowed to resize ^ ^ ^
1178
+ if (n > 0) {
1179
+ llama_token last_token = tokens[n - 1];
1180
+ // make sure we never remove tokens in the middle of an image
1181
+ if (last_token == LLAMA_TOKEN_NULL) {
1182
+ find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
1183
+ }
1184
+ }
1185
+ // remove all image chunks that are not used anymore
1186
+ for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
1187
+ llama_pos pos = it->first;
1188
+ if (pos >= (llama_pos)n) {
1189
+ it = map_pos_to_image.erase(it);
1190
+ } else {
1191
+ ++it;
1192
+ }
1193
+ }
1194
+ }
1195
+ tokens.resize(n);
1196
+ }
1197
+
1198
+ std::string detokenize(const llama_context * ctx, bool special) const {
1199
+ llama_tokens text_tokens;
1200
+ text_tokens.reserve(tokens.size());
1201
+ for (const auto & t : tokens) {
1202
+ if (t != LLAMA_TOKEN_NULL) {
1203
+ text_tokens.push_back(t);
1204
+ }
1205
+ }
1206
+ return common_detokenize(ctx, text_tokens, special);
1207
+ }
1208
+
1209
+ size_t get_common_prefix(const server_tokens & b) const {
1210
+ size_t max_idx = std::min(tokens.size(), b.tokens.size());
1211
+ for (size_t i = 0; i < max_idx; ++i) {
1212
+ auto & ai = tokens[i];
1213
+ auto & bi = b.tokens[i];
1214
+
1215
+ if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
1216
+ GGML_ASSERT(has_mtmd);
1217
+ const auto & a_chunk = find_chunk(i);
1218
+ const auto & b_chunk = b.find_chunk(i);
1219
+ GGML_ASSERT(a_chunk && b_chunk);
1220
+ const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
1221
+ const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
1222
+ std::string ai_id = mtmd_image_tokens_get_id(a_img);
1223
+ std::string bi_id = mtmd_image_tokens_get_id(b_img);
1224
+ size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
1225
+ size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
1226
+ if (ai_id == bi_id && a_pos == b_pos) {
1227
+ GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
1228
+ i += a_pos - 1; // will be +1 by the for loop
1229
+ continue;
1230
+ } else {
1231
+ return i;
1232
+ }
1233
+ } else if (ai == bi) {
1234
+ continue;
1235
+ } else {
1236
+ return i;
1237
+ }
1238
+ }
1239
+ return max_idx; // all tokens are equal
1240
+ }
1241
+
1242
+ // make sure all text tokens are within the vocab range
1243
+ bool validate(const struct llama_context * ctx) const {
1244
+ const llama_model * model = llama_get_model(ctx);
1245
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1246
+ const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1247
+
1248
+ for (size_t i = 0; i < tokens.size(); ++i) {
1249
+ auto & t = tokens[i];
1250
+ if (t == LLAMA_TOKEN_NULL) {
1251
+ try {
1252
+ const auto & chunk = find_chunk(i);
1253
+ const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
1254
+ size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
1255
+ i += n_pos - 1; // will be +1 by the for loop
1256
+ } catch (const std::exception & e) {
1257
+ return false;
1258
+ }
1259
+ } else if (t < 0 || t >= n_vocab) {
1260
+ return false;
1261
+ }
1262
+ }
1263
+ return true;
1264
+ }
1265
+
1266
+ // encode and decode the image chunk
1267
+ int32_t process_chunk(
1268
+ llama_context * ctx,
1269
+ mtmd_context * mctx,
1270
+ llama_pos n_past,
1271
+ int32_t seq_id,
1272
+ llama_pos & n_pos_out) {
1273
+ auto it = map_pos_to_image.find(n_past);
1274
+ if (it == map_pos_to_image.end()) {
1275
+ throw std::runtime_error("Chunk not found");
1276
+ }
1277
+ SRV_INF("%s\n", "processing image...");
1278
+ int32_t n_batch = llama_n_batch(ctx);
1279
+ int64_t t0 = ggml_time_ms();
1280
+ llama_pos new_n_past = n_past;
1281
+ int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
1282
+ it->second.get(), // chunk
1283
+ n_past,
1284
+ seq_id,
1285
+ n_batch,
1286
+ true, // logits last
1287
+ &new_n_past);
1288
+ SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
1289
+ if (result != 0) {
1290
+ LOG_ERR("mtmd_helper_eval failed with status %d", result);
1291
+ n_pos_out = n_past;
1292
+ return result;
1293
+ }
1294
+ n_pos_out = new_n_past;
1295
+ return 0;
1296
+ }
1297
+ };
1298
+
1299
+ // Computes FNV-1a hash of the data
1300
+ static std::string fnv_hash(const uint8_t * data, size_t len) {
1301
+ const uint64_t fnv_prime = 0x100000001b3ULL;
1302
+ uint64_t hash = 0xcbf29ce484222325ULL;
1303
+
1304
+ for (size_t i = 0; i < len; ++i) {
1305
+ hash ^= data[i];
1306
+ hash *= fnv_prime;
1307
+ }
1308
+ return std::to_string(hash);
1309
+ }
@@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
577
577
 
578
578
  const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
579
579
 
580
- // TODO: refactor in a common struct
581
- params.model = params.vocoder.model;
582
- params.model_url = params.vocoder.model_url;
583
- params.hf_repo = params.vocoder.hf_repo;
584
- params.hf_file = params.vocoder.hf_file;
585
-
580
+ params.model = params.vocoder.model;
586
581
  params.embedding = true;
587
582
 
588
583
  common_init_result llama_init_cts = common_init_from_params(params);
@@ -699,11 +694,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
699
694
  const std::string voice_data = audio_data;
700
695
 
701
696
  auto tmp = common_tokenize(vocab, voice_data, false, true);
702
- printf("\n\n");
697
+
698
+ std::ostringstream tokens_oss;
703
699
  for (size_t i = 0; i < tmp.size(); ++i) {
704
- printf("%d, ", tmp[i]);
700
+ tokens_oss << tmp[i] << ", ";
705
701
  }
706
- printf("\n\n");
702
+ LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
703
+
707
704
  prompt_add(prompt_inp, tmp);
708
705
  #else
709
706
  prompt_add(prompt_inp, llama_tokens {
@@ -1,6 +0,0 @@
1
- set( CMAKE_SYSTEM_NAME Windows )
2
- set( CMAKE_SYSTEM_PROCESSOR arm64 )
3
-
4
- set( target arm64-pc-windows-msvc )
5
- set( CMAKE_C_COMPILER_TARGET ${target} )
6
- set( CMAKE_CXX_COMPILER_TARGET ${target} )
@@ -1,5 +0,0 @@
1
- set(TARGET llama-gbnf-validator)
2
- add_executable(${TARGET} gbnf-validator.cpp)
3
- install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -1,5 +0,0 @@
1
- set(TARGET llama-infill)
2
- add_executable(${TARGET} infill.cpp)
3
- install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_17)