@fugood/llama.node 0.6.2 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (378) hide show
  1. package/CMakeLists.txt +40 -30
  2. package/README.md +4 -1
  3. package/lib/binding.js +41 -29
  4. package/lib/binding.ts +26 -25
  5. package/package.json +45 -10
  6. package/scripts/build.js +47 -0
  7. package/scripts/llama.cpp.patch +109 -0
  8. package/src/anyascii.c +22223 -0
  9. package/src/anyascii.h +42 -0
  10. package/src/tts_utils.cpp +20 -7
  11. package/src/tts_utils.h +2 -0
  12. package/bin/darwin/arm64/llama-node.node +0 -0
  13. package/bin/darwin/x64/llama-node.node +0 -0
  14. package/bin/linux/arm64/llama-node.node +0 -0
  15. package/bin/linux/x64/llama-node.node +0 -0
  16. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  17. package/bin/linux-cuda/x64/llama-node.node +0 -0
  18. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  19. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  20. package/bin/win32/x64/llama-node.node +0 -0
  21. package/bin/win32/x64/node.lib +0 -0
  22. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  23. package/bin/win32-vulkan/arm64/node.lib +0 -0
  24. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  25. package/bin/win32-vulkan/x64/node.lib +0 -0
  26. package/patches/node-api-headers+1.1.0.patch +0 -26
  27. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
  28. package/src/llama.cpp/.github/workflows/build.yml +0 -1078
  29. package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
  30. package/src/llama.cpp/.github/workflows/docker.yml +0 -178
  31. package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
  32. package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
  33. package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
  37. package/src/llama.cpp/.github/workflows/release.yml +0 -739
  38. package/src/llama.cpp/.github/workflows/server.yml +0 -237
  39. package/src/llama.cpp/.github/workflows/winget.yml +0 -42
  40. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
  41. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
  42. package/src/llama.cpp/cmake/build-info.cmake +0 -64
  43. package/src/llama.cpp/cmake/common.cmake +0 -35
  44. package/src/llama.cpp/cmake/git-vars.cmake +0 -22
  45. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
  46. package/src/llama.cpp/common/build-info.cpp.in +0 -4
  47. package/src/llama.cpp/docs/build.md +0 -561
  48. package/src/llama.cpp/examples/CMakeLists.txt +0 -43
  49. package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
  50. package/src/llama.cpp/examples/batched/batched.cpp +0 -246
  51. package/src/llama.cpp/examples/chat-13B.bat +0 -57
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
  53. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
  54. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
  55. package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
  56. package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
  57. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
  58. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
  59. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
  61. package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
  62. package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
  63. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
  64. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
  65. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
  66. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
  67. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
  68. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
  69. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
  70. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
  71. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
  72. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
  73. package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
  74. package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
  75. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
  76. package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
  77. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
  78. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
  79. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
  80. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
  81. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
  82. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
  83. package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
  84. package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
  85. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
  86. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
  87. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
  88. package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
  89. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
  90. package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
  91. package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
  92. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
  93. package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
  94. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
  95. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
  96. package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
  97. package/src/llama.cpp/examples/simple/simple.cpp +0 -206
  98. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
  99. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
  100. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
  101. package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
  102. package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
  103. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
  104. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
  105. package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
  106. package/src/llama.cpp/examples/sycl/build.sh +0 -23
  107. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
  108. package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
  109. package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
  110. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
  111. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
  112. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
  113. package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
  114. package/src/llama.cpp/examples/training/finetune.cpp +0 -96
  115. package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
  116. package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
  117. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
  118. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
  119. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
  120. package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
  121. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
  122. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
  123. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
  124. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
  125. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
  126. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
  127. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
  128. package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
  129. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
  130. package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
  131. package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
  132. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
  133. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
  134. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
  135. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
  136. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
  137. package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
  138. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  139. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  140. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
  141. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
  142. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
  143. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
  144. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
  145. package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
  146. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
  147. package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
  148. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
  149. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
  150. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
  151. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
  152. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
  153. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
  154. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
  155. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
  156. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
  157. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
  158. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
  159. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
  160. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
  161. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
  162. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
  163. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
  164. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
  165. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
  166. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
  167. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
  168. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
  169. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
  170. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
  171. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
  172. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
  173. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
  174. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
  175. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
  176. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
  177. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
  178. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
  181. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
  182. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
  183. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
  184. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
  185. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
  186. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
  187. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
  188. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
  189. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
  190. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
  191. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
  192. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
  193. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
  194. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
  195. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
  196. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
  197. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
  198. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
  199. package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
  200. package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
  201. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
  202. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
  203. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
  204. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
  205. package/src/llama.cpp/ggml/src/ggml.c +0 -6550
  206. package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
  207. package/src/llama.cpp/models/.editorconfig +0 -1
  208. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  209. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  210. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  211. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  212. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  213. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  214. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  215. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  216. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  217. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  218. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  220. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  221. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  223. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  226. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  227. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  228. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  229. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  230. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  231. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  232. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  233. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  234. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  236. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  238. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  241. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  242. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  249. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  250. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  253. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  256. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  257. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  258. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
  259. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  260. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  261. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  262. package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
  263. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
  264. package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
  265. package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
  266. package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
  267. package/src/llama.cpp/prompts/alpaca.txt +0 -1
  268. package/src/llama.cpp/prompts/assistant.txt +0 -31
  269. package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  270. package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
  271. package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  272. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  273. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  274. package/src/llama.cpp/prompts/chat.txt +0 -28
  275. package/src/llama.cpp/prompts/dan-modified.txt +0 -1
  276. package/src/llama.cpp/prompts/dan.txt +0 -1
  277. package/src/llama.cpp/prompts/mnemonics.txt +0 -93
  278. package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
  279. package/src/llama.cpp/prompts/reason-act.txt +0 -18
  280. package/src/llama.cpp/requirements/requirements-all.txt +0 -15
  281. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
  282. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
  283. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
  284. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
  285. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
  286. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
  287. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
  288. package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
  289. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
  290. package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
  291. package/src/llama.cpp/requirements.txt +0 -13
  292. package/src/llama.cpp/scripts/build-info.sh +0 -30
  293. package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
  294. package/src/llama.cpp/scripts/xxd.cmake +0 -16
  295. package/src/llama.cpp/tests/CMakeLists.txt +0 -177
  296. package/src/llama.cpp/tests/get-model.cpp +0 -21
  297. package/src/llama.cpp/tests/get-model.h +0 -2
  298. package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
  299. package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
  300. package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
  301. package/src/llama.cpp/tests/test-barrier.cpp +0 -94
  302. package/src/llama.cpp/tests/test-c.c +0 -7
  303. package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
  304. package/src/llama.cpp/tests/test-chat.cpp +0 -985
  305. package/src/llama.cpp/tests/test-double-float.cpp +0 -57
  306. package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
  307. package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
  308. package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
  309. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
  310. package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
  311. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
  312. package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
  313. package/src/llama.cpp/tests/test-log.cpp +0 -39
  314. package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
  315. package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
  316. package/src/llama.cpp/tests/test-opt.cpp +0 -904
  317. package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
  318. package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
  319. package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
  320. package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
  321. package/src/llama.cpp/tests/test-rope.cpp +0 -262
  322. package/src/llama.cpp/tests/test-sampling.cpp +0 -399
  323. package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
  324. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
  325. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
  326. package/src/llama.cpp/tools/CMakeLists.txt +0 -39
  327. package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
  328. package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
  329. package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
  330. package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
  331. package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
  332. package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
  333. package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
  334. package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
  335. package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
  336. package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
  337. package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
  338. package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
  339. package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
  340. package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
  341. package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
  342. package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
  343. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
  344. package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
  345. package/src/llama.cpp/tools/main/main.cpp +0 -977
  346. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
  347. package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
  348. package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
  349. package/src/llama.cpp/tools/mtmd/clip.h +0 -101
  350. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
  351. package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
  352. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
  353. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
  354. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
  355. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
  356. package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
  357. package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
  358. package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
  359. package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
  360. package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
  361. package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
  362. package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
  363. package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
  364. package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
  365. package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
  366. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
  367. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
  368. package/src/llama.cpp/tools/run/run.cpp +0 -1261
  369. package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
  370. package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
  371. package/src/llama.cpp/tools/server/httplib.h +0 -10506
  372. package/src/llama.cpp/tools/server/server.cpp +0 -4966
  373. package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
  374. package/src/llama.cpp/tools/server/utils.hpp +0 -1337
  375. package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
  376. package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
  377. package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
  378. package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
@@ -1,1337 +0,0 @@
1
- #pragma once
2
-
3
- #include "common.h"
4
- #include "log.h"
5
- #include "llama.h"
6
- #include "arg.h" // common_remote_get_content
7
- #include "base64.hpp"
8
- #include "mtmd.h"
9
-
10
- // increase max payload length to allow use of larger context size
11
- #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
12
- // disable Nagle's algorithm
13
- #define CPPHTTPLIB_TCP_NODELAY true
14
- #include "httplib.h"
15
-
16
- // Change JSON_ASSERT from assert() to GGML_ASSERT:
17
- #define JSON_ASSERT GGML_ASSERT
18
- #include "json.hpp"
19
- #include "chat.h"
20
-
21
- #include <random>
22
- #include <sstream>
23
- #include <string>
24
- #include <vector>
25
- #include <memory>
26
- #include <cinttypes>
27
-
28
- #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
29
-
30
- using json = nlohmann::ordered_json;
31
-
32
- #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
33
- #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
34
- #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
35
- #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
36
-
37
- #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
38
- #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
39
- #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
40
- #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
41
-
42
- #define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
43
- #define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
44
- #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
45
- #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
46
-
47
- using raw_buffer = std::vector<uint8_t>;
48
-
49
- template <typename T>
50
- static T json_value(const json & body, const std::string & key, const T & default_value) {
51
- // Fallback null to default value
52
- if (body.contains(key) && !body.at(key).is_null()) {
53
- try {
54
- return body.at(key);
55
- } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
56
- LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
57
- return default_value;
58
- }
59
- } else {
60
- return default_value;
61
- }
62
- }
63
-
64
- const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
65
-
66
- // thin wrapper around common_grammar_trigger with (de)serialization functions
67
- struct server_grammar_trigger {
68
- common_grammar_trigger value;
69
-
70
- server_grammar_trigger() = default;
71
- server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
72
- server_grammar_trigger(const json & in) {
73
- value.type = (common_grammar_trigger_type) in.at("type").get<int>();
74
- value.value = in.at("value").get<std::string>();
75
- if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
76
- value.token = (llama_token) in.at("token").get<int>();
77
- }
78
- }
79
-
80
- json to_json() const {
81
- json out {
82
- {"type", (int) value.type},
83
- {"value", value.value},
84
- };
85
- if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
86
- out["token"] = (int) value.token;
87
- }
88
- return out;
89
- }
90
- };
91
-
92
- //
93
- // tokenizer and input processing utils
94
- //
95
-
96
- static bool json_is_array_of_numbers(const json & data) {
97
- if (data.is_array()) {
98
- for (const auto & e : data) {
99
- if (!e.is_number_integer()) {
100
- return false;
101
- }
102
- }
103
- return true;
104
- }
105
- return false;
106
- }
107
-
108
- // is array having BOTH numbers & strings?
109
- static bool json_is_array_of_mixed_numbers_strings(const json & data) {
110
- bool seen_string = false;
111
- bool seen_number = false;
112
- if (data.is_array()) {
113
- for (const auto & e : data) {
114
- seen_string |= e.is_string();
115
- seen_number |= e.is_number_integer();
116
- if (seen_number && seen_string) {
117
- return true;
118
- }
119
- }
120
- }
121
- return false;
122
- }
123
-
124
- // get value by path(key1 / key2)
125
- static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
126
- json result = json::object();
127
-
128
- for (const std::string & path : paths) {
129
- json current = js;
130
- const auto keys = string_split<std::string>(path, /*separator*/ '/');
131
- bool valid_path = true;
132
- for (const std::string & k : keys) {
133
- if (valid_path && current.is_object() && current.contains(k)) {
134
- current = current[k];
135
- } else {
136
- valid_path = false;
137
- }
138
- }
139
- if (valid_path) {
140
- result[path] = current;
141
- }
142
- }
143
- return result;
144
- }
145
-
146
- /**
147
- * this handles 2 cases:
148
- * - only string, example: "string"
149
- * - mixed string and tokens, example: [12, 34, "string", 56, 78]
150
- */
151
- static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
152
- // If `add_bos` is true, we only add BOS, when json_prompt is a string,
153
- // or the first element of the json_prompt array is a string.
154
- llama_tokens prompt_tokens;
155
-
156
- if (json_prompt.is_array()) {
157
- bool first = true;
158
- for (const auto & p : json_prompt) {
159
- if (p.is_string()) {
160
- auto s = p.template get<std::string>();
161
-
162
- llama_tokens p;
163
- if (first) {
164
- p = common_tokenize(vocab, s, add_special, parse_special);
165
- first = false;
166
- } else {
167
- p = common_tokenize(vocab, s, false, parse_special);
168
- }
169
-
170
- prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
171
- } else {
172
- if (first) {
173
- first = false;
174
- }
175
-
176
- prompt_tokens.push_back(p.template get<llama_token>());
177
- }
178
- }
179
- } else {
180
- auto s = json_prompt.template get<std::string>();
181
- prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
182
- }
183
-
184
- return prompt_tokens;
185
- }
186
-
187
- /**
188
- * break the input "prompt" object into multiple prompt if needed, then tokenize them
189
- * this supports these cases:
190
- * - "prompt": "string"
191
- * - "prompt": [12, 34, 56]
192
- * - "prompt": [12, 34, "string", 56, 78]
193
- * and multiple prompts (multi-tasks):
194
- * - "prompt": ["string1", "string2"]
195
- * - "prompt": ["string1", [12, 34, 56]]
196
- * - "prompt": [[12, 34, 56], [78, 90, 12]]
197
- * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
198
- */
199
- static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
200
- std::vector<llama_tokens> result;
201
- if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
202
- // string or mixed
203
- result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
204
- } else if (json_is_array_of_numbers(json_prompt)) {
205
- // array of tokens
206
- result.push_back(json_prompt.get<llama_tokens>());
207
- } else if (json_prompt.is_array()) {
208
- // array of prompts
209
- result.reserve(json_prompt.size());
210
- for (const auto & p : json_prompt) {
211
- if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
212
- result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
213
- } else if (json_is_array_of_numbers(p)) {
214
- // array of tokens
215
- result.push_back(p.get<llama_tokens>());
216
- } else {
217
- throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
218
- }
219
- }
220
- } else {
221
- throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
222
- }
223
- if (result.empty()) {
224
- throw std::runtime_error("\"prompt\" must not be empty");
225
- }
226
- return result;
227
- }
228
-
229
- // return the last index of character that can form a valid string
230
- // if the last character is potentially cut in half, return the index before the cut
231
- // if validate_utf8(text) == text.size(), then the whole text is valid utf8
232
- static size_t validate_utf8(const std::string& text) {
233
- size_t len = text.size();
234
- if (len == 0) return 0;
235
-
236
- // Check the last few bytes to see if a multi-byte character is cut off
237
- for (size_t i = 1; i <= 4 && i <= len; ++i) {
238
- unsigned char c = text[len - i];
239
- // Check for start of a multi-byte sequence from the end
240
- if ((c & 0xE0) == 0xC0) {
241
- // 2-byte character start: 110xxxxx
242
- // Needs at least 2 bytes
243
- if (i < 2) return len - i;
244
- } else if ((c & 0xF0) == 0xE0) {
245
- // 3-byte character start: 1110xxxx
246
- // Needs at least 3 bytes
247
- if (i < 3) return len - i;
248
- } else if ((c & 0xF8) == 0xF0) {
249
- // 4-byte character start: 11110xxx
250
- // Needs at least 4 bytes
251
- if (i < 4) return len - i;
252
- }
253
- }
254
-
255
- // If no cut-off multi-byte character is found, return full length
256
- return len;
257
- }
258
-
259
- //
260
- // template utils
261
- //
262
-
263
- // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
264
- static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
265
- llama_tokens result;
266
-
267
- result.reserve(doc.size() + query.size() + 4);
268
- result.push_back(llama_vocab_bos(vocab));
269
- result.insert(result.end(), query.begin(), query.end());
270
- result.push_back(llama_vocab_eos(vocab));
271
- result.push_back(llama_vocab_sep(vocab));
272
- result.insert(result.end(), doc.begin(), doc.end());
273
- result.push_back(llama_vocab_eos(vocab));
274
-
275
- return result;
276
- }
277
-
278
- // format infill task
279
- static llama_tokens format_infill(
280
- const llama_vocab * vocab,
281
- const json & input_prefix,
282
- const json & input_suffix,
283
- const json & input_extra,
284
- const int n_batch,
285
- const int n_predict,
286
- const int n_ctx,
287
- const bool spm_infill,
288
- const llama_tokens & tokens_prompt
289
- ) {
290
- // TODO: optimize this block by reducing memory allocations and movement
291
-
292
- // use FIM repo-level pattern:
293
- // ref: https://arxiv.org/pdf/2409.12186
294
- //
295
- // [FIM_REP]myproject
296
- // [FIM_SEP]filename0
297
- // extra chunk 0
298
- // [FIM_SEP]filename1
299
- // extra chunk 1
300
- // ...
301
- // [FIM_SEP]filename
302
- // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
303
- //
304
- llama_tokens extra_tokens;
305
- extra_tokens.reserve(n_ctx);
306
-
307
- auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
308
- auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
309
-
310
- if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
311
- // TODO: make project name an input
312
- static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
313
-
314
- extra_tokens.push_back(llama_vocab_fim_rep(vocab));
315
- extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
316
- }
317
- for (const auto & chunk : input_extra) {
318
- // { "text": string, "filename": string }
319
- const std::string text = json_value(chunk, "text", std::string());
320
- const std::string filename = json_value(chunk, "filename", std::string("tmp"));
321
-
322
- if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
323
- const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
324
-
325
- extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
326
- extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
327
- } else {
328
- // chunk separator in binary form to avoid confusing the AI
329
- static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
330
- static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
331
-
332
- extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
333
- }
334
-
335
- const auto chunk_tokens = common_tokenize(vocab, text, false, false);
336
- extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
337
- }
338
-
339
- if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
340
- // TODO: current filename
341
- static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
342
-
343
- extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
344
- extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
345
- }
346
-
347
- // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
348
- const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
349
- const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
350
-
351
- SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
352
-
353
- // fill the rest of the context with extra chunks
354
- const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
355
-
356
- tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
357
- tokens_suffix.resize(n_suffix_take);
358
-
359
- tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
360
- tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
361
- tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
362
-
363
- auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
364
- auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
365
-
366
- if (llama_vocab_get_add_bos(vocab)) {
367
- embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
368
- }
369
-
370
- SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
371
-
372
- // put the extra context before the FIM prefix
373
- embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
374
-
375
- embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
376
- embd_inp.push_back(llama_vocab_fim_mid(vocab));
377
-
378
- return embd_inp;
379
- }
380
-
381
- //
382
- // base64 utils (TODO: move to common in the future)
383
- //
384
-
385
- static const std::string base64_chars =
386
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
387
- "abcdefghijklmnopqrstuvwxyz"
388
- "0123456789+/";
389
-
390
- static inline bool is_base64(uint8_t c) {
391
- return (isalnum(c) || (c == '+') || (c == '/'));
392
- }
393
-
394
- static inline raw_buffer base64_decode(const std::string & encoded_string) {
395
- int i = 0;
396
- int j = 0;
397
- int in_ = 0;
398
-
399
- int in_len = encoded_string.size();
400
-
401
- uint8_t char_array_4[4];
402
- uint8_t char_array_3[3];
403
-
404
- raw_buffer ret;
405
-
406
- while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
407
- char_array_4[i++] = encoded_string[in_]; in_++;
408
- if (i == 4) {
409
- for (i = 0; i < 4; i++) {
410
- char_array_4[i] = base64_chars.find(char_array_4[i]);
411
- }
412
-
413
- char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
414
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
415
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
416
-
417
- for (i = 0; (i < 3); i++) {
418
- ret.push_back(char_array_3[i]);
419
- }
420
-
421
- i = 0;
422
- }
423
- }
424
-
425
- if (i) {
426
- for (j = i; j < 4; j++) {
427
- char_array_4[j] = 0;
428
- }
429
-
430
- for (j = 0; j < 4; j++) {
431
- char_array_4[j] = base64_chars.find(char_array_4[j]);
432
- }
433
-
434
- char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
435
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
436
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
437
-
438
- for (j = 0; j < i - 1; j++) {
439
- ret.push_back(char_array_3[j]);
440
- }
441
- }
442
-
443
- return ret;
444
- }
445
-
446
- //
447
- // random string / id
448
- //
449
-
450
- static std::string random_string() {
451
- static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
452
-
453
- std::random_device rd;
454
- std::mt19937 generator(rd());
455
-
456
- std::string result(32, ' ');
457
-
458
- for (int i = 0; i < 32; ++i) {
459
- result[i] = str[generator() % str.size()];
460
- }
461
-
462
- return result;
463
- }
464
-
465
- static std::string gen_chatcmplid() {
466
- return "chatcmpl-" + random_string();
467
- }
468
-
469
- static std::string gen_tool_call_id() {
470
- return random_string();
471
- }
472
-
473
- //
474
- // other common utils
475
- //
476
-
477
- static bool ends_with(const std::string & str, const std::string & suffix) {
478
- return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
479
- }
480
-
481
- static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
482
- if (!text.empty() && !stop.empty()) {
483
- const char text_last_char = text.back();
484
- for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
485
- if (stop[char_index] == text_last_char) {
486
- const std::string current_partial = stop.substr(0, char_index + 1);
487
- if (ends_with(text, current_partial)) {
488
- return text.size() - char_index - 1;
489
- }
490
- }
491
- }
492
- }
493
-
494
- return std::string::npos;
495
- }
496
-
497
- // TODO: reuse llama_detokenize
498
- template <class Iter>
499
- static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
500
- std::string ret;
501
- for (; begin != end; ++begin) {
502
- ret += common_token_to_piece(ctx, *begin);
503
- }
504
-
505
- return ret;
506
- }
507
-
508
- // format incomplete utf-8 multibyte character for output
509
- static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
510
- std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
511
-
512
- // if the size is 1 and first bit is 1, meaning it's a partial character
513
- // (size > 1 meaning it's already a known token)
514
- if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
515
- std::stringstream ss;
516
- ss << std::hex << (out[0] & 0xff);
517
- std::string res(ss.str());
518
- out = "byte: \\x" + res;
519
- }
520
-
521
- return out;
522
- }
523
-
524
- static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
525
- const std::string str =
526
- std::string(event) + ": " +
527
- data.dump(-1, ' ', false, json::error_handler_t::replace) +
528
- "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
529
-
530
- LOG_DBG("data stream, to_send: %s", str.c_str());
531
-
532
- return sink.write(str.c_str(), str.size());
533
- }
534
-
535
- //
536
- // OAI utils
537
- //
538
-
539
- // used by /completions endpoint
540
- static json oaicompat_completion_params_parse(const json & body) {
541
- json llama_params;
542
-
543
- if (!body.contains("prompt")) {
544
- throw std::runtime_error("\"prompt\" is required");
545
- }
546
-
547
- // Handle "stop" field
548
- if (body.contains("stop") && body.at("stop").is_string()) {
549
- llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
550
- } else {
551
- llama_params["stop"] = json_value(body, "stop", json::array());
552
- }
553
-
554
- // Handle "n" field
555
- int n_choices = json_value(body, "n", 1);
556
- if (n_choices != 1) {
557
- throw std::runtime_error("Only one completion choice is allowed");
558
- }
559
-
560
- // Handle "echo" field
561
- if (json_value(body, "echo", false)) {
562
- throw std::runtime_error("Only no echo is supported");
563
- }
564
-
565
- // Params supported by OAI but unsupported by llama.cpp
566
- static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
567
- for (const auto & param : unsupported_params) {
568
- if (body.contains(param)) {
569
- throw std::runtime_error("Unsupported param: " + param);
570
- }
571
- }
572
-
573
- // Copy remaining properties to llama_params
574
- for (const auto & item : body.items()) {
575
- // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
576
- if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
577
- llama_params[item.key()] = item.value();
578
- }
579
- }
580
-
581
- return llama_params;
582
- }
583
-
584
- struct oaicompat_parser_options {
585
- bool use_jinja;
586
- bool prefill_assistant;
587
- common_reasoning_format reasoning_format;
588
- common_chat_templates * tmpls;
589
- bool allow_image;
590
- bool allow_audio;
591
- };
592
-
593
- // used by /chat/completions endpoint
594
- static json oaicompat_chat_params_parse(
595
- const json & body, /* openai api json semantics */
596
- const oaicompat_parser_options & opt,
597
- std::vector<raw_buffer> & out_files)
598
- {
599
- json llama_params;
600
-
601
- auto tools = json_value(body, "tools", json());
602
- auto stream = json_value(body, "stream", false);
603
-
604
- if (tools.is_array() && !tools.empty()) {
605
- if (stream) {
606
- throw std::runtime_error("Cannot use tools with stream");
607
- }
608
- if (!opt.use_jinja) {
609
- throw std::runtime_error("tools param requires --jinja flag");
610
- }
611
- }
612
- if (!opt.use_jinja) {
613
- if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
614
- throw std::runtime_error("Unsupported param: tool_choice");
615
- }
616
- }
617
-
618
- // Handle "stop" field
619
- if (body.contains("stop") && body.at("stop").is_string()) {
620
- llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
621
- } else {
622
- llama_params["stop"] = json_value(body, "stop", json::array());
623
- }
624
-
625
- auto json_schema = json_value(body, "json_schema", json());
626
- auto grammar = json_value(body, "grammar", std::string());
627
- if (!json_schema.is_null() && !grammar.empty()) {
628
- throw std::runtime_error("Cannot use both json_schema and grammar");
629
- }
630
-
631
- // Handle "response_format" field
632
- if (body.contains("response_format")) {
633
- json response_format = json_value(body, "response_format", json::object());
634
- std::string response_type = json_value(response_format, "type", std::string());
635
- if (response_type == "json_object") {
636
- json_schema = json_value(response_format, "schema", json::object());
637
- } else if (response_type == "json_schema") {
638
- auto schema_wrapper = json_value(response_format, "json_schema", json::object());
639
- json_schema = json_value(schema_wrapper, "schema", json::object());
640
- } else if (!response_type.empty() && response_type != "text") {
641
- throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
642
- }
643
- }
644
-
645
- // get input files
646
- if (!body.contains("messages")) {
647
- throw std::runtime_error("'messages' is required");
648
- }
649
- json messages = body.at("messages");
650
- if (!messages.is_array()) {
651
- throw std::runtime_error("Expected 'messages' to be an array");
652
- }
653
- for (auto & msg : messages) {
654
- std::string role = json_value(msg, "role", std::string());
655
- if (role != "assistant" && !msg.contains("content")) {
656
- throw std::runtime_error("All non-assistant messages must contain 'content'");
657
- }
658
- if (role == "assistant") {
659
- if (!msg.contains("content") && !msg.contains("tool_calls")) {
660
- throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
661
- }
662
- if (!msg.contains("content")) {
663
- continue; // avoid errors with no content
664
- }
665
- }
666
- json & content = msg.at("content");
667
- if (content.is_string() || content.is_null()) {
668
- continue;
669
- }
670
-
671
- if (!content.is_array()) {
672
- throw std::runtime_error("Expected 'content' to be a string or an array");
673
- }
674
-
675
- for (auto & p : content) {
676
- std::string type = json_value(p, "type", std::string());
677
- if (type == "image_url") {
678
- if (!opt.allow_image) {
679
- throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
680
- }
681
-
682
- json image_url = json_value(p, "image_url", json::object());
683
- std::string url = json_value(image_url, "url", std::string());
684
- if (string_starts_with(url, "http")) {
685
- // download remote image
686
- // TODO @ngxson : maybe make these params configurable
687
- common_remote_params params;
688
- params.headers.push_back("User-Agent: llama.cpp/" + build_info);
689
- params.max_size = 1024 * 1024 * 10; // 10MB
690
- params.timeout = 10; // seconds
691
- SRV_INF("downloading image from '%s'\n", url.c_str());
692
- auto res = common_remote_get_content(url, params);
693
- if (200 <= res.first && res.first < 300) {
694
- SRV_INF("downloaded %ld bytes\n", res.second.size());
695
- raw_buffer data;
696
- data.insert(data.end(), res.second.begin(), res.second.end());
697
- out_files.push_back(data);
698
- } else {
699
- throw std::runtime_error("Failed to download image");
700
- }
701
-
702
- } else {
703
- // try to decode base64 image
704
- std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
705
- if (parts.size() != 2) {
706
- throw std::runtime_error("Invalid image_url.url value");
707
- } else if (!string_starts_with(parts[0], "data:image/")) {
708
- throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
709
- } else if (!string_ends_with(parts[0], "base64")) {
710
- throw std::runtime_error("image_url.url must be base64 encoded");
711
- } else {
712
- auto base64_data = parts[1];
713
- auto decoded_data = base64_decode(base64_data);
714
- out_files.push_back(decoded_data);
715
- }
716
- }
717
-
718
- // replace this chunk with a marker
719
- p["type"] = "text";
720
- p["text"] = mtmd_default_marker();
721
- p.erase("image_url");
722
-
723
- } else if (type == "input_audio") {
724
- if (!opt.allow_audio) {
725
- throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
726
- }
727
-
728
- json input_audio = json_value(p, "input_audio", json::object());
729
- std::string data = json_value(input_audio, "data", std::string());
730
- std::string format = json_value(input_audio, "format", std::string());
731
- // while we also support flac, we don't allow it here so we matches the OAI spec
732
- if (format != "wav" && format != "mp3") {
733
- throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
734
- }
735
- auto decoded_data = base64_decode(data); // expected to be base64 encoded
736
- out_files.push_back(decoded_data);
737
-
738
- // replace this chunk with a marker
739
- p["type"] = "text";
740
- p["text"] = mtmd_default_marker();
741
- p.erase("input_audio");
742
-
743
- } else if (type != "text") {
744
- throw std::runtime_error("unsupported content[].type");
745
- }
746
- }
747
- }
748
-
749
- common_chat_templates_inputs inputs;
750
- inputs.messages = common_chat_msgs_parse_oaicompat(messages);
751
- inputs.tools = common_chat_tools_parse_oaicompat(tools);
752
- inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
753
- inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
754
- inputs.grammar = grammar;
755
- inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
756
- inputs.use_jinja = opt.use_jinja;
757
- inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
758
- inputs.extract_reasoning = opt.reasoning_format != COMMON_REASONING_FORMAT_NONE;
759
- inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
760
- if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
761
- throw std::runtime_error("Cannot use custom grammar constraints with tools.");
762
- }
763
-
764
- // if the assistant message appears at the end of list, we do not add end-of-turn token
765
- // for ex. this can be useful to modify the reasoning process in reasoning models
766
- bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
767
- common_chat_msg last_message;
768
- if (prefill_assistant_message) {
769
- last_message = inputs.messages.back();
770
- inputs.messages.pop_back();
771
-
772
- /* sanity check, max one assistant message at the end of the list */
773
- if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
774
- throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
775
- }
776
-
777
- inputs.extract_reasoning = false;
778
- inputs.add_generation_prompt = true;
779
- }
780
-
781
- // Apply chat template to the list of messages
782
- auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
783
-
784
- /* Append assistant prefilled message */
785
- if (prefill_assistant_message) {
786
- chat_params.prompt += last_message.content;
787
- }
788
-
789
- llama_params["chat_format"] = static_cast<int>(chat_params.format);
790
- llama_params["prompt"] = chat_params.prompt;
791
- if (!chat_params.grammar.empty()) {
792
- llama_params["grammar"] = chat_params.grammar;
793
- }
794
- llama_params["grammar_lazy"] = chat_params.grammar_lazy;
795
- auto grammar_triggers = json::array();
796
- for (const auto & trigger : chat_params.grammar_triggers) {
797
- server_grammar_trigger ct(trigger);
798
- grammar_triggers.push_back(ct.to_json());
799
- }
800
- llama_params["grammar_triggers"] = grammar_triggers;
801
- llama_params["preserved_tokens"] = chat_params.preserved_tokens;
802
- for (const auto & stop : chat_params.additional_stops) {
803
- llama_params["stop"].push_back(stop);
804
- }
805
-
806
- // Handle "n" field
807
- int n_choices = json_value(body, "n", 1);
808
- if (n_choices != 1) {
809
- throw std::runtime_error("Only one completion choice is allowed");
810
- }
811
-
812
- // Handle "logprobs" field
813
- // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
814
- if (json_value(body, "logprobs", false)) {
815
- llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
816
- } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
817
- throw std::runtime_error("top_logprobs requires logprobs to be set to true");
818
- }
819
-
820
- // Copy remaining properties to llama_params
821
- // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
822
- // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
823
- for (const auto & item : body.items()) {
824
- // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
825
- if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
826
- llama_params[item.key()] = item.value();
827
- }
828
- }
829
-
830
- return llama_params;
831
- }
832
-
833
- static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
834
- json data = json::array();
835
- int32_t n_tokens = 0;
836
- int i = 0;
837
- for (const auto & elem : embeddings) {
838
- json embedding_obj;
839
-
840
- if (use_base64) {
841
- const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
842
- const char* data_ptr = reinterpret_cast<const char*>(vec.data());
843
- size_t data_size = vec.size() * sizeof(float);
844
- embedding_obj = {
845
- {"embedding", base64::encode(data_ptr, data_size)},
846
- {"index", i++},
847
- {"object", "embedding"},
848
- {"encoding_format", "base64"}
849
- };
850
- } else {
851
- embedding_obj = {
852
- {"embedding", json_value(elem, "embedding", json::array())},
853
- {"index", i++},
854
- {"object", "embedding"}
855
- };
856
- }
857
- data.push_back(embedding_obj);
858
-
859
- n_tokens += json_value(elem, "tokens_evaluated", 0);
860
- }
861
-
862
- json res = json {
863
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
864
- {"object", "list"},
865
- {"usage", json {
866
- {"prompt_tokens", n_tokens},
867
- {"total_tokens", n_tokens}
868
- }},
869
- {"data", data}
870
- };
871
-
872
- return res;
873
- }
874
-
875
- static json format_response_rerank(
876
- const json & request,
877
- const json & ranks,
878
- bool is_tei_format,
879
- std::vector<std::string> & texts) {
880
- json res;
881
- if (is_tei_format) {
882
- // TEI response format
883
- res = json::array();
884
- bool return_text = json_value(request, "return_text", false);
885
- for (const auto & rank : ranks) {
886
- int index = json_value(rank, "index", 0);
887
- json elem = json{
888
- {"index", index},
889
- {"score", json_value(rank, "score", 0.0)},
890
- };
891
- if (return_text) {
892
- elem["text"] = std::move(texts[index]);
893
- }
894
- res.push_back(elem);
895
- }
896
- } else {
897
- // Jina response format
898
- json results = json::array();
899
- int32_t n_tokens = 0;
900
- for (const auto & rank : ranks) {
901
- results.push_back(json{
902
- {"index", json_value(rank, "index", 0)},
903
- {"relevance_score", json_value(rank, "score", 0.0)},
904
- });
905
-
906
- n_tokens += json_value(rank, "tokens_evaluated", 0);
907
- }
908
-
909
- res = json{
910
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
911
- {"object", "list"},
912
- {"usage", json{
913
- {"prompt_tokens", n_tokens},
914
- {"total_tokens", n_tokens}
915
- }},
916
- {"results", results}
917
- };
918
- }
919
-
920
- return res;
921
- }
922
-
923
- static bool is_valid_utf8(const std::string & str) {
924
- const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
925
- const unsigned char* end = bytes + str.length();
926
-
927
- while (bytes < end) {
928
- if (*bytes <= 0x7F) {
929
- // 1-byte sequence (0xxxxxxx)
930
- bytes++;
931
- } else if ((*bytes & 0xE0) == 0xC0) {
932
- // 2-byte sequence (110xxxxx 10xxxxxx)
933
- if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
934
- return false;
935
- bytes += 2;
936
- } else if ((*bytes & 0xF0) == 0xE0) {
937
- // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
938
- if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
939
- return false;
940
- bytes += 3;
941
- } else if ((*bytes & 0xF8) == 0xF0) {
942
- // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
943
- if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
944
- (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
945
- return false;
946
- bytes += 4;
947
- } else {
948
- // Invalid UTF-8 lead byte
949
- return false;
950
- }
951
- }
952
-
953
- return true;
954
- }
955
-
956
- static json format_tokenizer_response(const json & tokens) {
957
- return json {
958
- {"tokens", tokens}
959
- };
960
- }
961
-
962
- static json format_detokenized_response(const std::string & content) {
963
- return json {
964
- {"content", content}
965
- };
966
- }
967
-
968
- static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
969
- json data = json::array();
970
- for (const auto & lb : logit_bias) {
971
- data.push_back(json{
972
- {"bias", lb.bias},
973
- {"token", lb.token},
974
- });
975
- }
976
- return data;
977
- }
978
-
979
- static std::string safe_json_to_str(const json & data) {
980
- return data.dump(-1, ' ', false, json::error_handler_t::replace);
981
- }
982
-
983
- static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
984
- std::vector<llama_token_data> cur;
985
- const auto * logits = llama_get_logits_ith(ctx, idx);
986
-
987
- const llama_model * model = llama_get_model(ctx);
988
- const llama_vocab * vocab = llama_model_get_vocab(model);
989
-
990
- const int n_vocab = llama_vocab_n_tokens(vocab);
991
-
992
- cur.resize(n_vocab);
993
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
994
- cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
995
- }
996
-
997
- // sort tokens by logits
998
- std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
999
- return a.logit > b.logit;
1000
- });
1001
-
1002
- // apply softmax
1003
- float max_l = cur[0].logit;
1004
- float cum_sum = 0.0f;
1005
- for (size_t i = 0; i < cur.size(); ++i) {
1006
- float p = expf(cur[i].logit - max_l);
1007
- cur[i].p = p;
1008
- cum_sum += p;
1009
- }
1010
- for (size_t i = 0; i < cur.size(); ++i) {
1011
- cur[i].p /= cum_sum;
1012
- }
1013
-
1014
- return cur;
1015
- }
1016
-
1017
- static bool are_lora_equal(
1018
- const std::vector<common_adapter_lora_info> & l1,
1019
- const std::vector<common_adapter_lora_info> & l2) {
1020
- if (l1.size() != l2.size()) {
1021
- return false;
1022
- }
1023
- for (size_t i = 0; i < l1.size(); ++i) {
1024
- // we don't check lora.path to reduce the time complexity
1025
- if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
1026
- return false;
1027
- }
1028
- }
1029
- return true;
1030
- }
1031
-
1032
- // parse lora config from JSON request, returned a copy of lora_base with updated scale
1033
- static std::vector<common_adapter_lora_info> parse_lora_request(
1034
- const std::vector<common_adapter_lora_info> & lora_base,
1035
- const json & data) {
1036
- std::vector<common_adapter_lora_info> lora(lora_base);
1037
- int max_idx = lora.size();
1038
-
1039
- // clear existing value
1040
- for (auto & entry : lora) {
1041
- entry.scale = 0.0f;
1042
- }
1043
-
1044
- // set value
1045
- for (const auto & entry : data) {
1046
- int id = json_value(entry, "id", -1);
1047
- float scale = json_value(entry, "scale", 0.0f);
1048
- if (0 <= id && id < max_idx) {
1049
- lora[id].scale = scale;
1050
- } else {
1051
- throw std::runtime_error("invalid adapter id");
1052
- }
1053
- }
1054
-
1055
- return lora;
1056
- }
1057
-
1058
- //
1059
- // utils for interacting with libmtmd
1060
- // (may need to refactor in near future)
1061
- //
1062
-
1063
- /**
1064
- * server_tokens is a helper to manage the input tokens and image for the server.
1065
- * it is made this way to simplify the logic of KV cache management.
1066
- */
1067
- struct server_tokens {
1068
- bool has_mtmd = false;
1069
-
1070
- private: // disallow accessing these members directly, risking out-of-sync
1071
-
1072
- // map a **start** position in tokens to the image chunk
1073
- std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
1074
-
1075
- // list of tokens
1076
- // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
1077
- // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
1078
- // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
1079
- llama_tokens tokens;
1080
-
1081
- // for ex. with input of 5 text tokens and 2 images:
1082
- // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1083
- // pos 0 1 2 3 4 5 6 7 8 9
1084
- // map_pos_to_media will contain: {5, img0}, {8, img1}
1085
-
1086
- public:
1087
- server_tokens() = default;
1088
- ~server_tokens() = default;
1089
-
1090
- // Prevent copying
1091
- server_tokens(const server_tokens&) = delete;
1092
- server_tokens& operator=(const server_tokens&) = delete;
1093
-
1094
- // Allow moving (usually implicitly generated if members are movable)
1095
- server_tokens(server_tokens&&) = default;
1096
- server_tokens& operator=(server_tokens&&) = default;
1097
-
1098
- // Allow accessing elements using [] operator
1099
- llama_token operator[](size_t index) { return tokens[index]; }
1100
- const llama_token& operator[](size_t index) const { return tokens[index]; }
1101
-
1102
- server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
1103
- for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
1104
- push_back(mtmd_chunks[i]);
1105
- }
1106
- }
1107
-
1108
- server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
1109
-
1110
- // for debugging
1111
- std::string str() const {
1112
- std::ostringstream oss;
1113
- oss << "tokens: ";
1114
- for (const auto & t : tokens) {
1115
- if (t == LLAMA_TOKEN_NULL) {
1116
- oss << "<embd> ";
1117
- } else {
1118
- oss << t << " ";
1119
- }
1120
- }
1121
- oss << "\n";
1122
- oss << "image pos: ";
1123
- for (const auto & it : map_pos_to_media) {
1124
- oss << it.first << ", ";
1125
- }
1126
- return oss.str();
1127
- }
1128
-
1129
- const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
1130
- auto it = map_pos_to_media.find(pos);
1131
- if (it != map_pos_to_media.end()) {
1132
- return it->second;
1133
- } else {
1134
- throw std::runtime_error("Chunk not found");
1135
- }
1136
- }
1137
-
1138
- void push_back(llama_token tok) {
1139
- if (tok == LLAMA_TOKEN_NULL) {
1140
- throw std::runtime_error("Invalid token");
1141
- }
1142
- tokens.emplace_back(tok);
1143
- }
1144
-
1145
- // will create a copy of the chunk if it contains non-text data
1146
- void push_back(const mtmd_input_chunk * chunk) {
1147
- auto type = mtmd_input_chunk_get_type(chunk);
1148
- if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1149
- GGML_ASSERT(has_mtmd);
1150
- const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
1151
- llama_pos start_pos = tokens.size();
1152
- for (int i = 0; i < n_pos; ++i) {
1153
- tokens.emplace_back(LLAMA_TOKEN_NULL);
1154
- }
1155
- mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
1156
- map_pos_to_media[start_pos] = std::move(new_chunk);
1157
- } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1158
- size_t n_tokens;
1159
- auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
1160
- for (size_t i = 0; i < n_tokens; ++i) {
1161
- push_back(text_tokens[i]);
1162
- }
1163
- } else {
1164
- GGML_ABORT("Invalid chunk type");
1165
- }
1166
- }
1167
-
1168
- // for compatibility with context shift and prompt truncation
1169
- void insert(const llama_tokens & inp_tokens) {
1170
- GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1171
- tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
1172
- }
1173
-
1174
- // for compatibility with speculative decoding, ctx shift, slot save/load
1175
- const llama_tokens & get_text_tokens() const {
1176
- GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1177
- return tokens;
1178
- }
1179
-
1180
- // for compatibility with speculative decoding
1181
- void set_token(llama_pos pos, llama_token id) {
1182
- GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1183
- tokens[pos] = id;
1184
- }
1185
-
1186
- size_t size() const {
1187
- return tokens.size();
1188
- }
1189
-
1190
- bool empty() const {
1191
- return tokens.empty();
1192
- }
1193
-
1194
- void clear() {
1195
- tokens.clear();
1196
- }
1197
-
1198
- void keep_first(size_t n) {
1199
- GGML_ASSERT(n <= tokens.size());
1200
- if (has_mtmd) {
1201
- if (n == tokens.size()) {
1202
- return; // nothing to do
1203
- }
1204
- // we throw an error if we try to remove a token in the middle of an image
1205
- // for ex. with input of 5 text tokens and 2 images:
1206
- // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1207
- // n 1 2 3 4 5 6 7 8 9 10
1208
- // allowed to resize ^ ^
1209
- // disallowed to resize ^ ^ ^
1210
- if (n > 0) {
1211
- llama_token last_token = tokens[n - 1];
1212
- // make sure we never remove tokens in the middle of an image
1213
- if (last_token == LLAMA_TOKEN_NULL) {
1214
- find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
1215
- }
1216
- }
1217
- // remove all image chunks that are not used anymore
1218
- for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
1219
- llama_pos pos = it->first;
1220
- if (pos >= (llama_pos)n) {
1221
- it = map_pos_to_media.erase(it);
1222
- } else {
1223
- ++it;
1224
- }
1225
- }
1226
- }
1227
- tokens.resize(n);
1228
- }
1229
-
1230
- std::string detokenize(const llama_context * ctx, bool special) const {
1231
- llama_tokens text_tokens;
1232
- text_tokens.reserve(tokens.size());
1233
- for (const auto & t : tokens) {
1234
- if (t != LLAMA_TOKEN_NULL) {
1235
- text_tokens.push_back(t);
1236
- }
1237
- }
1238
- return common_detokenize(ctx, text_tokens, special);
1239
- }
1240
-
1241
- size_t get_common_prefix(const server_tokens & b) const {
1242
- size_t max_idx = std::min(tokens.size(), b.tokens.size());
1243
- for (size_t i = 0; i < max_idx; ++i) {
1244
- auto & ai = tokens[i];
1245
- auto & bi = b.tokens[i];
1246
-
1247
- if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
1248
- GGML_ASSERT(has_mtmd);
1249
- const auto & a_chunk = find_chunk(i);
1250
- const auto & b_chunk = b.find_chunk(i);
1251
- GGML_ASSERT(a_chunk && b_chunk);
1252
- std::string ai_id = mtmd_input_chunk_get_id(a_chunk.get());
1253
- std::string bi_id = mtmd_input_chunk_get_id(b_chunk.get());
1254
- size_t a_pos = mtmd_input_chunk_get_n_pos(a_chunk.get());
1255
- size_t b_pos = mtmd_input_chunk_get_n_pos(b_chunk.get());
1256
- if (ai_id == bi_id && a_pos == b_pos) {
1257
- GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
1258
- i += a_pos - 1; // will be +1 by the for loop
1259
- continue;
1260
- } else {
1261
- return i;
1262
- }
1263
- } else if (ai == bi) {
1264
- continue;
1265
- } else {
1266
- return i;
1267
- }
1268
- }
1269
- return max_idx; // all tokens are equal
1270
- }
1271
-
1272
- // make sure all text tokens are within the vocab range
1273
- bool validate(const struct llama_context * ctx) const {
1274
- const llama_model * model = llama_get_model(ctx);
1275
- const llama_vocab * vocab = llama_model_get_vocab(model);
1276
- const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1277
-
1278
- for (size_t i = 0; i < tokens.size(); ++i) {
1279
- auto & t = tokens[i];
1280
- if (t == LLAMA_TOKEN_NULL) {
1281
- try {
1282
- const auto & chunk = find_chunk(i);
1283
- size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
1284
- i += n_pos - 1; // will be +1 by the for loop
1285
- } catch (const std::exception & e) {
1286
- return false;
1287
- }
1288
- } else if (t < 0 || t >= n_vocab) {
1289
- return false;
1290
- }
1291
- }
1292
- return true;
1293
- }
1294
-
1295
- // encode and decode the image chunk
1296
- int32_t process_chunk(
1297
- llama_context * ctx,
1298
- mtmd_context * mctx,
1299
- llama_pos n_past,
1300
- int32_t seq_id,
1301
- llama_pos & n_pos_out) {
1302
- auto & chunk = find_chunk(n_past);
1303
- const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
1304
- ? "image" : "audio";
1305
- SRV_INF("processing %s...\n", name);
1306
- int32_t n_batch = llama_n_batch(ctx);
1307
- int64_t t0 = ggml_time_ms();
1308
- llama_pos new_n_past = n_past;
1309
- int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
1310
- chunk.get(),
1311
- n_past,
1312
- seq_id,
1313
- n_batch,
1314
- true, // logits last
1315
- &new_n_past);
1316
- SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
1317
- if (result != 0) {
1318
- LOG_ERR("mtmd_helper_eval failed with status %d", result);
1319
- n_pos_out = n_past;
1320
- return result;
1321
- }
1322
- n_pos_out = new_n_past;
1323
- return 0;
1324
- }
1325
- };
1326
-
1327
- // Computes FNV-1a hash of the data
1328
- static std::string fnv_hash(const uint8_t * data, size_t len) {
1329
- const uint64_t fnv_prime = 0x100000001b3ULL;
1330
- uint64_t hash = 0xcbf29ce484222325ULL;
1331
-
1332
- for (size_t i = 0; i < len; ++i) {
1333
- hash ^= data[i];
1334
- hash *= fnv_prime;
1335
- }
1336
- return std::to_string(hash);
1337
- }