@fugood/llama.node 0.6.3 → 1.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/CMakeLists.txt +40 -30
  2. package/README.md +4 -1
  3. package/lib/binding.js +41 -29
  4. package/lib/binding.ts +26 -25
  5. package/package.json +40 -7
  6. package/scripts/build.js +47 -0
  7. package/scripts/llama.cpp.patch +109 -0
  8. package/src/anyascii.c +22223 -0
  9. package/src/anyascii.h +42 -0
  10. package/src/tts_utils.cpp +20 -7
  11. package/src/tts_utils.h +2 -0
  12. package/bin/darwin/arm64/llama-node.node +0 -0
  13. package/bin/darwin/x64/llama-node.node +0 -0
  14. package/bin/linux/arm64/llama-node.node +0 -0
  15. package/bin/linux/x64/llama-node.node +0 -0
  16. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  17. package/bin/linux-cuda/x64/llama-node.node +0 -0
  18. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  19. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  20. package/bin/win32/x64/llama-node.node +0 -0
  21. package/bin/win32/x64/node.lib +0 -0
  22. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  23. package/bin/win32-vulkan/arm64/node.lib +0 -0
  24. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  25. package/bin/win32-vulkan/x64/node.lib +0 -0
  26. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
  27. package/src/llama.cpp/.github/workflows/build.yml +0 -1078
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
  29. package/src/llama.cpp/.github/workflows/docker.yml +0 -178
  30. package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
  31. package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
  32. package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
  36. package/src/llama.cpp/.github/workflows/release.yml +0 -739
  37. package/src/llama.cpp/.github/workflows/server.yml +0 -237
  38. package/src/llama.cpp/.github/workflows/winget.yml +0 -42
  39. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
  40. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
  41. package/src/llama.cpp/cmake/build-info.cmake +0 -64
  42. package/src/llama.cpp/cmake/common.cmake +0 -35
  43. package/src/llama.cpp/cmake/git-vars.cmake +0 -22
  44. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
  45. package/src/llama.cpp/common/build-info.cpp.in +0 -4
  46. package/src/llama.cpp/docs/build.md +0 -561
  47. package/src/llama.cpp/examples/CMakeLists.txt +0 -43
  48. package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
  49. package/src/llama.cpp/examples/batched/batched.cpp +0 -246
  50. package/src/llama.cpp/examples/chat-13B.bat +0 -57
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
  53. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
  54. package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
  56. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
  57. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
  58. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
  59. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
  60. package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
  61. package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
  62. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
  63. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
  64. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
  65. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
  66. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
  67. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
  68. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
  69. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
  70. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
  71. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
  72. package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
  73. package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
  74. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
  75. package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
  76. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
  77. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
  78. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
  79. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
  80. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
  81. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
  82. package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
  83. package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
  84. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
  85. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
  86. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
  87. package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
  88. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
  89. package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
  90. package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
  91. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
  92. package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
  93. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
  94. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
  96. package/src/llama.cpp/examples/simple/simple.cpp +0 -206
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
  99. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
  100. package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
  101. package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
  102. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
  103. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
  104. package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
  105. package/src/llama.cpp/examples/sycl/build.sh +0 -23
  106. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
  107. package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
  108. package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
  109. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
  110. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
  111. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
  112. package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
  113. package/src/llama.cpp/examples/training/finetune.cpp +0 -96
  114. package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
  115. package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
  123. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
  124. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
  125. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
  126. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
  127. package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
  128. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
  129. package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
  130. package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
  131. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
  132. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
  133. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
  134. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
  135. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
  136. package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
  137. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  138. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  139. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
  140. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
  141. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
  142. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
  143. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
  144. package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
  145. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
  146. package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
  147. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
  148. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
  149. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
  150. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
  151. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
  152. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
  153. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
  154. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
  155. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
  156. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
  157. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
  158. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
  159. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
  160. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
  161. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
  162. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
  163. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
  164. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
  165. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
  166. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
  167. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
  168. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
  169. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
  170. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
  171. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
  172. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
  173. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
  174. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
  175. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
  176. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
  177. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
  178. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
  182. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
  183. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
  184. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
  185. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
  186. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
  187. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
  188. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
  189. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
  190. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
  191. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
  192. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
  194. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
  195. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
  197. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
  198. package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
  199. package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
  200. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
  201. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
  202. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
  203. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
  204. package/src/llama.cpp/ggml/src/ggml.c +0 -6550
  205. package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
  206. package/src/llama.cpp/models/.editorconfig +0 -1
  207. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  208. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  209. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  210. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  211. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  212. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  213. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  214. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  215. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  216. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  217. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  219. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  220. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  221. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  222. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  223. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  225. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  227. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  228. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  230. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  231. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  232. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  233. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  236. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  237. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  239. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  240. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  241. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  242. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  245. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  248. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  249. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  256. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  257. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  259. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  260. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  261. package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
  262. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
  263. package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
  264. package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
  265. package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
  266. package/src/llama.cpp/prompts/alpaca.txt +0 -1
  267. package/src/llama.cpp/prompts/assistant.txt +0 -31
  268. package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  269. package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
  270. package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  271. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  272. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  273. package/src/llama.cpp/prompts/chat.txt +0 -28
  274. package/src/llama.cpp/prompts/dan-modified.txt +0 -1
  275. package/src/llama.cpp/prompts/dan.txt +0 -1
  276. package/src/llama.cpp/prompts/mnemonics.txt +0 -93
  277. package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
  278. package/src/llama.cpp/prompts/reason-act.txt +0 -18
  279. package/src/llama.cpp/requirements/requirements-all.txt +0 -15
  280. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
  281. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
  282. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
  283. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
  284. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
  285. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
  286. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
  287. package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
  288. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
  289. package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
  290. package/src/llama.cpp/requirements.txt +0 -13
  291. package/src/llama.cpp/scripts/build-info.sh +0 -30
  292. package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
  293. package/src/llama.cpp/scripts/xxd.cmake +0 -16
  294. package/src/llama.cpp/tests/CMakeLists.txt +0 -177
  295. package/src/llama.cpp/tests/get-model.cpp +0 -21
  296. package/src/llama.cpp/tests/get-model.h +0 -2
  297. package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
  298. package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
  299. package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
  300. package/src/llama.cpp/tests/test-barrier.cpp +0 -94
  301. package/src/llama.cpp/tests/test-c.c +0 -7
  302. package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
  303. package/src/llama.cpp/tests/test-chat.cpp +0 -985
  304. package/src/llama.cpp/tests/test-double-float.cpp +0 -57
  305. package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
  306. package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
  307. package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
  308. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
  309. package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
  310. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
  311. package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
  312. package/src/llama.cpp/tests/test-log.cpp +0 -39
  313. package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
  314. package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
  315. package/src/llama.cpp/tests/test-opt.cpp +0 -904
  316. package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
  317. package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
  318. package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
  319. package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
  320. package/src/llama.cpp/tests/test-rope.cpp +0 -262
  321. package/src/llama.cpp/tests/test-sampling.cpp +0 -399
  322. package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
  323. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
  324. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
  325. package/src/llama.cpp/tools/CMakeLists.txt +0 -39
  326. package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
  327. package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
  328. package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
  329. package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
  330. package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
  331. package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
  332. package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
  333. package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
  334. package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
  335. package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
  336. package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
  337. package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
  338. package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
  339. package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
  340. package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
  341. package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
  342. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
  343. package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
  344. package/src/llama.cpp/tools/main/main.cpp +0 -977
  345. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
  346. package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
  347. package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
  348. package/src/llama.cpp/tools/mtmd/clip.h +0 -101
  349. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
  350. package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
  351. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
  352. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
  353. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
  354. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
  355. package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
  356. package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
  357. package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
  358. package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
  359. package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
  360. package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
  361. package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
  362. package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
  363. package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
  364. package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
  365. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
  366. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
  367. package/src/llama.cpp/tools/run/run.cpp +0 -1261
  368. package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
  369. package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
  370. package/src/llama.cpp/tools/server/httplib.h +0 -10506
  371. package/src/llama.cpp/tools/server/server.cpp +0 -4966
  372. package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
  373. package/src/llama.cpp/tools/server/utils.hpp +0 -1337
  374. package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
  375. package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
  376. package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
  377. package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
@@ -1,1337 +0,0 @@
1
- #pragma once
2
-
3
- #include "common.h"
4
- #include "log.h"
5
- #include "llama.h"
6
- #include "arg.h" // common_remote_get_content
7
- #include "base64.hpp"
8
- #include "mtmd.h"
9
-
10
- // increase max payload length to allow use of larger context size
11
- #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
12
- // disable Nagle's algorithm
13
- #define CPPHTTPLIB_TCP_NODELAY true
14
- #include "httplib.h"
15
-
16
- // Change JSON_ASSERT from assert() to GGML_ASSERT:
17
- #define JSON_ASSERT GGML_ASSERT
18
- #include "json.hpp"
19
- #include "chat.h"
20
-
21
- #include <random>
22
- #include <sstream>
23
- #include <string>
24
- #include <vector>
25
- #include <memory>
26
- #include <cinttypes>
27
-
28
- #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
29
-
30
- using json = nlohmann::ordered_json;
31
-
32
- #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
33
- #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
34
- #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
35
- #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
36
-
37
- #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
38
- #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
39
- #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
40
- #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
41
-
42
- #define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
43
- #define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
44
- #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
45
- #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
46
-
47
- using raw_buffer = std::vector<uint8_t>;
48
-
49
- template <typename T>
50
- static T json_value(const json & body, const std::string & key, const T & default_value) {
51
- // Fallback null to default value
52
- if (body.contains(key) && !body.at(key).is_null()) {
53
- try {
54
- return body.at(key);
55
- } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
56
- LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
57
- return default_value;
58
- }
59
- } else {
60
- return default_value;
61
- }
62
- }
63
-
64
- const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
65
-
66
- // thin wrapper around common_grammar_trigger with (de)serialization functions
67
- struct server_grammar_trigger {
68
- common_grammar_trigger value;
69
-
70
- server_grammar_trigger() = default;
71
- server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
72
- server_grammar_trigger(const json & in) {
73
- value.type = (common_grammar_trigger_type) in.at("type").get<int>();
74
- value.value = in.at("value").get<std::string>();
75
- if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
76
- value.token = (llama_token) in.at("token").get<int>();
77
- }
78
- }
79
-
80
- json to_json() const {
81
- json out {
82
- {"type", (int) value.type},
83
- {"value", value.value},
84
- };
85
- if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
86
- out["token"] = (int) value.token;
87
- }
88
- return out;
89
- }
90
- };
91
-
92
- //
93
- // tokenizer and input processing utils
94
- //
95
-
96
- static bool json_is_array_of_numbers(const json & data) {
97
- if (data.is_array()) {
98
- for (const auto & e : data) {
99
- if (!e.is_number_integer()) {
100
- return false;
101
- }
102
- }
103
- return true;
104
- }
105
- return false;
106
- }
107
-
108
- // is array having BOTH numbers & strings?
109
- static bool json_is_array_of_mixed_numbers_strings(const json & data) {
110
- bool seen_string = false;
111
- bool seen_number = false;
112
- if (data.is_array()) {
113
- for (const auto & e : data) {
114
- seen_string |= e.is_string();
115
- seen_number |= e.is_number_integer();
116
- if (seen_number && seen_string) {
117
- return true;
118
- }
119
- }
120
- }
121
- return false;
122
- }
123
-
124
- // get value by path(key1 / key2)
125
- static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
126
- json result = json::object();
127
-
128
- for (const std::string & path : paths) {
129
- json current = js;
130
- const auto keys = string_split<std::string>(path, /*separator*/ '/');
131
- bool valid_path = true;
132
- for (const std::string & k : keys) {
133
- if (valid_path && current.is_object() && current.contains(k)) {
134
- current = current[k];
135
- } else {
136
- valid_path = false;
137
- }
138
- }
139
- if (valid_path) {
140
- result[path] = current;
141
- }
142
- }
143
- return result;
144
- }
145
-
146
- /**
147
- * this handles 2 cases:
148
- * - only string, example: "string"
149
- * - mixed string and tokens, example: [12, 34, "string", 56, 78]
150
- */
151
- static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
152
- // If `add_bos` is true, we only add BOS, when json_prompt is a string,
153
- // or the first element of the json_prompt array is a string.
154
- llama_tokens prompt_tokens;
155
-
156
- if (json_prompt.is_array()) {
157
- bool first = true;
158
- for (const auto & p : json_prompt) {
159
- if (p.is_string()) {
160
- auto s = p.template get<std::string>();
161
-
162
- llama_tokens p;
163
- if (first) {
164
- p = common_tokenize(vocab, s, add_special, parse_special);
165
- first = false;
166
- } else {
167
- p = common_tokenize(vocab, s, false, parse_special);
168
- }
169
-
170
- prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
171
- } else {
172
- if (first) {
173
- first = false;
174
- }
175
-
176
- prompt_tokens.push_back(p.template get<llama_token>());
177
- }
178
- }
179
- } else {
180
- auto s = json_prompt.template get<std::string>();
181
- prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
182
- }
183
-
184
- return prompt_tokens;
185
- }
186
-
187
- /**
188
- * break the input "prompt" object into multiple prompt if needed, then tokenize them
189
- * this supports these cases:
190
- * - "prompt": "string"
191
- * - "prompt": [12, 34, 56]
192
- * - "prompt": [12, 34, "string", 56, 78]
193
- * and multiple prompts (multi-tasks):
194
- * - "prompt": ["string1", "string2"]
195
- * - "prompt": ["string1", [12, 34, 56]]
196
- * - "prompt": [[12, 34, 56], [78, 90, 12]]
197
- * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
198
- */
199
- static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
200
- std::vector<llama_tokens> result;
201
- if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
202
- // string or mixed
203
- result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
204
- } else if (json_is_array_of_numbers(json_prompt)) {
205
- // array of tokens
206
- result.push_back(json_prompt.get<llama_tokens>());
207
- } else if (json_prompt.is_array()) {
208
- // array of prompts
209
- result.reserve(json_prompt.size());
210
- for (const auto & p : json_prompt) {
211
- if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
212
- result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
213
- } else if (json_is_array_of_numbers(p)) {
214
- // array of tokens
215
- result.push_back(p.get<llama_tokens>());
216
- } else {
217
- throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
218
- }
219
- }
220
- } else {
221
- throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
222
- }
223
- if (result.empty()) {
224
- throw std::runtime_error("\"prompt\" must not be empty");
225
- }
226
- return result;
227
- }
228
-
229
- // return the last index of character that can form a valid string
230
- // if the last character is potentially cut in half, return the index before the cut
231
- // if validate_utf8(text) == text.size(), then the whole text is valid utf8
232
- static size_t validate_utf8(const std::string& text) {
233
- size_t len = text.size();
234
- if (len == 0) return 0;
235
-
236
- // Check the last few bytes to see if a multi-byte character is cut off
237
- for (size_t i = 1; i <= 4 && i <= len; ++i) {
238
- unsigned char c = text[len - i];
239
- // Check for start of a multi-byte sequence from the end
240
- if ((c & 0xE0) == 0xC0) {
241
- // 2-byte character start: 110xxxxx
242
- // Needs at least 2 bytes
243
- if (i < 2) return len - i;
244
- } else if ((c & 0xF0) == 0xE0) {
245
- // 3-byte character start: 1110xxxx
246
- // Needs at least 3 bytes
247
- if (i < 3) return len - i;
248
- } else if ((c & 0xF8) == 0xF0) {
249
- // 4-byte character start: 11110xxx
250
- // Needs at least 4 bytes
251
- if (i < 4) return len - i;
252
- }
253
- }
254
-
255
- // If no cut-off multi-byte character is found, return full length
256
- return len;
257
- }
258
-
259
- //
260
- // template utils
261
- //
262
-
263
- // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
264
- static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
265
- llama_tokens result;
266
-
267
- result.reserve(doc.size() + query.size() + 4);
268
- result.push_back(llama_vocab_bos(vocab));
269
- result.insert(result.end(), query.begin(), query.end());
270
- result.push_back(llama_vocab_eos(vocab));
271
- result.push_back(llama_vocab_sep(vocab));
272
- result.insert(result.end(), doc.begin(), doc.end());
273
- result.push_back(llama_vocab_eos(vocab));
274
-
275
- return result;
276
- }
277
-
278
- // format infill task
279
- static llama_tokens format_infill(
280
- const llama_vocab * vocab,
281
- const json & input_prefix,
282
- const json & input_suffix,
283
- const json & input_extra,
284
- const int n_batch,
285
- const int n_predict,
286
- const int n_ctx,
287
- const bool spm_infill,
288
- const llama_tokens & tokens_prompt
289
- ) {
290
- // TODO: optimize this block by reducing memory allocations and movement
291
-
292
- // use FIM repo-level pattern:
293
- // ref: https://arxiv.org/pdf/2409.12186
294
- //
295
- // [FIM_REP]myproject
296
- // [FIM_SEP]filename0
297
- // extra chunk 0
298
- // [FIM_SEP]filename1
299
- // extra chunk 1
300
- // ...
301
- // [FIM_SEP]filename
302
- // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
303
- //
304
- llama_tokens extra_tokens;
305
- extra_tokens.reserve(n_ctx);
306
-
307
- auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
308
- auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
309
-
310
- if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
311
- // TODO: make project name an input
312
- static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
313
-
314
- extra_tokens.push_back(llama_vocab_fim_rep(vocab));
315
- extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
316
- }
317
- for (const auto & chunk : input_extra) {
318
- // { "text": string, "filename": string }
319
- const std::string text = json_value(chunk, "text", std::string());
320
- const std::string filename = json_value(chunk, "filename", std::string("tmp"));
321
-
322
- if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
323
- const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
324
-
325
- extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
326
- extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
327
- } else {
328
- // chunk separator in binary form to avoid confusing the AI
329
- static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
330
- static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
331
-
332
- extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
333
- }
334
-
335
- const auto chunk_tokens = common_tokenize(vocab, text, false, false);
336
- extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
337
- }
338
-
339
- if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
340
- // TODO: current filename
341
- static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
342
-
343
- extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
344
- extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
345
- }
346
-
347
- // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
348
- const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
349
- const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
350
-
351
- SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
352
-
353
- // fill the rest of the context with extra chunks
354
- const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
355
-
356
- tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
357
- tokens_suffix.resize(n_suffix_take);
358
-
359
- tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
360
- tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
361
- tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
362
-
363
- auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
364
- auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
365
-
366
- if (llama_vocab_get_add_bos(vocab)) {
367
- embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
368
- }
369
-
370
- SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
371
-
372
- // put the extra context before the FIM prefix
373
- embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
374
-
375
- embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
376
- embd_inp.push_back(llama_vocab_fim_mid(vocab));
377
-
378
- return embd_inp;
379
- }
380
-
381
- //
382
- // base64 utils (TODO: move to common in the future)
383
- //
384
-
385
- static const std::string base64_chars =
386
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
387
- "abcdefghijklmnopqrstuvwxyz"
388
- "0123456789+/";
389
-
390
- static inline bool is_base64(uint8_t c) {
391
- return (isalnum(c) || (c == '+') || (c == '/'));
392
- }
393
-
394
- static inline raw_buffer base64_decode(const std::string & encoded_string) {
395
- int i = 0;
396
- int j = 0;
397
- int in_ = 0;
398
-
399
- int in_len = encoded_string.size();
400
-
401
- uint8_t char_array_4[4];
402
- uint8_t char_array_3[3];
403
-
404
- raw_buffer ret;
405
-
406
- while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
407
- char_array_4[i++] = encoded_string[in_]; in_++;
408
- if (i == 4) {
409
- for (i = 0; i < 4; i++) {
410
- char_array_4[i] = base64_chars.find(char_array_4[i]);
411
- }
412
-
413
- char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
414
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
415
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
416
-
417
- for (i = 0; (i < 3); i++) {
418
- ret.push_back(char_array_3[i]);
419
- }
420
-
421
- i = 0;
422
- }
423
- }
424
-
425
- if (i) {
426
- for (j = i; j < 4; j++) {
427
- char_array_4[j] = 0;
428
- }
429
-
430
- for (j = 0; j < 4; j++) {
431
- char_array_4[j] = base64_chars.find(char_array_4[j]);
432
- }
433
-
434
- char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
435
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
436
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
437
-
438
- for (j = 0; j < i - 1; j++) {
439
- ret.push_back(char_array_3[j]);
440
- }
441
- }
442
-
443
- return ret;
444
- }
445
-
446
- //
447
- // random string / id
448
- //
449
-
450
- static std::string random_string() {
451
- static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
452
-
453
- std::random_device rd;
454
- std::mt19937 generator(rd());
455
-
456
- std::string result(32, ' ');
457
-
458
- for (int i = 0; i < 32; ++i) {
459
- result[i] = str[generator() % str.size()];
460
- }
461
-
462
- return result;
463
- }
464
-
465
- static std::string gen_chatcmplid() {
466
- return "chatcmpl-" + random_string();
467
- }
468
-
469
- static std::string gen_tool_call_id() {
470
- return random_string();
471
- }
472
-
473
- //
474
- // other common utils
475
- //
476
-
477
- static bool ends_with(const std::string & str, const std::string & suffix) {
478
- return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
479
- }
480
-
481
- static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
482
- if (!text.empty() && !stop.empty()) {
483
- const char text_last_char = text.back();
484
- for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
485
- if (stop[char_index] == text_last_char) {
486
- const std::string current_partial = stop.substr(0, char_index + 1);
487
- if (ends_with(text, current_partial)) {
488
- return text.size() - char_index - 1;
489
- }
490
- }
491
- }
492
- }
493
-
494
- return std::string::npos;
495
- }
496
-
497
- // TODO: reuse llama_detokenize
498
- template <class Iter>
499
- static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
500
- std::string ret;
501
- for (; begin != end; ++begin) {
502
- ret += common_token_to_piece(ctx, *begin);
503
- }
504
-
505
- return ret;
506
- }
507
-
508
- // format incomplete utf-8 multibyte character for output
509
- static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
510
- std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
511
-
512
- // if the size is 1 and first bit is 1, meaning it's a partial character
513
- // (size > 1 meaning it's already a known token)
514
- if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
515
- std::stringstream ss;
516
- ss << std::hex << (out[0] & 0xff);
517
- std::string res(ss.str());
518
- out = "byte: \\x" + res;
519
- }
520
-
521
- return out;
522
- }
523
-
524
- static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
525
- const std::string str =
526
- std::string(event) + ": " +
527
- data.dump(-1, ' ', false, json::error_handler_t::replace) +
528
- "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
529
-
530
- LOG_DBG("data stream, to_send: %s", str.c_str());
531
-
532
- return sink.write(str.c_str(), str.size());
533
- }
534
-
535
- //
536
- // OAI utils
537
- //
538
-
539
- // used by /completions endpoint
540
- static json oaicompat_completion_params_parse(const json & body) {
541
- json llama_params;
542
-
543
- if (!body.contains("prompt")) {
544
- throw std::runtime_error("\"prompt\" is required");
545
- }
546
-
547
- // Handle "stop" field
548
- if (body.contains("stop") && body.at("stop").is_string()) {
549
- llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
550
- } else {
551
- llama_params["stop"] = json_value(body, "stop", json::array());
552
- }
553
-
554
- // Handle "n" field
555
- int n_choices = json_value(body, "n", 1);
556
- if (n_choices != 1) {
557
- throw std::runtime_error("Only one completion choice is allowed");
558
- }
559
-
560
- // Handle "echo" field
561
- if (json_value(body, "echo", false)) {
562
- throw std::runtime_error("Only no echo is supported");
563
- }
564
-
565
- // Params supported by OAI but unsupported by llama.cpp
566
- static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
567
- for (const auto & param : unsupported_params) {
568
- if (body.contains(param)) {
569
- throw std::runtime_error("Unsupported param: " + param);
570
- }
571
- }
572
-
573
- // Copy remaining properties to llama_params
574
- for (const auto & item : body.items()) {
575
- // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
576
- if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
577
- llama_params[item.key()] = item.value();
578
- }
579
- }
580
-
581
- return llama_params;
582
- }
583
-
584
- struct oaicompat_parser_options {
585
- bool use_jinja;
586
- bool prefill_assistant;
587
- common_reasoning_format reasoning_format;
588
- common_chat_templates * tmpls;
589
- bool allow_image;
590
- bool allow_audio;
591
- };
592
-
593
- // used by /chat/completions endpoint
594
- static json oaicompat_chat_params_parse(
595
- const json & body, /* openai api json semantics */
596
- const oaicompat_parser_options & opt,
597
- std::vector<raw_buffer> & out_files)
598
- {
599
- json llama_params;
600
-
601
- auto tools = json_value(body, "tools", json());
602
- auto stream = json_value(body, "stream", false);
603
-
604
- if (tools.is_array() && !tools.empty()) {
605
- if (stream) {
606
- throw std::runtime_error("Cannot use tools with stream");
607
- }
608
- if (!opt.use_jinja) {
609
- throw std::runtime_error("tools param requires --jinja flag");
610
- }
611
- }
612
- if (!opt.use_jinja) {
613
- if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
614
- throw std::runtime_error("Unsupported param: tool_choice");
615
- }
616
- }
617
-
618
- // Handle "stop" field
619
- if (body.contains("stop") && body.at("stop").is_string()) {
620
- llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
621
- } else {
622
- llama_params["stop"] = json_value(body, "stop", json::array());
623
- }
624
-
625
- auto json_schema = json_value(body, "json_schema", json());
626
- auto grammar = json_value(body, "grammar", std::string());
627
- if (!json_schema.is_null() && !grammar.empty()) {
628
- throw std::runtime_error("Cannot use both json_schema and grammar");
629
- }
630
-
631
- // Handle "response_format" field
632
- if (body.contains("response_format")) {
633
- json response_format = json_value(body, "response_format", json::object());
634
- std::string response_type = json_value(response_format, "type", std::string());
635
- if (response_type == "json_object") {
636
- json_schema = json_value(response_format, "schema", json::object());
637
- } else if (response_type == "json_schema") {
638
- auto schema_wrapper = json_value(response_format, "json_schema", json::object());
639
- json_schema = json_value(schema_wrapper, "schema", json::object());
640
- } else if (!response_type.empty() && response_type != "text") {
641
- throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
642
- }
643
- }
644
-
645
- // get input files
646
- if (!body.contains("messages")) {
647
- throw std::runtime_error("'messages' is required");
648
- }
649
- json messages = body.at("messages");
650
- if (!messages.is_array()) {
651
- throw std::runtime_error("Expected 'messages' to be an array");
652
- }
653
- for (auto & msg : messages) {
654
- std::string role = json_value(msg, "role", std::string());
655
- if (role != "assistant" && !msg.contains("content")) {
656
- throw std::runtime_error("All non-assistant messages must contain 'content'");
657
- }
658
- if (role == "assistant") {
659
- if (!msg.contains("content") && !msg.contains("tool_calls")) {
660
- throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
661
- }
662
- if (!msg.contains("content")) {
663
- continue; // avoid errors with no content
664
- }
665
- }
666
- json & content = msg.at("content");
667
- if (content.is_string() || content.is_null()) {
668
- continue;
669
- }
670
-
671
- if (!content.is_array()) {
672
- throw std::runtime_error("Expected 'content' to be a string or an array");
673
- }
674
-
675
- for (auto & p : content) {
676
- std::string type = json_value(p, "type", std::string());
677
- if (type == "image_url") {
678
- if (!opt.allow_image) {
679
- throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
680
- }
681
-
682
- json image_url = json_value(p, "image_url", json::object());
683
- std::string url = json_value(image_url, "url", std::string());
684
- if (string_starts_with(url, "http")) {
685
- // download remote image
686
- // TODO @ngxson : maybe make these params configurable
687
- common_remote_params params;
688
- params.headers.push_back("User-Agent: llama.cpp/" + build_info);
689
- params.max_size = 1024 * 1024 * 10; // 10MB
690
- params.timeout = 10; // seconds
691
- SRV_INF("downloading image from '%s'\n", url.c_str());
692
- auto res = common_remote_get_content(url, params);
693
- if (200 <= res.first && res.first < 300) {
694
- SRV_INF("downloaded %ld bytes\n", res.second.size());
695
- raw_buffer data;
696
- data.insert(data.end(), res.second.begin(), res.second.end());
697
- out_files.push_back(data);
698
- } else {
699
- throw std::runtime_error("Failed to download image");
700
- }
701
-
702
- } else {
703
- // try to decode base64 image
704
- std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
705
- if (parts.size() != 2) {
706
- throw std::runtime_error("Invalid image_url.url value");
707
- } else if (!string_starts_with(parts[0], "data:image/")) {
708
- throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
709
- } else if (!string_ends_with(parts[0], "base64")) {
710
- throw std::runtime_error("image_url.url must be base64 encoded");
711
- } else {
712
- auto base64_data = parts[1];
713
- auto decoded_data = base64_decode(base64_data);
714
- out_files.push_back(decoded_data);
715
- }
716
- }
717
-
718
- // replace this chunk with a marker
719
- p["type"] = "text";
720
- p["text"] = mtmd_default_marker();
721
- p.erase("image_url");
722
-
723
- } else if (type == "input_audio") {
724
- if (!opt.allow_audio) {
725
- throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
726
- }
727
-
728
- json input_audio = json_value(p, "input_audio", json::object());
729
- std::string data = json_value(input_audio, "data", std::string());
730
- std::string format = json_value(input_audio, "format", std::string());
731
- // while we also support flac, we don't allow it here so we matches the OAI spec
732
- if (format != "wav" && format != "mp3") {
733
- throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
734
- }
735
- auto decoded_data = base64_decode(data); // expected to be base64 encoded
736
- out_files.push_back(decoded_data);
737
-
738
- // replace this chunk with a marker
739
- p["type"] = "text";
740
- p["text"] = mtmd_default_marker();
741
- p.erase("input_audio");
742
-
743
- } else if (type != "text") {
744
- throw std::runtime_error("unsupported content[].type");
745
- }
746
- }
747
- }
748
-
749
- common_chat_templates_inputs inputs;
750
- inputs.messages = common_chat_msgs_parse_oaicompat(messages);
751
- inputs.tools = common_chat_tools_parse_oaicompat(tools);
752
- inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
753
- inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
754
- inputs.grammar = grammar;
755
- inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
756
- inputs.use_jinja = opt.use_jinja;
757
- inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
758
- inputs.extract_reasoning = opt.reasoning_format != COMMON_REASONING_FORMAT_NONE;
759
- inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
760
- if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
761
- throw std::runtime_error("Cannot use custom grammar constraints with tools.");
762
- }
763
-
764
- // if the assistant message appears at the end of list, we do not add end-of-turn token
765
- // for ex. this can be useful to modify the reasoning process in reasoning models
766
- bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
767
- common_chat_msg last_message;
768
- if (prefill_assistant_message) {
769
- last_message = inputs.messages.back();
770
- inputs.messages.pop_back();
771
-
772
- /* sanity check, max one assistant message at the end of the list */
773
- if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
774
- throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
775
- }
776
-
777
- inputs.extract_reasoning = false;
778
- inputs.add_generation_prompt = true;
779
- }
780
-
781
- // Apply chat template to the list of messages
782
- auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
783
-
784
- /* Append assistant prefilled message */
785
- if (prefill_assistant_message) {
786
- chat_params.prompt += last_message.content;
787
- }
788
-
789
- llama_params["chat_format"] = static_cast<int>(chat_params.format);
790
- llama_params["prompt"] = chat_params.prompt;
791
- if (!chat_params.grammar.empty()) {
792
- llama_params["grammar"] = chat_params.grammar;
793
- }
794
- llama_params["grammar_lazy"] = chat_params.grammar_lazy;
795
- auto grammar_triggers = json::array();
796
- for (const auto & trigger : chat_params.grammar_triggers) {
797
- server_grammar_trigger ct(trigger);
798
- grammar_triggers.push_back(ct.to_json());
799
- }
800
- llama_params["grammar_triggers"] = grammar_triggers;
801
- llama_params["preserved_tokens"] = chat_params.preserved_tokens;
802
- for (const auto & stop : chat_params.additional_stops) {
803
- llama_params["stop"].push_back(stop);
804
- }
805
-
806
- // Handle "n" field
807
- int n_choices = json_value(body, "n", 1);
808
- if (n_choices != 1) {
809
- throw std::runtime_error("Only one completion choice is allowed");
810
- }
811
-
812
- // Handle "logprobs" field
813
- // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
814
- if (json_value(body, "logprobs", false)) {
815
- llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
816
- } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
817
- throw std::runtime_error("top_logprobs requires logprobs to be set to true");
818
- }
819
-
820
- // Copy remaining properties to llama_params
821
- // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
822
- // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
823
- for (const auto & item : body.items()) {
824
- // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
825
- if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
826
- llama_params[item.key()] = item.value();
827
- }
828
- }
829
-
830
- return llama_params;
831
- }
832
-
833
- static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
834
- json data = json::array();
835
- int32_t n_tokens = 0;
836
- int i = 0;
837
- for (const auto & elem : embeddings) {
838
- json embedding_obj;
839
-
840
- if (use_base64) {
841
- const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
842
- const char* data_ptr = reinterpret_cast<const char*>(vec.data());
843
- size_t data_size = vec.size() * sizeof(float);
844
- embedding_obj = {
845
- {"embedding", base64::encode(data_ptr, data_size)},
846
- {"index", i++},
847
- {"object", "embedding"},
848
- {"encoding_format", "base64"}
849
- };
850
- } else {
851
- embedding_obj = {
852
- {"embedding", json_value(elem, "embedding", json::array())},
853
- {"index", i++},
854
- {"object", "embedding"}
855
- };
856
- }
857
- data.push_back(embedding_obj);
858
-
859
- n_tokens += json_value(elem, "tokens_evaluated", 0);
860
- }
861
-
862
- json res = json {
863
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
864
- {"object", "list"},
865
- {"usage", json {
866
- {"prompt_tokens", n_tokens},
867
- {"total_tokens", n_tokens}
868
- }},
869
- {"data", data}
870
- };
871
-
872
- return res;
873
- }
874
-
875
- static json format_response_rerank(
876
- const json & request,
877
- const json & ranks,
878
- bool is_tei_format,
879
- std::vector<std::string> & texts) {
880
- json res;
881
- if (is_tei_format) {
882
- // TEI response format
883
- res = json::array();
884
- bool return_text = json_value(request, "return_text", false);
885
- for (const auto & rank : ranks) {
886
- int index = json_value(rank, "index", 0);
887
- json elem = json{
888
- {"index", index},
889
- {"score", json_value(rank, "score", 0.0)},
890
- };
891
- if (return_text) {
892
- elem["text"] = std::move(texts[index]);
893
- }
894
- res.push_back(elem);
895
- }
896
- } else {
897
- // Jina response format
898
- json results = json::array();
899
- int32_t n_tokens = 0;
900
- for (const auto & rank : ranks) {
901
- results.push_back(json{
902
- {"index", json_value(rank, "index", 0)},
903
- {"relevance_score", json_value(rank, "score", 0.0)},
904
- });
905
-
906
- n_tokens += json_value(rank, "tokens_evaluated", 0);
907
- }
908
-
909
- res = json{
910
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
911
- {"object", "list"},
912
- {"usage", json{
913
- {"prompt_tokens", n_tokens},
914
- {"total_tokens", n_tokens}
915
- }},
916
- {"results", results}
917
- };
918
- }
919
-
920
- return res;
921
- }
922
-
923
- static bool is_valid_utf8(const std::string & str) {
924
- const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
925
- const unsigned char* end = bytes + str.length();
926
-
927
- while (bytes < end) {
928
- if (*bytes <= 0x7F) {
929
- // 1-byte sequence (0xxxxxxx)
930
- bytes++;
931
- } else if ((*bytes & 0xE0) == 0xC0) {
932
- // 2-byte sequence (110xxxxx 10xxxxxx)
933
- if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
934
- return false;
935
- bytes += 2;
936
- } else if ((*bytes & 0xF0) == 0xE0) {
937
- // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
938
- if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
939
- return false;
940
- bytes += 3;
941
- } else if ((*bytes & 0xF8) == 0xF0) {
942
- // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
943
- if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
944
- (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
945
- return false;
946
- bytes += 4;
947
- } else {
948
- // Invalid UTF-8 lead byte
949
- return false;
950
- }
951
- }
952
-
953
- return true;
954
- }
955
-
956
- static json format_tokenizer_response(const json & tokens) {
957
- return json {
958
- {"tokens", tokens}
959
- };
960
- }
961
-
962
- static json format_detokenized_response(const std::string & content) {
963
- return json {
964
- {"content", content}
965
- };
966
- }
967
-
968
- static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
969
- json data = json::array();
970
- for (const auto & lb : logit_bias) {
971
- data.push_back(json{
972
- {"bias", lb.bias},
973
- {"token", lb.token},
974
- });
975
- }
976
- return data;
977
- }
978
-
979
- static std::string safe_json_to_str(const json & data) {
980
- return data.dump(-1, ' ', false, json::error_handler_t::replace);
981
- }
982
-
983
- static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
984
- std::vector<llama_token_data> cur;
985
- const auto * logits = llama_get_logits_ith(ctx, idx);
986
-
987
- const llama_model * model = llama_get_model(ctx);
988
- const llama_vocab * vocab = llama_model_get_vocab(model);
989
-
990
- const int n_vocab = llama_vocab_n_tokens(vocab);
991
-
992
- cur.resize(n_vocab);
993
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
994
- cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
995
- }
996
-
997
- // sort tokens by logits
998
- std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
999
- return a.logit > b.logit;
1000
- });
1001
-
1002
- // apply softmax
1003
- float max_l = cur[0].logit;
1004
- float cum_sum = 0.0f;
1005
- for (size_t i = 0; i < cur.size(); ++i) {
1006
- float p = expf(cur[i].logit - max_l);
1007
- cur[i].p = p;
1008
- cum_sum += p;
1009
- }
1010
- for (size_t i = 0; i < cur.size(); ++i) {
1011
- cur[i].p /= cum_sum;
1012
- }
1013
-
1014
- return cur;
1015
- }
1016
-
1017
- static bool are_lora_equal(
1018
- const std::vector<common_adapter_lora_info> & l1,
1019
- const std::vector<common_adapter_lora_info> & l2) {
1020
- if (l1.size() != l2.size()) {
1021
- return false;
1022
- }
1023
- for (size_t i = 0; i < l1.size(); ++i) {
1024
- // we don't check lora.path to reduce the time complexity
1025
- if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
1026
- return false;
1027
- }
1028
- }
1029
- return true;
1030
- }
1031
-
1032
- // parse lora config from JSON request, returned a copy of lora_base with updated scale
1033
- static std::vector<common_adapter_lora_info> parse_lora_request(
1034
- const std::vector<common_adapter_lora_info> & lora_base,
1035
- const json & data) {
1036
- std::vector<common_adapter_lora_info> lora(lora_base);
1037
- int max_idx = lora.size();
1038
-
1039
- // clear existing value
1040
- for (auto & entry : lora) {
1041
- entry.scale = 0.0f;
1042
- }
1043
-
1044
- // set value
1045
- for (const auto & entry : data) {
1046
- int id = json_value(entry, "id", -1);
1047
- float scale = json_value(entry, "scale", 0.0f);
1048
- if (0 <= id && id < max_idx) {
1049
- lora[id].scale = scale;
1050
- } else {
1051
- throw std::runtime_error("invalid adapter id");
1052
- }
1053
- }
1054
-
1055
- return lora;
1056
- }
1057
-
1058
- //
1059
- // utils for interacting with libmtmd
1060
- // (may need to refactor in near future)
1061
- //
1062
-
1063
- /**
1064
- * server_tokens is a helper to manage the input tokens and image for the server.
1065
- * it is made this way to simplify the logic of KV cache management.
1066
- */
1067
- struct server_tokens {
1068
- bool has_mtmd = false;
1069
-
1070
- private: // disallow accessing these members directly, risking out-of-sync
1071
-
1072
- // map a **start** position in tokens to the image chunk
1073
- std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
1074
-
1075
- // list of tokens
1076
- // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
1077
- // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
1078
- // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
1079
- llama_tokens tokens;
1080
-
1081
- // for ex. with input of 5 text tokens and 2 images:
1082
- // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1083
- // pos 0 1 2 3 4 5 6 7 8 9
1084
- // map_pos_to_media will contain: {5, img0}, {8, img1}
1085
-
1086
- public:
1087
- server_tokens() = default;
1088
- ~server_tokens() = default;
1089
-
1090
- // Prevent copying
1091
- server_tokens(const server_tokens&) = delete;
1092
- server_tokens& operator=(const server_tokens&) = delete;
1093
-
1094
- // Allow moving (usually implicitly generated if members are movable)
1095
- server_tokens(server_tokens&&) = default;
1096
- server_tokens& operator=(server_tokens&&) = default;
1097
-
1098
- // Allow accessing elements using [] operator
1099
- llama_token operator[](size_t index) { return tokens[index]; }
1100
- const llama_token& operator[](size_t index) const { return tokens[index]; }
1101
-
1102
- server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
1103
- for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
1104
- push_back(mtmd_chunks[i]);
1105
- }
1106
- }
1107
-
1108
- server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
1109
-
1110
- // for debugging
1111
- std::string str() const {
1112
- std::ostringstream oss;
1113
- oss << "tokens: ";
1114
- for (const auto & t : tokens) {
1115
- if (t == LLAMA_TOKEN_NULL) {
1116
- oss << "<embd> ";
1117
- } else {
1118
- oss << t << " ";
1119
- }
1120
- }
1121
- oss << "\n";
1122
- oss << "image pos: ";
1123
- for (const auto & it : map_pos_to_media) {
1124
- oss << it.first << ", ";
1125
- }
1126
- return oss.str();
1127
- }
1128
-
1129
- const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
1130
- auto it = map_pos_to_media.find(pos);
1131
- if (it != map_pos_to_media.end()) {
1132
- return it->second;
1133
- } else {
1134
- throw std::runtime_error("Chunk not found");
1135
- }
1136
- }
1137
-
1138
- void push_back(llama_token tok) {
1139
- if (tok == LLAMA_TOKEN_NULL) {
1140
- throw std::runtime_error("Invalid token");
1141
- }
1142
- tokens.emplace_back(tok);
1143
- }
1144
-
1145
- // will create a copy of the chunk if it contains non-text data
1146
- void push_back(const mtmd_input_chunk * chunk) {
1147
- auto type = mtmd_input_chunk_get_type(chunk);
1148
- if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1149
- GGML_ASSERT(has_mtmd);
1150
- const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
1151
- llama_pos start_pos = tokens.size();
1152
- for (int i = 0; i < n_pos; ++i) {
1153
- tokens.emplace_back(LLAMA_TOKEN_NULL);
1154
- }
1155
- mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
1156
- map_pos_to_media[start_pos] = std::move(new_chunk);
1157
- } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1158
- size_t n_tokens;
1159
- auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
1160
- for (size_t i = 0; i < n_tokens; ++i) {
1161
- push_back(text_tokens[i]);
1162
- }
1163
- } else {
1164
- GGML_ABORT("Invalid chunk type");
1165
- }
1166
- }
1167
-
1168
- // for compatibility with context shift and prompt truncation
1169
- void insert(const llama_tokens & inp_tokens) {
1170
- GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1171
- tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
1172
- }
1173
-
1174
- // for compatibility with speculative decoding, ctx shift, slot save/load
1175
- const llama_tokens & get_text_tokens() const {
1176
- GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1177
- return tokens;
1178
- }
1179
-
1180
- // for compatibility with speculative decoding
1181
- void set_token(llama_pos pos, llama_token id) {
1182
- GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1183
- tokens[pos] = id;
1184
- }
1185
-
1186
- size_t size() const {
1187
- return tokens.size();
1188
- }
1189
-
1190
- bool empty() const {
1191
- return tokens.empty();
1192
- }
1193
-
1194
- void clear() {
1195
- tokens.clear();
1196
- }
1197
-
1198
- void keep_first(size_t n) {
1199
- GGML_ASSERT(n <= tokens.size());
1200
- if (has_mtmd) {
1201
- if (n == tokens.size()) {
1202
- return; // nothing to do
1203
- }
1204
- // we throw an error if we try to remove a token in the middle of an image
1205
- // for ex. with input of 5 text tokens and 2 images:
1206
- // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1207
- // n 1 2 3 4 5 6 7 8 9 10
1208
- // allowed to resize ^ ^
1209
- // disallowed to resize ^ ^ ^
1210
- if (n > 0) {
1211
- llama_token last_token = tokens[n - 1];
1212
- // make sure we never remove tokens in the middle of an image
1213
- if (last_token == LLAMA_TOKEN_NULL) {
1214
- find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
1215
- }
1216
- }
1217
- // remove all image chunks that are not used anymore
1218
- for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
1219
- llama_pos pos = it->first;
1220
- if (pos >= (llama_pos)n) {
1221
- it = map_pos_to_media.erase(it);
1222
- } else {
1223
- ++it;
1224
- }
1225
- }
1226
- }
1227
- tokens.resize(n);
1228
- }
1229
-
1230
- std::string detokenize(const llama_context * ctx, bool special) const {
1231
- llama_tokens text_tokens;
1232
- text_tokens.reserve(tokens.size());
1233
- for (const auto & t : tokens) {
1234
- if (t != LLAMA_TOKEN_NULL) {
1235
- text_tokens.push_back(t);
1236
- }
1237
- }
1238
- return common_detokenize(ctx, text_tokens, special);
1239
- }
1240
-
1241
- size_t get_common_prefix(const server_tokens & b) const {
1242
- size_t max_idx = std::min(tokens.size(), b.tokens.size());
1243
- for (size_t i = 0; i < max_idx; ++i) {
1244
- auto & ai = tokens[i];
1245
- auto & bi = b.tokens[i];
1246
-
1247
- if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
1248
- GGML_ASSERT(has_mtmd);
1249
- const auto & a_chunk = find_chunk(i);
1250
- const auto & b_chunk = b.find_chunk(i);
1251
- GGML_ASSERT(a_chunk && b_chunk);
1252
- std::string ai_id = mtmd_input_chunk_get_id(a_chunk.get());
1253
- std::string bi_id = mtmd_input_chunk_get_id(b_chunk.get());
1254
- size_t a_pos = mtmd_input_chunk_get_n_pos(a_chunk.get());
1255
- size_t b_pos = mtmd_input_chunk_get_n_pos(b_chunk.get());
1256
- if (ai_id == bi_id && a_pos == b_pos) {
1257
- GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
1258
- i += a_pos - 1; // will be +1 by the for loop
1259
- continue;
1260
- } else {
1261
- return i;
1262
- }
1263
- } else if (ai == bi) {
1264
- continue;
1265
- } else {
1266
- return i;
1267
- }
1268
- }
1269
- return max_idx; // all tokens are equal
1270
- }
1271
-
1272
- // make sure all text tokens are within the vocab range
1273
- bool validate(const struct llama_context * ctx) const {
1274
- const llama_model * model = llama_get_model(ctx);
1275
- const llama_vocab * vocab = llama_model_get_vocab(model);
1276
- const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1277
-
1278
- for (size_t i = 0; i < tokens.size(); ++i) {
1279
- auto & t = tokens[i];
1280
- if (t == LLAMA_TOKEN_NULL) {
1281
- try {
1282
- const auto & chunk = find_chunk(i);
1283
- size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
1284
- i += n_pos - 1; // will be +1 by the for loop
1285
- } catch (const std::exception & e) {
1286
- return false;
1287
- }
1288
- } else if (t < 0 || t >= n_vocab) {
1289
- return false;
1290
- }
1291
- }
1292
- return true;
1293
- }
1294
-
1295
- // encode and decode the image chunk
1296
- int32_t process_chunk(
1297
- llama_context * ctx,
1298
- mtmd_context * mctx,
1299
- llama_pos n_past,
1300
- int32_t seq_id,
1301
- llama_pos & n_pos_out) {
1302
- auto & chunk = find_chunk(n_past);
1303
- const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
1304
- ? "image" : "audio";
1305
- SRV_INF("processing %s...\n", name);
1306
- int32_t n_batch = llama_n_batch(ctx);
1307
- int64_t t0 = ggml_time_ms();
1308
- llama_pos new_n_past = n_past;
1309
- int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
1310
- chunk.get(),
1311
- n_past,
1312
- seq_id,
1313
- n_batch,
1314
- true, // logits last
1315
- &new_n_past);
1316
- SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
1317
- if (result != 0) {
1318
- LOG_ERR("mtmd_helper_eval failed with status %d", result);
1319
- n_pos_out = n_past;
1320
- return result;
1321
- }
1322
- n_pos_out = new_n_past;
1323
- return 0;
1324
- }
1325
- };
1326
-
1327
- // Computes FNV-1a hash of the data
1328
- static std::string fnv_hash(const uint8_t * data, size_t len) {
1329
- const uint64_t fnv_prime = 0x100000001b3ULL;
1330
- uint64_t hash = 0xcbf29ce484222325ULL;
1331
-
1332
- for (size_t i = 0; i < len; ++i) {
1333
- hash ^= data[i];
1334
- hash *= fnv_prime;
1335
- }
1336
- return std::to_string(hash);
1337
- }