@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -20,21 +20,26 @@
20
20
  #include <sstream>
21
21
  #include <string>
22
22
  #include <vector>
23
+ #include <memory>
23
24
 
24
- #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
25
+ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
25
26
 
26
27
  using json = nlohmann::ordered_json;
27
28
 
28
- // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
29
- enum error_type {
30
- ERROR_TYPE_INVALID_REQUEST,
31
- ERROR_TYPE_AUTHENTICATION,
32
- ERROR_TYPE_SERVER,
33
- ERROR_TYPE_NOT_FOUND,
34
- ERROR_TYPE_PERMISSION,
35
- ERROR_TYPE_UNAVAILABLE, // custom error
36
- ERROR_TYPE_NOT_SUPPORTED, // custom error
37
- };
29
+ #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
30
+ #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
31
+ #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
32
+ #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
33
+
34
+ #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
35
+ #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
36
+ #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
37
+ #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
38
+
39
+ #define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
40
+ #define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
41
+ #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
42
+ #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
38
43
 
39
44
  template <typename T>
40
45
  static T json_value(const json & body, const std::string & key, const T & default_value) {
@@ -52,12 +57,274 @@ static T json_value(const json & body, const std::string & key, const T & defaul
52
57
  }
53
58
 
54
59
  //
55
- // chat template utils
60
+ // tokenizer and input processing utils
56
61
  //
57
62
 
63
+ static bool json_is_array_of_numbers(const json & data) {
64
+ if (data.is_array()) {
65
+ for (const auto & e : data) {
66
+ if (!e.is_number_integer()) {
67
+ return false;
68
+ }
69
+ }
70
+ return true;
71
+ }
72
+ return false;
73
+ }
74
+
75
+ // is array having BOTH numbers & strings?
76
+ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
77
+ bool seen_string = false;
78
+ bool seen_number = false;
79
+ if (data.is_array()) {
80
+ for (const auto & e : data) {
81
+ seen_string |= e.is_string();
82
+ seen_number |= e.is_number_integer();
83
+ if (seen_number && seen_string) {
84
+ return true;
85
+ }
86
+ }
87
+ }
88
+ return false;
89
+ }
90
+
91
+ /**
92
+ * this handles 2 cases:
93
+ * - only string, example: "string"
94
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
95
+ */
96
+ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
97
+ // If `add_bos` is true, we only add BOS, when json_prompt is a string,
98
+ // or the first element of the json_prompt array is a string.
99
+ llama_tokens prompt_tokens;
100
+
101
+ if (json_prompt.is_array()) {
102
+ bool first = true;
103
+ for (const auto & p : json_prompt) {
104
+ if (p.is_string()) {
105
+ auto s = p.template get<std::string>();
106
+
107
+ llama_tokens p;
108
+ if (first) {
109
+ p = common_tokenize(ctx, s, add_special, parse_special);
110
+ first = false;
111
+ } else {
112
+ p = common_tokenize(ctx, s, false, parse_special);
113
+ }
114
+
115
+ prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
116
+ } else {
117
+ if (first) {
118
+ first = false;
119
+ }
120
+
121
+ prompt_tokens.push_back(p.template get<llama_token>());
122
+ }
123
+ }
124
+ } else {
125
+ auto s = json_prompt.template get<std::string>();
126
+ prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
127
+ }
128
+
129
+ return prompt_tokens;
130
+ }
131
+
132
+ /**
133
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
134
+ * this supports these cases:
135
+ * - "prompt": "string"
136
+ * - "prompt": [12, 34, 56]
137
+ * - "prompt": [12, 34, "string", 56, 78]
138
+ * and multiple prompts (multi-tasks):
139
+ * - "prompt": ["string1", "string2"]
140
+ * - "prompt": ["string1", [12, 34, 56]]
141
+ * - "prompt": [[12, 34, 56], [78, 90, 12]]
142
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
143
+ */
144
+ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
145
+ std::vector<llama_tokens> result;
146
+ if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
147
+ // string or mixed
148
+ result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
149
+ } else if (json_is_array_of_numbers(json_prompt)) {
150
+ // array of tokens
151
+ result.push_back(json_prompt.get<llama_tokens>());
152
+ } else if (json_prompt.is_array()) {
153
+ // array of prompts
154
+ result.reserve(json_prompt.size());
155
+ for (const auto & p : json_prompt) {
156
+ if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
157
+ result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
158
+ } else if (json_is_array_of_numbers(p)) {
159
+ // array of tokens
160
+ result.push_back(p.get<llama_tokens>());
161
+ } else {
162
+ throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
163
+ }
164
+ }
165
+ } else {
166
+ throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
167
+ }
168
+ if (result.empty()) {
169
+ throw std::runtime_error("\"prompt\" must not be empty");
170
+ }
171
+ return result;
172
+ }
173
+
174
+ // return the last index of character that can form a valid string
175
+ // if the last character is potentially cut in half, return the index before the cut
176
+ // if validate_utf8(text) == text.size(), then the whole text is valid utf8
177
+ static size_t validate_utf8(const std::string& text) {
178
+ size_t len = text.size();
179
+ if (len == 0) return 0;
180
+
181
+ // Check the last few bytes to see if a multi-byte character is cut off
182
+ for (size_t i = 1; i <= 4 && i <= len; ++i) {
183
+ unsigned char c = text[len - i];
184
+ // Check for start of a multi-byte sequence from the end
185
+ if ((c & 0xE0) == 0xC0) {
186
+ // 2-byte character start: 110xxxxx
187
+ // Needs at least 2 bytes
188
+ if (i < 2) return len - i;
189
+ } else if ((c & 0xF0) == 0xE0) {
190
+ // 3-byte character start: 1110xxxx
191
+ // Needs at least 3 bytes
192
+ if (i < 3) return len - i;
193
+ } else if ((c & 0xF8) == 0xF0) {
194
+ // 4-byte character start: 11110xxx
195
+ // Needs at least 4 bytes
196
+ if (i < 4) return len - i;
197
+ }
198
+ }
199
+
200
+ // If no cut-off multi-byte character is found, return full length
201
+ return len;
202
+ }
203
+
204
+ //
205
+ // template utils
206
+ //
207
+
208
+ // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
209
+ static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
210
+ llama_tokens result;
211
+ result.reserve(doc.size() + query.size() + 4);
212
+ result.push_back(llama_token_bos(model));
213
+ result.insert(result.end(), query.begin(), query.end());
214
+ result.push_back(llama_token_eos(model));
215
+ result.push_back(llama_token_sep(model));
216
+ result.insert(result.end(), doc.begin(), doc.end());
217
+ result.push_back(llama_token_eos(model));
218
+ return result;
219
+ }
220
+
221
+ // format infill task
222
+ static llama_tokens format_infill(
223
+ const llama_context * ctx,
224
+ const json & input_prefix,
225
+ const json & input_suffix,
226
+ const json & input_extra,
227
+ const int n_batch,
228
+ const int n_predict,
229
+ const int n_ctx,
230
+ const bool spm_infill,
231
+ const llama_tokens & tokens_prompt
232
+ ) {
233
+ // TODO: optimize this block by reducing memory allocations and movement
234
+
235
+ // use FIM repo-level pattern:
236
+ // ref: https://arxiv.org/pdf/2409.12186
237
+ //
238
+ // [FIM_REP]myproject
239
+ // [FIM_SEP]filename0
240
+ // extra chunk 0
241
+ // [FIM_SEP]filename1
242
+ // extra chunk 1
243
+ // ...
244
+ // [FIM_SEP]filename
245
+ // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
246
+ //
247
+ llama_tokens extra_tokens;
248
+ extra_tokens.reserve(n_ctx);
249
+
250
+ auto model = llama_get_model(ctx);
251
+ auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
252
+ auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
253
+
254
+ if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
255
+ // TODO: make project name an input
256
+ static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
257
+
258
+ extra_tokens.push_back(llama_token_fim_rep(model));
259
+ extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
260
+ }
261
+ for (const auto & chunk : input_extra) {
262
+ // { "text": string, "filename": string }
263
+ const std::string text = json_value(chunk, "text", std::string());
264
+ const std::string filename = json_value(chunk, "filename", std::string("tmp"));
265
+
266
+ if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
267
+ const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
268
+
269
+ extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
270
+ extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
271
+ } else {
272
+ // chunk separator in binary form to avoid confusing the AI
273
+ static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
274
+ static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
275
+
276
+ extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
277
+ }
278
+
279
+ const auto chunk_tokens = common_tokenize(ctx, text, false, false);
280
+ extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
281
+ }
282
+
283
+ if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
284
+ // TODO: current filename
285
+ static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
286
+
287
+ extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
288
+ extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
289
+ }
290
+
291
+ // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
292
+ const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
293
+ const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
294
+
295
+ SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
296
+
297
+ // fill the rest of the context with extra chunks
298
+ const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
299
+
300
+ tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
301
+ tokens_suffix.resize(n_suffix_take);
302
+
303
+ tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
304
+ tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
305
+ tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
306
+
307
+ auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
308
+ auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
309
+
310
+ if (llama_add_bos_token(model)) {
311
+ embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
312
+ }
313
+
314
+ SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
315
+
316
+ // put the extra context before the FIM prefix
317
+ embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
318
+
319
+ embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
320
+ embd_inp.push_back(llama_token_fim_mid(model));
321
+
322
+ return embd_inp;
323
+ }
324
+
58
325
  // Format given chat. If tmpl is empty, we take the template from model metadata
59
326
  inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
60
- std::vector<llama_chat_msg> chat;
327
+ std::vector<common_chat_msg> chat;
61
328
 
62
329
  for (size_t i = 0; i < messages.size(); ++i) {
63
330
  const auto & curr_msg = messages[i];
@@ -84,12 +351,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
84
351
  chat.push_back({role, content});
85
352
  }
86
353
 
87
- const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
354
+ const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
88
355
  LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
89
356
 
90
357
  return formatted_chat;
91
358
  }
92
359
 
360
+ static std::string llama_get_chat_template(const struct llama_model * model) {
361
+ std::string template_key = "tokenizer.chat_template";
362
+ // call with NULL buffer to get the total size of the string
363
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
364
+ if (res < 2) {
365
+ return "";
366
+ } else {
367
+ std::vector<char> model_template(res + 1, 0);
368
+ llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
369
+ return std::string(model_template.data(), model_template.size() - 1);
370
+ }
371
+ }
372
+
93
373
  //
94
374
  // base64 utils (TODO: move to common in the future)
95
375
  //
@@ -182,20 +462,6 @@ static std::string gen_chatcmplid() {
182
462
  // other common utils
183
463
  //
184
464
 
185
- static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
186
- size_t i;
187
- for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
188
-
189
- return i;
190
- }
191
-
192
- static size_t common_part(const std::string & a, const std::string & b) {
193
- size_t i;
194
- for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
195
-
196
- return i;
197
- }
198
-
199
465
  static bool ends_with(const std::string & str, const std::string & suffix) {
200
466
  return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
201
467
  }
@@ -216,24 +482,12 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
216
482
  return std::string::npos;
217
483
  }
218
484
 
219
- static bool json_is_array_of_numbers(const json & data) {
220
- if (data.is_array()) {
221
- for (const auto & e : data) {
222
- if (!e.is_number()) {
223
- return false;
224
- }
225
- }
226
- return true;
227
- }
228
- return false;
229
- }
230
-
231
485
  // TODO: reuse llama_detokenize
232
486
  template <class Iter>
233
487
  static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
234
488
  std::string ret;
235
489
  for (; begin != end; ++begin) {
236
- ret += llama_token_to_piece(ctx, *begin);
490
+ ret += common_token_to_piece(ctx, *begin);
237
491
  }
238
492
 
239
493
  return ret;
@@ -241,7 +495,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
241
495
 
242
496
  // format incomplete utf-8 multibyte character for output
243
497
  static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
244
- std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
498
+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
245
499
 
246
500
  // if the size is 1 and first bit is 1, meaning it's a partial character
247
501
  // (size > 1 meaning it's already a known token)
@@ -255,48 +509,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
255
509
  return out;
256
510
  }
257
511
 
258
- struct completion_token_output {
259
- llama_token tok;
260
- std::string text_to_send;
261
-
262
- struct token_prob {
263
- llama_token tok;
264
- float prob;
265
- };
266
-
267
- std::vector<token_prob> probs;
268
- };
269
-
270
- // convert a vector of completion_token_output to json
271
- static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
272
- json out = json::array();
273
-
274
- for (const auto & prob : probs) {
275
- json probs_for_token = json::array();
276
-
277
- for (const auto & p : prob.probs) {
278
- const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
279
- probs_for_token.push_back(json {
280
- {"tok_str", tok_str},
281
- {"prob", p.prob},
282
- });
283
- }
284
-
285
- const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
286
- out.push_back(json {
287
- {"content", tok_str},
288
- {"probs", probs_for_token},
289
- });
290
- }
291
-
292
- return out;
293
- }
294
-
295
512
  static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
296
513
  const std::string str =
297
514
  std::string(event) + ": " +
298
515
  data.dump(-1, ' ', false, json::error_handler_t::replace) +
299
- "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
516
+ "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
300
517
 
301
518
  LOG_DBG("data stream, to_send: %s", str.c_str());
302
519
 
@@ -313,8 +530,6 @@ static json oaicompat_completion_params_parse(
313
530
  const std::string & chat_template) {
314
531
  json llama_params;
315
532
 
316
- llama_params["__oaicompat"] = true;
317
-
318
533
  // Apply chat template to the list of messages
319
534
  llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
320
535
 
@@ -347,9 +562,9 @@ static json oaicompat_completion_params_parse(
347
562
 
348
563
  // Handle "logprobs" field
349
564
  // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
350
- if (body.contains("logprobs")) {
565
+ if (json_value(body, "logprobs", false)) {
351
566
  llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
352
- } else if (body.contains("top_logprobs")) {
567
+ } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
353
568
  throw std::runtime_error("top_logprobs requires logprobs to be set to true");
354
569
  }
355
570
 
@@ -362,7 +577,7 @@ static json oaicompat_completion_params_parse(
362
577
  }
363
578
 
364
579
  // Copy remaining properties to llama_params
365
- // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
580
+ // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
366
581
  // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
367
582
  for (const auto & item : body.items()) {
368
583
  // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
@@ -374,157 +589,9 @@ static json oaicompat_completion_params_parse(
374
589
  return llama_params;
375
590
  }
376
591
 
377
- static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
378
- bool stopped_word = result.count("stopped_word") != 0;
379
- bool stopped_eos = json_value(result, "stopped_eos", false);
380
- int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
381
- int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
382
- std::string content = json_value(result, "content", std::string(""));
383
-
384
- std::string finish_reason = "length";
385
- if (stopped_word || stopped_eos) {
386
- finish_reason = "stop";
387
- }
388
-
389
- json choices =
390
- streaming ? json::array({json{{"finish_reason", finish_reason},
391
- {"index", 0},
392
- {"delta", json::object()}}})
393
- : json::array({json{{"finish_reason", finish_reason},
394
- {"index", 0},
395
- {"message", json{{"content", content},
396
- {"role", "assistant"}}}}});
397
-
398
- std::time_t t = std::time(0);
399
-
400
- json res = json {
401
- {"choices", choices},
402
- {"created", t},
403
- {"model",
404
- json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
405
- {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
406
- {"usage", json {
407
- {"completion_tokens", num_tokens_predicted},
408
- {"prompt_tokens", num_prompt_tokens},
409
- {"total_tokens", num_tokens_predicted + num_prompt_tokens}
410
- }},
411
- {"id", completion_id}
412
- };
413
-
414
- // extra fields for debugging purposes
415
- if (verbose) {
416
- res["__verbose"] = result;
417
- }
418
-
419
- if (result.contains("completion_probabilities")) {
420
- res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
421
- }
422
-
423
- return res;
424
- }
425
-
426
- // return value is vector as there is one case where we might need to generate two responses
427
- static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
428
- if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
429
- return std::vector<json>({result});
430
- }
431
-
432
- bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
433
- std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
434
-
435
- bool stopped_word = json_value(result, "stopped_word", false);
436
- bool stopped_eos = json_value(result, "stopped_eos", false);
437
- bool stopped_limit = json_value(result, "stopped_limit", false);
438
- std::string content = json_value(result, "content", std::string(""));
439
-
440
- std::string finish_reason;
441
- if (stopped_word || stopped_eos) {
442
- finish_reason = "stop";
443
- }
444
- if (stopped_limit) {
445
- finish_reason = "length";
446
- }
447
-
448
- std::time_t t = std::time(0);
449
-
450
- json choices;
451
-
452
- if (!finish_reason.empty()) {
453
- choices = json::array({json{{"finish_reason", finish_reason},
454
- {"index", 0},
455
- {"delta", json::object()}}});
456
- } else {
457
- if (first) {
458
- if (content.empty()) {
459
- choices = json::array({json{{"finish_reason", nullptr},
460
- {"index", 0},
461
- {"delta", json{{"role", "assistant"}}}}});
462
- } else {
463
- // We have to send this as two updates to conform to openai behavior
464
- json initial_ret = json{{"choices", json::array({json{
465
- {"finish_reason", nullptr},
466
- {"index", 0},
467
- {"delta", json{
468
- {"role", "assistant"}
469
- }}}})},
470
- {"created", t},
471
- {"id", completion_id},
472
- {"model", modelname},
473
- {"object", "chat.completion.chunk"}};
474
-
475
- json second_ret = json{
476
- {"choices", json::array({json{{"finish_reason", nullptr},
477
- {"index", 0},
478
- {"delta", json{
479
- {"content", content}}}
480
- }})},
481
- {"created", t},
482
- {"id", completion_id},
483
- {"model", modelname},
484
- {"object", "chat.completion.chunk"}};
485
-
486
- return std::vector<json>({initial_ret, second_ret});
487
- }
488
- } else {
489
- // Some idiosyncrasy in task processing logic makes several trailing calls
490
- // with empty content, we ignore these at the calee site.
491
- if (content.empty()) {
492
- return std::vector<json>({json::object()});
493
- }
494
-
495
- choices = json::array({json{
496
- {"finish_reason", nullptr},
497
- {"index", 0},
498
- {"delta",
499
- json{
500
- {"content", content},
501
- }},
502
- }});
503
- }
504
- }
505
-
506
- json ret = json {
507
- {"choices", choices},
508
- {"created", t},
509
- {"id", completion_id},
510
- {"model", modelname},
511
- {"object", "chat.completion.chunk"}
512
- };
513
- if (!finish_reason.empty()) {
514
- int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
515
- int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
516
- ret.push_back({"usage", json {
517
- {"completion_tokens", num_tokens_predicted},
518
- {"prompt_tokens", num_prompt_tokens},
519
- {"total_tokens", num_tokens_predicted + num_prompt_tokens}
520
- }});
521
- }
522
-
523
- return std::vector<json>({ret});
524
- }
525
-
526
592
  static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
527
593
  json data = json::array();
594
+ int32_t n_tokens = 0;
528
595
  int i = 0;
529
596
  for (const auto & elem : embeddings) {
530
597
  data.push_back(json{
@@ -532,14 +599,16 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
532
599
  {"index", i++},
533
600
  {"object", "embedding"}
534
601
  });
602
+
603
+ n_tokens += json_value(elem, "tokens_evaluated", 0);
535
604
  }
536
605
 
537
606
  json res = json {
538
607
  {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
539
608
  {"object", "list"},
540
- {"usage", json { // TODO: fill
541
- {"prompt_tokens", 0},
542
- {"total_tokens", 0}
609
+ {"usage", json {
610
+ {"prompt_tokens", n_tokens},
611
+ {"total_tokens", n_tokens}
543
612
  }},
544
613
  {"data", data}
545
614
  };
@@ -549,20 +618,23 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
549
618
 
550
619
  static json format_response_rerank(const json & request, const json & ranks) {
551
620
  json data = json::array();
621
+ int32_t n_tokens = 0;
552
622
  int i = 0;
553
623
  for (const auto & rank : ranks) {
554
624
  data.push_back(json{
555
625
  {"index", i++},
556
626
  {"relevance_score", json_value(rank, "score", 0.0)},
557
627
  });
628
+
629
+ n_tokens += json_value(rank, "tokens_evaluated", 0);
558
630
  }
559
631
 
560
632
  json res = json {
561
633
  {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
562
634
  {"object", "list"},
563
- {"usage", json { // TODO: fill
564
- {"prompt_tokens", 0},
565
- {"total_tokens", 0}
635
+ {"usage", json {
636
+ {"prompt_tokens", n_tokens},
637
+ {"total_tokens", n_tokens}
566
638
  }},
567
639
  {"results", data}
568
640
  };
@@ -615,42 +687,47 @@ static json format_detokenized_response(const std::string & content) {
615
687
  };
616
688
  }
617
689
 
618
- static json format_error_response(const std::string & message, const enum error_type type) {
619
- std::string type_str;
620
- int code = 500;
621
- switch (type) {
622
- case ERROR_TYPE_INVALID_REQUEST:
623
- type_str = "invalid_request_error";
624
- code = 400;
625
- break;
626
- case ERROR_TYPE_AUTHENTICATION:
627
- type_str = "authentication_error";
628
- code = 401;
629
- break;
630
- case ERROR_TYPE_NOT_FOUND:
631
- type_str = "not_found_error";
632
- code = 404;
633
- break;
634
- case ERROR_TYPE_SERVER:
635
- type_str = "server_error";
636
- code = 500;
637
- break;
638
- case ERROR_TYPE_PERMISSION:
639
- type_str = "permission_error";
640
- code = 403;
641
- break;
642
- case ERROR_TYPE_NOT_SUPPORTED:
643
- type_str = "not_supported_error";
644
- code = 501;
645
- break;
646
- case ERROR_TYPE_UNAVAILABLE:
647
- type_str = "unavailable_error";
648
- code = 503;
649
- break;
690
+ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
691
+ json data = json::array();
692
+ for (const auto & lb : logit_bias) {
693
+ data.push_back(json{
694
+ {"bias", lb.bias},
695
+ {"token", lb.token},
696
+ });
650
697
  }
651
- return json {
652
- {"code", code},
653
- {"message", message},
654
- {"type", type_str},
655
- };
698
+ return data;
699
+ }
700
+
701
+ static std::string safe_json_to_str(json data) {
702
+ return data.dump(-1, ' ', false, json::error_handler_t::replace);
703
+ }
704
+
705
+ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
706
+ std::vector<llama_token_data> cur;
707
+ const auto * logits = llama_get_logits_ith(ctx, idx);
708
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
709
+
710
+ cur.resize(n_vocab);
711
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
712
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
713
+ }
714
+
715
+ // sort tokens by logits
716
+ std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
717
+ return a.logit > b.logit;
718
+ });
719
+
720
+ // apply softmax
721
+ float max_l = cur[0].logit;
722
+ float cum_sum = 0.0f;
723
+ for (size_t i = 0; i < cur.size(); ++i) {
724
+ float p = expf(cur[i].logit - max_l);
725
+ cur[i].p = p;
726
+ cum_sum += p;
727
+ }
728
+ for (size_t i = 0; i < cur.size(); ++i) {
729
+ cur[i].p /= cum_sum;
730
+ }
731
+
732
+ return cur;
656
733
  }