@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,20 +1,45 @@
1
1
  #pragma once
2
2
 
3
- #include "llama.h"
4
3
  #include "common.h"
4
+ #include "log.h"
5
+ #include "llama.h"
6
+
7
+ #ifndef NDEBUG
8
+ // crash the server in debug mode, otherwise send an http 500 error
9
+ #define CPPHTTPLIB_NO_EXCEPTIONS 1
10
+ #endif
11
+ // increase max payload length to allow use of larger context size
12
+ #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
13
+ #include "httplib.h"
5
14
 
6
15
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
7
16
  #define JSON_ASSERT GGML_ASSERT
8
17
  #include "json.hpp"
9
18
 
19
+ #include <random>
20
+ #include <sstream>
10
21
  #include <string>
11
22
  #include <vector>
12
- #include <sstream>
13
- #include <random>
14
23
 
15
24
  #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
16
25
 
17
26
  using json = nlohmann::ordered_json;
27
+ using llama_tokens = std::vector<llama_token>;
28
+
29
+ #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
30
+ #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
31
+ #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
32
+ #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
33
+
34
+ #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
35
+ #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
36
+ #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
37
+ #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
38
+
39
+ #define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
40
+ #define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
41
+ #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
42
+ #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
18
43
 
19
44
  // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
20
45
  enum error_type {
@@ -27,32 +52,6 @@ enum error_type {
27
52
  ERROR_TYPE_NOT_SUPPORTED, // custom error
28
53
  };
29
54
 
30
- extern bool server_verbose;
31
- extern bool server_log_json;
32
-
33
- #ifndef SERVER_VERBOSE
34
- #define SERVER_VERBOSE 1
35
- #endif
36
-
37
- #if SERVER_VERBOSE != 1
38
- #define LOG_VERBOSE(MSG, ...)
39
- #else
40
- #define LOG_VERBOSE(MSG, ...) \
41
- do \
42
- { \
43
- if (server_verbose) \
44
- { \
45
- server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
46
- } \
47
- } while (0)
48
- #endif
49
-
50
- #define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
51
- #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
52
- #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
53
-
54
- static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
55
-
56
55
  template <typename T>
57
56
  static T json_value(const json & body, const std::string & key, const T & default_value) {
58
57
  // Fallback null to default value
@@ -60,9 +59,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
60
59
  try {
61
60
  return body.at(key);
62
61
  } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
63
- std::stringstream ss;
64
- ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
65
- LOG_WARNING(ss.str().c_str(), body);
62
+ LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
66
63
  return default_value;
67
64
  }
68
65
  } else {
@@ -70,55 +67,241 @@ static T json_value(const json & body, const std::string & key, const T & defaul
70
67
  }
71
68
  }
72
69
 
73
- static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
74
- std::stringstream ss_tid;
75
- ss_tid << std::this_thread::get_id();
76
- json log = json{
77
- {"tid", ss_tid.str()},
78
- {"timestamp", time(nullptr)},
79
- };
70
+ //
71
+ // tokenizer and input processing utils
72
+ //
80
73
 
81
- if (server_log_json) {
82
- log.merge_patch({
83
- {"level", level},
84
- {"function", function},
85
- {"line", line},
86
- {"msg", message},
87
- });
74
+ static bool json_is_array_of_numbers(const json & data) {
75
+ if (data.is_array()) {
76
+ for (const auto & e : data) {
77
+ if (!e.is_number_integer()) {
78
+ return false;
79
+ }
80
+ }
81
+ return true;
82
+ }
83
+ return false;
84
+ }
88
85
 
89
- if (!extra.empty()) {
90
- log.merge_patch(extra);
86
+ // is array having BOTH numbers & strings?
87
+ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
88
+ bool seen_string = false;
89
+ bool seen_number = false;
90
+ if (data.is_array()) {
91
+ for (const auto & e : data) {
92
+ seen_string |= e.is_string();
93
+ seen_number |= e.is_number_integer();
94
+ if (seen_number && seen_string) {
95
+ return true;
96
+ }
91
97
  }
98
+ }
99
+ return false;
100
+ }
92
101
 
93
- printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
94
- } else {
95
- char buf[1024];
96
- snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
102
+ /**
103
+ * this handles 2 cases:
104
+ * - only string, example: "string"
105
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
106
+ */
107
+ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
108
+ // If `add_bos` is true, we only add BOS, when json_prompt is a string,
109
+ // or the first element of the json_prompt array is a string.
110
+ llama_tokens prompt_tokens;
111
+
112
+ if (json_prompt.is_array()) {
113
+ bool first = true;
114
+ for (const auto & p : json_prompt) {
115
+ if (p.is_string()) {
116
+ auto s = p.template get<std::string>();
117
+
118
+ llama_tokens p;
119
+ if (first) {
120
+ p = common_tokenize(ctx, s, add_special, parse_special);
121
+ first = false;
122
+ } else {
123
+ p = common_tokenize(ctx, s, false, parse_special);
124
+ }
97
125
 
98
- if (!extra.empty()) {
99
- log.merge_patch(extra);
100
- }
101
- std::stringstream ss;
102
- ss << buf << " |";
103
- for (const auto & el : log.items())
104
- {
105
- const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
106
- ss << " " << el.key() << "=" << value;
126
+ prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
127
+ } else {
128
+ if (first) {
129
+ first = false;
130
+ }
131
+
132
+ prompt_tokens.push_back(p.template get<llama_token>());
133
+ }
107
134
  }
135
+ } else {
136
+ auto s = json_prompt.template get<std::string>();
137
+ prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
138
+ }
139
+
140
+ return prompt_tokens;
141
+ }
108
142
 
109
- const std::string str = ss.str();
110
- printf("%.*s\n", (int)str.size(), str.data());
143
+ /**
144
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
145
+ * this supports these cases:
146
+ * - "prompt": "string"
147
+ * - "prompt": [12, 34, 56]
148
+ * - "prompt": [12, 34, "string", 56, 78]
149
+ * and multiple prompts (multi-tasks):
150
+ * - "prompt": ["string1", "string2"]
151
+ * - "prompt": ["string1", [12, 34, 56]]
152
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
153
+ */
154
+ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
155
+ std::vector<llama_tokens> result;
156
+ if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
157
+ // string or mixed
158
+ result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
159
+ } else if (json_is_array_of_numbers(json_prompt)) {
160
+ // array of tokens
161
+ result.push_back(json_prompt.get<llama_tokens>());
162
+ } else if (json_prompt.is_array()) {
163
+ // array of prompts
164
+ result.reserve(json_prompt.size());
165
+ for (const auto & p : json_prompt) {
166
+ if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
167
+ result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
168
+ } else if (json_is_array_of_numbers(p)) {
169
+ // array of tokens
170
+ result.push_back(p.get<llama_tokens>());
171
+ } else {
172
+ throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
173
+ }
174
+ }
175
+ } else {
176
+ throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
111
177
  }
112
- fflush(stdout);
178
+ return result;
113
179
  }
114
180
 
115
181
  //
116
- // chat template utils
182
+ // template utils
117
183
  //
118
184
 
185
+ // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
186
+ static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
187
+ llama_tokens result;
188
+ result.reserve(doc.size() + query.size() + 4);
189
+ result.push_back(llama_token_bos(model));
190
+ result.insert(result.end(), query.begin(), query.end());
191
+ result.push_back(llama_token_eos(model));
192
+ result.push_back(llama_token_sep(model));
193
+ result.insert(result.end(), doc.begin(), doc.end());
194
+ result.push_back(llama_token_eos(model));
195
+ return result;
196
+ }
197
+
198
+ // format infill task
199
+ static llama_tokens format_infill(
200
+ const llama_context * ctx,
201
+ const json & input_prefix,
202
+ const json & input_suffix,
203
+ const json & input_extra,
204
+ const int n_batch,
205
+ const int n_predict,
206
+ const int n_ctx,
207
+ const bool spm_infill,
208
+ const llama_tokens & tokens_prompt
209
+ ) {
210
+ // TODO: optimize this block by reducing memory allocations and movement
211
+
212
+ // use FIM repo-level pattern:
213
+ // ref: https://arxiv.org/pdf/2409.12186
214
+ //
215
+ // [FIM_REP]myproject
216
+ // [FIM_SEP]filename0
217
+ // extra chunk 0
218
+ // [FIM_SEP]filename1
219
+ // extra chunk 1
220
+ // ...
221
+ // [FIM_SEP]filename
222
+ // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
223
+ //
224
+ llama_tokens extra_tokens;
225
+ extra_tokens.reserve(n_ctx);
226
+
227
+ auto model = llama_get_model(ctx);
228
+ auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
229
+ auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
230
+
231
+ if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
232
+ // TODO: make project name an input
233
+ static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
234
+
235
+ extra_tokens.push_back(llama_token_fim_rep(model));
236
+ extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
237
+ }
238
+ for (const auto & chunk : input_extra) {
239
+ // { "text": string, "filename": string }
240
+ const std::string text = json_value(chunk, "text", std::string());
241
+ const std::string filename = json_value(chunk, "filename", std::string("tmp"));
242
+
243
+ if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
244
+ const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
245
+
246
+ extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
247
+ extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
248
+ } else {
249
+ // chunk separator in binary form to avoid confusing the AI
250
+ static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
251
+ static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
252
+
253
+ extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
254
+ }
255
+
256
+ const auto chunk_tokens = common_tokenize(ctx, text, false, false);
257
+ extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
258
+ }
259
+
260
+ if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
261
+ // TODO: current filename
262
+ static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
263
+
264
+ extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
265
+ extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
266
+ }
267
+
268
+ // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
269
+ const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
270
+ const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
271
+
272
+ SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
273
+
274
+ // fill the rest of the context with extra chunks
275
+ const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
276
+
277
+ tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
278
+ tokens_suffix.resize(n_suffix_take);
279
+
280
+ tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
281
+ tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
282
+ tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
283
+
284
+ auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
285
+ auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
286
+
287
+ if (llama_add_bos_token(model)) {
288
+ embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
289
+ }
290
+
291
+ SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
292
+
293
+ // put the extra context before the FIM prefix
294
+ embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
295
+
296
+ embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
297
+ embd_inp.push_back(llama_token_fim_mid(model));
298
+
299
+ return embd_inp;
300
+ }
301
+
119
302
  // Format given chat. If tmpl is empty, we take the template from model metadata
120
303
  inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
121
- std::vector<llama_chat_msg> chat;
304
+ std::vector<common_chat_msg> chat;
122
305
 
123
306
  for (size_t i = 0; i < messages.size(); ++i) {
124
307
  const auto & curr_msg = messages[i];
@@ -145,11 +328,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
145
328
  chat.push_back({role, content});
146
329
  }
147
330
 
148
- auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
149
- LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
331
+ const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
332
+ LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
333
+
150
334
  return formatted_chat;
151
335
  }
152
336
 
337
+ static std::string llama_get_chat_template(const struct llama_model * model) {
338
+ std::string template_key = "tokenizer.chat_template";
339
+ // call with NULL buffer to get the total size of the string
340
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
341
+ if (res < 0) {
342
+ return "";
343
+ } else {
344
+ std::vector<char> model_template(res, 0);
345
+ llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
346
+ return std::string(model_template.data(), model_template.size());
347
+ }
348
+ }
349
+
153
350
  //
154
351
  // base64 utils (TODO: move to common in the future)
155
352
  //
@@ -235,28 +432,67 @@ static std::string random_string() {
235
432
  }
236
433
 
237
434
  static std::string gen_chatcmplid() {
238
- std::stringstream chatcmplid;
239
- chatcmplid << "chatcmpl-" << random_string();
240
-
241
- return chatcmplid.str();
435
+ return "chatcmpl-" + random_string();
242
436
  }
243
437
 
244
438
  //
245
439
  // other common utils
246
440
  //
247
441
 
248
- static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
442
+ static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
249
443
  size_t i;
250
444
  for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
251
445
 
252
446
  return i;
253
447
  }
254
448
 
255
- static size_t common_part(const std::string & a, const std::string & b) {
256
- size_t i;
257
- for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
449
+ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
450
+ // check for empty sequences
451
+ if (a.empty() || b.empty()) {
452
+ return 0;
453
+ }
258
454
 
259
- return i;
455
+ // get the lengths of the input sequences
456
+ size_t a_len = a.size();
457
+ size_t b_len = b.size();
458
+
459
+ // initialize the maximum length of the longest common subsequence (LCS)
460
+ size_t max_length = 0;
461
+
462
+ // use two rows instead of a 2D matrix to optimize space
463
+ std::vector<size_t> prev_row(b_len + 1, 0);
464
+ std::vector<size_t> curr_row(b_len + 1, 0);
465
+
466
+ // iterate through the elements of a
467
+ for (size_t i = 1; i <= a_len; i++) {
468
+ // iterate through the elements of b
469
+ for (size_t j = 1; j <= b_len; j++) {
470
+ // if elements at the current positions match
471
+ if (a[i - 1] == b[j - 1]) {
472
+ // if it's the first element of either sequences, set LCS length to 1
473
+ if (i == 1 || j == 1) {
474
+ curr_row[j] = 1;
475
+ } else {
476
+ // increment LCS length by 1 compared to the previous element
477
+ curr_row[j] = prev_row[j - 1] + 1;
478
+ }
479
+
480
+ // update max_length if necessary
481
+ if (curr_row[j] > max_length) {
482
+ max_length = curr_row[j];
483
+ }
484
+ } else {
485
+ // reset LCS length if elements don't match
486
+ curr_row[j] = 0;
487
+ }
488
+ }
489
+
490
+ // update the previous row for the next iteration
491
+ prev_row = curr_row;
492
+ }
493
+
494
+ // return the maximum length of the LCS
495
+ return max_length;
260
496
  }
261
497
 
262
498
  static bool ends_with(const std::string & str, const std::string & suffix) {
@@ -284,7 +520,7 @@ template <class Iter>
284
520
  static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
285
521
  std::string ret;
286
522
  for (; begin != end; ++begin) {
287
- ret += llama_token_to_piece(ctx, *begin);
523
+ ret += common_token_to_piece(ctx, *begin);
288
524
  }
289
525
 
290
526
  return ret;
@@ -292,7 +528,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
292
528
 
293
529
  // format incomplete utf-8 multibyte character for output
294
530
  static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
295
- std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
531
+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
296
532
 
297
533
  // if the size is 1 and first bit is 1, meaning it's a partial character
298
534
  // (size > 1 meaning it's already a known token)
@@ -343,6 +579,17 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
343
579
  return out;
344
580
  }
345
581
 
582
+ static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
583
+ const std::string str =
584
+ std::string(event) + ": " +
585
+ data.dump(-1, ' ', false, json::error_handler_t::replace) +
586
+ "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
587
+
588
+ LOG_DBG("data stream, to_send: %s", str.c_str());
589
+
590
+ return sink.write(str.c_str(), str.size());
591
+ }
592
+
346
593
  //
347
594
  // OAI utils
348
595
  //
@@ -355,24 +602,6 @@ static json oaicompat_completion_params_parse(
355
602
 
356
603
  llama_params["__oaicompat"] = true;
357
604
 
358
- // Map OpenAI parameters to llama.cpp parameters
359
- //
360
- // For parameters that are defined by the OpenAI documentation (e.g.
361
- // temperature), we explicitly specify OpenAI's intended default; we
362
- // need to do that because sometimes OpenAI disagrees with llama.cpp
363
- //
364
- // https://platform.openai.com/docs/api-reference/chat/create
365
- llama_sampling_params default_sparams;
366
- llama_params["model"] = json_value(body, "model", std::string("unknown"));
367
- llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
368
- llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
369
- llama_params["n_predict"] = json_value(body, "max_tokens", -1);
370
- llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
371
- llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
372
- llama_params["stream"] = json_value(body, "stream", false);
373
- llama_params["temperature"] = json_value(body, "temperature", 1.0);
374
- llama_params["top_p"] = json_value(body, "top_p", 1.0);
375
-
376
605
  // Apply chat template to the list of messages
377
606
  llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
378
607
 
@@ -389,6 +618,9 @@ static json oaicompat_completion_params_parse(
389
618
  std::string response_type = json_value(response_format, "type", std::string());
390
619
  if (response_type == "json_object") {
391
620
  llama_params["json_schema"] = json_value(response_format, "schema", json::object());
621
+ } else if (response_type == "json_schema") {
622
+ json json_schema = json_value(response_format, "json_schema", json::object());
623
+ llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
392
624
  } else if (!response_type.empty() && response_type != "text") {
393
625
  throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
394
626
  }
@@ -402,22 +634,22 @@ static json oaicompat_completion_params_parse(
402
634
 
403
635
  // Handle "logprobs" field
404
636
  // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
405
- if (body.contains("logprobs")) {
637
+ if (json_value(body, "logprobs", false)) {
406
638
  llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
407
- } else if (body.contains("top_logprobs")) {
639
+ } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
408
640
  throw std::runtime_error("top_logprobs requires logprobs to be set to true");
409
641
  }
410
642
 
411
643
  // Params supported by OAI but unsupported by llama.cpp
412
644
  static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
413
- for (auto & param : unsupported_params) {
645
+ for (const auto & param : unsupported_params) {
414
646
  if (body.contains(param)) {
415
647
  throw std::runtime_error("Unsupported param: " + param);
416
648
  }
417
649
  }
418
650
 
419
651
  // Copy remaining properties to llama_params
420
- // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
652
+ // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
421
653
  // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
422
654
  for (const auto & item : body.items()) {
423
655
  // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
@@ -429,7 +661,7 @@ static json oaicompat_completion_params_parse(
429
661
  return llama_params;
430
662
  }
431
663
 
432
- static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
664
+ static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
433
665
  bool stopped_word = result.count("stopped_word") != 0;
434
666
  bool stopped_eos = json_value(result, "stopped_eos", false);
435
667
  int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@@ -466,7 +698,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
466
698
  {"id", completion_id}
467
699
  };
468
700
 
469
- if (server_verbose) {
701
+ // extra fields for debugging purposes
702
+ if (verbose) {
470
703
  res["__verbose"] = result;
471
704
  }
472
705
 
@@ -478,7 +711,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
478
711
  }
479
712
 
480
713
  // return value is vector as there is one case where we might need to generate two responses
481
- static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
714
+ static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
482
715
  if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
483
716
  return std::vector<json>({result});
484
717
  }
@@ -580,7 +813,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
580
813
  static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
581
814
  json data = json::array();
582
815
  int i = 0;
583
- for (auto & elem : embeddings) {
816
+ for (const auto & elem : embeddings) {
584
817
  data.push_back(json{
585
818
  {"embedding", json_value(elem, "embedding", json::array())},
586
819
  {"index", i++},
@@ -591,7 +824,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
591
824
  json res = json {
592
825
  {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
593
826
  {"object", "list"},
594
- {"usage", json {
827
+ {"usage", json { // TODO: fill
595
828
  {"prompt_tokens", 0},
596
829
  {"total_tokens", 0}
597
830
  }},
@@ -601,7 +834,63 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
601
834
  return res;
602
835
  }
603
836
 
604
- static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
837
+ static json format_response_rerank(const json & request, const json & ranks) {
838
+ json data = json::array();
839
+ int i = 0;
840
+ for (const auto & rank : ranks) {
841
+ data.push_back(json{
842
+ {"index", i++},
843
+ {"relevance_score", json_value(rank, "score", 0.0)},
844
+ });
845
+ }
846
+
847
+ json res = json {
848
+ {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
849
+ {"object", "list"},
850
+ {"usage", json { // TODO: fill
851
+ {"prompt_tokens", 0},
852
+ {"total_tokens", 0}
853
+ }},
854
+ {"results", data}
855
+ };
856
+
857
+ return res;
858
+ }
859
+
860
+ static bool is_valid_utf8(const std::string & str) {
861
+ const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
862
+ const unsigned char* end = bytes + str.length();
863
+
864
+ while (bytes < end) {
865
+ if (*bytes <= 0x7F) {
866
+ // 1-byte sequence (0xxxxxxx)
867
+ bytes++;
868
+ } else if ((*bytes & 0xE0) == 0xC0) {
869
+ // 2-byte sequence (110xxxxx 10xxxxxx)
870
+ if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
871
+ return false;
872
+ bytes += 2;
873
+ } else if ((*bytes & 0xF0) == 0xE0) {
874
+ // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
875
+ if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
876
+ return false;
877
+ bytes += 3;
878
+ } else if ((*bytes & 0xF8) == 0xF0) {
879
+ // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
880
+ if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
881
+ (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
882
+ return false;
883
+ bytes += 4;
884
+ } else {
885
+ // Invalid UTF-8 lead byte
886
+ return false;
887
+ }
888
+ }
889
+
890
+ return true;
891
+ }
892
+
893
+ static json format_tokenizer_response(const json & tokens) {
605
894
  return json {
606
895
  {"tokens", tokens}
607
896
  };
@@ -1,5 +1,5 @@
1
1
  set(TARGET llama-simple)
2
2
  add_executable(${TARGET} simple.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4
+ target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_compile_features(${TARGET} PRIVATE cxx_std_11)