@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -7,9 +7,6 @@
7
7
 
8
8
  #include "common.h"
9
9
  #include "log.h"
10
- // Change JSON_ASSERT from assert() to GGML_ASSERT:
11
- #define JSON_ASSERT GGML_ASSERT
12
- #include "json.hpp"
13
10
  #include "llama.h"
14
11
 
15
12
  #include <algorithm>
@@ -51,47 +48,11 @@
51
48
  #include <sys/stat.h>
52
49
  #include <unistd.h>
53
50
  #endif
54
- #if defined(LLAMA_USE_CURL)
55
- #include <curl/curl.h>
56
- #include <curl/easy.h>
57
- #include <future>
58
- #endif
59
51
 
60
52
  #if defined(_MSC_VER)
61
53
  #pragma warning(disable: 4244 4267) // possible loss of data
62
54
  #endif
63
55
 
64
- #if defined(LLAMA_USE_CURL)
65
- #ifdef __linux__
66
- #include <linux/limits.h>
67
- #elif defined(_WIN32)
68
- # if !defined(PATH_MAX)
69
- # define PATH_MAX MAX_PATH
70
- # endif
71
- #else
72
- #include <sys/syslimits.h>
73
- #endif
74
- #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
75
-
76
- //
77
- // CURL utils
78
- //
79
-
80
- using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
81
-
82
- // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
83
- struct curl_slist_ptr {
84
- struct curl_slist * ptr = nullptr;
85
- ~curl_slist_ptr() {
86
- if (ptr) {
87
- curl_slist_free_all(ptr);
88
- }
89
- }
90
- };
91
- #endif // LLAMA_USE_CURL
92
-
93
- using json = nlohmann::ordered_json;
94
-
95
56
  //
96
57
  // CPU utils
97
58
  //
@@ -482,6 +443,25 @@ void string_replace_all(std::string & s, const std::string & search, const std::
482
443
  s = std::move(builder);
483
444
  }
484
445
 
446
+ bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
447
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
448
+ }
449
+ size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
450
+ if (!str.empty() && !stop.empty()) {
451
+ const char text_last_char = str.back();
452
+ for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
453
+ if (stop[char_index] == text_last_char) {
454
+ const auto current_partial = stop.substr(0, char_index + 1);
455
+ if (string_ends_with(str, current_partial)) {
456
+ return str.size() - char_index - 1;
457
+ }
458
+ }
459
+ }
460
+ }
461
+
462
+ return std::string::npos;
463
+ }
464
+
485
465
  std::string regex_escape(const std::string & s) {
486
466
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
487
467
  return std::regex_replace(s, special_chars, "\\$0");
@@ -869,7 +849,7 @@ std::string fs_get_cache_directory() {
869
849
  if (getenv("LLAMA_CACHE")) {
870
850
  cache_directory = std::getenv("LLAMA_CACHE");
871
851
  } else {
872
- #ifdef __linux__
852
+ #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
873
853
  if (std::getenv("XDG_CACHE_HOME")) {
874
854
  cache_directory = std::getenv("XDG_CACHE_HOME");
875
855
  } else {
@@ -879,7 +859,9 @@ std::string fs_get_cache_directory() {
879
859
  cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
880
860
  #elif defined(_WIN32)
881
861
  cache_directory = std::getenv("LOCALAPPDATA");
882
- #endif // __linux__
862
+ #else
863
+ # error Unknown architecture
864
+ #endif
883
865
  cache_directory = ensure_trailing_slash(cache_directory);
884
866
  cache_directory += "llama.cpp";
885
867
  }
@@ -900,22 +882,14 @@ std::string fs_get_cache_file(const std::string & filename) {
900
882
  //
901
883
  // Model utils
902
884
  //
885
+
903
886
  struct common_init_result common_init_from_params(common_params & params) {
904
887
  common_init_result iparams;
905
888
  auto mparams = common_model_params_to_llama(params);
906
889
 
907
- llama_model * model = nullptr;
908
-
909
- if (!params.hf_repo.empty() && !params.hf_file.empty()) {
910
- model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
911
- } else if (!params.model_url.empty()) {
912
- model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
913
- } else {
914
- model = llama_model_load_from_file(params.model.c_str(), mparams);
915
- }
916
-
890
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
917
891
  if (model == NULL) {
918
- LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
892
+ LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
919
893
  return iparams;
920
894
  }
921
895
 
@@ -950,7 +924,7 @@ struct common_init_result common_init_from_params(common_params & params) {
950
924
 
951
925
  llama_context * lctx = llama_init_from_model(model, cparams);
952
926
  if (lctx == NULL) {
953
- LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
927
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
954
928
  llama_model_free(model);
955
929
  return iparams;
956
930
  }
@@ -1074,6 +1048,19 @@ struct common_init_result common_init_from_params(common_params & params) {
1074
1048
  return iparams;
1075
1049
  }
1076
1050
 
1051
+ std::string get_model_endpoint() {
1052
+ const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
1053
+ // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
1054
+ const char * hf_endpoint_env = getenv("HF_ENDPOINT");
1055
+ const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
1056
+ std::string model_endpoint = "https://huggingface.co/";
1057
+ if (endpoint_env) {
1058
+ model_endpoint = endpoint_env;
1059
+ if (model_endpoint.back() != '/') model_endpoint += '/';
1060
+ }
1061
+ return model_endpoint;
1062
+ }
1063
+
1077
1064
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1078
1065
  llama_clear_adapter_lora(ctx);
1079
1066
  for (auto & la : lora) {
@@ -1089,15 +1076,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1089
1076
  if (!params.devices.empty()) {
1090
1077
  mparams.devices = params.devices.data();
1091
1078
  }
1079
+
1092
1080
  if (params.n_gpu_layers != -1) {
1093
1081
  mparams.n_gpu_layers = params.n_gpu_layers;
1094
1082
  }
1083
+
1095
1084
  mparams.main_gpu = params.main_gpu;
1096
1085
  mparams.split_mode = params.split_mode;
1097
1086
  mparams.tensor_split = params.tensor_split;
1098
1087
  mparams.use_mmap = params.use_mmap;
1099
1088
  mparams.use_mlock = params.use_mlock;
1100
1089
  mparams.check_tensors = params.check_tensors;
1090
+
1101
1091
  if (params.kv_overrides.empty()) {
1102
1092
  mparams.kv_overrides = NULL;
1103
1093
  } else {
@@ -1105,6 +1095,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1105
1095
  mparams.kv_overrides = params.kv_overrides.data();
1106
1096
  }
1107
1097
 
1098
+ if (params.tensor_buft_overrides.empty()) {
1099
+ mparams.tensor_buft_overrides = NULL;
1100
+ } else {
1101
+ GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
1102
+ mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1103
+ }
1104
+
1108
1105
  return mparams;
1109
1106
  }
1110
1107
 
@@ -1118,7 +1115,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1118
1115
  cparams.n_threads = params.cpuparams.n_threads;
1119
1116
  cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1120
1117
  params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1121
- cparams.logits_all = params.logits_all;
1122
1118
  cparams.embeddings = params.embedding;
1123
1119
  cparams.rope_scaling_type = params.rope_scaling_type;
1124
1120
  cparams.rope_freq_base = params.rope_freq_base;
@@ -1136,6 +1132,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1136
1132
  cparams.offload_kqv = !params.no_kv_offload;
1137
1133
  cparams.flash_attn = params.flash_attn;
1138
1134
  cparams.no_perf = params.no_perf;
1135
+ cparams.op_offload = !params.no_op_offload;
1139
1136
 
1140
1137
  if (params.reranking) {
1141
1138
  cparams.embeddings = true;
@@ -1164,451 +1161,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
1164
1161
  return tpp;
1165
1162
  }
1166
1163
 
1167
- #ifdef LLAMA_USE_CURL
1168
-
1169
- #define CURL_MAX_RETRY 3
1170
- #define CURL_RETRY_DELAY_SECONDS 2
1171
-
1172
- static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
1173
- int remaining_attempts = max_attempts;
1174
-
1175
- while (remaining_attempts > 0) {
1176
- LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
1177
-
1178
- CURLcode res = curl_easy_perform(curl);
1179
- if (res == CURLE_OK) {
1180
- return true;
1181
- }
1182
-
1183
- int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
1184
- LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
1185
-
1186
- remaining_attempts--;
1187
- std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
1188
- }
1189
-
1190
- LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
1191
-
1192
- return false;
1193
- }
1194
-
1195
- static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1196
- // Initialize libcurl
1197
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1198
- curl_slist_ptr http_headers;
1199
- if (!curl) {
1200
- LOG_ERR("%s: error initializing libcurl\n", __func__);
1201
- return false;
1202
- }
1203
-
1204
- bool force_download = false;
1205
-
1206
- // Set the URL, allow to follow http redirection
1207
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1208
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
1209
-
1210
- // Check if hf-token or bearer-token was specified
1211
- if (!hf_token.empty()) {
1212
- std::string auth_header = "Authorization: Bearer " + hf_token;
1213
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1214
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1215
- }
1216
-
1217
- #if defined(_WIN32)
1218
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
1219
- // operating system. Currently implemented under MS-Windows.
1220
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1221
- #endif
1222
-
1223
- // Check if the file already exists locally
1224
- auto file_exists = std::filesystem::exists(path);
1225
-
1226
- // If the file exists, check its JSON metadata companion file.
1227
- std::string metadata_path = path + ".json";
1228
- nlohmann::json metadata;
1229
- std::string etag;
1230
- std::string last_modified;
1231
-
1232
- if (file_exists) {
1233
- // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
1234
- std::ifstream metadata_in(metadata_path);
1235
- if (metadata_in.good()) {
1236
- try {
1237
- metadata_in >> metadata;
1238
- LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
1239
- if (metadata.contains("url") && metadata.at("url").is_string()) {
1240
- auto previous_url = metadata.at("url").get<std::string>();
1241
- if (previous_url != url) {
1242
- LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
1243
- return false;
1244
- }
1245
- }
1246
- if (metadata.contains("etag") && metadata.at("etag").is_string()) {
1247
- etag = metadata.at("etag");
1248
- }
1249
- if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
1250
- last_modified = metadata.at("lastModified");
1251
- }
1252
- } catch (const nlohmann::json::exception & e) {
1253
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
1254
- return false;
1255
- }
1256
- }
1257
- } else {
1258
- LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
1259
- }
1260
-
1261
- // Send a HEAD request to retrieve the etag and last-modified headers
1262
- struct common_load_model_from_url_headers {
1263
- std::string etag;
1264
- std::string last_modified;
1265
- };
1266
-
1267
- common_load_model_from_url_headers headers;
1268
-
1269
- {
1270
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1271
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1272
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
1273
-
1274
- static std::regex header_regex("([^:]+): (.*)\r\n");
1275
- static std::regex etag_regex("ETag", std::regex_constants::icase);
1276
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
1277
-
1278
- std::string header(buffer, n_items);
1279
- std::smatch match;
1280
- if (std::regex_match(header, match, header_regex)) {
1281
- const std::string & key = match[1];
1282
- const std::string & value = match[2];
1283
- if (std::regex_match(key, match, etag_regex)) {
1284
- headers->etag = value;
1285
- } else if (std::regex_match(key, match, last_modified_regex)) {
1286
- headers->last_modified = value;
1287
- }
1288
- }
1289
- return n_items;
1290
- };
1291
-
1292
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
1293
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
1294
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
1295
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
1296
-
1297
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1298
- if (!was_perform_successful) {
1299
- return false;
1300
- }
1301
-
1302
- long http_code = 0;
1303
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1304
- if (http_code != 200) {
1305
- // HEAD not supported, we don't know if the file has changed
1306
- // force trigger downloading
1307
- force_download = true;
1308
- LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
1309
- }
1310
- }
1311
-
1312
- bool should_download = !file_exists || force_download;
1313
- if (!should_download) {
1314
- if (!etag.empty() && etag != headers.etag) {
1315
- LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
1316
- should_download = true;
1317
- } else if (!last_modified.empty() && last_modified != headers.last_modified) {
1318
- LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
1319
- should_download = true;
1320
- }
1321
- }
1322
- if (should_download) {
1323
- std::string path_temporary = path + ".downloadInProgress";
1324
- if (file_exists) {
1325
- LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
1326
- if (remove(path.c_str()) != 0) {
1327
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
1328
- return false;
1329
- }
1330
- }
1331
-
1332
- // Set the output file
1333
-
1334
- struct FILE_deleter {
1335
- void operator()(FILE * f) const {
1336
- fclose(f);
1337
- }
1338
- };
1339
-
1340
- std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
1341
- if (!outfile) {
1342
- LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
1343
- return false;
1344
- }
1345
-
1346
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
1347
- auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
1348
- return fwrite(data, size, nmemb, (FILE *)fd);
1349
- };
1350
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
1351
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1352
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
1353
-
1354
- // display download progress
1355
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
1356
-
1357
- // helper function to hide password in URL
1358
- auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
1359
- std::size_t protocol_pos = url.find("://");
1360
- if (protocol_pos == std::string::npos) {
1361
- return url; // Malformed URL
1362
- }
1363
-
1364
- std::size_t at_pos = url.find('@', protocol_pos + 3);
1365
- if (at_pos == std::string::npos) {
1366
- return url; // No password in URL
1367
- }
1368
-
1369
- return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
1370
- };
1371
-
1372
- // start the download
1373
- LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1374
- llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
1375
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1376
- if (!was_perform_successful) {
1377
- return false;
1378
- }
1379
-
1380
- long http_code = 0;
1381
- curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1382
- if (http_code < 200 || http_code >= 400) {
1383
- LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
1384
- return false;
1385
- }
1386
-
1387
- // Causes file to be closed explicitly here before we rename it.
1388
- outfile.reset();
1389
-
1390
- // Write the updated JSON metadata file.
1391
- metadata.update({
1392
- {"url", url},
1393
- {"etag", headers.etag},
1394
- {"lastModified", headers.last_modified}
1395
- });
1396
- std::ofstream(metadata_path) << metadata.dump(4);
1397
- LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
1398
-
1399
- if (rename(path_temporary.c_str(), path.c_str()) != 0) {
1400
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
1401
- return false;
1402
- }
1403
- }
1404
-
1405
- return true;
1406
- }
1407
-
1408
- struct llama_model * common_load_model_from_url(
1409
- const std::string & model_url,
1410
- const std::string & local_path,
1411
- const std::string & hf_token,
1412
- const struct llama_model_params & params) {
1413
- // Basic validation of the model_url
1414
- if (model_url.empty()) {
1415
- LOG_ERR("%s: invalid model_url\n", __func__);
1416
- return NULL;
1417
- }
1418
-
1419
- if (!common_download_file(model_url, local_path, hf_token)) {
1420
- return NULL;
1421
- }
1422
-
1423
- // check for additional GGUFs split to download
1424
- int n_split = 0;
1425
- {
1426
- struct gguf_init_params gguf_params = {
1427
- /*.no_alloc = */ true,
1428
- /*.ctx = */ NULL,
1429
- };
1430
- auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
1431
- if (!ctx_gguf) {
1432
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
1433
- return NULL;
1434
- }
1435
-
1436
- auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
1437
- if (key_n_split >= 0) {
1438
- n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
1439
- }
1440
-
1441
- gguf_free(ctx_gguf);
1442
- }
1443
-
1444
- if (n_split > 1) {
1445
- char split_prefix[PATH_MAX] = {0};
1446
- char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1447
-
1448
- // Verify the first split file format
1449
- // and extract split URL and PATH prefixes
1450
- {
1451
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
1452
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
1453
- return NULL;
1454
- }
1455
-
1456
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
1457
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
1458
- return NULL;
1459
- }
1460
- }
1461
-
1462
- // Prepare download in parallel
1463
- std::vector<std::future<bool>> futures_download;
1464
- for (int idx = 1; idx < n_split; idx++) {
1465
- futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
1466
- char split_path[PATH_MAX] = {0};
1467
- llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
1468
-
1469
- char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1470
- llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1471
-
1472
- return common_download_file(split_url, split_path, hf_token);
1473
- }, idx));
1474
- }
1475
-
1476
- // Wait for all downloads to complete
1477
- for (auto & f : futures_download) {
1478
- if (!f.get()) {
1479
- return NULL;
1480
- }
1481
- }
1482
- }
1483
-
1484
- return llama_model_load_from_file(local_path.c_str(), params);
1485
- }
1486
-
1487
- struct llama_model * common_load_model_from_hf(
1488
- const std::string & repo,
1489
- const std::string & remote_path,
1490
- const std::string & local_path,
1491
- const std::string & hf_token,
1492
- const struct llama_model_params & params) {
1493
- // construct hugging face model url:
1494
- //
1495
- // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
1496
- // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
1497
- //
1498
- // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
1499
- // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
1500
- //
1501
-
1502
- std::string model_url = "https://huggingface.co/";
1503
- model_url += repo;
1504
- model_url += "/resolve/main/";
1505
- model_url += remote_path;
1506
-
1507
- return common_load_model_from_url(model_url, local_path, hf_token, params);
1508
- }
1509
-
1510
- /**
1511
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
1512
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
1513
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
1514
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
1515
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
1516
- *
1517
- * Return pair of <repo, file> (with "repo" already having tag removed)
1518
- *
1519
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
1520
- */
1521
- std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
1522
- auto parts = string_split<std::string>(hf_repo_with_tag, ':');
1523
- std::string tag = parts.size() > 1 ? parts.back() : "latest";
1524
- std::string hf_repo = parts[0];
1525
- if (string_split<std::string>(hf_repo, '/').size() != 2) {
1526
- throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
1527
- }
1528
-
1529
- // fetch model info from Hugging Face Hub API
1530
- json model_info;
1531
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1532
- curl_slist_ptr http_headers;
1533
- std::string res_str;
1534
- std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
1535
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1536
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
1537
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
1538
- auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
1539
- static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
1540
- return size * nmemb;
1541
- };
1542
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1543
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
1544
- #if defined(_WIN32)
1545
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1546
- #endif
1547
- if (!hf_token.empty()) {
1548
- std::string auth_header = "Authorization: Bearer " + hf_token;
1549
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1550
- }
1551
- // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
1552
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
1553
- http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
1554
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1555
-
1556
- CURLcode res = curl_easy_perform(curl.get());
1557
-
1558
- if (res != CURLE_OK) {
1559
- throw std::runtime_error("error: cannot make GET request to HF API");
1560
- }
1561
-
1562
- long res_code;
1563
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
1564
- if (res_code == 200) {
1565
- model_info = json::parse(res_str);
1566
- } else if (res_code == 401) {
1567
- throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
1568
- } else {
1569
- throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
1570
- }
1571
-
1572
- // check response
1573
- if (!model_info.contains("ggufFile")) {
1574
- throw std::runtime_error("error: model does not have ggufFile");
1575
- }
1576
- json & gguf_file = model_info.at("ggufFile");
1577
- if (!gguf_file.contains("rfilename")) {
1578
- throw std::runtime_error("error: ggufFile does not have rfilename");
1579
- }
1580
-
1581
- return std::make_pair(hf_repo, gguf_file.at("rfilename"));
1582
- }
1583
-
1584
- #else
1585
-
1586
- struct llama_model * common_load_model_from_url(
1587
- const std::string & /*model_url*/,
1588
- const std::string & /*local_path*/,
1589
- const std::string & /*hf_token*/,
1590
- const struct llama_model_params & /*params*/) {
1591
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1592
- return nullptr;
1593
- }
1594
-
1595
- struct llama_model * common_load_model_from_hf(
1596
- const std::string & /*repo*/,
1597
- const std::string & /*remote_path*/,
1598
- const std::string & /*local_path*/,
1599
- const std::string & /*hf_token*/,
1600
- const struct llama_model_params & /*params*/) {
1601
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1602
- return nullptr;
1603
- }
1604
-
1605
- std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
1606
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1607
- return std::make_pair("", "");
1608
- }
1609
-
1610
- #endif // LLAMA_USE_CURL
1611
-
1612
1164
  //
1613
1165
  // Batch utils
1614
1166
  //
@@ -2033,25 +1585,19 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
2033
1585
  return result;
2034
1586
  }
2035
1587
 
2036
- template <>
2037
- json common_grammar_trigger::to_json() const {
2038
- json out {
2039
- {"type", (int) type},
2040
- {"value", value},
2041
- };
2042
- if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2043
- out["token"] = (int) token;
2044
- }
2045
- return out;
2046
- }
1588
+ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
1589
+ const int64_t ne_datapoint = llama_n_ctx(ctx);
1590
+ const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
1591
+ ggml_opt_dataset_t result = ggml_opt_dataset_init(
1592
+ GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
1593
+
1594
+ llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
1595
+ llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
2047
1596
 
2048
- template <>
2049
- common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
2050
- common_grammar_trigger out;
2051
- out.type = (common_grammar_trigger_type) in.at("type").get<int>();
2052
- out.value = in.at("value").get<std::string>();
2053
- if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2054
- out.token = (llama_token) in.at("token").get<int>();
1597
+ for (int64_t idata = 0; idata < ndata; ++idata) {
1598
+ memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
1599
+ memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
2055
1600
  }
2056
- return out;
1601
+
1602
+ return result;
2057
1603
  }