@fugood/llama.node 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +2 -0
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -2,6 +2,9 @@
2
2
  #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
3
  #endif
4
4
 
5
+ #include "ggml.h"
6
+ #include "gguf.h"
7
+
5
8
  #include "common.h"
6
9
  #include "log.h"
7
10
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
@@ -18,6 +21,7 @@
18
21
  #include <cstdarg>
19
22
  #include <cstring>
20
23
  #include <ctime>
24
+ #include <filesystem>
21
25
  #include <fstream>
22
26
  #include <iostream>
23
27
  #include <iterator>
@@ -62,11 +66,29 @@
62
66
  #ifdef __linux__
63
67
  #include <linux/limits.h>
64
68
  #elif defined(_WIN32)
65
- #define PATH_MAX MAX_PATH
69
+ # if !defined(PATH_MAX)
70
+ # define PATH_MAX MAX_PATH
71
+ # endif
66
72
  #else
67
73
  #include <sys/syslimits.h>
68
74
  #endif
69
75
  #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
76
+
77
+ //
78
+ // CURL utils
79
+ //
80
+
81
+ using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
82
+
83
+ // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
84
+ struct curl_slist_ptr {
85
+ struct curl_slist * ptr = nullptr;
86
+ ~curl_slist_ptr() {
87
+ if (ptr) {
88
+ curl_slist_free_all(ptr);
89
+ }
90
+ }
91
+ };
70
92
  #endif // LLAMA_USE_CURL
71
93
 
72
94
  using json = nlohmann::ordered_json;
@@ -843,7 +865,7 @@ struct common_init_result common_init_from_params(common_params & params) {
843
865
  } else if (!params.model_url.empty()) {
844
866
  model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
845
867
  } else {
846
- model = llama_load_model_from_file(params.model.c_str(), mparams);
868
+ model = llama_model_load_from_file(params.model.c_str(), mparams);
847
869
  }
848
870
 
849
871
  if (model == NULL) {
@@ -851,26 +873,28 @@ struct common_init_result common_init_from_params(common_params & params) {
851
873
  return iparams;
852
874
  }
853
875
 
876
+ const llama_vocab * vocab = llama_model_get_vocab(model);
877
+
854
878
  if (params.reranking) {
855
879
  bool ok = true;
856
880
 
857
- if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
858
- LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
881
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
882
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
859
883
  ok = false;
860
884
  }
861
885
 
862
- if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
863
- LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
886
+ if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
887
+ LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
864
888
  ok = false;
865
889
  }
866
890
 
867
- if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
868
- LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
891
+ if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
892
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
869
893
  ok = false;
870
894
  }
871
895
 
872
896
  if (!ok) {
873
- llama_free_model(model);
897
+ llama_model_free(model);
874
898
 
875
899
  return iparams;
876
900
  }
@@ -878,40 +902,40 @@ struct common_init_result common_init_from_params(common_params & params) {
878
902
 
879
903
  auto cparams = common_context_params_to_llama(params);
880
904
 
881
- llama_context * lctx = llama_new_context_with_model(model, cparams);
905
+ llama_context * lctx = llama_init_from_model(model, cparams);
882
906
  if (lctx == NULL) {
883
907
  LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
884
- llama_free_model(model);
908
+ llama_model_free(model);
885
909
  return iparams;
886
910
  }
887
911
 
888
912
  if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
889
- LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
890
- llama_free_model(model);
891
- return iparams;
913
+ LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
914
+ params.ctx_shift = false;
892
915
  }
893
916
 
894
917
  if (!params.control_vectors.empty()) {
895
918
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
896
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
919
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
897
920
 
898
921
  const auto cvec = common_control_vector_load(params.control_vectors);
899
922
  if (cvec.n_embd == -1) {
900
923
  llama_free(lctx);
901
- llama_free_model(model);
924
+ llama_model_free(model);
902
925
 
903
926
  return iparams;
904
927
  }
905
928
 
906
- int err = llama_control_vector_apply(lctx,
907
- cvec.data.data(),
908
- cvec.data.size(),
909
- cvec.n_embd,
910
- params.control_vector_layer_start,
911
- params.control_vector_layer_end);
929
+ int err = llama_apply_adapter_cvec(
930
+ lctx,
931
+ cvec.data.data(),
932
+ cvec.data.size(),
933
+ cvec.n_embd,
934
+ params.control_vector_layer_start,
935
+ params.control_vector_layer_end);
912
936
  if (err) {
913
937
  llama_free(lctx);
914
- llama_free_model(model);
938
+ llama_model_free(model);
915
939
 
916
940
  return iparams;
917
941
  }
@@ -919,30 +943,31 @@ struct common_init_result common_init_from_params(common_params & params) {
919
943
 
920
944
  // load and optionally apply lora adapters
921
945
  for (auto & la : params.lora_adapters) {
922
- common_lora_adapter_container loaded_la;
923
- loaded_la.path = la.path;
924
- loaded_la.scale = la.scale;
925
- loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
926
- if (loaded_la.adapter == nullptr) {
946
+ llama_adapter_lora_ptr lora;
947
+ lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
948
+ if (lora == nullptr) {
927
949
  LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
928
950
  llama_free(lctx);
929
- llama_free_model(model);
951
+ llama_model_free(model);
930
952
  return iparams;
931
953
  }
932
- iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
954
+
955
+ la.ptr = lora.get();
956
+ iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
933
957
  }
958
+
934
959
  if (!params.lora_init_without_apply) {
935
- common_lora_adapters_apply(lctx, iparams.lora_adapters);
960
+ common_set_adapter_lora(lctx, params.lora_adapters);
936
961
  }
937
962
 
938
- if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
939
- LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
963
+ if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
964
+ LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
940
965
  params.sampling.ignore_eos = false;
941
966
  }
942
967
 
943
968
  if (params.sampling.ignore_eos) {
944
- for (llama_token i = 0; i < llama_n_vocab(model); i++) {
945
- if (llama_token_is_eog(model, i)) {
969
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
970
+ if (llama_vocab_is_eog(vocab, i)) {
946
971
  LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
947
972
  params.sampling.logit_bias.push_back({i, -INFINITY});
948
973
  }
@@ -963,8 +988,9 @@ struct common_init_result common_init_from_params(common_params & params) {
963
988
  LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
964
989
 
965
990
  std::vector<llama_token> tmp;
966
- llama_token bos = llama_token_bos(model);
967
- llama_token eos = llama_token_eos(model);
991
+ llama_token bos = llama_vocab_bos(vocab);
992
+ llama_token eos = llama_vocab_eos(vocab);
993
+
968
994
  // some models (e.g. T5) don't have a BOS token
969
995
  if (bos != LLAMA_TOKEN_NULL) {
970
996
  tmp.push_back(bos);
@@ -979,7 +1005,7 @@ struct common_init_result common_init_from_params(common_params & params) {
979
1005
  if (llama_model_has_encoder(model)) {
980
1006
  llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
981
1007
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
982
- if (decoder_start_token_id == -1) {
1008
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
983
1009
  decoder_start_token_id = bos;
984
1010
  }
985
1011
  tmp.clear();
@@ -993,17 +1019,17 @@ struct common_init_result common_init_from_params(common_params & params) {
993
1019
  llama_perf_context_reset(lctx);
994
1020
  }
995
1021
 
996
- iparams.model = model;
997
- iparams.context = lctx;
1022
+ iparams.model.reset(model);
1023
+ iparams.context.reset(lctx);
998
1024
 
999
1025
  return iparams;
1000
1026
  }
1001
1027
 
1002
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
1003
- llama_lora_adapter_clear(ctx);
1004
- for (auto & la : lora_adapters) {
1028
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1029
+ llama_clear_adapter_lora(ctx);
1030
+ for (auto & la : lora) {
1005
1031
  if (la.scale != 0.0f) {
1006
- llama_lora_adapter_set(ctx, la.adapter, la.scale);
1032
+ llama_set_adapter_lora(ctx, la.ptr, la.scale);
1007
1033
  }
1008
1034
  }
1009
1035
  }
@@ -1017,7 +1043,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1017
1043
  if (params.n_gpu_layers != -1) {
1018
1044
  mparams.n_gpu_layers = params.n_gpu_layers;
1019
1045
  }
1020
- mparams.rpc_servers = params.rpc_servers.c_str();
1021
1046
  mparams.main_gpu = params.main_gpu;
1022
1047
  mparams.split_mode = params.split_mode;
1023
1048
  mparams.tensor_split = params.tensor_split;
@@ -1120,7 +1145,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
1120
1145
 
1121
1146
  static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1122
1147
  // Initialize libcurl
1123
- std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1148
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1149
+ curl_slist_ptr http_headers;
1124
1150
  if (!curl) {
1125
1151
  LOG_ERR("%s: error initializing libcurl\n", __func__);
1126
1152
  return false;
@@ -1134,11 +1160,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
1134
1160
 
1135
1161
  // Check if hf-token or bearer-token was specified
1136
1162
  if (!hf_token.empty()) {
1137
- std::string auth_header = "Authorization: Bearer ";
1138
- auth_header += hf_token.c_str();
1139
- struct curl_slist *http_headers = NULL;
1140
- http_headers = curl_slist_append(http_headers, auth_header.c_str());
1141
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
1163
+ std::string auth_header = "Authorization: Bearer " + hf_token;
1164
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1165
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1142
1166
  }
1143
1167
 
1144
1168
  #if defined(_WIN32)
@@ -1148,8 +1172,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
1148
1172
  #endif
1149
1173
 
1150
1174
  // Check if the file already exists locally
1151
- struct stat model_file_info;
1152
- auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
1175
+ auto file_exists = std::filesystem::exists(path);
1153
1176
 
1154
1177
  // If the file exists, check its JSON metadata companion file.
1155
1178
  std::string metadata_path = path + ".json";
@@ -1409,7 +1432,7 @@ struct llama_model * common_load_model_from_url(
1409
1432
  }
1410
1433
  }
1411
1434
 
1412
- return llama_load_model_from_file(local_path.c_str(), params);
1435
+ return llama_model_load_from_file(local_path.c_str(), params);
1413
1436
  }
1414
1437
 
1415
1438
  struct llama_model * common_load_model_from_hf(
@@ -1435,6 +1458,80 @@ struct llama_model * common_load_model_from_hf(
1435
1458
  return common_load_model_from_url(model_url, local_path, hf_token, params);
1436
1459
  }
1437
1460
 
1461
+ /**
1462
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
1463
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
1464
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
1465
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
1466
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
1467
+ *
1468
+ * Return pair of <repo, file> (with "repo" already having tag removed)
1469
+ *
1470
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
1471
+ */
1472
+ std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
1473
+ auto parts = string_split<std::string>(hf_repo_with_tag, ':');
1474
+ std::string tag = parts.size() > 1 ? parts.back() : "latest";
1475
+ std::string hf_repo = parts[0];
1476
+ if (string_split<std::string>(hf_repo, '/').size() != 2) {
1477
+ throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
1478
+ }
1479
+
1480
+ // fetch model info from Hugging Face Hub API
1481
+ json model_info;
1482
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1483
+ curl_slist_ptr http_headers;
1484
+ std::string res_str;
1485
+ std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
1486
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1487
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
1488
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
1489
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
1490
+ static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
1491
+ return size * nmemb;
1492
+ };
1493
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1494
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
1495
+ #if defined(_WIN32)
1496
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1497
+ #endif
1498
+ if (!hf_token.empty()) {
1499
+ std::string auth_header = "Authorization: Bearer " + hf_token;
1500
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1501
+ }
1502
+ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
1503
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
1504
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
1505
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1506
+
1507
+ CURLcode res = curl_easy_perform(curl.get());
1508
+
1509
+ if (res != CURLE_OK) {
1510
+ throw std::runtime_error("error: cannot make GET request to HF API");
1511
+ }
1512
+
1513
+ long res_code;
1514
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
1515
+ if (res_code == 200) {
1516
+ model_info = json::parse(res_str);
1517
+ } else if (res_code == 401) {
1518
+ throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
1519
+ } else {
1520
+ throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
1521
+ }
1522
+
1523
+ // check response
1524
+ if (!model_info.contains("ggufFile")) {
1525
+ throw std::runtime_error("error: model does not have ggufFile");
1526
+ }
1527
+ json & gguf_file = model_info.at("ggufFile");
1528
+ if (!gguf_file.contains("rfilename")) {
1529
+ throw std::runtime_error("error: ggufFile does not have rfilename");
1530
+ }
1531
+
1532
+ return std::make_pair(hf_repo, gguf_file.at("rfilename"));
1533
+ }
1534
+
1438
1535
  #else
1439
1536
 
1440
1537
  struct llama_model * common_load_model_from_url(
@@ -1456,6 +1553,11 @@ struct llama_model * common_load_model_from_hf(
1456
1553
  return nullptr;
1457
1554
  }
1458
1555
 
1556
+ std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
1557
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1558
+ return std::make_pair("", "");
1559
+ }
1560
+
1459
1561
  #endif // LLAMA_USE_CURL
1460
1562
 
1461
1563
  //
@@ -1554,21 +1656,23 @@ std::vector<llama_token> common_tokenize(
1554
1656
  const std::string & text,
1555
1657
  bool add_special,
1556
1658
  bool parse_special) {
1557
- return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1659
+ const llama_model * model = llama_get_model(ctx);
1660
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1661
+ return common_tokenize(vocab, text, add_special, parse_special);
1558
1662
  }
1559
1663
 
1560
1664
  std::vector<llama_token> common_tokenize(
1561
- const struct llama_model * model,
1665
+ const struct llama_vocab * vocab,
1562
1666
  const std::string & text,
1563
1667
  bool add_special,
1564
1668
  bool parse_special) {
1565
1669
  // upper limit for the number of tokens
1566
1670
  int n_tokens = text.length() + 2 * add_special;
1567
1671
  std::vector<llama_token> result(n_tokens);
1568
- n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1672
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1569
1673
  if (n_tokens < 0) {
1570
1674
  result.resize(-n_tokens);
1571
- int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1675
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1572
1676
  GGML_ASSERT(check == -n_tokens);
1573
1677
  } else {
1574
1678
  result.resize(n_tokens);
@@ -1577,12 +1681,18 @@ std::vector<llama_token> common_tokenize(
1577
1681
  }
1578
1682
 
1579
1683
  std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1684
+ const llama_model * model = llama_get_model(ctx);
1685
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1686
+ return common_token_to_piece(vocab, token, special);
1687
+ }
1688
+
1689
+ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1580
1690
  std::string piece;
1581
1691
  piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1582
- const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1692
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1583
1693
  if (n_chars < 0) {
1584
1694
  piece.resize(-n_chars);
1585
- int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1695
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1586
1696
  GGML_ASSERT(check == -n_chars);
1587
1697
  }
1588
1698
  else {
@@ -1592,13 +1702,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
1592
1702
  return piece;
1593
1703
  }
1594
1704
 
1595
- std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1705
+ std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1706
+ const llama_model * model = llama_get_model(ctx);
1707
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1708
+ return common_detokenize(vocab, tokens, special);
1709
+ }
1710
+
1711
+ std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1596
1712
  std::string text;
1597
1713
  text.resize(std::max(text.capacity(), tokens.size()));
1598
- int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1714
+ int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1599
1715
  if (n_chars < 0) {
1600
1716
  text.resize(-n_chars);
1601
- n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1717
+ n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1602
1718
  GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1603
1719
  }
1604
1720
 
@@ -1612,9 +1728,14 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
1612
1728
  // Chat template utils
1613
1729
  //
1614
1730
 
1731
+ std::string common_get_builtin_chat_template(const struct llama_model * model) {
1732
+ const char * ptr_tmpl = llama_model_chat_template(model);
1733
+ return ptr_tmpl == nullptr ? "" : ptr_tmpl;
1734
+ }
1735
+
1615
1736
  bool common_chat_verify_template(const std::string & tmpl) {
1616
1737
  llama_chat_message chat[] = {{"user", "test"}};
1617
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1738
+ const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
1618
1739
  return res >= 0;
1619
1740
  }
1620
1741
 
@@ -1625,16 +1746,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
1625
1746
  int alloc_size = 0;
1626
1747
  bool fallback = false; // indicate if we must fallback to default chatml
1627
1748
  std::vector<llama_chat_message> chat;
1628
- for (auto & msg : msgs) {
1749
+ for (const auto & msg : msgs) {
1629
1750
  chat.push_back({msg.role.c_str(), msg.content.c_str()});
1630
1751
  alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1631
1752
  }
1632
1753
 
1633
- const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
1754
+ const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
1634
1755
  std::vector<char> buf(alloc_size);
1635
1756
 
1636
1757
  // run the first time to get the total output length
1637
- int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1758
+ int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1638
1759
 
1639
1760
  // error: chat template is not supported
1640
1761
  if (res < 0) {
@@ -1642,18 +1763,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
1642
1763
  // if the custom "tmpl" is not supported, we throw an error
1643
1764
  // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1644
1765
  throw std::runtime_error("this custom template is not supported");
1645
- } else {
1646
- // If the built-in template is not supported, we default to chatml
1647
- res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1648
- fallback = true;
1649
1766
  }
1767
+
1768
+ // If the built-in template is not supported, we default to chatml
1769
+ res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1770
+ fallback = true;
1650
1771
  }
1651
1772
 
1652
1773
  // if it turns out that our buffer is too small, we resize it
1653
1774
  if ((size_t) res > buf.size()) {
1654
1775
  buf.resize(res);
1655
1776
  res = llama_chat_apply_template(
1656
- fallback ? nullptr : model,
1657
1777
  fallback ? "chatml" : ptr_tmpl,
1658
1778
  chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1659
1779
  }