npm - @fugood/llama.node - Versions diffs - 0.3.7 → 0.3.9 - Mend

@fugood/llama.node 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

package/README.md +17 -2
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +8 -0
package/lib/index.js +16 -1
package/lib/index.ts +16 -0
package/package.json +1 -1
package/src/EmbeddingWorker.cpp +4 -3
package/src/LlamaCompletionWorker.cpp +4 -2
package/src/LlamaContext.cpp +156 -6
package/src/LlamaContext.h +5 -0
package/src/common.hpp +6 -11
package/src/llama.cpp/.github/workflows/build.yml +19 -17
package/src/llama.cpp/.github/workflows/docker.yml +77 -30
package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +22 -3
package/src/llama.cpp/CMakeLists.txt +49 -24
package/src/llama.cpp/common/arg.cpp +82 -26
package/src/llama.cpp/common/arg.h +3 -0
package/src/llama.cpp/common/common.cpp +192 -72
package/src/llama.cpp/common/common.h +51 -18
package/src/llama.cpp/common/ngram-cache.cpp +12 -12
package/src/llama.cpp/common/ngram-cache.h +2 -2
package/src/llama.cpp/common/sampling.cpp +11 -6
package/src/llama.cpp/common/speculative.cpp +18 -15
package/src/llama.cpp/docs/build.md +2 -0
package/src/llama.cpp/examples/batched/batched.cpp +9 -7
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
package/src/llama.cpp/examples/infill/infill.cpp +23 -24
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
package/src/llama.cpp/examples/llava/clip.cpp +4 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
package/src/llama.cpp/examples/llava/llava.cpp +2 -2
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
package/src/llama.cpp/examples/main/main.cpp +51 -29
package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
package/src/llama.cpp/examples/run/run.cpp +175 -61
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
package/src/llama.cpp/examples/server/httplib.h +1295 -409
package/src/llama.cpp/examples/server/server.cpp +387 -181
package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
package/src/llama.cpp/examples/server/utils.hpp +170 -58
package/src/llama.cpp/examples/simple/simple.cpp +9 -8
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
package/src/llama.cpp/examples/tts/tts.cpp +64 -23
package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
package/src/llama.cpp/ggml/include/ggml.h +36 -145
package/src/llama.cpp/ggml/include/gguf.h +202 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
package/src/llama.cpp/ggml/src/ggml.c +117 -1327
package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
package/src/llama.cpp/include/llama-cpp.h +6 -1
package/src/llama.cpp/include/llama.h +138 -75
package/src/llama.cpp/src/CMakeLists.txt +13 -1
package/src/llama.cpp/src/llama-adapter.cpp +347 -0
package/src/llama.cpp/src/llama-adapter.h +74 -0
package/src/llama.cpp/src/llama-arch.cpp +1487 -0
package/src/llama.cpp/src/llama-arch.h +400 -0
package/src/llama.cpp/src/llama-batch.cpp +368 -0
package/src/llama.cpp/src/llama-batch.h +88 -0
package/src/llama.cpp/src/llama-chat.cpp +578 -0
package/src/llama.cpp/src/llama-chat.h +52 -0
package/src/llama.cpp/src/llama-context.cpp +1775 -0
package/src/llama.cpp/src/llama-context.h +128 -0
package/src/llama.cpp/src/llama-cparams.cpp +1 -0
package/src/llama.cpp/src/llama-cparams.h +37 -0
package/src/llama.cpp/src/llama-grammar.cpp +5 -4
package/src/llama.cpp/src/llama-grammar.h +3 -1
package/src/llama.cpp/src/llama-hparams.cpp +71 -0
package/src/llama.cpp/src/llama-hparams.h +139 -0
package/src/llama.cpp/src/llama-impl.cpp +167 -0
package/src/llama.cpp/src/llama-impl.h +16 -136
package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
package/src/llama.cpp/src/llama-kv-cache.h +218 -0
package/src/llama.cpp/src/llama-mmap.cpp +589 -0
package/src/llama.cpp/src/llama-mmap.h +67 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
package/src/llama.cpp/src/llama-model-loader.h +167 -0
package/src/llama.cpp/src/llama-model.cpp +3953 -0
package/src/llama.cpp/src/llama-model.h +370 -0
package/src/llama.cpp/src/llama-quant.cpp +934 -0
package/src/llama.cpp/src/llama-quant.h +1 -0
package/src/llama.cpp/src/llama-sampling.cpp +147 -32
package/src/llama.cpp/src/llama-sampling.h +3 -19
package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
package/src/llama.cpp/src/llama-vocab.h +97 -142
package/src/llama.cpp/src/llama.cpp +7160 -20314
package/src/llama.cpp/src/unicode.cpp +8 -3
package/src/llama.cpp/tests/CMakeLists.txt +2 -0
package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
package/src/llama.cpp/tests/test-gguf.cpp +222 -187
package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
package/src/llama.cpp/tests/test-sampling.cpp +0 -1
package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6

package/src/llama.cpp/common/common.cpp CHANGED Viewed

@@ -2,6 +2,9 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
+#include "ggml.h"
+#include "gguf.h"
 #include "common.h"
 #include "log.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
@@ -18,6 +21,7 @@
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <iterator>
@@ -62,11 +66,29 @@
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
-#define PATH_MAX MAX_PATH
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
 #else
 #include <sys/syslimits.h>
 #endif
 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+//
+// CURL utils
+//
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
 #endif // LLAMA_USE_CURL
 using json = nlohmann::ordered_json;
@@ -843,7 +865,7 @@ struct common_init_result common_init_from_params(common_params & params) {
     } else if (!params.model_url.empty()) {
         model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
     } else {
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
     }
     if (model == NULL) {
@@ -851,26 +873,28 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
+    const llama_vocab * vocab = llama_model_get_vocab(model);
     if (params.reranking) {
         bool ok = true;
-        if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: model does not have a  BOS token, reranking will not work\n", __func__);
+        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
             ok = false;
         }
-        if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
+        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
             ok = false;
         }
-        if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: model does not have a  SEP token, reranking will not work\n", __func__);
+        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
             ok = false;
         }
         if (!ok) {
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -878,40 +902,40 @@ struct common_init_result common_init_from_params(common_params & params) {
     auto cparams = common_context_params_to_llama(params);
-    llama_context * lctx = llama_new_context_with_model(model, cparams);
+    llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
-        llama_free_model(model);
+        llama_model_free(model);
         return iparams;
     }
     if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
-        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
-        llama_free_model(model);
-        return iparams;
+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        params.ctx_shift = false;
     }
     if (!params.control_vectors.empty()) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
-        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_model_n_layer(model);
         const auto cvec = common_control_vector_load(params.control_vectors);
         if (cvec.n_embd == -1) {
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
-        int err = llama_control_vector_apply(lctx,
-                                             cvec.data.data(),
-                                             cvec.data.size(),
-                                             cvec.n_embd,
-                                             params.control_vector_layer_start,
-                                             params.control_vector_layer_end);
+        int err = llama_apply_adapter_cvec(
+                lctx,
+                cvec.data.data(),
+                cvec.data.size(),
+                cvec.n_embd,
+                params.control_vector_layer_start,
+                params.control_vector_layer_end);
         if (err) {
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -919,30 +943,31 @@ struct common_init_result common_init_from_params(common_params & params) {
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
-        common_lora_adapter_container loaded_la;
-        loaded_la.path = la.path;
-        loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-        if (loaded_la.adapter == nullptr) {
+        llama_adapter_lora_ptr lora;
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        if (lora == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+        la.ptr = lora.get();
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
     }
     if (!params.lora_init_without_apply) {
-        common_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_set_adapter_lora(lctx, params.lora_adapters);
     }
-    if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
-        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
         params.sampling.ignore_eos = false;
     }
     if (params.sampling.ignore_eos) {
-        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
-            if (llama_token_is_eog(model, i)) {
+        for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+            if (llama_vocab_is_eog(vocab, i)) {
                 LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
                 params.sampling.logit_bias.push_back({i, -INFINITY});
             }
@@ -963,8 +988,9 @@ struct common_init_result common_init_from_params(common_params & params) {
         LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
         std::vector<llama_token> tmp;
-        llama_token bos = llama_token_bos(model);
-        llama_token eos = llama_token_eos(model);
+        llama_token bos = llama_vocab_bos(vocab);
+        llama_token eos = llama_vocab_eos(vocab);
         // some models (e.g. T5) don't have a BOS token
         if (bos != LLAMA_TOKEN_NULL) {
             tmp.push_back(bos);
@@ -979,7 +1005,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_encoder(model)) {
             llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-            if (decoder_start_token_id == -1) {
+            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
                 decoder_start_token_id = bos;
             }
             tmp.clear();
@@ -993,17 +1019,17 @@ struct common_init_result common_init_from_params(common_params & params) {
         llama_perf_context_reset(lctx);
     }
-    iparams.model   = model;
-    iparams.context = lctx;
+    iparams.model.reset(model);
+    iparams.context.reset(lctx);
     return iparams;
 }
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
-    llama_lora_adapter_clear(ctx);
-    for (auto & la : lora_adapters) {
+void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
+    llama_clear_adapter_lora(ctx);
+    for (auto & la : lora) {
         if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            llama_set_adapter_lora(ctx, la.ptr, la.scale);
         }
     }
 }
@@ -1017,7 +1043,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
     }
-    mparams.rpc_servers     = params.rpc_servers.c_str();
     mparams.main_gpu        = params.main_gpu;
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
@@ -1120,7 +1145,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
     // Initialize libcurl
-    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
     if (!curl) {
         LOG_ERR("%s: error initializing libcurl\n", __func__);
         return false;
@@ -1134,11 +1160,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
     // Check if hf-token or bearer-token was specified
     if (!hf_token.empty()) {
-      std::string auth_header = "Authorization: Bearer ";
-      auth_header += hf_token.c_str();
-      struct curl_slist *http_headers = NULL;
-      http_headers = curl_slist_append(http_headers, auth_header.c_str());
-      curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
     }
 #if defined(_WIN32)
@@ -1148,8 +1172,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 #endif
     // Check if the file already exists locally
-    struct stat model_file_info;
-    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
+    auto file_exists = std::filesystem::exists(path);
     // If the file exists, check its JSON metadata companion file.
     std::string metadata_path = path + ".json";
@@ -1409,7 +1432,7 @@ struct llama_model * common_load_model_from_url(
         }
     }
-    return llama_load_model_from_file(local_path.c_str(), params);
+    return llama_model_load_from_file(local_path.c_str(), params);
 }
 struct llama_model * common_load_model_from_hf(
@@ -1435,6 +1458,80 @@ struct llama_model * common_load_model_from_hf(
     return common_load_model_from_url(model_url, local_path, hf_token, params);
 }
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+    // fetch model info from Hugging Face Hub API
+    json model_info;
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+    CURLcode res = curl_easy_perform(curl.get());
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to HF API");
+    }
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        model_info = json::parse(res_str);
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+    // check response
+    if (!model_info.contains("ggufFile")) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+    json & gguf_file = model_info.at("ggufFile");
+    if (!gguf_file.contains("rfilename")) {
+        throw std::runtime_error("error: ggufFile does not have rfilename");
+    }
+    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
+}
 #else
 struct llama_model * common_load_model_from_url(
@@ -1456,6 +1553,11 @@ struct llama_model * common_load_model_from_hf(
     return nullptr;
 }
+std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return std::make_pair("", "");
+}
 #endif // LLAMA_USE_CURL
 //
@@ -1554,21 +1656,23 @@ std::vector<llama_token> common_tokenize(
            const std::string & text,
                         bool   add_special,
                         bool   parse_special) {
-    return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_tokenize(vocab, text, add_special, parse_special);
 }
 std::vector<llama_token> common_tokenize(
-    const struct llama_model * model,
+    const struct llama_vocab * vocab,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special) {
     // upper limit for the number of tokens
     int n_tokens = text.length() + 2 * add_special;
     std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
@@ -1577,12 +1681,18 @@ std::vector<llama_token> common_tokenize(
 }
 std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_token_to_piece(vocab, token, special);
+}
+std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
-    const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+    const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
     if (n_chars < 0) {
         piece.resize(-n_chars);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+        int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
         GGML_ASSERT(check == -n_chars);
     }
     else {
@@ -1592,13 +1702,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
     return piece;
 }
-std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_detokenize(vocab, tokens, special);
+}
+std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
     if (n_chars < 0) {
         text.resize(-n_chars);
-        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
         GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
     }
@@ -1612,9 +1728,14 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 // Chat template utils
 //
+std::string common_get_builtin_chat_template(const struct llama_model * model) {
+    const char * ptr_tmpl = llama_model_chat_template(model);
+    return ptr_tmpl == nullptr ? "" : ptr_tmpl;
+}
 bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
@@ -1625,16 +1746,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
     int alloc_size = 0;
     bool fallback = false; // indicate if we must fallback to default chatml
     std::vector<llama_chat_message> chat;
-    for (auto & msg : msgs) {
+    for (const auto & msg : msgs) {
         chat.push_back({msg.role.c_str(), msg.content.c_str()});
         alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
     }
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
     std::vector<char> buf(alloc_size);
     // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     // error: chat template is not supported
     if (res < 0) {
@@ -1642,18 +1763,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
             // if the custom "tmpl" is not supported, we throw an error
             // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
             throw std::runtime_error("this custom template is not supported");
-        } else {
-            // If the built-in template is not supported, we default to chatml
-            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-            fallback = true;
         }
+        // If the built-in template is not supported, we default to chatml
+        res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        fallback = true;
     }
     // if it turns out that our buffer is too small, we resize it
     if ((size_t) res > buf.size()) {
         buf.resize(res);
         res = llama_chat_apply_template(
-            fallback ? nullptr : model,
             fallback ? "chatml" : ptr_tmpl,
             chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }