npm - @fugood/llama.node - Versions diffs - 0.3.6 → 0.3.8 - Mend

@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

package/README.md +17 -2
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +3 -1
package/lib/index.js +16 -1
package/lib/index.ts +16 -0
package/package.json +1 -1
package/src/EmbeddingWorker.cpp +4 -3
package/src/LlamaCompletionWorker.cpp +4 -2
package/src/LlamaContext.cpp +61 -6
package/src/LlamaContext.h +1 -0
package/src/common.hpp +6 -11
package/src/llama.cpp/.github/workflows/build.yml +19 -17
package/src/llama.cpp/.github/workflows/docker.yml +77 -30
package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +22 -3
package/src/llama.cpp/CMakeLists.txt +49 -24
package/src/llama.cpp/common/arg.cpp +82 -26
package/src/llama.cpp/common/arg.h +3 -0
package/src/llama.cpp/common/common.cpp +192 -72
package/src/llama.cpp/common/common.h +51 -18
package/src/llama.cpp/common/ngram-cache.cpp +12 -12
package/src/llama.cpp/common/ngram-cache.h +2 -2
package/src/llama.cpp/common/sampling.cpp +11 -6
package/src/llama.cpp/common/speculative.cpp +18 -15
package/src/llama.cpp/docs/build.md +2 -0
package/src/llama.cpp/examples/batched/batched.cpp +9 -7
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
package/src/llama.cpp/examples/infill/infill.cpp +23 -24
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
package/src/llama.cpp/examples/llava/clip.cpp +4 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
package/src/llama.cpp/examples/llava/llava.cpp +2 -2
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
package/src/llama.cpp/examples/main/main.cpp +51 -29
package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
package/src/llama.cpp/examples/run/run.cpp +175 -61
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
package/src/llama.cpp/examples/server/httplib.h +1295 -409
package/src/llama.cpp/examples/server/server.cpp +387 -181
package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
package/src/llama.cpp/examples/server/utils.hpp +170 -58
package/src/llama.cpp/examples/simple/simple.cpp +9 -8
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
package/src/llama.cpp/examples/tts/tts.cpp +64 -23
package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
package/src/llama.cpp/ggml/include/ggml.h +36 -145
package/src/llama.cpp/ggml/include/gguf.h +202 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
package/src/llama.cpp/ggml/src/ggml.c +117 -1327
package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
package/src/llama.cpp/include/llama-cpp.h +6 -1
package/src/llama.cpp/include/llama.h +138 -75
package/src/llama.cpp/src/CMakeLists.txt +13 -1
package/src/llama.cpp/src/llama-adapter.cpp +347 -0
package/src/llama.cpp/src/llama-adapter.h +74 -0
package/src/llama.cpp/src/llama-arch.cpp +1487 -0
package/src/llama.cpp/src/llama-arch.h +400 -0
package/src/llama.cpp/src/llama-batch.cpp +368 -0
package/src/llama.cpp/src/llama-batch.h +88 -0
package/src/llama.cpp/src/llama-chat.cpp +578 -0
package/src/llama.cpp/src/llama-chat.h +52 -0
package/src/llama.cpp/src/llama-context.cpp +1775 -0
package/src/llama.cpp/src/llama-context.h +128 -0
package/src/llama.cpp/src/llama-cparams.cpp +1 -0
package/src/llama.cpp/src/llama-cparams.h +37 -0
package/src/llama.cpp/src/llama-grammar.cpp +5 -4
package/src/llama.cpp/src/llama-grammar.h +3 -1
package/src/llama.cpp/src/llama-hparams.cpp +71 -0
package/src/llama.cpp/src/llama-hparams.h +139 -0
package/src/llama.cpp/src/llama-impl.cpp +167 -0
package/src/llama.cpp/src/llama-impl.h +16 -136
package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
package/src/llama.cpp/src/llama-kv-cache.h +218 -0
package/src/llama.cpp/src/llama-mmap.cpp +589 -0
package/src/llama.cpp/src/llama-mmap.h +67 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
package/src/llama.cpp/src/llama-model-loader.h +167 -0
package/src/llama.cpp/src/llama-model.cpp +3953 -0
package/src/llama.cpp/src/llama-model.h +370 -0
package/src/llama.cpp/src/llama-quant.cpp +934 -0
package/src/llama.cpp/src/llama-quant.h +1 -0
package/src/llama.cpp/src/llama-sampling.cpp +147 -32
package/src/llama.cpp/src/llama-sampling.h +3 -19
package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
package/src/llama.cpp/src/llama-vocab.h +97 -142
package/src/llama.cpp/src/llama.cpp +7160 -20314
package/src/llama.cpp/src/unicode.cpp +8 -3
package/src/llama.cpp/tests/CMakeLists.txt +2 -0
package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
package/src/llama.cpp/tests/test-gguf.cpp +222 -187
package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
package/src/llama.cpp/tests/test-sampling.cpp +0 -1
package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6

package/README.md CHANGED Viewed

@@ -4,9 +4,23 @@
 [![NPM Version](https://img.shields.io/npm/v/%40fugood%2Fllama.node)](https://www.npmjs.com/package/@fugood/llama.node)
 ![NPM Downloads](https://img.shields.io/npm/dw/%40fugood%2Fllama.node)
-Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
+An another Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp) to make same API with [llama.rn](https://github.com/mybigday/llama.rn) as much as possible.
-[llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
+- [llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
+- [llama.rn](https://github.com/mybigday/llama.rn): React Native binding of llama.cpp
+## Platform Support
+- macOS
+  - arm64: CPU and Metal GPU acceleration
+  - x86_64: CPU only
+- Windows (x86_64 and arm64)
+  - CPU
+  - GPU acceleration via Vulkan
+- Linux (x86_64 and arm64)
+  - CPU
+  - GPU acceleration via Vulkan
+  - GPU acceleration via CUDA
 ## Installation
@@ -49,6 +63,7 @@ console.log('Result:', text)
 - [x] `default`: General usage, not support GPU except macOS (Metal)
 - [x] `vulkan`: Support GPU Vulkan (Windows/Linux), but some scenario might unstable
+- [x] `cuda`: Support GPU CUDA (Linux), but only for limited capability (x86_64: 8.9, arm64: 8.7)
 ## License

package/bin/darwin/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/darwin/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-cuda/arm64/llama-node.node ADDED Viewed

Binary file

package/bin/linux-cuda/x64/llama-node.node ADDED Viewed

Binary file

package/bin/linux-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/x64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/node.lib CHANGED Viewed

Binary file

package/lib/binding.ts CHANGED Viewed

@@ -111,13 +111,15 @@ export interface LlamaContext {
   saveSession(path: string): Promise<void>
   loadSession(path: string): Promise<void>
   release(): Promise<void>
+  // static
+  loadModelInfo(path: string, skip: string[]): Promise<Object>
 }
 export interface Module {
   LlamaContext: LlamaContext
 }
-export type LibVariant = 'default' | 'vulkan'
+export type LibVariant = 'default' | 'vulkan' | 'cuda'
 const setupEnv = (variant?: string) => {
   const postfix = variant ? `-${variant}` : ''

package/lib/index.js CHANGED Viewed

@@ -23,7 +23,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
     });
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.loadModel = void 0;
+exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = void 0;
 const binding_1 = require("./binding");
 __exportStar(require("./binding"), exports);
 const mods = {};
@@ -34,3 +34,18 @@ const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
     return new mods[variant].LlamaContext(options);
 });
 exports.loadModel = loadModel;
+exports.initLlama = binding_1.loadModule;
+const modelInfoSkip = [
+    // Large fields
+    'tokenizer.ggml.tokens',
+    'tokenizer.ggml.token_type',
+    'tokenizer.ggml.merges',
+    'tokenizer.ggml.scores',
+];
+const loadLlamaModelInfo = (path) => __awaiter(void 0, void 0, void 0, function* () {
+    var _a;
+    const variant = 'default';
+    (_a = mods[variant]) !== null && _a !== void 0 ? _a : (mods[variant] = yield (0, binding_1.loadModule)(variant));
+    return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip);
+});
+exports.loadLlamaModelInfo = loadLlamaModelInfo;

package/lib/index.ts CHANGED Viewed

@@ -14,3 +14,19 @@ export const loadModel = async (options: LlamaModelOptionsExtended): Promise<Lla
   mods[variant] ??= await loadModule(options.lib_variant)
   return new mods[variant].LlamaContext(options)
 }
+export const initLlama = loadModule
+const modelInfoSkip = [
+  // Large fields
+  'tokenizer.ggml.tokens',
+  'tokenizer.ggml.token_type',
+  'tokenizer.ggml.merges',
+  'tokenizer.ggml.scores',
+]
+export const loadLlamaModelInfo = async (path: string): Promise<Object> => {
+  const variant = 'default'
+  mods[variant] ??= await loadModule(variant)
+  return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip)
+}

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "0.3.6",
+  "version": "0.3.8",
   "description": "Llama.cpp for Node.js",
   "main": "lib/index.js",
   "scripts": {

package/src/EmbeddingWorker.cpp CHANGED Viewed

@@ -9,10 +9,11 @@ void EmbeddingWorker::Execute() {
   llama_kv_cache_clear(_sess->context());
   auto tokens = ::common_tokenize(_sess->context(), _text, true);
   // add SEP if not present
-  if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
-    tokens.push_back(llama_token_sep(_sess->model()));
+  auto vocab = llama_model_get_vocab(_sess->model());
+  if (tokens.empty() || tokens.back() != llama_vocab_sep(vocab)) {
+    tokens.push_back(llama_vocab_sep(vocab));
   }
-  const int n_embd = llama_n_embd(_sess->model());
+  const int n_embd = llama_model_n_embd(_sess->model());
   do {
     auto ctx = _sess->context();
     int ret =

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -59,7 +59,9 @@ void LlamaCompletionWorker::Execute() {
   size_t n_cur = 0;
   size_t n_input = 0;
   const auto model = _sess->model();
-  const bool add_bos = llama_add_bos_token(model);
+  auto vocab = llama_model_get_vocab(model);
+  const bool add_bos = llama_vocab_get_add_bos(vocab);
   auto ctx = _sess->context();
   auto sparams = llama_sampler_chain_default_params();
@@ -130,7 +132,7 @@ void LlamaCompletionWorker::Execute() {
       });
     }
     // is it an end of generation?
-    if (llama_token_is_eog(model, new_token_id)) {
+    if (llama_vocab_is_eog(vocab, new_token_id)) {
       break;
     }
     // check for stop words

package/src/LlamaContext.cpp CHANGED Viewed

@@ -1,4 +1,6 @@
 #include "ggml.h"
+#include "gguf.h"
+#include "llama-impl.h"
 #include "LlamaContext.h"
 #include "DetokenizeWorker.h"
 #include "DisposeWorker.h"
@@ -8,6 +10,56 @@
 #include "SaveSessionWorker.h"
 #include "TokenizeWorker.h"
+// loadModelInfo(path: string): object
+Napi::Value LlamaContext::ModelInfo(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  struct gguf_init_params params = {
+    /*.no_alloc = */ false,
+    /*.ctx      = */ NULL,
+  };
+  std::string path = info[0].ToString().Utf8Value();
+  // Convert Napi::Array to vector<string>
+  std::vector<std::string> skip;
+  if (info.Length() > 1 && info[1].IsArray()) {
+    Napi::Array skipArray = info[1].As<Napi::Array>();
+    for (uint32_t i = 0; i < skipArray.Length(); i++) {
+      skip.push_back(skipArray.Get(i).ToString().Utf8Value());
+    }
+  }
+  struct gguf_context * ctx = gguf_init_from_file(path.c_str(), params);
+  Napi::Object metadata = Napi::Object::New(env);
+  if (std::find(skip.begin(), skip.end(), "version") == skip.end()) {
+    metadata.Set("version", Napi::Number::New(env, gguf_get_version(ctx)));
+  }
+  if (std::find(skip.begin(), skip.end(), "alignment") == skip.end()) {
+    metadata.Set("alignment", Napi::Number::New(env, gguf_get_alignment(ctx)));
+  }
+  if (std::find(skip.begin(), skip.end(), "data_offset") == skip.end()) {
+    metadata.Set("data_offset", Napi::Number::New(env, gguf_get_data_offset(ctx)));
+  }
+  // kv
+  {
+    const int n_kv = gguf_get_n_kv(ctx);
+    for (int i = 0; i < n_kv; ++i) {
+      const char * key = gguf_get_key(ctx, i);
+      if (std::find(skip.begin(), skip.end(), key) != skip.end()) {
+        continue;
+      }
+      const std::string value = gguf_kv_to_str(ctx, i);
+      metadata.Set(key, Napi::String::New(env, value.c_str()));
+    }
+  }
+  gguf_free(ctx);
+  return metadata;
+}
 std::vector<common_chat_msg> get_messages(Napi::Array messages) {
   std::vector<common_chat_msg> chat;
   for (size_t i = 0; i < messages.Length(); i++) {
@@ -52,7 +104,10 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
            "loadSession",
            static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::Release>(
-           "release", static_cast<napi_property_attributes>(napi_enumerable))});
+           "release", static_cast<napi_property_attributes>(napi_enumerable)),
+       StaticMethod<&LlamaContext::ModelInfo>(
+           "loadModelInfo",
+           static_cast<napi_property_attributes>(napi_enumerable))});
   Napi::FunctionReference *constructor = new Napi::FunctionReference();
   *constructor = Napi::Persistent(func);
 #if NAPI_VERSION > 5
@@ -140,14 +195,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   llama_backend_init();
   llama_numa_init(params.numa);
-  auto result = common_init_from_params(params);
+  auto sess = std::make_shared<LlamaSession>(params);
-  if (result.model == nullptr || result.context == nullptr) {
+  if (sess->model() == nullptr || sess->context() == nullptr) {
     Napi::TypeError::New(env, "Failed to load model")
         .ThrowAsJavaScriptException();
   }
-  _sess = std::make_shared<LlamaSession>(result.model, result.context, params);
+  _sess = sess;
   _info = common_params_get_system_info(params);
 }
@@ -162,8 +217,8 @@ bool validateModelChatTemplate(const struct llama_model * model) {
     int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
     if (res >= 0) {
         llama_chat_message chat[] = {{"user", "test"}};
-        std::string tmpl = std::string(model_template.data(), model_template.size());
-        int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
+        const char * tmpl = llama_model_chat_template(model);
+        int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
         return chat_res > 0;
     }
     return res > 0;

package/src/LlamaContext.h CHANGED Viewed

@@ -5,6 +5,7 @@ class LlamaCompletionWorker;
 class LlamaContext : public Napi::ObjectWrap<LlamaContext> {
 public:
   LlamaContext(const Napi::CallbackInfo &info);
+  static Napi::Value ModelInfo(const Napi::CallbackInfo& info);
   static void Init(Napi::Env env, Napi::Object &exports);
 private:

package/src/common.hpp CHANGED Viewed

@@ -11,8 +11,6 @@
 #include <tuple>
 #include <vector>
-typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
-typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
 typedef std::unique_ptr<common_sampler, decltype(&common_sampler_free)>
     LlamaCppSampling;
 typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
@@ -47,17 +45,17 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
 class LlamaSession {
 public:
-  LlamaSession(llama_model *model, llama_context *ctx, common_params params)
-      : model_(LlamaCppModel(model, llama_free_model)),
-        ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
+  LlamaSession(common_params params)
+      : params_(params) {
+    llama_init_ = common_init_from_params(params);
     tokens_.reserve(params.n_ctx);
   }
   ~LlamaSession() { dispose(); }
-  inline llama_context *context() { return ctx_.get(); }
+  inline llama_context *context() { return llama_init_.context.get(); }
-  inline llama_model *model() { return model_.get(); }
+  inline llama_model *model() { return llama_init_.model.get(); }
   inline std::vector<llama_token> *tokens_ptr() { return &tokens_; }
@@ -72,13 +70,10 @@ public:
   void dispose() {
     std::lock_guard<std::mutex> lock(mutex);
     tokens_.clear();
-    ctx_.reset();
-    model_.reset();
   }
 private:
-  LlamaCppModel model_;
-  LlamaCppContext ctx_;
+  common_init_result llama_init_;
   const common_params params_;
   std::vector<llama_token> tokens_{};
   std::mutex mutex;

package/src/llama.cpp/.github/workflows/build.yml CHANGED Viewed

@@ -60,8 +60,7 @@ jobs:
             -DLLAMA_CURL=ON \
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
-            -DBUILD_SHARED_LIBS=OFF
+            -DGGML_RPC=ON
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
       - name: Test
@@ -88,6 +87,7 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
           zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
       - name: Upload artifacts
@@ -123,8 +123,7 @@ jobs:
             -DLLAMA_FATAL_WARNINGS=ON \
             -DLLAMA_CURL=ON \
             -DGGML_METAL=OFF \
-            -DGGML_RPC=ON \
-            -DBUILD_SHARED_LIBS=OFF
+            -DGGML_RPC=ON
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
       - name: Test
@@ -151,6 +150,7 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
           zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
       - name: Upload artifacts
@@ -181,7 +181,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
           cmake --build . --config Release -j $(nproc)
       - name: Test
@@ -219,6 +219,7 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
           zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
       - name: Upload artifacts
@@ -236,7 +237,7 @@ jobs:
     strategy:
       matrix:
         sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        build_type: [Debug]
     steps:
       - name: Clone
@@ -651,23 +652,23 @@ jobs:
       matrix:
         include:
           - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
           - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
           - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
           - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
           - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
           - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
           - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
           - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
           - build: 'llvm-arm64-opencl-adreno'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
@@ -798,6 +799,7 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
+          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
           7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
       - name: Upload artifacts
@@ -914,7 +916,7 @@ jobs:
         shell: cmd
         run: |
           call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
+          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
           cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
@@ -1239,7 +1241,7 @@ jobs:
       - name: Create release
         id: create_release
-        uses: anzz1/action-create-release@v1
+        uses: ggml-org/action-create-release@v1
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:

package/src/llama.cpp/.github/workflows/docker.yml CHANGED Viewed

@@ -34,21 +34,14 @@ jobs:
     strategy:
       matrix:
         config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
+          # Multi-stage build
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
     steps:
       - name: Check out the repo
         uses: actions/checkout@v4
@@ -56,10 +49,10 @@ jobs:
           fetch-depth: 0 # preserve git history, so we can determine the build number
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       - name: Log in to Docker Hub
         uses: docker/login-action@v2
@@ -79,26 +72,34 @@ jobs:
           # determine tag name postfix (build number, commit hash)
           if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="b${BUILD_NUMBER}"
+            TAG_POSTFIX="-b${BUILD_NUMBER}"
           else
             SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
+            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
           fi
           # list all tags possible
-          TAGS=""
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
-          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
-          echo "output_tags=$TAGS"  # print out for debugging
+          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
+              TYPE=""
+          else
+              TYPE="-${{ matrix.config.tag }}"
+          fi
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
+          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
+          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
+          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
+          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
+          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
+          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
+          echo "full_output_tags=$FULLTAGS"  # print out for debugging
+          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
+          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
         env:
           GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
           GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
       - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
+        if: ${{ matrix.config.free_disk_space == true }}
+        uses: ggml-org/free-disk-space@v1.3.1
         with:
           # this might remove tools that are actually needed,
           # if set to "true" but frees about 6 GB
@@ -113,13 +114,59 @@ jobs:
           docker-images: true
           swap-storage: true
-      - name: Build and push Docker image (tagged + versioned)
-        if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+      - name: Build and push Full Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.full_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: full
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+      - name: Build and push Light Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.light_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: light
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+      - name: Build and push Server Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
         uses: docker/build-push-action@v6
         with:
           context: .
           push: true
           platforms: ${{ matrix.config.platforms }}
           # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.output_tags }}
+          tags: ${{ steps.tag.outputs.server_output_tags }}
           file: ${{ matrix.config.dockerfile }}
+          target: server
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache

package/src/llama.cpp/.github/workflows/editorconfig.yml CHANGED Viewed

@@ -23,5 +23,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - uses: editorconfig-checker/action-editorconfig-checker@v2
+        with:
+          version: v3.0.3
       - run: editorconfig-checker