npm - @fugood/llama.node - Versions diffs - 0.3.4 → 0.3.6 - Mend

@fugood/llama.node 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/CMakeLists.txt +1 -1
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +32 -1
package/package.json +1 -1
package/src/LlamaCompletionWorker.cpp +16 -0
package/src/LlamaContext.cpp +44 -2

package/CMakeLists.txt CHANGED Viewed

@@ -7,7 +7,7 @@ project (llama-node)
 set(CMAKE_CXX_STANDARD 17)
 execute_process(COMMAND
-  git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/ggml-cpu-CMakeLists.txt.patch
+  git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/llama.cpp.patch
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
 )

package/bin/darwin/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/darwin/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/x64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/node.lib CHANGED Viewed

Binary file

package/lib/binding.ts CHANGED Viewed

@@ -9,13 +9,34 @@ export type LlamaModelOptions = {
   model: string
   embedding?: boolean
   embd_normalize?: number
-  pooling_type?: number
+  pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
   n_ctx?: number
   n_batch?: number
+  n_ubatch?: number
   n_threads?: number
   n_gpu_layers?: number
+  flash_attn?: boolean
+  cache_type_k?:
+    | 'f16'
+    | 'f32'
+    | 'q8_0'
+    | 'q4_0'
+    | 'q4_1'
+    | 'iq4_nl'
+    | 'q5_0'
+    | 'q5_1'
+  cache_type_v?:
+    | 'f16'
+    | 'f32'
+    | 'q8_0'
+    | 'q4_0'
+    | 'q4_1'
+    | 'iq4_nl'
+    | 'q5_0'
+    | 'q5_1'
   use_mlock?: boolean
   use_mmap?: boolean
+  vocab_only?: boolean
 }
 export type LlamaCompletionOptions = {
@@ -53,6 +74,16 @@ export type LlamaCompletionResult = {
   tokens_predicted: number
   tokens_evaluated: number
   truncated: boolean
+  timings: {
+    prompt_n: number
+    prompt_ms: number
+    prompt_per_token_ms: number
+    prompt_per_second: number
+    predicted_n: number
+    predicted_ms: number
+    predicted_per_token_ms: number
+    predicted_per_second: number
+  }
 }
 export type LlamaCompletionToken = {

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "0.3.4",
+  "version": "0.3.6",
   "description": "Llama.cpp for Node.js",
   "main": "lib/index.js",
   "scripts": {

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -159,6 +159,22 @@ void LlamaCompletionWorker::OnOK() {
              Napi::Boolean::New(Napi::AsyncWorker::Env(), _result.truncated));
   result.Set("text",
              Napi::String::New(Napi::AsyncWorker::Env(), _result.text.c_str()));
+  auto ctx = _sess->context();
+  const auto timings_token = llama_perf_context(ctx);
+  auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
+  timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
+  timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
+  timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
+  timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
+  timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
+  timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
+  timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
+  timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
+  result.Set("timings", timingsResult);
   Napi::Promise::Deferred::Resolve(result);
 }

package/src/LlamaContext.cpp CHANGED Viewed

@@ -1,3 +1,4 @@
+#include "ggml.h"
 #include "LlamaContext.h"
 #include "DetokenizeWorker.h"
 #include "DisposeWorker.h"
@@ -60,6 +61,36 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
   exports.Set("LlamaContext", func);
 }
+const std::vector<ggml_type> kv_cache_types = {
+  GGML_TYPE_F32,
+  GGML_TYPE_F16,
+  GGML_TYPE_BF16,
+  GGML_TYPE_Q8_0,
+  GGML_TYPE_Q4_0,
+  GGML_TYPE_Q4_1,
+  GGML_TYPE_IQ4_NL,
+  GGML_TYPE_Q5_0,
+  GGML_TYPE_Q5_1,
+};
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+  for (const auto & type : kv_cache_types) {
+    if (ggml_type_name(type) == s) {
+      return type;
+    }
+  }
+  throw std::runtime_error("Unsupported cache type: " + s);
+}
+static int32_t pooling_type_from_str(const std::string & s) {
+  if (s == "none") return LLAMA_POOLING_TYPE_NONE;
+  if (s == "mean") return LLAMA_POOLING_TYPE_MEAN;
+  if (s == "cls") return LLAMA_POOLING_TYPE_CLS;
+  if (s == "last") return LLAMA_POOLING_TYPE_LAST;
+  if (s == "rank") return LLAMA_POOLING_TYPE_RANK;
+  return LLAMA_POOLING_TYPE_UNSPECIFIED;
+}
 // construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
 // use_mlock, use_mmap }): LlamaContext throws error
 LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
@@ -76,20 +107,31 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
     Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
   }
+  params.vocab_only = get_option<bool>(options, "vocab_only", false);
+  if (params.vocab_only) {
+    params.warmup = false;
+  }
   params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
   params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
+  params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
   params.embedding = get_option<bool>(options, "embedding", false);
   if (params.embedding) {
     // For non-causal models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
   }
   params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
-  int32_t pooling_type = get_option<int32_t>(options, "pooling_type", -1);
-  params.pooling_type = (enum llama_pooling_type) pooling_type;
+  params.pooling_type = (enum llama_pooling_type) pooling_type_from_str(
+    get_option<std::string>(options, "pooling_type", "").c_str()
+  );
   params.cpuparams.n_threads =
       get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
   params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
+  params.flash_attn = get_option<bool>(options, "flash_attn", false);
+  params.cache_type_k = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_k", "f16").c_str());
+  params.cache_type_v = kv_cache_type_from_str(get_option<std::string>(options, "cache_type_v", "f16").c_str());
   params.use_mlock = get_option<bool>(options, "use_mlock", false);
   params.use_mmap = get_option<bool>(options, "use_mmap", true);
   params.numa =