npm - @fugood/llama.node - Versions diffs - 0.3.3 → 0.3.5 - Mend

@fugood/llama.node 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

package/CMakeLists.txt +5 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +29 -1
package/package.json +1 -1
package/src/EmbeddingWorker.cpp +15 -5
package/src/EmbeddingWorker.h +2 -1
package/src/LlamaCompletionWorker.cpp +17 -1
package/src/LlamaContext.cpp +86 -18
package/src/LlamaContext.h +2 -0
package/src/llama.cpp/.github/workflows/build.yml +197 -159
package/src/llama.cpp/.github/workflows/docker.yml +5 -8
package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
package/src/llama.cpp/.github/workflows/server.yml +21 -14
package/src/llama.cpp/CMakeLists.txt +11 -6
package/src/llama.cpp/Sources/llama/llama.h +4 -0
package/src/llama.cpp/cmake/common.cmake +33 -0
package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
package/src/llama.cpp/common/CMakeLists.txt +6 -2
package/src/llama.cpp/common/arg.cpp +426 -245
package/src/llama.cpp/common/common.cpp +143 -80
package/src/llama.cpp/common/common.h +81 -24
package/src/llama.cpp/common/sampling.cpp +53 -19
package/src/llama.cpp/common/sampling.h +22 -1
package/src/llama.cpp/common/speculative.cpp +274 -0
package/src/llama.cpp/common/speculative.h +28 -0
package/src/llama.cpp/docs/build.md +101 -148
package/src/llama.cpp/examples/CMakeLists.txt +32 -13
package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/batched/batched.cpp +5 -4
package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/infill/infill.cpp +1 -1
package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
package/src/llama.cpp/examples/llava/clip.cpp +262 -66
package/src/llama.cpp/examples/llava/clip.h +8 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
package/src/llama.cpp/examples/llava/llava.cpp +46 -19
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/main/main.cpp +9 -5
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/run/run.cpp +911 -0
package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
package/src/llama.cpp/examples/server/server.cpp +1758 -886
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
package/src/llama.cpp/examples/server/utils.hpp +94 -304
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +4 -0
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/tts/tts.cpp +932 -0
package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
package/src/llama.cpp/ggml/include/ggml.h +106 -24
package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
package/src/llama.cpp/ggml/src/ggml.c +367 -207
package/src/llama.cpp/include/llama-cpp.h +25 -0
package/src/llama.cpp/include/llama.h +26 -19
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
package/src/llama.cpp/src/CMakeLists.txt +2 -7
package/src/llama.cpp/src/llama-grammar.cpp +15 -15
package/src/llama.cpp/src/llama-grammar.h +2 -5
package/src/llama.cpp/src/llama-sampling.cpp +35 -90
package/src/llama.cpp/src/llama-vocab.cpp +6 -1
package/src/llama.cpp/src/llama.cpp +1748 -640
package/src/llama.cpp/src/unicode.cpp +62 -51
package/src/llama.cpp/src/unicode.h +9 -10
package/src/llama.cpp/tests/CMakeLists.txt +48 -37
package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
package/src/llama.cpp/tests/test-rope.cpp +61 -20
package/src/llama.cpp/tests/test-sampling.cpp +2 -2
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446

package/CMakeLists.txt CHANGED Viewed

@@ -6,6 +6,11 @@ project (llama-node)
 set(CMAKE_CXX_STANDARD 17)
+execute_process(COMMAND
+  git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/llama.cpp.patch
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
 if(NOT DEFINED napi_build_version)
   set(napi_build_version 6)
 endif()

package/bin/darwin/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/darwin/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/x64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/node.lib CHANGED Viewed

Binary file

package/lib/binding.ts CHANGED Viewed

@@ -8,12 +8,15 @@ export type ChatMessage = {
 export type LlamaModelOptions = {
   model: string
   embedding?: boolean
+  embd_normalize?: number
+  pooling_type?: number
   n_ctx?: number
   n_batch?: number
   n_threads?: number
   n_gpu_layers?: number
   use_mlock?: boolean
   use_mmap?: boolean
+  vocab_only?: boolean
 }
 export type LlamaCompletionOptions = {
@@ -23,7 +26,21 @@ export type LlamaCompletionOptions = {
   temperature?: number
   top_k?: number
   top_p?: number
-  repetition_penalty?: number
+  min_p?: number
+  mirostat?: number
+  mirostat_tau?: number
+  mirostat_eta?: number
+  penalty_last_n?: number
+  penalty_repeat?: number
+  penalty_freq?: number
+  penalty_present?: number
+  typ_p?: number
+  xtc_threshold?: number
+  xtc_probability?: number
+  dry_multiplier?: number
+  dry_base?: number
+  dry_allowed_length?: number
+  dry_penalty_last_n?: number
   n_predict?: number
   max_length?: number
   max_tokens?: number
@@ -37,6 +54,16 @@ export type LlamaCompletionResult = {
   tokens_predicted: number
   tokens_evaluated: number
   truncated: boolean
+  timings: {
+    prompt_n: number
+    prompt_ms: number
+    prompt_per_token_ms: number
+    prompt_per_second: number
+    predicted_n: number
+    predicted_ms: number
+    predicted_per_token_ms: number
+    predicted_per_second: number
+  }
 }
 export type LlamaCompletionToken = {
@@ -54,6 +81,7 @@ export type EmbeddingResult = {
 export interface LlamaContext {
   new (options: LlamaModelOptions): LlamaContext
   getSystemInfo(): string
+  getModelInfo(): object
   getFormattedChat(messages: ChatMessage[]): string
   completion(options: LlamaCompletionOptions, callback?: (token: LlamaCompletionToken) => void): Promise<LlamaCompletionResult>
   stopCompletion(): void

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "0.3.3",
+  "version": "0.3.5",
   "description": "Llama.cpp for Node.js",
   "main": "lib/index.js",
   "scripts": {

package/src/EmbeddingWorker.cpp CHANGED Viewed

@@ -2,8 +2,8 @@
 #include "LlamaContext.h"
 EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
-                                 LlamaSessionPtr &sess, std::string text)
-    : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
+                                 LlamaSessionPtr &sess, std::string text, common_params &params)
+    : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _params(params) {}
 void EmbeddingWorker::Execute() {
   llama_kv_cache_clear(_sess->context());
@@ -14,20 +14,30 @@ void EmbeddingWorker::Execute() {
   }
   const int n_embd = llama_n_embd(_sess->model());
   do {
+    auto ctx = _sess->context();
     int ret =
-        llama_decode(_sess->context(),
+        llama_decode(ctx,
                      llama_batch_get_one(tokens.data(), tokens.size()));
     if (ret < 0) {
       SetError("Failed to inference, code: " + std::to_string(ret));
       break;
     }
-    const float *embd = llama_get_embeddings_seq(_sess->context(), 0);
+    float *embd;
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+      embd = llama_get_embeddings(ctx);
+    } else {
+      embd = llama_get_embeddings_seq(ctx, 0);
+    }
     if (embd == nullptr) {
       SetError("Failed to get embeddings");
       break;
     }
     _result.embedding.resize(n_embd);
-    memcpy(_result.embedding.data(), embd, n_embd * sizeof(float));
+    std::vector<float> embedding(embd, embd + n_embd), out(embd, embd + n_embd);
+        common_embd_normalize(embedding.data(), out.data(), n_embd, _params.embd_normalize);
+    memcpy(_result.embedding.data(), out.data(), n_embd * sizeof(float));
   } while (false);
 }

package/src/EmbeddingWorker.h CHANGED Viewed

@@ -9,7 +9,7 @@ class EmbeddingWorker : public Napi::AsyncWorker,
                         public Napi::Promise::Deferred {
 public:
   EmbeddingWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
-                  std::string text);
+                  std::string text, common_params &params);
 protected:
   void Execute();
@@ -19,5 +19,6 @@ protected:
 private:
   LlamaSessionPtr _sess;
   std::string _text;
+  common_params _params;
   EmbeddingResult _result;
 };

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -64,7 +64,7 @@ void LlamaCompletionWorker::Execute() {
   auto sparams = llama_sampler_chain_default_params();
-  LlamaCppSampling sampling{common_sampler_init(model, _params.sparams),
+  LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
                             common_sampler_free};
   std::vector<llama_token> prompt_tokens =
@@ -159,6 +159,22 @@ void LlamaCompletionWorker::OnOK() {
              Napi::Boolean::New(Napi::AsyncWorker::Env(), _result.truncated));
   result.Set("text",
              Napi::String::New(Napi::AsyncWorker::Env(), _result.text.c_str()));
+  auto ctx = _sess->context();
+  const auto timings_token = llama_perf_context(ctx);
+  auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
+  timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
+  timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
+  timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
+  timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
+  timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
+  timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
+  timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
+  timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
+  result.Set("timings", timingsResult);
   Napi::Promise::Deferred::Resolve(result);
 }

package/src/LlamaContext.cpp CHANGED Viewed

@@ -25,6 +25,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
       {InstanceMethod<&LlamaContext::GetSystemInfo>(
            "getSystemInfo",
            static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::GetModelInfo>(
+           "getModelInfo",
+           static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::GetFormattedChat>(
            "getFormattedChat",
            static_cast<napi_property_attributes>(napi_enumerable)),
@@ -72,9 +75,23 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   if (params.model.empty()) {
     Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
   }
-  params.embedding = get_option<bool>(options, "embedding", false);
+  params.vocab_only = get_option<bool>(options, "vocab_only", false);
+  if (params.vocab_only) {
+    params.warmup = false;
+  }
   params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
   params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
+  params.embedding = get_option<bool>(options, "embedding", false);
+  if (params.embedding) {
+    // For non-causal models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;
+  }
+  params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
+  int32_t pooling_type = get_option<int32_t>(options, "pooling_type", -1);
+  params.pooling_type = (enum llama_pooling_type) pooling_type;
   params.cpuparams.n_threads =
       get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
   params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
@@ -102,6 +119,44 @@ Napi::Value LlamaContext::GetSystemInfo(const Napi::CallbackInfo &info) {
   return Napi::String::New(info.Env(), _info);
 }
+bool validateModelChatTemplate(const struct llama_model * model) {
+    std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
+    std::string template_key = "tokenizer.chat_template";
+    int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
+    if (res >= 0) {
+        llama_chat_message chat[] = {{"user", "test"}};
+        std::string tmpl = std::string(model_template.data(), model_template.size());
+        int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
+        return chat_res > 0;
+    }
+    return res > 0;
+}
+// getModelInfo(): object
+Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
+  char desc[1024];
+  auto model = _sess->model();
+  llama_model_desc(model, desc, sizeof(desc));
+  int count = llama_model_meta_count(model);
+  Napi::Object metadata = Napi::Object::New(info.Env());
+  for (int i = 0; i < count; i++) {
+    char key[256];
+    llama_model_meta_key_by_index(model, i, key, sizeof(key));
+    char val[2048];
+    llama_model_meta_val_str_by_index(model, i, val, sizeof(val));
+    metadata.Set(key, val);
+  }
+  Napi::Object details = Napi::Object::New(info.Env());
+  details.Set("desc", desc);
+  details.Set("nParams", llama_model_n_params(model));
+  details.Set("size", llama_model_size(model));
+  details.Set("isChatTemplateSupported", validateModelChatTemplate(model));
+  details.Set("metadata", metadata);
+  return details;
+}
 // getFormattedChat(messages: [{ role: string, content: string }]): string
 Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
@@ -146,29 +201,34 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
         .ThrowAsJavaScriptException();
   }
   params.n_predict = get_option<int32_t>(options, "n_predict", -1);
-  params.sparams.temp = get_option<float>(options, "temperature", 0.80f);
-  params.sparams.top_k = get_option<int32_t>(options, "top_k", 40);
-  params.sparams.top_p = get_option<float>(options, "top_p", 0.95f);
-  params.sparams.min_p = get_option<float>(options, "min_p", 0.05f);
-  params.sparams.mirostat = get_option<int32_t>(options, "mirostat", 0.00f);
-  params.sparams.mirostat_tau =
+  params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
+  params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
+  params.sampling.top_p = get_option<float>(options, "top_p", 0.95f);
+  params.sampling.min_p = get_option<float>(options, "min_p", 0.05f);
+  params.sampling.mirostat = get_option<int32_t>(options, "mirostat", 0.00f);
+  params.sampling.mirostat_tau =
       get_option<float>(options, "mirostat_tau", 5.00f);
-  params.sparams.mirostat_eta =
+  params.sampling.mirostat_eta =
       get_option<float>(options, "mirostat_eta", 0.10f);
-  params.sparams.penalty_last_n =
+  params.sampling.penalty_last_n =
       get_option<int32_t>(options, "penalty_last_n", 64);
-  params.sparams.penalty_repeat =
+  params.sampling.penalty_repeat =
       get_option<float>(options, "penalty_repeat", 1.00f);
-  params.sparams.penalty_freq =
+  params.sampling.penalty_freq =
       get_option<float>(options, "penalty_freq", 0.00f);
-  params.sparams.penalty_present =
+  params.sampling.penalty_present =
       get_option<float>(options, "penalty_present", 0.00f);
-  params.sparams.penalize_nl = get_option<bool>(options, "penalize_nl", false);
-  params.sparams.typ_p = get_option<float>(options, "typical_p", 1.00f);
-  params.sparams.ignore_eos = get_option<float>(options, "ignore_eos", false);
-  params.sparams.grammar = get_option<std::string>(options, "grammar", "");
+  params.sampling.typ_p = get_option<float>(options, "typical_p", 1.00f);
+  params.sampling.xtc_threshold = get_option<float>(options, "xtc_threshold", 0.00f);
+  params.sampling.xtc_probability = get_option<float>(options, "xtc_probability", 0.10f);
+  params.sampling.dry_multiplier = get_option<float>(options, "dry_multiplier", 1.75f);
+  params.sampling.dry_base = get_option<float>(options, "dry_base", 2);
+  params.sampling.dry_allowed_length = get_option<float>(options, "dry_allowed_length", -1);
+  params.sampling.dry_penalty_last_n = get_option<float>(options, "dry_penalty_last_n", 0);
+  params.sampling.ignore_eos = get_option<bool>(options, "ignore_eos", false);
+  params.sampling.grammar = get_option<std::string>(options, "grammar", "");
   params.n_keep = get_option<int32_t>(options, "n_keep", 0);
-  params.sparams.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
+  params.sampling.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
   std::vector<std::string> stop_words;
   if (options.Has("stop") && options.Get("stop").IsArray()) {
     auto stop_words_array = options.Get("stop").As<Napi::Array>();
@@ -243,8 +303,16 @@ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
     Napi::TypeError::New(env, "Context is disposed")
         .ThrowAsJavaScriptException();
   }
+  auto options = Napi::Object::New(env);
+  if (info.Length() >= 2 && info[1].IsObject()) {
+    options = info[1].As<Napi::Object>();
+  }
+  common_params embdParams;
+  embdParams.embedding = true;
+  embdParams.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
   auto text = info[0].ToString().Utf8Value();
-  auto *worker = new EmbeddingWorker(info, _sess, text);
+  auto *worker = new EmbeddingWorker(info, _sess, text, embdParams);
   worker->Queue();
   return worker->Promise();
 }

package/src/LlamaContext.h CHANGED Viewed

@@ -9,6 +9,7 @@ public:
 private:
   Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
+  Napi::Value GetModelInfo(const Napi::CallbackInfo &info);
   Napi::Value GetFormattedChat(const Napi::CallbackInfo &info);
   Napi::Value Completion(const Napi::CallbackInfo &info);
   void StopCompletion(const Napi::CallbackInfo &info);
@@ -20,6 +21,7 @@ private:
   Napi::Value Release(const Napi::CallbackInfo &info);
   std::string _info;
+  Napi::Object _meta;
   LlamaSessionPtr _sess = nullptr;
   LlamaCompletionWorker *_wip = nullptr;
 };