npm - @fugood/llama.node - Versions diffs - 0.2.2 → 0.3.0 - Mend

@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (320) hide show

package/CMakeLists.txt CHANGED Viewed

@@ -53,8 +53,6 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
   endif()
 endif()
-include_directories(${CMAKE_JS_INC})
 # flags: -fPIC
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -76,6 +74,11 @@ add_custom_target(
 set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
 add_subdirectory("src/llama.cpp")
+include_directories(
+  ${CMAKE_JS_INC}
+  "src/llama.cpp"
+)
 file(
   GLOB SOURCE_FILES
     "src/addons.cc"

package/bin/darwin/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/darwin/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32/x64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/arm64/node.lib CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/win32-vulkan/x64/node.lib CHANGED Viewed

Binary file

package/lib/binding.ts CHANGED Viewed

@@ -1,5 +1,10 @@
 import * as path from 'path'
+export type ChatMessage = {
+  role: string
+  text: string
+}
 export type LlamaModelOptions = {
   model: string
   embedding?: boolean
@@ -12,7 +17,8 @@ export type LlamaModelOptions = {
 }
 export type LlamaCompletionOptions = {
-  prompt: string
+  messages?: ChatMessage[]
+  prompt?: string
   n_samples?: number
   temperature?: number
   top_k?: number
@@ -48,6 +54,7 @@ export type EmbeddingResult = {
 export interface LlamaContext {
   new (options: LlamaModelOptions): LlamaContext
   getSystemInfo(): string
+  getFormattedChat(messages: ChatMessage[]): string
   completion(options: LlamaCompletionOptions, callback?: (token: LlamaCompletionToken) => void): Promise<LlamaCompletionResult>
   stopCompletion(): void
   tokenize(text: string): Promise<TokenizeResult>

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "0.2.2",
+  "version": "0.3.0",
   "description": "Llama.cpp for Node.js",
   "main": "lib/index.js",
   "scripts": {

package/patches/llama.patch CHANGED Viewed

@@ -1,20 +1,20 @@
-diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
-index b9449be0..cfa0f774 100644
---- a/ggml-vulkan.cpp
-+++ b/ggml-vulkan.cpp
-@@ -525,9 +525,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
+diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
+index fa68360b..f9ff7b5d 100644
+--- a/ggml/src/ggml-vulkan.cpp
++++ b/ggml/src/ggml-vulkan.cpp
+@@ -617,9 +617,15 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
          vk::PipelineCreateFlags(),
          pipeline_shader_create_info,
          pipeline->layout);
--    pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
+-    pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
--    ctx->device->pipelines.push_back(pipeline);
+-    device->pipelines.push_back(pipeline);
 +    try {
-+        pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
-+        ctx->device->pipelines.push_back(pipeline);
-+    } catch (vk::UnknownError const&) {
-+        std::cerr << "ggml_vk_create_pipeline: Failed to create pipeline " << name << std::endl;
-+        ggml_vk_destroy_pipeline(ctx->device->device, pipeline);
++        pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
++        device->pipelines.push_back(pipeline);
++    } catch(vk::UnknownError const&) {
++        VK_LOG_DEBUG("Failed to create pipeline " << name);
++        ggml_vk_destroy_pipeline(device->device, pipeline);
 +        pipeline.reset();
 +    }
  }

package/src/DetokenizeWorker.cpp CHANGED Viewed

@@ -8,7 +8,7 @@ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
       _tokens(std::move(tokens)) {}
 void DetokenizeWorker::Execute() {
-  const auto text = ::llama_detokenize_bpe(_sess->context(), _tokens);
+  const auto text = ::llama_detokenize(_sess->context(), _tokens);
   _text = std::move(text);
 }

package/src/LlamaContext.cpp CHANGED Viewed

@@ -7,12 +7,27 @@
 #include "SaveSessionWorker.h"
 #include "TokenizeWorker.h"
+std::vector<llama_chat_msg> get_messages(Napi::Array messages) {
+  std::vector<llama_chat_msg> chat;
+  for (size_t i = 0; i < messages.Length(); i++) {
+    auto message = messages.Get(i).As<Napi::Object>();
+    chat.push_back({
+      get_option<std::string>(message, "role", ""),
+      get_option<std::string>(message, "content", ""),
+    });
+  }
+  return std::move(chat);
+}
 void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
   Napi::Function func = DefineClass(
       env, "LlamaContext",
       {InstanceMethod<&LlamaContext::GetSystemInfo>(
            "getSystemInfo",
            static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::GetFormattedChat>(
+           "getFormattedChat",
+           static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::Completion>(
            "completion",
            static_cast<napi_property_attributes>(napi_enumerable)),
@@ -89,6 +104,17 @@ Napi::Value LlamaContext::GetSystemInfo(const Napi::CallbackInfo &info) {
   return Napi::String::New(info.Env(), _info);
 }
+// getFormattedChat(messages: [{ role: string, content: string }]): string
+Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  if (info.Length() < 1 || !info[0].IsArray()) {
+    Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
+  }
+  auto messages = info[0].As<Napi::Array>();
+  auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
+  return Napi::String::New(env, formatted);
+}
 // completion(options: LlamaCompletionOptions, onToken?: (token: string) =>
 // void): Promise<LlamaCompletionResult>
 Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
@@ -110,7 +136,13 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   auto options = info[0].As<Napi::Object>();
   gpt_params params = _sess->params();
-  params.prompt = get_option<std::string>(options, "prompt", "");
+  if (options.Has("messages") && options.Get("messages").IsArray()) {
+    auto messages = options.Get("messages").As<Napi::Array>();
+    auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
+    params.prompt = formatted;
+  } else {
+    params.prompt = get_option<std::string>(options, "prompt", "");
+  }
   if (params.prompt.empty()) {
     Napi::TypeError::New(env, "Prompt is required")
         .ThrowAsJavaScriptException();

package/src/LlamaContext.h CHANGED Viewed

@@ -9,6 +9,7 @@ public:
 private:
   Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
+  Napi::Value GetFormattedChat(const Napi::CallbackInfo &info);
   Napi::Value Completion(const Napi::CallbackInfo &info);
   void StopCompletion(const Napi::CallbackInfo &info);
   Napi::Value Tokenize(const Napi::CallbackInfo &info);

package/src/LoadSessionWorker.cpp CHANGED Viewed

@@ -15,6 +15,7 @@ void LoadSessionWorker::Execute() {
                              tokens.capacity(), &count)) {
     SetError("Failed to load session");
   }
+  tokens.resize(count);
   _sess->set_tokens(std::move(tokens));
   _sess->get_mutex().unlock();
 }

package/src/llama.cpp/.github/workflows/bench.yml ADDED Viewed

@@ -0,0 +1,310 @@
+# Benchmark
+name: Benchmark
+on:
+  workflow_dispatch:
+    inputs:
+      gpu-series:
+        description: 'Azure GPU series to run with'
+        required: true
+        type: choice
+        options:
+          - Standard_NC4as_T4_v3
+          - Standard_NC24ads_A100_v4
+          - Standard_NC80adis_H100_v5
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+      duration:
+        description: 'Duration of the bench'
+        type: string
+        default: 10m
+  push:
+    branches:
+      - master
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+  schedule:
+    -  cron: '04 2 * * *'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
+  cancel-in-progress: true
+jobs:
+  bench-server-baseline:
+    runs-on: Standard_NC4as_T4_v3
+    env:
+      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
+      N_USERS: 8
+      DURATION: 10m
+    strategy:
+      matrix:
+        model: [phi-2]
+        ftype: [q4_0, q8_0, f16]
+        include:
+          - model: phi-2
+            ftype: q4_0
+            pr_comment_enabled: "true"
+    if: |
+      inputs.gpu-series == 'Standard_NC4as_T4_v3'
+      || (
+        github.event_name == 'schedule'
+        && github.ref_name == 'master'
+        && github.repository_owner == 'ggerganov'
+      )
+      || github.event_name == 'pull_request_target'
+      || (
+        github.event_name == 'push'
+        && github.event.ref == 'refs/heads/master'
+        && github.repository_owner == 'ggerganov'
+      )
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+      - name: Install python env
+        id: pipenv
+        run: |
+          cd examples/server/bench
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+      - name: Prometheus
+        id: install_prometheus
+        run: |
+          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
+          tar xzf prometheus*.tar.gz --strip-components=1
+          ./prometheus --config.file=examples/server/bench/prometheus.yml &
+          while ! nc -z localhost 9090; do
+            sleep 0.1
+          done
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+      - name: Install k6 and xk6-sse
+        id: k6_installation
+        run: |
+          cd examples/server/bench
+          go install go.k6.io/xk6/cmd/xk6@latest
+          xk6 build master \
+              --with github.com/phymbert/xk6-sse
+      - name: Build
+        id: cmake_build
+        run: |
+          set -eux
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DLLAMA_CUBLAS=ON \
+              -DCUDAToolkit_ROOT=/usr/local/cuda \
+              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+              -DCMAKE_CUDA_ARCHITECTURES=75 \
+              -DLLAMA_FATAL_WARNINGS=OFF \
+              -DLLAMA_ALL_WARNINGS=OFF \
+              -DCMAKE_BUILD_TYPE=Release;
+          cmake --build build --config Release -j $(nproc) --target llama-server
+      - name: Download the dataset
+        id: download_dataset
+        run: |
+          cd examples/server/bench
+          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+      - name: Server bench
+        id: server_bench
+        run: |
+          set -eux
+          cd examples/server/bench
+          source venv/bin/activate
+          python bench.py \
+              --runner-label ${{ env.RUNNER_LABEL }} \
+              --name ${{ github.job }} \
+              --branch ${{ github.head_ref || github.ref_name }} \
+              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
+              --scenario script.js \
+              --duration ${{ github.event.inputs.duration || env.DURATION }} \
+              --hf-repo ggml-org/models	 \
+              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
+              --model-path-prefix /models \
+              --parallel ${{ env.N_USERS }} \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 16384 \
+              --n-prompts 1000 \
+              --max-prompt-tokens 1024 \
+              --max-tokens 2048
+          cat results.github.env >> $GITHUB_ENV
+          # Remove dataset as we do not want it in the artefact
+          rm ShareGPT_V3_unfiltered_cleaned_split.json
+      - uses: actions/upload-artifact@v4
+        with:
+          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+          compression-level: 9
+          path: |
+            examples/server/bench/*.jpg
+            examples/server/bench/*.json
+            examples/server/bench/*.log
+      - name: Commit status
+        uses: Sibz/github-status-action@v1
+        with:
+          authToken: ${{secrets.GITHUB_TOKEN}}
+          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
+          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+          description: |
+            ${{ env.BENCH_RESULTS }}
+          state: 'success'
+      - name: Upload benchmark images
+        uses: devicons/public-upload-to-imgur@v2.2.2
+        continue-on-error: true # Important as it looks unstable: 503
+        id: imgur_step
+        with:
+          client_id: ${{secrets.IMGUR_CLIENT_ID}}
+          path: |
+            examples/server/bench/prompt_tokens_seconds.jpg
+            examples/server/bench/predicted_tokens_seconds.jpg
+            examples/server/bench/kv_cache_usage_ratio.jpg
+            examples/server/bench/requests_processing.jpg
+      - name: Extract mermaid
+        id: set_mermaid
+        run: |
+          set -eux
+          cd examples/server/bench
+          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
+          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
+          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
+          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
+          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
+          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
+          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+      - name: Extract image url
+        id: extract_image_url
+        continue-on-error: true
+        run: |
+          set -eux
+          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
+          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
+          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
+          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
+      - name: Comment PR
+        uses: mshick/add-pr-comment@v2
+        id: comment_pr
+        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
+        with:
+          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+          message: |
+            <p align="center">
+            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+            </p>
+            <details>
+            <summary>Expand details for performance related PR only</summary>
+            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
+            - ${{ env.BENCH_GRAPH_XLABEL }}
+            <p align="center">
+            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
+            <details>
+            <summary>More</summary>
+            ```mermaid
+            ${{ env.PROMPT_TOKENS_SECONDS }}
+            ```
+            </details>
+            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
+            <details>
+                <summary>More</summary>
+            ```mermaid
+            ${{ env.PREDICTED_TOKENS_SECONDS }}
+            ```
+            </details>
+            </p>
+            <details>
+            <summary>Details</summary>
+            <p align="center">
+            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
+            <details>
+                <summary>More</summary>
+            ```mermaid
+            ${{ env.KV_CACHE_USAGE_RATIO }}
+            ```
+            </details>
+            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
+            <details>
+                <summary>More</summary>
+            ```mermaid
+            ${{ env.REQUESTS_PROCESSING }}
+            ```
+            </details>
+            </p>
+            </details>
+            </details>