npm - @fugood/llama.node - Versions diffs - 0.3.0 → 0.3.2 - Mend

@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

package/CMakeLists.txt +1 -10
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +6 -4
package/src/LlamaCompletionWorker.cpp +6 -6
package/src/LlamaContext.cpp +7 -9
package/src/common.hpp +2 -1
package/src/llama.cpp/.github/workflows/build.yml +98 -24
package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
package/src/llama.cpp/.github/workflows/docker.yml +43 -34
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +7 -0
package/src/llama.cpp/CMakeLists.txt +20 -8
package/src/llama.cpp/common/CMakeLists.txt +12 -10
package/src/llama.cpp/common/arg.cpp +2006 -0
package/src/llama.cpp/common/arg.h +77 -0
package/src/llama.cpp/common/common.cpp +496 -1632
package/src/llama.cpp/common/common.h +161 -63
package/src/llama.cpp/common/console.cpp +3 -0
package/src/llama.cpp/common/log.cpp +401 -0
package/src/llama.cpp/common/log.h +66 -698
package/src/llama.cpp/common/ngram-cache.cpp +3 -0
package/src/llama.cpp/common/sampling.cpp +348 -350
package/src/llama.cpp/common/sampling.h +62 -139
package/src/llama.cpp/common/stb_image.h +5990 -6398
package/src/llama.cpp/common/train.cpp +2 -0
package/src/llama.cpp/docs/build.md +36 -1
package/src/llama.cpp/examples/CMakeLists.txt +0 -1
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
package/src/llama.cpp/examples/batched/batched.cpp +39 -55
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
package/src/llama.cpp/examples/infill/infill.cpp +117 -132
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +685 -150
package/src/llama.cpp/examples/llava/clip.h +11 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
package/src/llama.cpp/examples/llava/llava.cpp +110 -24
package/src/llama.cpp/examples/llava/llava.h +2 -3
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
package/src/llama.cpp/examples/llava/requirements.txt +1 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
package/src/llama.cpp/examples/main/main.cpp +210 -262
package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
package/src/llama.cpp/examples/server/server.cpp +1027 -1073
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
package/src/llama.cpp/examples/server/utils.hpp +107 -105
package/src/llama.cpp/examples/simple/simple.cpp +35 -41
package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
package/src/llama.cpp/ggml/include/ggml.h +293 -186
package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
package/src/llama.cpp/include/llama.h +241 -264
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
package/src/llama.cpp/src/llama-grammar.cpp +721 -122
package/src/llama.cpp/src/llama-grammar.h +120 -15
package/src/llama.cpp/src/llama-impl.h +156 -1
package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
package/src/llama.cpp/src/llama-sampling.h +20 -47
package/src/llama.cpp/src/llama-vocab.cpp +343 -120
package/src/llama.cpp/src/llama-vocab.h +33 -17
package/src/llama.cpp/src/llama.cpp +4247 -1525
package/src/llama.cpp/src/unicode-data.cpp +6 -4
package/src/llama.cpp/src/unicode-data.h +4 -4
package/src/llama.cpp/src/unicode.cpp +15 -7
package/src/llama.cpp/tests/CMakeLists.txt +3 -0
package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
package/src/llama.cpp/tests/test-barrier.cpp +93 -0
package/src/llama.cpp/tests/test-grad0.cpp +187 -70
package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
package/src/llama.cpp/tests/test-log.cpp +39 -0
package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
package/src/llama.cpp/tests/test-rope.cpp +1 -1
package/src/llama.cpp/tests/test-sampling.cpp +157 -98
package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
package/patches/llama.patch +0 -22
package/src/llama.cpp/.github/workflows/bench.yml +0 -310
package/src/llama.cpp/common/grammar-parser.cpp +0 -536
package/src/llama.cpp/common/grammar-parser.h +0 -29
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275

package/src/llama.cpp/.github/workflows/bench.yml DELETED Viewed

@@ -1,310 +0,0 @@
-# Benchmark
-name: Benchmark
-on:
-  workflow_dispatch:
-    inputs:
-      gpu-series:
-        description: 'Azure GPU series to run with'
-        required: true
-        type: choice
-        options:
-          - Standard_NC4as_T4_v3
-          - Standard_NC24ads_A100_v4
-          - Standard_NC80adis_H100_v5
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      duration:
-        description: 'Duration of the bench'
-        type: string
-        default: 10m
-  push:
-    branches:
-      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  schedule:
-    -  cron: '04 2 * * *'
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
-  cancel-in-progress: true
-jobs:
-  bench-server-baseline:
-    runs-on: Standard_NC4as_T4_v3
-    env:
-      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
-      N_USERS: 8
-      DURATION: 10m
-    strategy:
-      matrix:
-        model: [phi-2]
-        ftype: [q4_0, q8_0, f16]
-        include:
-          - model: phi-2
-            ftype: q4_0
-            pr_comment_enabled: "true"
-    if: |
-      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || (
-        github.event_name == 'schedule'
-        && github.ref_name == 'master'
-        && github.repository_owner == 'ggerganov'
-      )
-      || github.event_name == 'pull_request_target'
-      || (
-        github.event_name == 'push'
-        && github.event.ref == 'refs/heads/master'
-        && github.repository_owner == 'ggerganov'
-      )
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-      - name: Install python env
-        id: pipenv
-        run: |
-          cd examples/server/bench
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-      - name: Prometheus
-        id: install_prometheus
-        run: |
-          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
-          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=examples/server/bench/prometheus.yml &
-          while ! nc -z localhost 9090; do
-            sleep 0.1
-          done
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-      - name: Install k6 and xk6-sse
-        id: k6_installation
-        run: |
-          cd examples/server/bench
-          go install go.k6.io/xk6/cmd/xk6@latest
-          xk6 build master \
-              --with github.com/phymbert/xk6-sse
-      - name: Build
-        id: cmake_build
-        run: |
-          set -eux
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DLLAMA_CUBLAS=ON \
-              -DCUDAToolkit_ROOT=/usr/local/cuda \
-              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
-              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
-      - name: Download the dataset
-        id: download_dataset
-        run: |
-          cd examples/server/bench
-          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-      - name: Server bench
-        id: server_bench
-        run: |
-          set -eux
-          cd examples/server/bench
-          source venv/bin/activate
-          python bench.py \
-              --runner-label ${{ env.RUNNER_LABEL }} \
-              --name ${{ github.job }} \
-              --branch ${{ github.head_ref || github.ref_name }} \
-              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
-              --scenario script.js \
-              --duration ${{ github.event.inputs.duration || env.DURATION }} \
-              --hf-repo ggml-org/models	 \
-              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
-              --model-path-prefix /models \
-              --parallel ${{ env.N_USERS }} \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 16384 \
-              --n-prompts 1000 \
-              --max-prompt-tokens 1024 \
-              --max-tokens 2048
-          cat results.github.env >> $GITHUB_ENV
-          # Remove dataset as we do not want it in the artefact
-          rm ShareGPT_V3_unfiltered_cleaned_split.json
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          compression-level: 9
-          path: |
-            examples/server/bench/*.jpg
-            examples/server/bench/*.json
-            examples/server/bench/*.log
-      - name: Commit status
-        uses: Sibz/github-status-action@v1
-        with:
-          authToken: ${{secrets.GITHUB_TOKEN}}
-          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          description: |
-            ${{ env.BENCH_RESULTS }}
-          state: 'success'
-      - name: Upload benchmark images
-        uses: devicons/public-upload-to-imgur@v2.2.2
-        continue-on-error: true # Important as it looks unstable: 503
-        id: imgur_step
-        with:
-          client_id: ${{secrets.IMGUR_CLIENT_ID}}
-          path: |
-            examples/server/bench/prompt_tokens_seconds.jpg
-            examples/server/bench/predicted_tokens_seconds.jpg
-            examples/server/bench/kv_cache_usage_ratio.jpg
-            examples/server/bench/requests_processing.jpg
-      - name: Extract mermaid
-        id: set_mermaid
-        run: |
-          set -eux
-          cd examples/server/bench
-          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
-          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
-          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
-          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
-          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
-          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
-          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-      - name: Extract image url
-        id: extract_image_url
-        continue-on-error: true
-        run: |
-          set -eux
-          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
-          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
-          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
-          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-      - name: Comment PR
-        uses: mshick/add-pr-comment@v2
-        id: comment_pr
-        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
-        with:
-          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          message: |
-            <p align="center">
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-            </p>
-            <details>
-            <summary>Expand details for performance related PR only</summary>
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
-            - ${{ env.BENCH_GRAPH_XLABEL }}
-            <p align="center">
-            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-            <details>
-            <summary>More</summary>
-            ```mermaid
-            ${{ env.PROMPT_TOKENS_SECONDS }}
-            ```
-            </details>
-            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-            <details>
-                <summary>More</summary>
-            ```mermaid
-            ${{ env.PREDICTED_TOKENS_SECONDS }}
-            ```
-            </details>
-            </p>
-            <details>
-            <summary>Details</summary>
-            <p align="center">
-            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-            <details>
-                <summary>More</summary>
-            ```mermaid
-            ${{ env.KV_CACHE_USAGE_RATIO }}
-            ```
-            </details>
-            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-            <details>
-                <summary>More</summary>
-            ```mermaid
-            ${{ env.REQUESTS_PROCESSING }}
-            ```
-            </details>
-            </p>
-            </details>
-            </details>