@fugood/llama.node 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +29 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +17 -1
- package/src/LlamaContext.cpp +86 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -1,30 +1,8 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
+
#include "ggml-cpu-traits.h"
|
|
3
4
|
#include "ggml.h"
|
|
4
5
|
|
|
5
6
|
// GGML internal header
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
extern "C" {
|
|
9
|
-
#endif
|
|
10
|
-
|
|
11
|
-
// Quantization
|
|
12
|
-
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
|
13
|
-
|
|
14
|
-
// GEMV
|
|
15
|
-
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
16
|
-
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
17
|
-
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
18
|
-
|
|
19
|
-
// GEMM
|
|
20
|
-
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
21
|
-
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
22
|
-
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
23
|
-
|
|
24
|
-
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
|
25
|
-
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
|
26
|
-
|
|
27
|
-
#ifdef __cplusplus
|
|
28
|
-
}
|
|
29
|
-
#endif
|
|
30
|
-
|
|
8
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#ifdef GGML_USE_CPU_HBM
|
|
2
|
+
|
|
3
|
+
#include "ggml-backend.h"
|
|
4
|
+
#include "ggml-backend-impl.h"
|
|
5
|
+
#include "ggml-cpu.h"
|
|
6
|
+
#include "ggml-impl.h"
|
|
7
|
+
|
|
8
|
+
#include "ggml-cpu-hbm.h"
|
|
9
|
+
|
|
10
|
+
// buffer type HBM
|
|
11
|
+
|
|
12
|
+
#include <hbwmalloc.h>
|
|
13
|
+
|
|
14
|
+
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
15
|
+
return "CPU_HBM";
|
|
16
|
+
|
|
17
|
+
GGML_UNUSED(buft);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
21
|
+
hbw_free(buffer->context);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
25
|
+
size_t size) {
|
|
26
|
+
void * ptr;
|
|
27
|
+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
28
|
+
if (result != 0) {
|
|
29
|
+
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
|
30
|
+
return NULL;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
34
|
+
buffer->buft = buft;
|
|
35
|
+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
|
36
|
+
|
|
37
|
+
return buffer;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
41
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
|
42
|
+
/* .iface = */ {
|
|
43
|
+
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
|
44
|
+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
|
45
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
46
|
+
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
|
47
|
+
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
|
48
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
49
|
+
},
|
|
50
|
+
/* .context = */ nullptr,
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
return &ggml_backend_cpu_buffer_type_hbm;
|
|
54
|
+
}
|
|
55
|
+
#endif
|
|
@@ -15,6 +15,18 @@
|
|
|
15
15
|
extern "C" {
|
|
16
16
|
#endif
|
|
17
17
|
|
|
18
|
+
struct ggml_compute_params {
|
|
19
|
+
// ith = thread index, nth = number of threads
|
|
20
|
+
int ith, nth;
|
|
21
|
+
|
|
22
|
+
// work buffer for all threads
|
|
23
|
+
size_t wsize;
|
|
24
|
+
void * wdata;
|
|
25
|
+
|
|
26
|
+
struct ggml_threadpool * threadpool;
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
|
|
18
30
|
#if defined(_MSC_VER)
|
|
19
31
|
|
|
20
32
|
#define m512bh(p) p
|
|
@@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
|
|
|
366
378
|
}
|
|
367
379
|
#endif
|
|
368
380
|
|
|
381
|
+
// TODO: move to ggml-threading
|
|
382
|
+
void ggml_barrier(struct ggml_threadpool * tp);
|
|
383
|
+
|
|
369
384
|
#ifdef __cplusplus
|
|
370
385
|
}
|
|
371
386
|
#endif
|
|
@@ -1791,11 +1791,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1791
1791
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
|
1792
1792
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
|
1793
1793
|
|
|
1794
|
-
float32_t _scale[4] = {
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1794
|
+
float32_t _scale[4] = {
|
|
1795
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
|
1796
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
|
1797
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
|
1798
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
|
1799
|
+
};
|
|
1799
1800
|
float32x4_t scale = vld1q_f32(_scale);
|
|
1800
1801
|
|
|
1801
1802
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
|
@@ -1811,13 +1812,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1811
1812
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
|
1812
1813
|
|
|
1813
1814
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
|
1814
|
-
|
|
1815
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
|
1815
1816
|
}
|
|
1816
|
-
|
|
1817
|
+
|
|
1818
|
+
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
|
|
1817
1819
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
|
1818
1820
|
|
|
1819
|
-
vst1_f32(s, vget_low_f32(sumv2));
|
|
1821
|
+
vst1_f32(s, vget_low_f32 (sumv2));
|
|
1820
1822
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
|
1823
|
+
|
|
1821
1824
|
return;
|
|
1822
1825
|
}
|
|
1823
1826
|
#endif
|
|
@@ -2345,10 +2348,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2345
2348
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
|
2346
2349
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
|
2347
2350
|
|
|
2348
|
-
float32_t summs_t[4] = {
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2351
|
+
float32_t summs_t[4] = {
|
|
2352
|
+
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
|
2353
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
|
2354
|
+
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
|
2355
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
|
|
2356
|
+
};
|
|
2352
2357
|
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
|
2353
2358
|
|
|
2354
2359
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
@@ -2369,10 +2374,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2369
2374
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
|
2370
2375
|
|
|
2371
2376
|
// mmla into int32x4_t
|
|
2372
|
-
float32_t _scale[4] = {
|
|
2373
|
-
|
|
2374
|
-
|
|
2375
|
-
|
|
2377
|
+
float32_t _scale[4] = {
|
|
2378
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
|
2379
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
|
2380
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
|
2381
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
|
2382
|
+
};
|
|
2376
2383
|
float32x4_t scale = vld1q_f32(_scale);
|
|
2377
2384
|
|
|
2378
2385
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
|
@@ -2387,15 +2394,17 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2387
2394
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
|
2388
2395
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
|
2389
2396
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
|
2390
|
-
|
|
2397
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
|
2391
2398
|
}
|
|
2392
2399
|
|
|
2393
|
-
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
|
2400
|
+
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
|
|
2394
2401
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
|
2402
|
+
|
|
2395
2403
|
sumv2 = vaddq_f32(sumv2, summs0);
|
|
2396
2404
|
|
|
2397
2405
|
vst1_f32(s, vget_low_f32 (sumv2));
|
|
2398
2406
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
|
2407
|
+
|
|
2399
2408
|
return;
|
|
2400
2409
|
}
|
|
2401
2410
|
#endif
|
|
@@ -3372,10 +3381,12 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3372
3381
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
|
3373
3382
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
|
3374
3383
|
|
|
3375
|
-
float32_t _scale[4] = {
|
|
3376
|
-
|
|
3377
|
-
|
|
3378
|
-
|
|
3384
|
+
float32_t _scale[4] = {
|
|
3385
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
|
3386
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
|
3387
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
|
3388
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
|
3389
|
+
};
|
|
3379
3390
|
float32x4_t scale = vld1q_f32(_scale);
|
|
3380
3391
|
|
|
3381
3392
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
|
@@ -3391,13 +3402,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3391
3402
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
|
3392
3403
|
|
|
3393
3404
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
|
3394
|
-
|
|
3405
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
|
3395
3406
|
}
|
|
3396
|
-
|
|
3407
|
+
|
|
3408
|
+
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
|
|
3397
3409
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
|
3398
3410
|
|
|
3399
|
-
vst1_f32(s,
|
|
3411
|
+
vst1_f32(s, vget_low_f32 (sumv2));
|
|
3400
3412
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
|
3413
|
+
|
|
3401
3414
|
return;
|
|
3402
3415
|
}
|
|
3403
3416
|
#endif
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
#include "ggml-cpu-traits.h"
|
|
2
|
+
|
|
3
|
+
#include "ggml-backend-impl.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
namespace ggml::cpu {
|
|
7
|
+
tensor_traits::~tensor_traits() {}
|
|
8
|
+
|
|
9
|
+
extra_buffer_type::~extra_buffer_type() {}
|
|
10
|
+
} // namespace ggml::cpu
|
|
11
|
+
|
|
12
|
+
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
|
13
|
+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
|
14
|
+
if (extra && extra->context) {
|
|
15
|
+
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
16
|
+
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
17
|
+
if (tensor_traits && tensor_traits->compute_forward(params, op)) {
|
|
18
|
+
return true;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
|
26
|
+
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
|
27
|
+
if (extra && extra->context) {
|
|
28
|
+
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
29
|
+
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
30
|
+
if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
|
|
31
|
+
return true;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include "ggml-backend-impl.h"
|
|
3
|
+
#include "ggml-cpu-impl.h"
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
# include <vector>
|
|
8
|
+
extern "C" {
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
// return true if op part of extra "accelerator"
|
|
12
|
+
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
|
|
13
|
+
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
|
|
14
|
+
|
|
15
|
+
#ifdef __cplusplus
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
namespace ggml::cpu {
|
|
19
|
+
// register in tensor->extra
|
|
20
|
+
class tensor_traits {
|
|
21
|
+
public:
|
|
22
|
+
virtual ~tensor_traits();
|
|
23
|
+
virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
|
|
24
|
+
virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
class extra_buffer_type {
|
|
28
|
+
public:
|
|
29
|
+
virtual ~extra_buffer_type();
|
|
30
|
+
virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
|
|
31
|
+
virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
|
|
32
|
+
};
|
|
33
|
+
} // namespace ggml::cpu
|
|
34
|
+
|
|
35
|
+
// implemented in ggml-cpu.cpp.
|
|
36
|
+
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
|
|
37
|
+
|
|
38
|
+
#endif
|