@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,9 +1,5 @@
|
|
|
1
|
-
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
|
2
1
|
#pragma once
|
|
3
2
|
|
|
4
|
-
#define GGML_COMMON_DECL_C
|
|
5
|
-
#include "ggml-common.h"
|
|
6
|
-
|
|
7
3
|
#include "ggml.h"
|
|
8
4
|
|
|
9
5
|
// GGML internal header
|
|
@@ -12,27 +8,11 @@
|
|
|
12
8
|
extern "C" {
|
|
13
9
|
#endif
|
|
14
10
|
|
|
15
|
-
// Quantization
|
|
16
|
-
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
17
|
-
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
18
|
-
|
|
19
|
-
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
|
20
|
-
|
|
21
11
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
22
12
|
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
23
13
|
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
24
14
|
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
25
15
|
|
|
26
|
-
// GEMV
|
|
27
|
-
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
28
|
-
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
29
|
-
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
30
|
-
|
|
31
|
-
// GEMM
|
|
32
|
-
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
33
|
-
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
34
|
-
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
35
|
-
|
|
36
16
|
#ifdef __cplusplus
|
|
37
17
|
}
|
|
38
18
|
#endif
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
//#define GGML_ALLOCATOR_DEBUG
|
|
16
16
|
|
|
17
|
-
//#define AT_PRINTF(...)
|
|
17
|
+
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
18
18
|
#define AT_PRINTF(...)
|
|
19
19
|
|
|
20
20
|
|
|
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
|
|
89
89
|
size = GGML_PAD(size, talloc->alignment);
|
|
90
90
|
|
|
91
91
|
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
|
92
|
-
|
|
92
|
+
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
|
93
93
|
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
|
94
94
|
GGML_ABORT("not enough space in the buffer");
|
|
95
95
|
}
|
|
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|
|
172
172
|
best_fit_block = alloc->n_free_blocks - 1;
|
|
173
173
|
} else {
|
|
174
174
|
// this should never happen
|
|
175
|
-
|
|
175
|
+
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
|
176
176
|
__func__, size, max_avail);
|
|
177
177
|
GGML_ABORT("not enough space in the buffer");
|
|
178
178
|
}
|
|
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|
|
209
209
|
}
|
|
210
210
|
}
|
|
211
211
|
}
|
|
212
|
-
|
|
212
|
+
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
|
213
213
|
for (int i = 0; i < 1024; i++) {
|
|
214
214
|
if (alloc->allocated_tensors[i].tensor) {
|
|
215
|
-
|
|
215
|
+
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
|
216
216
|
alloc->allocated_tensors[i].offset,
|
|
217
217
|
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
|
218
218
|
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
|
219
219
|
}
|
|
220
220
|
}
|
|
221
|
-
|
|
221
|
+
GGML_LOG_DEBUG("\n");
|
|
222
222
|
}
|
|
223
223
|
#endif
|
|
224
224
|
|
|
@@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
|
|
294
294
|
alloc->free_blocks[0].offset = 0;
|
|
295
295
|
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
|
296
296
|
alloc->max_size = 0;
|
|
297
|
+
|
|
298
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
|
299
|
+
for (int i = 0; i < 1024; i++) {
|
|
300
|
+
alloc->allocated_tensors[i].tensor = NULL;
|
|
301
|
+
}
|
|
302
|
+
#endif
|
|
297
303
|
}
|
|
298
304
|
|
|
299
305
|
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
|
@@ -342,7 +348,6 @@ struct tensor_alloc {
|
|
|
342
348
|
};
|
|
343
349
|
|
|
344
350
|
struct leaf_alloc {
|
|
345
|
-
int buffer_id;
|
|
346
351
|
struct tensor_alloc leaf;
|
|
347
352
|
};
|
|
348
353
|
|
|
@@ -461,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
|
461
466
|
return ggml_gallocr_hash_get(galloc, t)->allocated;
|
|
462
467
|
}
|
|
463
468
|
|
|
464
|
-
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
|
|
465
|
-
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
466
|
-
hn->buffer_id = buffer_id;
|
|
467
|
-
hn->offset = offset;
|
|
468
|
-
hn->allocated = true;
|
|
469
|
-
}
|
|
470
|
-
|
|
471
469
|
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
472
470
|
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
|
473
471
|
}
|
|
474
472
|
|
|
475
473
|
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
|
474
|
+
GGML_ASSERT(buffer_id >= 0);
|
|
476
475
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
477
476
|
|
|
478
477
|
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
|
@@ -734,7 +733,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
734
733
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
735
734
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
736
735
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
|
737
|
-
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
|
738
736
|
if (leaf->view_src || leaf->data) {
|
|
739
737
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
|
740
738
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
|
@@ -762,13 +760,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
762
760
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
|
763
761
|
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
|
764
762
|
#ifndef NDEBUG
|
|
765
|
-
|
|
763
|
+
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
766
764
|
#endif
|
|
767
765
|
|
|
768
766
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
|
769
767
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
|
770
768
|
if (galloc->buffers[i] == NULL) {
|
|
771
|
-
|
|
769
|
+
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
|
772
770
|
return false;
|
|
773
771
|
}
|
|
774
772
|
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
@@ -812,21 +810,25 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
812
810
|
}
|
|
813
811
|
|
|
814
812
|
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
|
815
|
-
size_t node_size =
|
|
813
|
+
size_t node_size = 0;
|
|
814
|
+
if (!node->data && !node->view_src) {
|
|
815
|
+
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
|
816
|
+
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
|
817
|
+
}
|
|
816
818
|
return talloc->size_max >= node_size;
|
|
817
819
|
}
|
|
818
820
|
|
|
819
821
|
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
|
820
822
|
if (galloc->n_nodes != graph->n_nodes) {
|
|
821
823
|
#ifndef NDEBUG
|
|
822
|
-
|
|
824
|
+
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
|
823
825
|
#endif
|
|
824
826
|
return true;
|
|
825
827
|
}
|
|
826
828
|
|
|
827
829
|
if (galloc->n_leafs != graph->n_leafs) {
|
|
828
830
|
#ifndef NDEBUG
|
|
829
|
-
|
|
831
|
+
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
|
830
832
|
#endif
|
|
831
833
|
return true;
|
|
832
834
|
}
|
|
@@ -837,7 +839,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
837
839
|
|
|
838
840
|
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
|
839
841
|
#ifndef NDEBUG
|
|
840
|
-
|
|
842
|
+
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
|
841
843
|
#endif
|
|
842
844
|
return true;
|
|
843
845
|
}
|
|
@@ -849,7 +851,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
849
851
|
}
|
|
850
852
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
|
851
853
|
#ifndef NDEBUG
|
|
852
|
-
|
|
854
|
+
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
|
853
855
|
#endif
|
|
854
856
|
return true;
|
|
855
857
|
}
|
|
@@ -863,14 +865,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
|
863
865
|
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
|
864
866
|
if (galloc->n_buffers == 1) {
|
|
865
867
|
#ifndef NDEBUG
|
|
866
|
-
|
|
868
|
+
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
|
867
869
|
#endif
|
|
868
870
|
if (!ggml_gallocr_reserve(galloc, graph)) {
|
|
869
871
|
return false;
|
|
870
872
|
}
|
|
871
873
|
} else {
|
|
872
874
|
#ifndef NDEBUG
|
|
873
|
-
|
|
875
|
+
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
|
874
876
|
#endif
|
|
875
877
|
return false;
|
|
876
878
|
}
|
|
@@ -934,7 +936,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
934
936
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
|
935
937
|
if (buffer == NULL) {
|
|
936
938
|
#ifndef NDEBUG
|
|
937
|
-
|
|
939
|
+
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
|
938
940
|
#endif
|
|
939
941
|
for (size_t i = 0; i < *n_buffers; i++) {
|
|
940
942
|
ggml_backend_buffer_free((*buffers)[i]);
|
|
@@ -984,7 +986,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
984
986
|
}
|
|
985
987
|
|
|
986
988
|
if (this_size > max_size) {
|
|
987
|
-
|
|
989
|
+
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
|
988
990
|
__func__, t->name,
|
|
989
991
|
ggml_backend_buft_name(buft),
|
|
990
992
|
this_size, max_size);
|
|
@@ -1016,7 +1018,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
1016
1018
|
|
|
1017
1019
|
if (n_buffers == 0) {
|
|
1018
1020
|
#ifndef NDEBUG
|
|
1019
|
-
|
|
1021
|
+
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
|
1020
1022
|
#endif
|
|
1021
1023
|
return NULL;
|
|
1022
1024
|
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
|
2
|
+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
3
|
+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
|
|
4
|
+
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
|
|
5
|
+
message(STATUS "Using AMX")
|
|
6
|
+
|
|
7
|
+
file(GLOB GGML_HEADERS_AMX "*.h")
|
|
8
|
+
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
|
|
9
|
+
|
|
10
|
+
file(GLOB GGML_SOURCES_AMX "*.cpp")
|
|
11
|
+
|
|
12
|
+
add_library(ggml-amx
|
|
13
|
+
${GGML_HEADERS_AMX}
|
|
14
|
+
${GGML_SOURCES_AMX})
|
|
15
|
+
|
|
16
|
+
target_link_libraries(ggml-amx PRIVATE ggml-base)
|
|
17
|
+
target_include_directories(ggml-amx PRIVATE . ..)
|
|
18
|
+
|
|
19
|
+
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
|
|
20
|
+
# TODO: integrate AMX backend into the CPU backend
|
|
21
|
+
if (MSVC)
|
|
22
|
+
# instruction set detection for MSVC only
|
|
23
|
+
if (GGML_NATIVE)
|
|
24
|
+
# TODO: improve, should not reference files from the parent folder
|
|
25
|
+
include(../ggml-cpu/cmake/FindSIMD.cmake)
|
|
26
|
+
endif ()
|
|
27
|
+
if (GGML_AVX512)
|
|
28
|
+
list(APPEND ARCH_FLAGS /arch:AVX512)
|
|
29
|
+
# MSVC has no compile-time flags enabling specific
|
|
30
|
+
# AVX512 extensions, neither it defines the
|
|
31
|
+
# macros corresponding to the extensions.
|
|
32
|
+
# Do it manually.
|
|
33
|
+
if (GGML_AVX512_VBMI)
|
|
34
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
|
35
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
|
36
|
+
endif()
|
|
37
|
+
if (GGML_AVX512_VNNI)
|
|
38
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
|
39
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
|
40
|
+
endif()
|
|
41
|
+
if (GGML_AVX512_BF16)
|
|
42
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
|
43
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
|
44
|
+
endif()
|
|
45
|
+
if (GGML_AMX_TILE)
|
|
46
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
|
47
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
|
48
|
+
endif()
|
|
49
|
+
if (GGML_AMX_INT8)
|
|
50
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
|
51
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
|
52
|
+
endif()
|
|
53
|
+
if (GGML_AMX_BF16)
|
|
54
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
|
55
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
|
56
|
+
endif()
|
|
57
|
+
elseif (GGML_AVX2)
|
|
58
|
+
list(APPEND ARCH_FLAGS /arch:AVX2)
|
|
59
|
+
elseif (GGML_AVX)
|
|
60
|
+
list(APPEND ARCH_FLAGS /arch:AVX)
|
|
61
|
+
endif()
|
|
62
|
+
else()
|
|
63
|
+
if (GGML_NATIVE)
|
|
64
|
+
list(APPEND ARCH_FLAGS -march=native)
|
|
65
|
+
endif()
|
|
66
|
+
if (GGML_F16C)
|
|
67
|
+
list(APPEND ARCH_FLAGS -mf16c)
|
|
68
|
+
endif()
|
|
69
|
+
if (GGML_FMA)
|
|
70
|
+
list(APPEND ARCH_FLAGS -mfma)
|
|
71
|
+
endif()
|
|
72
|
+
if (GGML_AVX)
|
|
73
|
+
list(APPEND ARCH_FLAGS -mavx)
|
|
74
|
+
endif()
|
|
75
|
+
if (GGML_AVX2)
|
|
76
|
+
list(APPEND ARCH_FLAGS -mavx2)
|
|
77
|
+
endif()
|
|
78
|
+
if (GGML_AVX512)
|
|
79
|
+
list(APPEND ARCH_FLAGS -mavx512f)
|
|
80
|
+
list(APPEND ARCH_FLAGS -mavx512dq)
|
|
81
|
+
list(APPEND ARCH_FLAGS -mavx512bw)
|
|
82
|
+
endif()
|
|
83
|
+
if (GGML_AVX512_VBMI)
|
|
84
|
+
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
|
85
|
+
endif()
|
|
86
|
+
if (GGML_AVX512_VNNI)
|
|
87
|
+
list(APPEND ARCH_FLAGS -mavx512vnni)
|
|
88
|
+
endif()
|
|
89
|
+
if (GGML_AVX512_BF16)
|
|
90
|
+
list(APPEND ARCH_FLAGS -mavx512bf16)
|
|
91
|
+
endif()
|
|
92
|
+
if (GGML_AMX_TILE)
|
|
93
|
+
list(APPEND ARCH_FLAGS -mamx-tile)
|
|
94
|
+
endif()
|
|
95
|
+
if (GGML_AMX_INT8)
|
|
96
|
+
list(APPEND ARCH_FLAGS -mamx-int8)
|
|
97
|
+
endif()
|
|
98
|
+
if (GGML_AMX_BF16)
|
|
99
|
+
list(APPEND ARCH_FLAGS -mamx-bf16)
|
|
100
|
+
endif()
|
|
101
|
+
endif()
|
|
102
|
+
|
|
103
|
+
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
|
|
104
|
+
else()
|
|
105
|
+
set(GGML_AMX OFF PARENT_SCOPE)
|
|
106
|
+
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
|
|
107
|
+
endif()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
// hack until AMX is moved into the CPU backend
|
|
5
|
+
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
|
|
6
|
+
|
|
7
|
+
#include <algorithm>
|
|
8
|
+
#include <memory>
|
|
9
|
+
#include <type_traits>
|
|
10
|
+
|
|
11
|
+
#if defined(_OPENMP)
|
|
12
|
+
#include <omp.h>
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
#define TILE_M 16
|
|
16
|
+
#define TILE_N 16
|
|
17
|
+
#define TILE_K 32
|
|
18
|
+
#define VNNI_BLK 4
|
|
19
|
+
|
|
20
|
+
#define AMX_BLK_SIZE 32
|
|
21
|
+
|
|
22
|
+
#define TMM0 0
|
|
23
|
+
#define TMM1 1
|
|
24
|
+
#define TMM2 2
|
|
25
|
+
#define TMM3 3
|
|
26
|
+
#define TMM4 4
|
|
27
|
+
#define TMM5 5
|
|
28
|
+
#define TMM6 6
|
|
29
|
+
#define TMM7 7
|
|
30
|
+
|
|
31
|
+
// parallel routines
|
|
32
|
+
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
|
|
33
|
+
inline T div_up(T x, T y) { return (x + y - 1) / y; }
|
|
34
|
+
|
|
35
|
+
template <typename T>
|
|
36
|
+
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
|
|
37
|
+
#if 0
|
|
38
|
+
// onednn partition pattern
|
|
39
|
+
T& n_my = n_end;
|
|
40
|
+
if (nth <= 1 || n == 0) {
|
|
41
|
+
n_start = 0;
|
|
42
|
+
n_my = n;
|
|
43
|
+
} else {
|
|
44
|
+
T n1 = div_up(n, nth);
|
|
45
|
+
T n2 = n1 - 1;
|
|
46
|
+
T T1 = n - n2 * nth;
|
|
47
|
+
n_my = ith < T1 ? n1 : n2;
|
|
48
|
+
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
|
|
49
|
+
}
|
|
50
|
+
n_end += n_start;
|
|
51
|
+
#else
|
|
52
|
+
// pytorch aten partition pattern
|
|
53
|
+
T n_my = div_up(n, nth);
|
|
54
|
+
n_start = ith * n_my;
|
|
55
|
+
n_end = std::min(n_start + n_my, n);
|
|
56
|
+
#endif
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
template <typename func_t>
|
|
60
|
+
inline void parallel_for(int nth, int n, const func_t& f) {
|
|
61
|
+
#if defined(_OPENMP)
|
|
62
|
+
#pragma omp parallel num_threads(nth)
|
|
63
|
+
{
|
|
64
|
+
//int nth = omp_get_num_threads();
|
|
65
|
+
int ith = omp_get_thread_num();
|
|
66
|
+
int tbegin, tend;
|
|
67
|
+
balance211(n, nth, ith, tbegin, tend);
|
|
68
|
+
f(tbegin, tend);
|
|
69
|
+
}
|
|
70
|
+
#else
|
|
71
|
+
f(0, n);
|
|
72
|
+
|
|
73
|
+
GGML_UNUSED(nth);
|
|
74
|
+
#endif
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// quantized types that have AMX support
|
|
78
|
+
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
|
79
|
+
// TODO: fix padding for vnni format
|
|
80
|
+
return (type == GGML_TYPE_Q4_0) ||
|
|
81
|
+
(type == GGML_TYPE_Q4_1);
|
|
82
|
+
//(type == GGML_TYPE_Q8_0) ||
|
|
83
|
+
//(type == GGML_TYPE_Q4_K) ||
|
|
84
|
+
//(type == GGML_TYPE_Q5_K) ||
|
|
85
|
+
//(type == GGML_TYPE_Q6_K) ||
|
|
86
|
+
//(type == GGML_TYPE_IQ4_XS);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ggml backend context
|
|
90
|
+
struct ggml_backend_amx_context {
|
|
91
|
+
int n_threads = GGML_DEFAULT_N_THREADS;
|
|
92
|
+
std::unique_ptr<char[]> work_data;
|
|
93
|
+
size_t work_size = 0;
|
|
94
|
+
};
|