@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -9,6 +9,7 @@ file(GLOB SRC_FILES
|
|
|
9
9
|
get_row_q8_0.cpp
|
|
10
10
|
quantize_f32_q8_0.cpp
|
|
11
11
|
quantize_f16_q8_0.cpp
|
|
12
|
+
quantize_float_to_q4_0.cpp
|
|
12
13
|
dup.cpp
|
|
13
14
|
)
|
|
14
15
|
|
|
@@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
|
|
|
29
30
|
${SRC_FILES}
|
|
30
31
|
)
|
|
31
32
|
|
|
32
|
-
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
|
33
|
+
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
|
10
10
|
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
|
11
|
+
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
|
|
12
|
+
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
|
|
11
13
|
|
|
12
14
|
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
|
13
15
|
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
#include "kernel_operator.h"
|
|
2
|
+
|
|
3
|
+
using namespace AscendC;
|
|
4
|
+
|
|
5
|
+
#define BUFFER_NUM 2
|
|
6
|
+
#define Group_Size 32
|
|
7
|
+
|
|
8
|
+
template <typename SRC_T>
|
|
9
|
+
class QUANTIZE_FLOAT_TO_Q4_0 {
|
|
10
|
+
public:
|
|
11
|
+
__aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
|
|
12
|
+
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
|
13
|
+
int64_t *input_ne_ub, size_t *input_nb_ub,
|
|
14
|
+
int64_t *output_ne_ub) {
|
|
15
|
+
// TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
|
|
16
|
+
// permute=[0,0,0,0]):
|
|
17
|
+
// [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
|
|
18
|
+
int64_t op_block_num = GetBlockNum();
|
|
19
|
+
int64_t op_block_idx = GetBlockIdx();
|
|
20
|
+
|
|
21
|
+
// input stride of data elements
|
|
22
|
+
for (int i = 0; i < 4; i++) {
|
|
23
|
+
input_ne[i] = input_ne_ub[i];
|
|
24
|
+
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
|
25
|
+
output_ne[i] = output_ne_ub[i];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// output stride of data elements
|
|
29
|
+
output_stride[0] = 1;
|
|
30
|
+
for (int i = 1; i < 4; i++) {
|
|
31
|
+
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// scale saved one by one after data:. [group1_scale, group2_scale, ...]
|
|
35
|
+
scale_ne = input_ne;
|
|
36
|
+
scale_stride[0] = 1;
|
|
37
|
+
scale_stride[1] = input_ne[0] / Group_Size;
|
|
38
|
+
for (int i = 2; i < 4; i++) {
|
|
39
|
+
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// split input tensor by rows.
|
|
43
|
+
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
|
44
|
+
dr = nr / op_block_num;
|
|
45
|
+
|
|
46
|
+
uint64_t tails = nr % op_block_num;
|
|
47
|
+
if (op_block_idx < tails) {
|
|
48
|
+
dr += 1;
|
|
49
|
+
ir = dr * op_block_idx;
|
|
50
|
+
} else {
|
|
51
|
+
ir = dr * op_block_idx + tails;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
group_size_in_row = scale_stride[1];
|
|
55
|
+
int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
|
|
56
|
+
output_ne[3] * sizeof(uint8_t) / 2;
|
|
57
|
+
|
|
58
|
+
input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
|
|
59
|
+
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
|
60
|
+
scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
|
|
61
|
+
group_size_in_row *
|
|
62
|
+
sizeof(half)));
|
|
63
|
+
|
|
64
|
+
pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
|
|
65
|
+
pipe.InitBuffer(output_queue, BUFFER_NUM,
|
|
66
|
+
Group_Size * sizeof(int8_t) / 2);
|
|
67
|
+
pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
|
|
68
|
+
pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
|
|
69
|
+
pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
|
|
70
|
+
pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
|
|
71
|
+
pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
|
|
72
|
+
pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
|
|
73
|
+
pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
__aicore__ inline void copy_in(uint32_t offset) {
|
|
77
|
+
LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
|
|
78
|
+
DataCopy(input_local, input_gm[offset], Group_Size);
|
|
79
|
+
input_queue.EnQue(input_local);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
__aicore__ inline void copy_out(uint32_t offset) {
|
|
83
|
+
// reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
|
|
84
|
+
// and using DataCopyPad to avoid 32 bits align.
|
|
85
|
+
LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
|
|
86
|
+
LocalTensor<int8_t> output_int8_local =
|
|
87
|
+
output_local.ReinterpretCast<int8_t>();
|
|
88
|
+
|
|
89
|
+
DataCopyExtParams dataCopyParams;
|
|
90
|
+
dataCopyParams.blockCount = 1;
|
|
91
|
+
dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
|
|
92
|
+
DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
|
|
93
|
+
|
|
94
|
+
output_queue.FreeTensor(output_local);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
|
98
|
+
LocalTensor<float> input_local) {
|
|
99
|
+
DataCopy(cast_local, input_local, Group_Size);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
|
103
|
+
LocalTensor<half> input_local) {
|
|
104
|
+
Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
|
108
|
+
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
|
109
|
+
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
|
110
|
+
const int64_t i1 =
|
|
111
|
+
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
|
112
|
+
|
|
113
|
+
const int64_t input_offset = i1 * input_stride[1] +
|
|
114
|
+
i2 * input_stride[2] +
|
|
115
|
+
i3 * input_stride[3] + Group_Size * group;
|
|
116
|
+
|
|
117
|
+
// output_offset is stride for output_gm which datatype is int8_t and
|
|
118
|
+
// divided by 2 is needed for int4b_t.
|
|
119
|
+
const int64_t output_offset = (i1 * output_stride[1] +
|
|
120
|
+
i2 * output_stride[2] +
|
|
121
|
+
i3 * output_stride[3] +
|
|
122
|
+
Group_Size * group) / 2;
|
|
123
|
+
copy_in(input_offset);
|
|
124
|
+
|
|
125
|
+
LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
|
|
126
|
+
LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
|
|
127
|
+
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
|
128
|
+
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
|
129
|
+
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
|
130
|
+
LocalTensor<float> min_local = min_queue.AllocTensor<float>();
|
|
131
|
+
LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
|
|
132
|
+
LocalTensor<half> half_local = half_queue.AllocTensor<half>();
|
|
133
|
+
|
|
134
|
+
input_to_cast(cast_local, input_local);
|
|
135
|
+
|
|
136
|
+
ReduceMax(max_local, cast_local, work_local, Group_Size);
|
|
137
|
+
ReduceMin(min_local, cast_local, work_local, Group_Size);
|
|
138
|
+
const float max_value = max_local.GetValue(0);
|
|
139
|
+
const float min_value = min_local.GetValue(0);
|
|
140
|
+
float d = max_value;
|
|
141
|
+
if (min_value < 0 && (-1 * min_value) > max_value) {
|
|
142
|
+
d = min_value;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
d = d / (-8);
|
|
146
|
+
if (d != 0) {
|
|
147
|
+
Muls(cast_local, cast_local, 1.0f / d, Group_Size);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
|
|
151
|
+
float scalar = 8.5f;
|
|
152
|
+
Adds(cast_local, cast_local, scalar, Group_Size);
|
|
153
|
+
Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
|
|
154
|
+
scalar = 15.0f;
|
|
155
|
+
Mins(cast_local, cast_local, scalar, Group_Size);
|
|
156
|
+
scalar = -8.0f;
|
|
157
|
+
Adds(cast_local, cast_local, scalar, Group_Size);
|
|
158
|
+
|
|
159
|
+
// float->half->int4b
|
|
160
|
+
Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
|
|
161
|
+
Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
|
|
162
|
+
|
|
163
|
+
output_queue.EnQue(output_local);
|
|
164
|
+
copy_out(output_offset);
|
|
165
|
+
|
|
166
|
+
input_queue.FreeTensor(input_local);
|
|
167
|
+
work_queue.FreeTensor(work_local);
|
|
168
|
+
max_queue.FreeTensor(max_local);
|
|
169
|
+
min_queue.FreeTensor(min_local);
|
|
170
|
+
int8_queue.FreeTensor(int8_local);
|
|
171
|
+
half_queue.FreeTensor(half_local);
|
|
172
|
+
cast_queue.FreeTensor(cast_local);
|
|
173
|
+
return (half)d;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
__aicore__ inline void calculate() {
|
|
177
|
+
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
|
178
|
+
uint32_t scale_local_offset = 0;
|
|
179
|
+
uint32_t scale_global_offset = 0;
|
|
180
|
+
for (int64_t i = ir; i < ir + dr; i++) {
|
|
181
|
+
for (int64_t j = 0; j < group_size_in_row; j++) {
|
|
182
|
+
half scale = calculate_group(i, j);
|
|
183
|
+
scale_local.SetValue(scale_local_offset++, scale);
|
|
184
|
+
// Copy Group_Size/2 length data each time.
|
|
185
|
+
if (scale_local_offset == Group_Size / 2) {
|
|
186
|
+
scale_local_offset = 0;
|
|
187
|
+
// TODO: OPTIMIZE ME
|
|
188
|
+
pipe_barrier(PIPE_ALL);
|
|
189
|
+
DataCopy(scale_gm[scale_global_offset], scale_local,
|
|
190
|
+
Group_Size / 2);
|
|
191
|
+
pipe_barrier(PIPE_ALL);
|
|
192
|
+
scale_global_offset += Group_Size / 2;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if (scale_local_offset != 0) {
|
|
198
|
+
pipe_barrier(PIPE_ALL);
|
|
199
|
+
DataCopyExtParams dataCopyParams;
|
|
200
|
+
dataCopyParams.blockCount = 1;
|
|
201
|
+
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
|
202
|
+
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
|
203
|
+
dataCopyParams);
|
|
204
|
+
pipe_barrier(PIPE_ALL);
|
|
205
|
+
}
|
|
206
|
+
scale_queue.FreeTensor(scale_local);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
private:
|
|
210
|
+
int64_t input_ne[4];
|
|
211
|
+
size_t input_stride[4];
|
|
212
|
+
|
|
213
|
+
int64_t *scale_ne;
|
|
214
|
+
size_t scale_stride[4];
|
|
215
|
+
|
|
216
|
+
int64_t output_ne[4];
|
|
217
|
+
size_t output_stride[4];
|
|
218
|
+
|
|
219
|
+
int64_t group_size_in_row;
|
|
220
|
+
|
|
221
|
+
int64_t ir;
|
|
222
|
+
int64_t dr;
|
|
223
|
+
|
|
224
|
+
TPipe pipe;
|
|
225
|
+
GlobalTensor<SRC_T> input_gm;
|
|
226
|
+
GlobalTensor<half> scale_gm;
|
|
227
|
+
GlobalTensor<int8_t> output_gm;
|
|
228
|
+
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
|
229
|
+
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
230
|
+
TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
|
|
231
|
+
TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
|
|
232
|
+
TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
|
|
233
|
+
TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
|
|
234
|
+
TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
|
|
235
|
+
TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
|
|
236
|
+
TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
|
|
237
|
+
};
|
|
238
|
+
|
|
239
|
+
template <typename T>
|
|
240
|
+
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
|
241
|
+
auto gm_ptr = (__gm__ uint8_t *)gm;
|
|
242
|
+
auto ub_ptr = (uint8_t *)(ub);
|
|
243
|
+
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
|
244
|
+
*ub_ptr = *gm_ptr;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
|
249
|
+
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
|
250
|
+
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
|
251
|
+
int64_t input_ne_ub[4];
|
|
252
|
+
size_t input_nb_ub[4];
|
|
253
|
+
int64_t output_ne_ub[4];
|
|
254
|
+
|
|
255
|
+
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
|
256
|
+
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
|
257
|
+
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
|
258
|
+
|
|
259
|
+
QUANTIZE_FLOAT_TO_Q4_0<half> op;
|
|
260
|
+
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
|
261
|
+
op.calculate();
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
|
265
|
+
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
|
266
|
+
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
|
267
|
+
int64_t input_ne_ub[4];
|
|
268
|
+
size_t input_nb_ub[4];
|
|
269
|
+
int64_t output_ne_ub[4];
|
|
270
|
+
|
|
271
|
+
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
|
272
|
+
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
|
273
|
+
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
|
274
|
+
|
|
275
|
+
QUANTIZE_FLOAT_TO_Q4_0<float> op;
|
|
276
|
+
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
|
277
|
+
op.calculate();
|
|
278
|
+
}
|
|
@@ -227,6 +227,25 @@ typedef struct {
|
|
|
227
227
|
} block_q8_0x8;
|
|
228
228
|
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
|
229
229
|
|
|
230
|
+
//
|
|
231
|
+
// Ternary quantization
|
|
232
|
+
//
|
|
233
|
+
|
|
234
|
+
// 1.6875 bpw
|
|
235
|
+
typedef struct {
|
|
236
|
+
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
|
|
237
|
+
uint8_t qh[QK_K/64]; // 4 elements per byte
|
|
238
|
+
ggml_half d;
|
|
239
|
+
} block_tq1_0;
|
|
240
|
+
static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
|
|
241
|
+
|
|
242
|
+
// 2.0625 bpw
|
|
243
|
+
typedef struct {
|
|
244
|
+
uint8_t qs[QK_K/4]; // 2 bits per element
|
|
245
|
+
ggml_half d;
|
|
246
|
+
} block_tq2_0;
|
|
247
|
+
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
|
248
|
+
|
|
230
249
|
//
|
|
231
250
|
// Super-block quantization structures
|
|
232
251
|
//
|
|
@@ -361,6 +380,7 @@ typedef struct {
|
|
|
361
380
|
} block_iq3_s;
|
|
362
381
|
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
|
363
382
|
|
|
383
|
+
// 1.5625 bpw
|
|
364
384
|
typedef struct {
|
|
365
385
|
ggml_half d;
|
|
366
386
|
uint8_t qs[QK_K/8];
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
add_library(ggml-cpu
|
|
2
|
+
ggml-cpu.c
|
|
3
|
+
ggml-cpu.cpp
|
|
4
|
+
ggml-cpu-aarch64.c
|
|
5
|
+
ggml-cpu-aarch64.h
|
|
6
|
+
ggml-cpu-quants.c
|
|
7
|
+
ggml-cpu-quants.h
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
target_link_libraries(ggml-cpu PRIVATE ggml-base)
|
|
11
|
+
target_include_directories(ggml-cpu PRIVATE . ..)
|
|
12
|
+
|
|
13
|
+
if (APPLE AND GGML_ACCELERATE)
|
|
14
|
+
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
|
15
|
+
if (ACCELERATE_FRAMEWORK)
|
|
16
|
+
message(STATUS "Accelerate framework found")
|
|
17
|
+
|
|
18
|
+
add_compile_definitions(GGML_USE_ACCELERATE)
|
|
19
|
+
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
|
20
|
+
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
|
21
|
+
|
|
22
|
+
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
|
|
23
|
+
else()
|
|
24
|
+
message(WARNING "Accelerate framework not found")
|
|
25
|
+
endif()
|
|
26
|
+
endif()
|
|
27
|
+
|
|
28
|
+
if (GGML_OPENMP)
|
|
29
|
+
find_package(OpenMP)
|
|
30
|
+
if (OpenMP_FOUND)
|
|
31
|
+
message(STATUS "OpenMP found")
|
|
32
|
+
|
|
33
|
+
add_compile_definitions(GGML_USE_OPENMP)
|
|
34
|
+
|
|
35
|
+
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
|
36
|
+
|
|
37
|
+
# FIXME: should be replaced with a compiler id check
|
|
38
|
+
#if (GGML_MUSA)
|
|
39
|
+
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
|
|
40
|
+
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
|
|
41
|
+
#endif()
|
|
42
|
+
else()
|
|
43
|
+
message(WARNING "OpenMP not found")
|
|
44
|
+
endif()
|
|
45
|
+
endif()
|
|
46
|
+
|
|
47
|
+
if (GGML_LLAMAFILE)
|
|
48
|
+
message(STATUS "Using llamafile")
|
|
49
|
+
|
|
50
|
+
add_compile_definitions(GGML_USE_LLAMAFILE)
|
|
51
|
+
|
|
52
|
+
target_sources(ggml-cpu PRIVATE
|
|
53
|
+
llamafile/sgemm.cpp
|
|
54
|
+
llamafile/sgemm.h)
|
|
55
|
+
endif()
|
|
56
|
+
|
|
57
|
+
if (GGML_CPU_HBM)
|
|
58
|
+
find_library(memkind memkind REQUIRED)
|
|
59
|
+
|
|
60
|
+
message(STATUS "Using memkind for CPU HBM")
|
|
61
|
+
|
|
62
|
+
add_compile_definitions(GGML_USE_CPU_HBM)
|
|
63
|
+
|
|
64
|
+
target_link_libraries(ggml-cpu PUBLIC memkind)
|
|
65
|
+
endif()
|
|
66
|
+
|
|
67
|
+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
|
68
|
+
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
|
69
|
+
(NOT CMAKE_OSX_ARCHITECTURES AND
|
|
70
|
+
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
71
|
+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
|
72
|
+
|
|
73
|
+
message(STATUS "ARM detected")
|
|
74
|
+
|
|
75
|
+
if (MSVC)
|
|
76
|
+
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
|
77
|
+
add_compile_definitions(__ARM_NEON)
|
|
78
|
+
add_compile_definitions(__ARM_FEATURE_FMA)
|
|
79
|
+
|
|
80
|
+
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
|
81
|
+
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
|
82
|
+
|
|
83
|
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
|
84
|
+
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
|
85
|
+
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
|
86
|
+
endif ()
|
|
87
|
+
|
|
88
|
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
|
89
|
+
|
|
90
|
+
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
|
91
|
+
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
|
92
|
+
endif ()
|
|
93
|
+
|
|
94
|
+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
|
95
|
+
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
|
96
|
+
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
97
|
+
endif ()
|
|
98
|
+
|
|
99
|
+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
|
100
|
+
else()
|
|
101
|
+
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
102
|
+
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
103
|
+
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
|
104
|
+
endif()
|
|
105
|
+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
|
106
|
+
# Raspberry Pi 1, Zero
|
|
107
|
+
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
|
108
|
+
endif()
|
|
109
|
+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
|
110
|
+
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
|
111
|
+
# Android armeabi-v7a
|
|
112
|
+
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
|
113
|
+
else()
|
|
114
|
+
# Raspberry Pi 2
|
|
115
|
+
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
|
116
|
+
endif()
|
|
117
|
+
endif()
|
|
118
|
+
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
|
119
|
+
# Android arm64-v8a
|
|
120
|
+
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
|
121
|
+
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
|
122
|
+
endif()
|
|
123
|
+
if (GGML_SVE)
|
|
124
|
+
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
|
125
|
+
endif()
|
|
126
|
+
endif()
|
|
127
|
+
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
|
128
|
+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
129
|
+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
|
|
130
|
+
message(STATUS "x86 detected")
|
|
131
|
+
if (MSVC)
|
|
132
|
+
# instruction set detection for MSVC only
|
|
133
|
+
if (GGML_NATIVE)
|
|
134
|
+
# TODO: improve, should not reference files from the parent folder
|
|
135
|
+
include(cmake/FindSIMD.cmake)
|
|
136
|
+
endif ()
|
|
137
|
+
if (GGML_AVX512)
|
|
138
|
+
list(APPEND ARCH_FLAGS /arch:AVX512)
|
|
139
|
+
# MSVC has no compile-time flags enabling specific
|
|
140
|
+
# AVX512 extensions, neither it defines the
|
|
141
|
+
# macros corresponding to the extensions.
|
|
142
|
+
# Do it manually.
|
|
143
|
+
if (GGML_AVX512_VBMI)
|
|
144
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
|
145
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
|
146
|
+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
|
147
|
+
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
|
148
|
+
endif()
|
|
149
|
+
endif()
|
|
150
|
+
if (GGML_AVX512_VNNI)
|
|
151
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
|
152
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
|
153
|
+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
|
154
|
+
list(APPEND ARCH_FLAGS -mavx512vnni)
|
|
155
|
+
endif()
|
|
156
|
+
endif()
|
|
157
|
+
if (GGML_AVX512_BF16)
|
|
158
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
|
159
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
|
160
|
+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
|
161
|
+
list(APPEND ARCH_FLAGS -mavx512bf16)
|
|
162
|
+
endif()
|
|
163
|
+
endif()
|
|
164
|
+
if (GGML_AMX_TILE)
|
|
165
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
|
166
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
|
167
|
+
endif()
|
|
168
|
+
if (GGML_AMX_INT8)
|
|
169
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
|
170
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
|
171
|
+
endif()
|
|
172
|
+
if (GGML_AMX_BF16)
|
|
173
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
|
174
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
|
175
|
+
endif()
|
|
176
|
+
elseif (GGML_AVX2)
|
|
177
|
+
list(APPEND ARCH_FLAGS /arch:AVX2)
|
|
178
|
+
elseif (GGML_AVX)
|
|
179
|
+
list(APPEND ARCH_FLAGS /arch:AVX)
|
|
180
|
+
endif()
|
|
181
|
+
else()
|
|
182
|
+
if (GGML_NATIVE)
|
|
183
|
+
list(APPEND ARCH_FLAGS -march=native)
|
|
184
|
+
endif()
|
|
185
|
+
if (GGML_F16C)
|
|
186
|
+
list(APPEND ARCH_FLAGS -mf16c)
|
|
187
|
+
endif()
|
|
188
|
+
if (GGML_FMA)
|
|
189
|
+
list(APPEND ARCH_FLAGS -mfma)
|
|
190
|
+
endif()
|
|
191
|
+
if (GGML_AVX)
|
|
192
|
+
list(APPEND ARCH_FLAGS -mavx)
|
|
193
|
+
endif()
|
|
194
|
+
if (GGML_AVX2)
|
|
195
|
+
list(APPEND ARCH_FLAGS -mavx2)
|
|
196
|
+
endif()
|
|
197
|
+
if (GGML_AVX512)
|
|
198
|
+
list(APPEND ARCH_FLAGS -mavx512f)
|
|
199
|
+
list(APPEND ARCH_FLAGS -mavx512dq)
|
|
200
|
+
list(APPEND ARCH_FLAGS -mavx512bw)
|
|
201
|
+
endif()
|
|
202
|
+
if (GGML_AVX512_VBMI)
|
|
203
|
+
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
|
204
|
+
endif()
|
|
205
|
+
if (GGML_AVX512_VNNI)
|
|
206
|
+
list(APPEND ARCH_FLAGS -mavx512vnni)
|
|
207
|
+
endif()
|
|
208
|
+
if (GGML_AVX512_BF16)
|
|
209
|
+
list(APPEND ARCH_FLAGS -mavx512bf16)
|
|
210
|
+
endif()
|
|
211
|
+
if (GGML_AMX_TILE)
|
|
212
|
+
list(APPEND ARCH_FLAGS -mamx-tile)
|
|
213
|
+
endif()
|
|
214
|
+
if (GGML_AMX_INT8)
|
|
215
|
+
list(APPEND ARCH_FLAGS -mamx-int8)
|
|
216
|
+
endif()
|
|
217
|
+
if (GGML_AMX_BF16)
|
|
218
|
+
list(APPEND ARCH_FLAGS -mamx-bf16)
|
|
219
|
+
endif()
|
|
220
|
+
endif()
|
|
221
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
|
222
|
+
message(STATUS "PowerPC detected")
|
|
223
|
+
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
|
224
|
+
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
|
225
|
+
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
|
226
|
+
set(substring_index -1)
|
|
227
|
+
endif()
|
|
228
|
+
|
|
229
|
+
if (${substring_index} GREATER_EQUAL 0)
|
|
230
|
+
list(APPEND ARCH_FLAGS -mcpu=power10)
|
|
231
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
|
232
|
+
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
|
233
|
+
else()
|
|
234
|
+
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
|
235
|
+
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
|
236
|
+
endif()
|
|
237
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
238
|
+
message(STATUS "loongarch64 detected")
|
|
239
|
+
|
|
240
|
+
list(APPEND ARCH_FLAGS -march=loongarch64)
|
|
241
|
+
if (GGML_LASX)
|
|
242
|
+
list(APPEND ARCH_FLAGS -mlasx)
|
|
243
|
+
endif()
|
|
244
|
+
if (GGML_LSX)
|
|
245
|
+
list(APPEND ARCH_FLAGS -mlsx)
|
|
246
|
+
endif()
|
|
247
|
+
else()
|
|
248
|
+
message(STATUS "Unknown architecture")
|
|
249
|
+
endif()
|
|
250
|
+
|
|
251
|
+
if (GGML_CPU_AARCH64)
|
|
252
|
+
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
|
|
253
|
+
add_compile_definitions(GGML_USE_CPU_AARCH64)
|
|
254
|
+
endif()
|
|
255
|
+
|
|
256
|
+
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
|
257
|
+
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
|
258
|
+
|
|
259
|
+
if (EMSCRIPTEN)
|
|
260
|
+
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
|
261
|
+
endif()
|