whispercpp 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +4 -2
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +150 -31
- data/ext/sources/examples/addon.node/index.js +3 -0
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +3 -2
- data/ext/sources/examples/cli/cli.cpp +3 -2
- data/ext/sources/examples/command/command.cpp +32 -8
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +2 -0
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +169 -22
- data/ext/sources/examples/stream/stream.cpp +6 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
- data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
- data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
- data/ext/sources/examples/talk-llama/llama-context.h +38 -17
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
- data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
- data/ext/sources/examples/talk-llama/llama-model.h +27 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
- data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
- data/ext/sources/examples/talk-llama/llama.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama.h +147 -40
- data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
- data/ext/sources/ggml/CMakeLists.txt +48 -3
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml.h +144 -5
- data/ext/sources/ggml/src/CMakeLists.txt +82 -24
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- data/ext/sources/ggml/src/ggml-common.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
- data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
- data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-impl.h +127 -183
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- data/ext/sources/ggml/src/ggml-quants.c +6 -8
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
- data/ext/sources/ggml/src/ggml.c +328 -48
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +24 -3
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +218 -169
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +56 -1
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +68 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +91 -43
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_params.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -31,6 +31,8 @@
|
|
31
31
|
#include <mutex>
|
32
32
|
#include <queue>
|
33
33
|
#include <chrono>
|
34
|
+
#include <unordered_set>
|
35
|
+
#include <optional>
|
34
36
|
|
35
37
|
#include "ggml-impl.h"
|
36
38
|
#include "ggml-backend-impl.h"
|
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
|
|
93
95
|
return id;
|
94
96
|
}
|
95
97
|
|
98
|
+
/**
|
99
|
+
* @brief Get the value of the specified environment variable (name).
|
100
|
+
* if not empty, return a std::string object
|
101
|
+
*/
|
102
|
+
std::optional<std::string> get_env(const std::string& name) {
|
103
|
+
const char* val = std::getenv(name.c_str());
|
104
|
+
if (!val) return std::nullopt;
|
105
|
+
std::string res = std::string(val);
|
106
|
+
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
|
107
|
+
return res;
|
108
|
+
}
|
109
|
+
|
110
|
+
/**
|
111
|
+
* @brief Verify whether the environment variable is a valid value.
|
112
|
+
*/
|
113
|
+
bool parse_bool(const std::string& value) {
|
114
|
+
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
|
115
|
+
return valid_values.find(value) != valid_values.end();
|
116
|
+
}
|
117
|
+
|
96
118
|
/**
|
97
119
|
* @brief Initialize the CANN device information.
|
98
120
|
*
|
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
214
236
|
* @param device The device ID to associate with this buffer pool.
|
215
237
|
*/
|
216
238
|
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
217
|
-
disable_clean =
|
239
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
218
240
|
}
|
219
241
|
|
220
242
|
/**
|
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
410
432
|
* @param device The device ID to associate with this buffer pool.
|
411
433
|
*/
|
412
434
|
explicit ggml_cann_pool_buf(int device) : device(device) {
|
413
|
-
disable_clean =
|
435
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
414
436
|
}
|
415
437
|
|
416
438
|
/**
|
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
731
753
|
*/
|
732
754
|
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
733
755
|
int device) {
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
738
|
-
}
|
739
|
-
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
|
740
|
-
if (enable_buf_prio) {
|
756
|
+
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
757
|
+
|
758
|
+
if (mem_pool_type == "prio") {
|
741
759
|
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
742
760
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
|
743
761
|
}
|
762
|
+
|
763
|
+
if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
|
764
|
+
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
|
765
|
+
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
766
|
+
}
|
767
|
+
|
744
768
|
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
|
745
769
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
|
746
770
|
}
|
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
|
1074
1074
|
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
1075
1075
|
GGML_TABLE_END()
|
1076
1076
|
|
1077
|
+
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
1078
|
+
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
1079
|
+
GGML_TABLE_END()
|
1080
|
+
|
1077
1081
|
#define NGRID_IQ1S 2048
|
1078
1082
|
#define IQ1S_DELTA 0.125f
|
1079
1083
|
#define IQ1M_DELTA 0.125f
|
@@ -1,3 +1,17 @@
|
|
1
|
+
function(ggml_add_cpu_backend_features cpu_name arch)
|
2
|
+
# The feature detection code is compiled as a separate target so that
|
3
|
+
# it can be built without the architecture flags
|
4
|
+
# Since multiple variants of the CPU backend may be included in the same
|
5
|
+
# build, using set_source_files_properties() to set the arch flags is not possible
|
6
|
+
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
7
|
+
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
8
|
+
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
|
9
|
+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
10
|
+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
11
|
+
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
12
|
+
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
13
|
+
endfunction()
|
14
|
+
|
1
15
|
function(ggml_add_cpu_backend_variant_impl tag_name)
|
2
16
|
if (tag_name)
|
3
17
|
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
@@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
10
24
|
list (APPEND GGML_CPU_SOURCES
|
11
25
|
ggml-cpu/ggml-cpu.c
|
12
26
|
ggml-cpu/ggml-cpu.cpp
|
13
|
-
ggml-cpu/
|
14
|
-
ggml-cpu/
|
15
|
-
ggml-cpu/
|
16
|
-
ggml-cpu/
|
17
|
-
ggml-cpu/
|
18
|
-
ggml-cpu/
|
19
|
-
ggml-cpu/
|
20
|
-
ggml-cpu/
|
27
|
+
ggml-cpu/repack.cpp
|
28
|
+
ggml-cpu/repack.h
|
29
|
+
ggml-cpu/hbm.cpp
|
30
|
+
ggml-cpu/hbm.h
|
31
|
+
ggml-cpu/quants.c
|
32
|
+
ggml-cpu/quants.h
|
33
|
+
ggml-cpu/traits.cpp
|
34
|
+
ggml-cpu/traits.h
|
21
35
|
ggml-cpu/amx/amx.cpp
|
22
36
|
ggml-cpu/amx/amx.h
|
23
37
|
ggml-cpu/amx/mmq.cpp
|
@@ -82,12 +96,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
82
96
|
target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
|
83
97
|
endif()
|
84
98
|
|
85
|
-
if (
|
86
|
-
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
87
|
-
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
88
|
-
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
89
|
-
|
99
|
+
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
90
100
|
message(STATUS "ARM detected")
|
101
|
+
list(APPEND GGML_CPU_SOURCES
|
102
|
+
ggml-cpu/arch/arm/quants.c
|
103
|
+
ggml-cpu/arch/arm/repack.cpp
|
104
|
+
)
|
91
105
|
|
92
106
|
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
93
107
|
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
|
@@ -143,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
143
157
|
else()
|
144
158
|
if (GGML_CPU_ARM_ARCH)
|
145
159
|
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
160
|
+
elseif(GGML_CPU_ALL_VARIANTS)
|
161
|
+
# Begin with the lowest baseline
|
162
|
+
set(ARM_MCPU "armv8-a")
|
163
|
+
set(ARCH_TAGS "")
|
164
|
+
set(ARCH_DEFINITIONS "")
|
165
|
+
|
166
|
+
# When a feature is selected, bump the MCPU to the first
|
167
|
+
# version that supported it
|
168
|
+
if (GGML_INTERNAL_DOTPROD)
|
169
|
+
set(ARM_MCPU "armv8.2-a")
|
170
|
+
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
171
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
172
|
+
endif()
|
173
|
+
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
174
|
+
set(ARM_MCPU "armv8.2-a")
|
175
|
+
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
176
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
177
|
+
endif()
|
178
|
+
if (GGML_INTERNAL_SVE)
|
179
|
+
set(ARM_MCPU "armv8.2-a")
|
180
|
+
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
181
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
182
|
+
endif()
|
183
|
+
if (GGML_INTERNAL_MATMUL_INT8)
|
184
|
+
set(ARM_MCPU "armv8.6-a")
|
185
|
+
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
186
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
187
|
+
endif()
|
188
|
+
if (GGML_INTERNAL_SVE2)
|
189
|
+
set(ARM_MCPU "armv8.6-a")
|
190
|
+
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
191
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
192
|
+
endif()
|
193
|
+
if (GGML_INTERNAL_NOSVE)
|
194
|
+
set(ARCH_TAGS "${ARCH_TAGS}+nosve")
|
195
|
+
endif()
|
196
|
+
if (GGML_INTERNAL_SME)
|
197
|
+
set(ARM_MCPU "armv9.2-a")
|
198
|
+
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
199
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
200
|
+
endif()
|
201
|
+
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
202
|
+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
146
203
|
endif()
|
147
204
|
endif()
|
148
205
|
|
@@ -170,11 +227,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
170
227
|
endforeach()
|
171
228
|
endif()
|
172
229
|
endif()
|
173
|
-
elseif (
|
174
|
-
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
175
|
-
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
176
|
-
|
230
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
|
177
231
|
message(STATUS "x86 detected")
|
232
|
+
list(APPEND GGML_CPU_SOURCES
|
233
|
+
ggml-cpu/arch/x86/quants.c
|
234
|
+
ggml-cpu/arch/x86/repack.cpp
|
235
|
+
)
|
178
236
|
|
179
237
|
if (MSVC)
|
180
238
|
# instruction set detection for MSVC only
|
@@ -305,21 +363,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
305
363
|
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
306
364
|
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
307
365
|
endif()
|
308
|
-
|
309
|
-
# The feature detection code is compiled as a separate target so that
|
310
|
-
# it can be built without the architecture flags
|
311
|
-
# Since multiple variants of the CPU backend may be included in the same
|
312
|
-
# build, using set_source_files_properties() to set the arch flags is not possible
|
313
|
-
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
314
|
-
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
315
|
-
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
316
|
-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
317
|
-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
318
|
-
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
319
|
-
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
366
|
+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
320
367
|
endif()
|
321
|
-
elseif (
|
368
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
322
369
|
message(STATUS "PowerPC detected")
|
370
|
+
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
|
323
371
|
if (GGML_NATIVE)
|
324
372
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
325
373
|
file(READ "/proc/cpuinfo" POWER10_M)
|
@@ -327,7 +375,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
327
375
|
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
328
376
|
endif()
|
329
377
|
|
330
|
-
string(
|
378
|
+
string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
|
379
|
+
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
|
331
380
|
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
332
381
|
|
333
382
|
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
@@ -339,13 +388,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
339
388
|
else()
|
340
389
|
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
341
390
|
endif()
|
391
|
+
elseif(GGML_CPU_ALL_VARIANTS)
|
392
|
+
# Begin with the lowest baseline
|
393
|
+
set(ARCH_DEFINITIONS "")
|
394
|
+
|
395
|
+
# When a feature is selected, bump the MCPU to the first
|
396
|
+
# version that supported it
|
397
|
+
foreach(PVER RANGE 7 11)
|
398
|
+
if(DEFINED GGML_INTERNAL_POWER${PVER})
|
399
|
+
set(POWERPC_MCPU "power${PVER}")
|
400
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
|
401
|
+
endif()
|
402
|
+
endforeach()
|
403
|
+
if (GGML_INTERNAL_VSX)
|
404
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
|
405
|
+
list(APPEND ARCH_FLAGS -mvsx)
|
406
|
+
endif()
|
407
|
+
|
408
|
+
if (DEFINED POWERPC_MCPU)
|
409
|
+
list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
|
410
|
+
endif()
|
411
|
+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
|
342
412
|
else()
|
343
413
|
if (GGML_CPU_POWERPC_CPUTYPE)
|
344
414
|
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
345
415
|
endif()
|
346
416
|
endif()
|
347
|
-
elseif (
|
417
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
|
348
418
|
message(STATUS "loongarch64 detected")
|
419
|
+
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
|
349
420
|
|
350
421
|
list(APPEND ARCH_FLAGS -march=loongarch64)
|
351
422
|
if (GGML_LASX)
|
@@ -354,8 +425,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
354
425
|
if (GGML_LSX)
|
355
426
|
list(APPEND ARCH_FLAGS -mlsx)
|
356
427
|
endif()
|
357
|
-
elseif (
|
358
|
-
message(STATUS "
|
428
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
429
|
+
message(STATUS "riscv64 detected")
|
430
|
+
list(APPEND GGML_CPU_SOURCES
|
431
|
+
ggml-cpu/arch/riscv/quants.c
|
432
|
+
ggml-cpu/arch/riscv/repack.cpp
|
433
|
+
)
|
359
434
|
if (GGML_RVV)
|
360
435
|
if (GGML_XTHEADVECTOR)
|
361
436
|
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
@@ -365,13 +440,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
365
440
|
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
366
441
|
endif()
|
367
442
|
endif()
|
368
|
-
elseif (
|
443
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
369
444
|
message(STATUS "s390x detected")
|
445
|
+
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
370
446
|
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
371
447
|
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
372
448
|
|
373
449
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
374
450
|
if (${S390X_M} MATCHES "8561|8562")
|
451
|
+
set(GGML_NNPA OFF)
|
375
452
|
message(STATUS "z15 target")
|
376
453
|
list(APPEND ARCH_FLAGS -march=z15)
|
377
454
|
elseif (${S390X_M} MATCHES "3931")
|
@@ -388,14 +465,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
388
465
|
endif()
|
389
466
|
|
390
467
|
if (GGML_VXE)
|
468
|
+
message(STATUS "VX/VXE/VXE2 enabled")
|
391
469
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
470
|
+
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
471
|
+
endif()
|
472
|
+
|
473
|
+
if (GGML_NNPA)
|
474
|
+
message(STATUS "NNPA enabled")
|
475
|
+
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
392
476
|
endif()
|
477
|
+
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
478
|
+
message(STATUS "Wasm detected")
|
479
|
+
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
393
480
|
else()
|
394
|
-
message(
|
481
|
+
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
|
482
|
+
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
|
395
483
|
endif()
|
396
484
|
|
397
|
-
if (
|
398
|
-
target_compile_definitions(${GGML_CPU_NAME} PRIVATE
|
485
|
+
if (GGML_CPU_REPACK)
|
486
|
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
|
399
487
|
endif()
|
400
488
|
|
401
489
|
if (GGML_CPU_KLEIDIAI)
|
@@ -406,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
406
494
|
|
407
495
|
# Fetch KleidiAI sources:
|
408
496
|
include(FetchContent)
|
409
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
497
|
+
set(KLEIDIAI_COMMIT_TAG "v1.9.0")
|
410
498
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
411
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
499
|
+
set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
|
412
500
|
|
413
501
|
if (POLICY CMP0135)
|
414
502
|
cmake_policy(SET CMP0135 NEW)
|
@@ -501,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
501
589
|
if (EMSCRIPTEN)
|
502
590
|
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
503
591
|
endif()
|
592
|
+
|
593
|
+
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
|
594
|
+
# The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
|
595
|
+
target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
|
596
|
+
endif()
|
504
597
|
endfunction()
|
@@ -8,7 +8,8 @@
|
|
8
8
|
#include "mmq.h"
|
9
9
|
#include "ggml-impl.h"
|
10
10
|
#include "ggml-cpu-impl.h"
|
11
|
-
#include "
|
11
|
+
#include "simd-mappings.h"
|
12
|
+
#include "quants.h"
|
12
13
|
#include "ggml-quants.h"
|
13
14
|
#include <algorithm>
|
14
15
|
#include <type_traits>
|
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
|
|
453
454
|
|
454
455
|
// Quantize these floats
|
455
456
|
const float iscale = 127.f / amax;
|
456
|
-
y[i].d =
|
457
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
|
457
458
|
const float id = ( amax != 0.0f ) ? iscale : 0.f;
|
458
459
|
const __m512 vscale = _mm512_set1_ps(id);
|
459
460
|
|
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
|
|
1090
1091
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
1091
1092
|
|
1092
1093
|
for (int m = 0; m < nr; ++m) {
|
1093
|
-
const __m512 vd1 = _mm512_set1_ps(
|
1094
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
1094
1095
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
1095
1096
|
|
1096
1097
|
__m512 vsum;
|
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
|
|
1113
1114
|
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
|
1114
1115
|
|
1115
1116
|
for (int m = 0; m < nr; ++m) {
|
1116
|
-
const __m512 vd1 = _mm512_set1_ps(
|
1117
|
-
const __m512 vs1 = _mm512_set1_ps(
|
1117
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
1118
|
+
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
|
1118
1119
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
1119
1120
|
|
1120
1121
|
__m512 vsum;
|
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
|
|
1137
1138
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
1138
1139
|
|
1139
1140
|
for (int m = 0; m < nr; ++m) {
|
1140
|
-
const __m512 vd1 = _mm512_set1_ps(
|
1141
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
1141
1142
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
1142
1143
|
|
1143
1144
|
__m512 vsum;
|
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|
1437
1438
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
1438
1439
|
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
|
1439
1440
|
}
|
1440
|
-
vd1 = _mm512_set1_ps(
|
1441
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
1441
1442
|
}
|
1442
1443
|
|
1443
1444
|
// load b
|
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|
1498
1499
|
for (int k = 0; k < 8; ++k) {
|
1499
1500
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
1500
1501
|
}
|
1501
|
-
vd1 = _mm512_set1_ps(
|
1502
|
-
vs1 = _mm512_set1_ps(
|
1502
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
1503
|
+
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
|
1503
1504
|
}
|
1504
1505
|
|
1505
1506
|
// load b
|
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|
1571
1572
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
1572
1573
|
va[k] = _mm512_add_epi8(va[k], off);
|
1573
1574
|
}
|
1574
|
-
vd1 = _mm512_set1_ps(
|
1575
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
1575
1576
|
}
|
1576
1577
|
|
1577
1578
|
// load b
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#include "ggml-backend-impl.h"
|
2
|
+
|
3
|
+
#if defined(__aarch64__)
|
4
|
+
|
5
|
+
#if defined(__linux__)
|
6
|
+
#include <sys/auxv.h>
|
7
|
+
#elif defined(__APPLE__)
|
8
|
+
#include <sys/sysctl.h>
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(HWCAP2_I8MM)
|
12
|
+
#define HWCAP2_I8MM (1 << 13)
|
13
|
+
#endif
|
14
|
+
|
15
|
+
#if !defined(HWCAP2_SME)
|
16
|
+
#define HWCAP2_SME (1 << 23)
|
17
|
+
#endif
|
18
|
+
|
19
|
+
struct aarch64_features {
|
20
|
+
// has_neon not needed, aarch64 has NEON guaranteed
|
21
|
+
bool has_dotprod = false;
|
22
|
+
bool has_fp16_va = false;
|
23
|
+
bool has_sve = false;
|
24
|
+
bool has_sve2 = false;
|
25
|
+
bool has_i8mm = false;
|
26
|
+
bool has_sme = false;
|
27
|
+
|
28
|
+
aarch64_features() {
|
29
|
+
#if defined(__linux__)
|
30
|
+
uint32_t hwcap = getauxval(AT_HWCAP);
|
31
|
+
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
32
|
+
|
33
|
+
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
34
|
+
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
35
|
+
has_sve = !!(hwcap & HWCAP_SVE);
|
36
|
+
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
37
|
+
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
38
|
+
has_sme = !!(hwcap2 & HWCAP2_SME);
|
39
|
+
#elif defined(__APPLE__)
|
40
|
+
int oldp = 0;
|
41
|
+
size_t size = sizeof(oldp);
|
42
|
+
|
43
|
+
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
44
|
+
has_dotprod = static_cast<bool>(oldp);
|
45
|
+
}
|
46
|
+
|
47
|
+
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
48
|
+
has_i8mm = static_cast<bool>(oldp);
|
49
|
+
}
|
50
|
+
|
51
|
+
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
52
|
+
has_sme = static_cast<bool>(oldp);
|
53
|
+
}
|
54
|
+
|
55
|
+
// Apple apparently does not implement SVE yet
|
56
|
+
#endif
|
57
|
+
}
|
58
|
+
};
|
59
|
+
|
60
|
+
static int ggml_backend_cpu_aarch64_score() {
|
61
|
+
int score = 1;
|
62
|
+
aarch64_features af;
|
63
|
+
|
64
|
+
#ifdef GGML_USE_DOTPROD
|
65
|
+
if (!af.has_dotprod) { return 0; }
|
66
|
+
score += 1<<1;
|
67
|
+
#endif
|
68
|
+
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
|
69
|
+
if (!af.has_fp16_va) { return 0; }
|
70
|
+
score += 1<<2;
|
71
|
+
#endif
|
72
|
+
#ifdef GGML_USE_SVE
|
73
|
+
if (!af.has_sve) { return 0; }
|
74
|
+
score += 1<<3;
|
75
|
+
#endif
|
76
|
+
#ifdef GGML_USE_MATMUL_INT8
|
77
|
+
if (!af.has_i8mm) { return 0; }
|
78
|
+
score += 1<<4;
|
79
|
+
#endif
|
80
|
+
#ifdef GGML_USE_SVE2
|
81
|
+
if (!af.has_sve2) { return 0; }
|
82
|
+
score += 1<<5;
|
83
|
+
#endif
|
84
|
+
#ifdef GGML_USE_SME
|
85
|
+
if (!af.has_sme) { return 0; }
|
86
|
+
score += 1<<6;
|
87
|
+
#endif
|
88
|
+
|
89
|
+
return score;
|
90
|
+
}
|
91
|
+
|
92
|
+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
|
93
|
+
|
94
|
+
# endif // defined(__aarch64__)
|