@fugood/llama.node 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +1 -1
- package/src/LlamaContext.cpp +81 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -32,13 +32,10 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
|
|
|
32
32
|
static bool match_string(const std::string & input, llama_grammar * grammar) {
|
|
33
33
|
const auto cpts = unicode_cpts_from_utf8(input);
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
|
|
35
|
+
auto & stacks_cur = llama_grammar_get_stacks(grammar);
|
|
37
36
|
|
|
38
37
|
for (const auto & cpt : cpts) {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
|
|
38
|
+
llama_grammar_accept(grammar, cpt);
|
|
42
39
|
|
|
43
40
|
if (stacks_cur.empty()) {
|
|
44
41
|
// no stacks means that the grammar failed to match at this point
|
|
@@ -63,7 +60,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
|
|
63
60
|
auto * grammar = build_grammar(grammar_str);
|
|
64
61
|
|
|
65
62
|
// Save the original grammar stacks so that we can reset after every new string we want to test
|
|
66
|
-
const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar);
|
|
63
|
+
const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); // copy
|
|
67
64
|
|
|
68
65
|
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
|
|
69
66
|
|
|
@@ -113,12 +113,10 @@ int main()
|
|
|
113
113
|
}
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
-
llama_grammar * grammar = NULL;
|
|
117
116
|
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
|
118
117
|
|
|
119
|
-
grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
|
120
|
-
if (grammar == nullptr)
|
|
121
|
-
{
|
|
118
|
+
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
|
119
|
+
if (grammar == nullptr) {
|
|
122
120
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
|
123
121
|
}
|
|
124
122
|
|
|
@@ -79,9 +79,9 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
// Total dot product error
|
|
82
|
-
static float dot_product_error(
|
|
83
|
-
|
|
84
|
-
|
|
82
|
+
static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
|
|
83
|
+
GGML_UNUSED(qfns);
|
|
84
|
+
|
|
85
85
|
std::vector<uint8_t> tmp_q1(2*test_size);
|
|
86
86
|
std::vector<uint8_t> tmp_q2(2*test_size);
|
|
87
87
|
|
|
@@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|
|
138
138
|
struct ggml_tensor * x;
|
|
139
139
|
|
|
140
140
|
// rope f32
|
|
141
|
-
for (int m = 0; m <
|
|
141
|
+
for (int m = 0; m < 5; ++m) {
|
|
142
142
|
const int ndims = 4;
|
|
143
143
|
|
|
144
144
|
const int64_t n_rot = 128;
|
|
@@ -147,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|
|
147
147
|
const int n_past_0 = 100;
|
|
148
148
|
const int n_past_2 = 33;
|
|
149
149
|
|
|
150
|
-
struct ggml_tensor *
|
|
151
|
-
struct ggml_tensor *
|
|
152
|
-
struct ggml_tensor *
|
|
153
|
-
|
|
154
|
-
for (int i = 0; i < ne[2]; ++i) {
|
|
155
|
-
((int32_t *) p0->data)[i] = n_past_0 + i;
|
|
156
|
-
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
|
157
|
-
((int32_t *) p2->data)[i] = n_past_2 + i;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
|
161
|
-
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
|
162
|
-
|
|
150
|
+
struct ggml_tensor * r0;
|
|
151
|
+
struct ggml_tensor * r1;
|
|
152
|
+
struct ggml_tensor * r2;
|
|
163
153
|
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
|
154
|
+
int mode = -1;
|
|
164
155
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
156
|
+
if (m < 3) {
|
|
157
|
+
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
|
158
|
+
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
|
159
|
+
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
|
169
160
|
|
|
170
|
-
|
|
171
|
-
|
|
161
|
+
for (int i = 0; i < ne[2]; ++i) {
|
|
162
|
+
((int32_t *) p0->data)[i] = n_past_0 + i;
|
|
163
|
+
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
|
164
|
+
((int32_t *) p2->data)[i] = n_past_2 + i;
|
|
165
|
+
}
|
|
166
|
+
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
|
167
|
+
mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
|
168
|
+
|
|
169
|
+
// 100, 101, 102, ..., 172
|
|
170
|
+
r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
|
|
171
|
+
// -67, -67, -67, ..., -67
|
|
172
|
+
r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
|
173
|
+
|
|
174
|
+
// 33, 34, 35, ..., 105
|
|
175
|
+
r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
|
|
176
|
+
} else {
|
|
177
|
+
// testing multi-dimension rope position embedding mode
|
|
178
|
+
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
|
179
|
+
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
|
180
|
+
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
|
181
|
+
|
|
182
|
+
int sections[4] = {16, 24, 24, 0};
|
|
183
|
+
mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
|
|
184
|
+
|
|
185
|
+
for (int i = 0; i < ne[2]; ++i) {
|
|
186
|
+
for (int j = 0; j < 4; ++j) {
|
|
187
|
+
((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
|
|
188
|
+
((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
|
|
189
|
+
((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// [[100, 101, 102, ..., 172],
|
|
194
|
+
// [101, 102, 103, ..., 173],
|
|
195
|
+
// [102, 103, 104, ..., 174]]
|
|
196
|
+
r0 = ggml_rope_multi(
|
|
197
|
+
ctx0, x, p0, nullptr,
|
|
198
|
+
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
|
199
|
+
// [[-67, -67, -67, ..., -67]
|
|
200
|
+
// [-67, -67, -67, ..., -67]
|
|
201
|
+
// [-67, -67, -67, ..., -67]]
|
|
202
|
+
r1 = ggml_rope_multi(
|
|
203
|
+
ctx0, r0, p1, nullptr,
|
|
204
|
+
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
|
205
|
+
|
|
206
|
+
// [[33, 34, 35, ..., 105]
|
|
207
|
+
// [34, 35, 36, ..., 106]
|
|
208
|
+
// [35, 36, 37, ..., 107]]
|
|
209
|
+
r2 = ggml_rope_multi(
|
|
210
|
+
ctx0, x, p2, nullptr,
|
|
211
|
+
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
|
212
|
+
}
|
|
172
213
|
|
|
173
214
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
174
215
|
|
|
@@ -145,7 +145,7 @@ static void test_penalties(
|
|
|
145
145
|
sampler_tester tester(probs, probs_expected);
|
|
146
146
|
|
|
147
147
|
const size_t n_vocab = probs.size();
|
|
148
|
-
auto * sampler = llama_sampler_init_penalties(
|
|
148
|
+
auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
|
|
149
149
|
|
|
150
150
|
for (size_t i = 0; i < last_tokens.size(); i++) {
|
|
151
151
|
llama_sampler_accept(sampler, last_tokens[i]);
|
|
@@ -284,7 +284,7 @@ static void test_perf() {
|
|
|
284
284
|
|
|
285
285
|
data.reserve(n_vocab);
|
|
286
286
|
for (int i = 0; i < n_vocab; i++) {
|
|
287
|
-
const float logit = 2.0f*((
|
|
287
|
+
const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
|
|
288
288
|
data.emplace_back(llama_token_data{i, logit, 0.0f});
|
|
289
289
|
}
|
|
290
290
|
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
name: Nix aarch64 builds
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
workflow_dispatch: # allows manual triggering
|
|
5
|
-
schedule:
|
|
6
|
-
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
|
7
|
-
# 1.5h instead of minutes with the cold cache).
|
|
8
|
-
#
|
|
9
|
-
# randint(0, 59), randint(0, 23)
|
|
10
|
-
- cron: '26 12 * * *'
|
|
11
|
-
# But also rebuild if we touched any of the Nix expressions:
|
|
12
|
-
push:
|
|
13
|
-
branches:
|
|
14
|
-
- master
|
|
15
|
-
paths: ['**/*.nix', 'flake.lock']
|
|
16
|
-
pull_request:
|
|
17
|
-
types: [opened, synchronize, reopened]
|
|
18
|
-
paths: ['**/*.nix', 'flake.lock']
|
|
19
|
-
|
|
20
|
-
concurrency:
|
|
21
|
-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
22
|
-
cancel-in-progress: true
|
|
23
|
-
|
|
24
|
-
# Fine-grant permission
|
|
25
|
-
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
26
|
-
permissions:
|
|
27
|
-
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
|
28
|
-
id-token: write
|
|
29
|
-
contents: read
|
|
30
|
-
|
|
31
|
-
jobs:
|
|
32
|
-
nix-build-aarch64:
|
|
33
|
-
runs-on: ubuntu-latest
|
|
34
|
-
steps:
|
|
35
|
-
- name: Checkout repository
|
|
36
|
-
uses: actions/checkout@v4
|
|
37
|
-
- name: Install QEMU
|
|
38
|
-
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
|
39
|
-
run: |
|
|
40
|
-
sudo apt-get update
|
|
41
|
-
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
|
42
|
-
sudo usermod -a -G kvm $USER
|
|
43
|
-
- name: Install Nix
|
|
44
|
-
uses: DeterminateSystems/nix-installer-action@v9
|
|
45
|
-
with:
|
|
46
|
-
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
47
|
-
extra-conf: |
|
|
48
|
-
extra-platforms = aarch64-linux
|
|
49
|
-
extra-system-features = nixos-test kvm
|
|
50
|
-
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
51
|
-
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
52
|
-
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
53
|
-
with:
|
|
54
|
-
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
55
|
-
- name: Set-up cachix to push the results to
|
|
56
|
-
uses: cachix/cachix-action@v13
|
|
57
|
-
with:
|
|
58
|
-
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
|
59
|
-
name: llama-cpp
|
|
60
|
-
- name: Show all output paths
|
|
61
|
-
run: >
|
|
62
|
-
nix run github:nix-community/nix-eval-jobs
|
|
63
|
-
-- --gc-roots-dir gcroot
|
|
64
|
-
--flake
|
|
65
|
-
".#packages.aarch64-linux"
|
|
66
|
-
- name: Build
|
|
67
|
-
run: >
|
|
68
|
-
nix run github:Mic92/nix-fast-build
|
|
69
|
-
-- --skip-cached --no-nom
|
|
70
|
-
--systems aarch64-linux
|
|
71
|
-
--flake
|
|
72
|
-
".#checks.aarch64-linux"
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
name: Nix CI
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
workflow_dispatch: # allows manual triggering
|
|
5
|
-
push:
|
|
6
|
-
branches:
|
|
7
|
-
- master
|
|
8
|
-
pull_request:
|
|
9
|
-
types: [opened, synchronize, reopened]
|
|
10
|
-
|
|
11
|
-
concurrency:
|
|
12
|
-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
13
|
-
cancel-in-progress: true
|
|
14
|
-
|
|
15
|
-
# Fine-grant permission
|
|
16
|
-
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
17
|
-
permissions:
|
|
18
|
-
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
|
19
|
-
id-token: write
|
|
20
|
-
contents: read
|
|
21
|
-
|
|
22
|
-
jobs:
|
|
23
|
-
nix-eval:
|
|
24
|
-
strategy:
|
|
25
|
-
fail-fast: false
|
|
26
|
-
matrix:
|
|
27
|
-
os: [ ubuntu-latest, macos-latest ]
|
|
28
|
-
runs-on: ${{ matrix.os }}
|
|
29
|
-
steps:
|
|
30
|
-
- name: Checkout repository
|
|
31
|
-
uses: actions/checkout@v4
|
|
32
|
-
- name: Install Nix
|
|
33
|
-
uses: DeterminateSystems/nix-installer-action@v9
|
|
34
|
-
with:
|
|
35
|
-
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
36
|
-
extra-conf: |
|
|
37
|
-
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
38
|
-
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
39
|
-
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
40
|
-
with:
|
|
41
|
-
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
42
|
-
- name: List all flake outputs
|
|
43
|
-
run: nix flake show --all-systems
|
|
44
|
-
- name: Show all output paths
|
|
45
|
-
run: >
|
|
46
|
-
nix run github:nix-community/nix-eval-jobs
|
|
47
|
-
-- --gc-roots-dir gcroot
|
|
48
|
-
--flake
|
|
49
|
-
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
|
50
|
-
nix-build:
|
|
51
|
-
strategy:
|
|
52
|
-
fail-fast: false
|
|
53
|
-
matrix:
|
|
54
|
-
os: [ ubuntu-latest, macos-latest ]
|
|
55
|
-
runs-on: ${{ matrix.os }}
|
|
56
|
-
steps:
|
|
57
|
-
- name: Checkout repository
|
|
58
|
-
uses: actions/checkout@v4
|
|
59
|
-
- name: Install Nix
|
|
60
|
-
uses: DeterminateSystems/nix-installer-action@v9
|
|
61
|
-
with:
|
|
62
|
-
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
63
|
-
extra-conf: |
|
|
64
|
-
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
65
|
-
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
66
|
-
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
67
|
-
with:
|
|
68
|
-
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
69
|
-
- name: Set-up cachix to push the results to
|
|
70
|
-
uses: cachix/cachix-action@v13
|
|
71
|
-
with:
|
|
72
|
-
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
|
73
|
-
name: llama-cpp
|
|
74
|
-
- name: Build
|
|
75
|
-
run: >
|
|
76
|
-
nix run github:Mic92/nix-fast-build
|
|
77
|
-
-- --skip-cached --no-nom
|
|
78
|
-
--flake
|
|
79
|
-
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
name: update-flake-lock
|
|
2
|
-
on:
|
|
3
|
-
workflow_dispatch:
|
|
4
|
-
schedule:
|
|
5
|
-
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
|
6
|
-
|
|
7
|
-
jobs:
|
|
8
|
-
lockfile:
|
|
9
|
-
runs-on: ubuntu-latest
|
|
10
|
-
steps:
|
|
11
|
-
- name: Checkout repository
|
|
12
|
-
uses: actions/checkout@v4
|
|
13
|
-
- name: Install Nix
|
|
14
|
-
uses: DeterminateSystems/nix-installer-action@main
|
|
15
|
-
- name: Update flake.lock
|
|
16
|
-
uses: DeterminateSystems/update-flake-lock@main
|
|
17
|
-
with:
|
|
18
|
-
pr-title: "nix: update flake.lock"
|
|
19
|
-
pr-labels: |
|
|
20
|
-
nix
|
|
21
|
-
pr-reviewers: philiptaron,SomeoneSerge
|
|
22
|
-
token: ${{ secrets.FLAKE_TOKEN }}
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
|
2
|
-
name: "Publish a flake to flakestry & flakehub"
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
tags:
|
|
6
|
-
- "*"
|
|
7
|
-
workflow_dispatch:
|
|
8
|
-
inputs:
|
|
9
|
-
tag:
|
|
10
|
-
description: "The existing tag to publish"
|
|
11
|
-
type: "string"
|
|
12
|
-
required: true
|
|
13
|
-
jobs:
|
|
14
|
-
flakestry-publish:
|
|
15
|
-
runs-on: ubuntu-latest
|
|
16
|
-
permissions:
|
|
17
|
-
id-token: "write"
|
|
18
|
-
contents: "read"
|
|
19
|
-
steps:
|
|
20
|
-
- uses: flakestry/flakestry-publish@main
|
|
21
|
-
with:
|
|
22
|
-
version: "${{ inputs.tag || github.ref_name }}"
|
|
23
|
-
flakehub-publish:
|
|
24
|
-
runs-on: "ubuntu-latest"
|
|
25
|
-
permissions:
|
|
26
|
-
id-token: "write"
|
|
27
|
-
contents: "read"
|
|
28
|
-
steps:
|
|
29
|
-
- uses: "actions/checkout@v4"
|
|
30
|
-
with:
|
|
31
|
-
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
|
32
|
-
- uses: "DeterminateSystems/nix-installer-action@main"
|
|
33
|
-
- uses: "DeterminateSystems/flakehub-push@main"
|
|
34
|
-
with:
|
|
35
|
-
visibility: "public"
|
|
36
|
-
tag: "${{ inputs.tag }}"
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
#pragma once
|
|
2
|
-
|
|
3
|
-
#include "ggml.h"
|
|
4
|
-
#include "ggml-backend.h"
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
#ifdef __cplusplus
|
|
8
|
-
extern "C" {
|
|
9
|
-
#endif
|
|
10
|
-
|
|
11
|
-
// buffer_type API
|
|
12
|
-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
|
13
|
-
|
|
14
|
-
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
|
15
|
-
|
|
16
|
-
// backend API
|
|
17
|
-
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
|
|
18
|
-
|
|
19
|
-
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
|
20
|
-
|
|
21
|
-
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
|
22
|
-
|
|
23
|
-
#ifdef __cplusplus
|
|
24
|
-
}
|
|
25
|
-
#endif
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
#define GGML_COMMON_DECL_C
|
|
2
|
-
#include "ggml-common.h"
|
|
3
|
-
|
|
4
|
-
#include "ggml-aarch64.h"
|
|
5
|
-
#include "ggml-impl.h"
|
|
6
|
-
#include "ggml-quants.h"
|
|
7
|
-
#include <assert.h>
|
|
8
|
-
|
|
9
|
-
#define UNUSED GGML_UNUSED
|
|
10
|
-
|
|
11
|
-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
12
|
-
block_q4_0x4 out;
|
|
13
|
-
|
|
14
|
-
for (int i = 0; i < 4; i++) {
|
|
15
|
-
out.d[i] = in[i].d;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
const int end = QK4_0 * 2 / blck_size_interleave;
|
|
19
|
-
|
|
20
|
-
if (blck_size_interleave == 8) {
|
|
21
|
-
const uint64_t xor_mask = 0x8888888888888888ULL;
|
|
22
|
-
for (int i = 0; i < end; ++i) {
|
|
23
|
-
int src_id = i % 4;
|
|
24
|
-
int src_offset = (i / 4) * blck_size_interleave;
|
|
25
|
-
int dst_offset = i * blck_size_interleave;
|
|
26
|
-
|
|
27
|
-
uint64_t elems;
|
|
28
|
-
// Using memcpy to avoid unaligned memory accesses
|
|
29
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
30
|
-
elems ^= xor_mask;
|
|
31
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
32
|
-
}
|
|
33
|
-
} else if (blck_size_interleave == 4) {
|
|
34
|
-
const uint32_t xor_mask = 0x88888888;
|
|
35
|
-
for (int i = 0; i < end; ++i) {
|
|
36
|
-
int src_id = i % 4;
|
|
37
|
-
int src_offset = (i / 4) * blck_size_interleave;
|
|
38
|
-
int dst_offset = i * blck_size_interleave;
|
|
39
|
-
|
|
40
|
-
uint32_t elems;
|
|
41
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
|
42
|
-
elems ^= xor_mask;
|
|
43
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
|
44
|
-
}
|
|
45
|
-
} else {
|
|
46
|
-
GGML_ASSERT(false);
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
return out;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
// interleave 8 block_q4_0s in blocks of blck_size_interleave
|
|
53
|
-
// returns an interleaved block_q4_0x8
|
|
54
|
-
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
|
55
|
-
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
|
56
|
-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
57
|
-
block_q4_0x8 out;
|
|
58
|
-
|
|
59
|
-
for (int i = 0; i < 8; i++) {
|
|
60
|
-
out.d[i] = in[i].d;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
const int end = QK4_0 * 4 / blck_size_interleave;
|
|
64
|
-
const uint64_t xor_mask = 0x8888888888888888ULL;
|
|
65
|
-
|
|
66
|
-
for (int i = 0; i < end; ++i) {
|
|
67
|
-
int src_id = i % 8;
|
|
68
|
-
int src_offset = (i / 8) * blck_size_interleave;
|
|
69
|
-
int dst_offset = i * blck_size_interleave;
|
|
70
|
-
|
|
71
|
-
uint64_t elems;
|
|
72
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
73
|
-
elems ^= xor_mask;
|
|
74
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
return out;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
|
|
81
|
-
assert(n_per_row % QK4_0 == 0);
|
|
82
|
-
const int nb = n_per_row / QK4_0;
|
|
83
|
-
|
|
84
|
-
void * out_ptr = NULL;
|
|
85
|
-
if (nrows_interleaved == 8) {
|
|
86
|
-
out_ptr = (block_q4_0x8 *) dst;
|
|
87
|
-
}
|
|
88
|
-
else if (nrows_interleaved == 4) {
|
|
89
|
-
out_ptr = (block_q4_0x4 *) dst;
|
|
90
|
-
}
|
|
91
|
-
assert(nrows_interleaved <= 8);
|
|
92
|
-
block_q4_0 dst_tmp[8];
|
|
93
|
-
|
|
94
|
-
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
|
|
95
|
-
|
|
96
|
-
for (int64_t x = 0; x < nb; x++) {
|
|
97
|
-
|
|
98
|
-
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
99
|
-
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
if (nrows_interleaved == 8) {
|
|
103
|
-
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
|
|
104
|
-
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
|
105
|
-
}
|
|
106
|
-
else if (nrows_interleaved == 4) {
|
|
107
|
-
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
|
|
108
|
-
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
117
|
-
UNUSED(quant_weights);
|
|
118
|
-
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
122
|
-
UNUSED(quant_weights);
|
|
123
|
-
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
127
|
-
UNUSED(quant_weights);
|
|
128
|
-
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
|
129
|
-
}
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
#pragma once
|
|
2
|
-
|
|
3
|
-
#include "ggml.h"
|
|
4
|
-
|
|
5
|
-
// GGML internal header
|
|
6
|
-
|
|
7
|
-
#ifdef __cplusplus
|
|
8
|
-
extern "C" {
|
|
9
|
-
#endif
|
|
10
|
-
|
|
11
|
-
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
12
|
-
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
13
|
-
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
14
|
-
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
15
|
-
|
|
16
|
-
#ifdef __cplusplus
|
|
17
|
-
}
|
|
18
|
-
#endif
|
|
19
|
-
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
|
2
|
-
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
3
|
-
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
|
|
4
|
-
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
|
|
5
|
-
message(STATUS "Using AMX")
|
|
6
|
-
|
|
7
|
-
file(GLOB GGML_HEADERS_AMX "*.h")
|
|
8
|
-
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
|
|
9
|
-
|
|
10
|
-
file(GLOB GGML_SOURCES_AMX "*.cpp")
|
|
11
|
-
|
|
12
|
-
add_library(ggml-amx
|
|
13
|
-
${GGML_HEADERS_AMX}
|
|
14
|
-
${GGML_SOURCES_AMX})
|
|
15
|
-
|
|
16
|
-
target_link_libraries(ggml-amx PRIVATE ggml-base)
|
|
17
|
-
target_include_directories(ggml-amx PRIVATE . ..)
|
|
18
|
-
|
|
19
|
-
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
|
|
20
|
-
# TODO: integrate AMX backend into the CPU backend
|
|
21
|
-
if (MSVC)
|
|
22
|
-
# instruction set detection for MSVC only
|
|
23
|
-
if (GGML_NATIVE)
|
|
24
|
-
# TODO: improve, should not reference files from the parent folder
|
|
25
|
-
include(../ggml-cpu/cmake/FindSIMD.cmake)
|
|
26
|
-
endif ()
|
|
27
|
-
if (GGML_AVX512)
|
|
28
|
-
list(APPEND ARCH_FLAGS /arch:AVX512)
|
|
29
|
-
# MSVC has no compile-time flags enabling specific
|
|
30
|
-
# AVX512 extensions, neither it defines the
|
|
31
|
-
# macros corresponding to the extensions.
|
|
32
|
-
# Do it manually.
|
|
33
|
-
if (GGML_AVX512_VBMI)
|
|
34
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
|
35
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
|
36
|
-
endif()
|
|
37
|
-
if (GGML_AVX512_VNNI)
|
|
38
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
|
39
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
|
40
|
-
endif()
|
|
41
|
-
if (GGML_AVX512_BF16)
|
|
42
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
|
43
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
|
44
|
-
endif()
|
|
45
|
-
if (GGML_AMX_TILE)
|
|
46
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
|
47
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
|
48
|
-
endif()
|
|
49
|
-
if (GGML_AMX_INT8)
|
|
50
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
|
51
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
|
52
|
-
endif()
|
|
53
|
-
if (GGML_AMX_BF16)
|
|
54
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
|
55
|
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
|
56
|
-
endif()
|
|
57
|
-
elseif (GGML_AVX2)
|
|
58
|
-
list(APPEND ARCH_FLAGS /arch:AVX2)
|
|
59
|
-
elseif (GGML_AVX)
|
|
60
|
-
list(APPEND ARCH_FLAGS /arch:AVX)
|
|
61
|
-
endif()
|
|
62
|
-
else()
|
|
63
|
-
if (GGML_NATIVE)
|
|
64
|
-
list(APPEND ARCH_FLAGS -march=native)
|
|
65
|
-
endif()
|
|
66
|
-
if (GGML_F16C)
|
|
67
|
-
list(APPEND ARCH_FLAGS -mf16c)
|
|
68
|
-
endif()
|
|
69
|
-
if (GGML_FMA)
|
|
70
|
-
list(APPEND ARCH_FLAGS -mfma)
|
|
71
|
-
endif()
|
|
72
|
-
if (GGML_AVX)
|
|
73
|
-
list(APPEND ARCH_FLAGS -mavx)
|
|
74
|
-
endif()
|
|
75
|
-
if (GGML_AVX2)
|
|
76
|
-
list(APPEND ARCH_FLAGS -mavx2)
|
|
77
|
-
endif()
|
|
78
|
-
if (GGML_AVX512)
|
|
79
|
-
list(APPEND ARCH_FLAGS -mavx512f)
|
|
80
|
-
list(APPEND ARCH_FLAGS -mavx512dq)
|
|
81
|
-
list(APPEND ARCH_FLAGS -mavx512bw)
|
|
82
|
-
endif()
|
|
83
|
-
if (GGML_AVX512_VBMI)
|
|
84
|
-
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
|
85
|
-
endif()
|
|
86
|
-
if (GGML_AVX512_VNNI)
|
|
87
|
-
list(APPEND ARCH_FLAGS -mavx512vnni)
|
|
88
|
-
endif()
|
|
89
|
-
if (GGML_AVX512_BF16)
|
|
90
|
-
list(APPEND ARCH_FLAGS -mavx512bf16)
|
|
91
|
-
endif()
|
|
92
|
-
if (GGML_AMX_TILE)
|
|
93
|
-
list(APPEND ARCH_FLAGS -mamx-tile)
|
|
94
|
-
endif()
|
|
95
|
-
if (GGML_AMX_INT8)
|
|
96
|
-
list(APPEND ARCH_FLAGS -mamx-int8)
|
|
97
|
-
endif()
|
|
98
|
-
if (GGML_AMX_BF16)
|
|
99
|
-
list(APPEND ARCH_FLAGS -mamx-bf16)
|
|
100
|
-
endif()
|
|
101
|
-
endif()
|
|
102
|
-
|
|
103
|
-
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
|
|
104
|
-
else()
|
|
105
|
-
set(GGML_AMX OFF PARENT_SCOPE)
|
|
106
|
-
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
|
|
107
|
-
endif()
|