@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -1,9 +1,5 @@
|
|
|
1
|
-
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
|
2
1
|
#pragma once
|
|
3
2
|
|
|
4
|
-
#define GGML_COMMON_DECL_C
|
|
5
|
-
#include "ggml-common.h"
|
|
6
|
-
|
|
7
3
|
#include "ggml.h"
|
|
8
4
|
|
|
9
5
|
// GGML internal header
|
|
@@ -12,27 +8,11 @@
|
|
|
12
8
|
extern "C" {
|
|
13
9
|
#endif
|
|
14
10
|
|
|
15
|
-
// Quantization
|
|
16
|
-
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
17
|
-
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
18
|
-
|
|
19
|
-
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
|
20
|
-
|
|
21
11
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
22
12
|
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
23
13
|
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
24
14
|
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
25
15
|
|
|
26
|
-
// GEMV
|
|
27
|
-
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
28
|
-
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
29
|
-
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
30
|
-
|
|
31
|
-
// GEMM
|
|
32
|
-
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
33
|
-
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
34
|
-
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
35
|
-
|
|
36
16
|
#ifdef __cplusplus
|
|
37
17
|
}
|
|
38
18
|
#endif
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
//#define GGML_ALLOCATOR_DEBUG
|
|
16
16
|
|
|
17
|
-
//#define AT_PRINTF(...)
|
|
17
|
+
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
18
18
|
#define AT_PRINTF(...)
|
|
19
19
|
|
|
20
20
|
|
|
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
|
|
89
89
|
size = GGML_PAD(size, talloc->alignment);
|
|
90
90
|
|
|
91
91
|
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
|
92
|
-
|
|
92
|
+
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
|
93
93
|
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
|
94
94
|
GGML_ABORT("not enough space in the buffer");
|
|
95
95
|
}
|
|
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|
|
172
172
|
best_fit_block = alloc->n_free_blocks - 1;
|
|
173
173
|
} else {
|
|
174
174
|
// this should never happen
|
|
175
|
-
|
|
175
|
+
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
|
176
176
|
__func__, size, max_avail);
|
|
177
177
|
GGML_ABORT("not enough space in the buffer");
|
|
178
178
|
}
|
|
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|
|
209
209
|
}
|
|
210
210
|
}
|
|
211
211
|
}
|
|
212
|
-
|
|
212
|
+
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
|
213
213
|
for (int i = 0; i < 1024; i++) {
|
|
214
214
|
if (alloc->allocated_tensors[i].tensor) {
|
|
215
|
-
|
|
215
|
+
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
|
216
216
|
alloc->allocated_tensors[i].offset,
|
|
217
217
|
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
|
218
218
|
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
|
219
219
|
}
|
|
220
220
|
}
|
|
221
|
-
|
|
221
|
+
GGML_LOG_DEBUG("\n");
|
|
222
222
|
}
|
|
223
223
|
#endif
|
|
224
224
|
|
|
@@ -348,7 +348,6 @@ struct tensor_alloc {
|
|
|
348
348
|
};
|
|
349
349
|
|
|
350
350
|
struct leaf_alloc {
|
|
351
|
-
int buffer_id;
|
|
352
351
|
struct tensor_alloc leaf;
|
|
353
352
|
};
|
|
354
353
|
|
|
@@ -467,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
|
467
466
|
return ggml_gallocr_hash_get(galloc, t)->allocated;
|
|
468
467
|
}
|
|
469
468
|
|
|
470
|
-
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
|
|
471
|
-
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
472
|
-
hn->buffer_id = buffer_id;
|
|
473
|
-
hn->offset = offset;
|
|
474
|
-
hn->allocated = true;
|
|
475
|
-
}
|
|
476
|
-
|
|
477
469
|
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|
478
470
|
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
|
479
471
|
}
|
|
480
472
|
|
|
481
473
|
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
|
474
|
+
GGML_ASSERT(buffer_id >= 0);
|
|
482
475
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
483
476
|
|
|
484
477
|
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
|
@@ -740,7 +733,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
740
733
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
741
734
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
742
735
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
|
743
|
-
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
|
744
736
|
if (leaf->view_src || leaf->data) {
|
|
745
737
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
|
746
738
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
|
@@ -768,13 +760,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
768
760
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
|
769
761
|
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
|
770
762
|
#ifndef NDEBUG
|
|
771
|
-
|
|
763
|
+
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
772
764
|
#endif
|
|
773
765
|
|
|
774
766
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
|
775
767
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
|
776
768
|
if (galloc->buffers[i] == NULL) {
|
|
777
|
-
|
|
769
|
+
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
|
778
770
|
return false;
|
|
779
771
|
}
|
|
780
772
|
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
@@ -818,21 +810,25 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
818
810
|
}
|
|
819
811
|
|
|
820
812
|
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
|
821
|
-
size_t node_size =
|
|
813
|
+
size_t node_size = 0;
|
|
814
|
+
if (!node->data && !node->view_src) {
|
|
815
|
+
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
|
816
|
+
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
|
817
|
+
}
|
|
822
818
|
return talloc->size_max >= node_size;
|
|
823
819
|
}
|
|
824
820
|
|
|
825
821
|
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
|
826
822
|
if (galloc->n_nodes != graph->n_nodes) {
|
|
827
823
|
#ifndef NDEBUG
|
|
828
|
-
|
|
824
|
+
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
|
829
825
|
#endif
|
|
830
826
|
return true;
|
|
831
827
|
}
|
|
832
828
|
|
|
833
829
|
if (galloc->n_leafs != graph->n_leafs) {
|
|
834
830
|
#ifndef NDEBUG
|
|
835
|
-
|
|
831
|
+
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
|
836
832
|
#endif
|
|
837
833
|
return true;
|
|
838
834
|
}
|
|
@@ -843,7 +839,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
843
839
|
|
|
844
840
|
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
|
845
841
|
#ifndef NDEBUG
|
|
846
|
-
|
|
842
|
+
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
|
847
843
|
#endif
|
|
848
844
|
return true;
|
|
849
845
|
}
|
|
@@ -855,7 +851,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
855
851
|
}
|
|
856
852
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
|
857
853
|
#ifndef NDEBUG
|
|
858
|
-
|
|
854
|
+
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
|
859
855
|
#endif
|
|
860
856
|
return true;
|
|
861
857
|
}
|
|
@@ -869,14 +865,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
|
869
865
|
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
|
870
866
|
if (galloc->n_buffers == 1) {
|
|
871
867
|
#ifndef NDEBUG
|
|
872
|
-
|
|
868
|
+
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
|
873
869
|
#endif
|
|
874
870
|
if (!ggml_gallocr_reserve(galloc, graph)) {
|
|
875
871
|
return false;
|
|
876
872
|
}
|
|
877
873
|
} else {
|
|
878
874
|
#ifndef NDEBUG
|
|
879
|
-
|
|
875
|
+
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
|
880
876
|
#endif
|
|
881
877
|
return false;
|
|
882
878
|
}
|
|
@@ -940,7 +936,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
940
936
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
|
941
937
|
if (buffer == NULL) {
|
|
942
938
|
#ifndef NDEBUG
|
|
943
|
-
|
|
939
|
+
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
|
944
940
|
#endif
|
|
945
941
|
for (size_t i = 0; i < *n_buffers; i++) {
|
|
946
942
|
ggml_backend_buffer_free((*buffers)[i]);
|
|
@@ -990,7 +986,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
990
986
|
}
|
|
991
987
|
|
|
992
988
|
if (this_size > max_size) {
|
|
993
|
-
|
|
989
|
+
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
|
994
990
|
__func__, t->name,
|
|
995
991
|
ggml_backend_buft_name(buft),
|
|
996
992
|
this_size, max_size);
|
|
@@ -1022,7 +1018,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
1022
1018
|
|
|
1023
1019
|
if (n_buffers == 0) {
|
|
1024
1020
|
#ifndef NDEBUG
|
|
1025
|
-
|
|
1021
|
+
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
|
1026
1022
|
#endif
|
|
1027
1023
|
return NULL;
|
|
1028
1024
|
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
|
2
|
+
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
3
|
+
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
|
|
4
|
+
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
|
|
5
|
+
message(STATUS "Using AMX")
|
|
6
|
+
|
|
7
|
+
file(GLOB GGML_HEADERS_AMX "*.h")
|
|
8
|
+
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
|
|
9
|
+
|
|
10
|
+
file(GLOB GGML_SOURCES_AMX "*.cpp")
|
|
11
|
+
|
|
12
|
+
add_library(ggml-amx
|
|
13
|
+
${GGML_HEADERS_AMX}
|
|
14
|
+
${GGML_SOURCES_AMX})
|
|
15
|
+
|
|
16
|
+
target_link_libraries(ggml-amx PRIVATE ggml-base)
|
|
17
|
+
target_include_directories(ggml-amx PRIVATE . ..)
|
|
18
|
+
|
|
19
|
+
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
|
|
20
|
+
# TODO: integrate AMX backend into the CPU backend
|
|
21
|
+
if (MSVC)
|
|
22
|
+
# instruction set detection for MSVC only
|
|
23
|
+
if (GGML_NATIVE)
|
|
24
|
+
# TODO: improve, should not reference files from the parent folder
|
|
25
|
+
include(../ggml-cpu/cmake/FindSIMD.cmake)
|
|
26
|
+
endif ()
|
|
27
|
+
if (GGML_AVX512)
|
|
28
|
+
list(APPEND ARCH_FLAGS /arch:AVX512)
|
|
29
|
+
# MSVC has no compile-time flags enabling specific
|
|
30
|
+
# AVX512 extensions, neither it defines the
|
|
31
|
+
# macros corresponding to the extensions.
|
|
32
|
+
# Do it manually.
|
|
33
|
+
if (GGML_AVX512_VBMI)
|
|
34
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
|
35
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
|
36
|
+
endif()
|
|
37
|
+
if (GGML_AVX512_VNNI)
|
|
38
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
|
39
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
|
40
|
+
endif()
|
|
41
|
+
if (GGML_AVX512_BF16)
|
|
42
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
|
43
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
|
44
|
+
endif()
|
|
45
|
+
if (GGML_AMX_TILE)
|
|
46
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
|
47
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
|
48
|
+
endif()
|
|
49
|
+
if (GGML_AMX_INT8)
|
|
50
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
|
51
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
|
52
|
+
endif()
|
|
53
|
+
if (GGML_AMX_BF16)
|
|
54
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
|
55
|
+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
|
56
|
+
endif()
|
|
57
|
+
elseif (GGML_AVX2)
|
|
58
|
+
list(APPEND ARCH_FLAGS /arch:AVX2)
|
|
59
|
+
elseif (GGML_AVX)
|
|
60
|
+
list(APPEND ARCH_FLAGS /arch:AVX)
|
|
61
|
+
endif()
|
|
62
|
+
else()
|
|
63
|
+
if (GGML_NATIVE)
|
|
64
|
+
list(APPEND ARCH_FLAGS -march=native)
|
|
65
|
+
endif()
|
|
66
|
+
if (GGML_F16C)
|
|
67
|
+
list(APPEND ARCH_FLAGS -mf16c)
|
|
68
|
+
endif()
|
|
69
|
+
if (GGML_FMA)
|
|
70
|
+
list(APPEND ARCH_FLAGS -mfma)
|
|
71
|
+
endif()
|
|
72
|
+
if (GGML_AVX)
|
|
73
|
+
list(APPEND ARCH_FLAGS -mavx)
|
|
74
|
+
endif()
|
|
75
|
+
if (GGML_AVX2)
|
|
76
|
+
list(APPEND ARCH_FLAGS -mavx2)
|
|
77
|
+
endif()
|
|
78
|
+
if (GGML_AVX512)
|
|
79
|
+
list(APPEND ARCH_FLAGS -mavx512f)
|
|
80
|
+
list(APPEND ARCH_FLAGS -mavx512dq)
|
|
81
|
+
list(APPEND ARCH_FLAGS -mavx512bw)
|
|
82
|
+
endif()
|
|
83
|
+
if (GGML_AVX512_VBMI)
|
|
84
|
+
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
|
85
|
+
endif()
|
|
86
|
+
if (GGML_AVX512_VNNI)
|
|
87
|
+
list(APPEND ARCH_FLAGS -mavx512vnni)
|
|
88
|
+
endif()
|
|
89
|
+
if (GGML_AVX512_BF16)
|
|
90
|
+
list(APPEND ARCH_FLAGS -mavx512bf16)
|
|
91
|
+
endif()
|
|
92
|
+
if (GGML_AMX_TILE)
|
|
93
|
+
list(APPEND ARCH_FLAGS -mamx-tile)
|
|
94
|
+
endif()
|
|
95
|
+
if (GGML_AMX_INT8)
|
|
96
|
+
list(APPEND ARCH_FLAGS -mamx-int8)
|
|
97
|
+
endif()
|
|
98
|
+
if (GGML_AMX_BF16)
|
|
99
|
+
list(APPEND ARCH_FLAGS -mamx-bf16)
|
|
100
|
+
endif()
|
|
101
|
+
endif()
|
|
102
|
+
|
|
103
|
+
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
|
|
104
|
+
else()
|
|
105
|
+
set(GGML_AMX OFF PARENT_SCOPE)
|
|
106
|
+
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
|
|
107
|
+
endif()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
// hack until AMX is moved into the CPU backend
|
|
5
|
+
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
|
|
6
|
+
|
|
7
|
+
#include <algorithm>
|
|
8
|
+
#include <memory>
|
|
9
|
+
#include <type_traits>
|
|
10
|
+
|
|
11
|
+
#if defined(_OPENMP)
|
|
12
|
+
#include <omp.h>
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
#define TILE_M 16
|
|
16
|
+
#define TILE_N 16
|
|
17
|
+
#define TILE_K 32
|
|
18
|
+
#define VNNI_BLK 4
|
|
19
|
+
|
|
20
|
+
#define AMX_BLK_SIZE 32
|
|
21
|
+
|
|
22
|
+
#define TMM0 0
|
|
23
|
+
#define TMM1 1
|
|
24
|
+
#define TMM2 2
|
|
25
|
+
#define TMM3 3
|
|
26
|
+
#define TMM4 4
|
|
27
|
+
#define TMM5 5
|
|
28
|
+
#define TMM6 6
|
|
29
|
+
#define TMM7 7
|
|
30
|
+
|
|
31
|
+
// parallel routines
|
|
32
|
+
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
|
|
33
|
+
inline T div_up(T x, T y) { return (x + y - 1) / y; }
|
|
34
|
+
|
|
35
|
+
template <typename T>
|
|
36
|
+
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
|
|
37
|
+
#if 0
|
|
38
|
+
// onednn partition pattern
|
|
39
|
+
T& n_my = n_end;
|
|
40
|
+
if (nth <= 1 || n == 0) {
|
|
41
|
+
n_start = 0;
|
|
42
|
+
n_my = n;
|
|
43
|
+
} else {
|
|
44
|
+
T n1 = div_up(n, nth);
|
|
45
|
+
T n2 = n1 - 1;
|
|
46
|
+
T T1 = n - n2 * nth;
|
|
47
|
+
n_my = ith < T1 ? n1 : n2;
|
|
48
|
+
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
|
|
49
|
+
}
|
|
50
|
+
n_end += n_start;
|
|
51
|
+
#else
|
|
52
|
+
// pytorch aten partition pattern
|
|
53
|
+
T n_my = div_up(n, nth);
|
|
54
|
+
n_start = ith * n_my;
|
|
55
|
+
n_end = std::min(n_start + n_my, n);
|
|
56
|
+
#endif
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
template <typename func_t>
|
|
60
|
+
inline void parallel_for(int nth, int n, const func_t& f) {
|
|
61
|
+
#if defined(_OPENMP)
|
|
62
|
+
#pragma omp parallel num_threads(nth)
|
|
63
|
+
{
|
|
64
|
+
//int nth = omp_get_num_threads();
|
|
65
|
+
int ith = omp_get_thread_num();
|
|
66
|
+
int tbegin, tend;
|
|
67
|
+
balance211(n, nth, ith, tbegin, tend);
|
|
68
|
+
f(tbegin, tend);
|
|
69
|
+
}
|
|
70
|
+
#else
|
|
71
|
+
f(0, n);
|
|
72
|
+
|
|
73
|
+
GGML_UNUSED(nth);
|
|
74
|
+
#endif
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// quantized types that have AMX support
|
|
78
|
+
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
|
79
|
+
// TODO: fix padding for vnni format
|
|
80
|
+
return (type == GGML_TYPE_Q4_0) ||
|
|
81
|
+
(type == GGML_TYPE_Q4_1);
|
|
82
|
+
//(type == GGML_TYPE_Q8_0) ||
|
|
83
|
+
//(type == GGML_TYPE_Q4_K) ||
|
|
84
|
+
//(type == GGML_TYPE_Q5_K) ||
|
|
85
|
+
//(type == GGML_TYPE_Q6_K) ||
|
|
86
|
+
//(type == GGML_TYPE_IQ4_XS);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ggml backend context
|
|
90
|
+
struct ggml_backend_amx_context {
|
|
91
|
+
int n_threads = GGML_DEFAULT_N_THREADS;
|
|
92
|
+
std::unique_ptr<char[]> work_data;
|
|
93
|
+
size_t work_size = 0;
|
|
94
|
+
};
|