llama_cpp 0.15.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +3 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +289 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +77 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +23 -8
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1 -1
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +18 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +11 -9
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +950 -267
- data/vendor/tmp/llama.cpp/ggml.c +1090 -89
- data/vendor/tmp/llama.cpp/ggml.h +15 -7
- data/vendor/tmp/llama.cpp/llama.cpp +57 -17
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1187 -655
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -1
- data/vendor/tmp/llama.cpp/unicode.cpp +254 -122
- data/vendor/tmp/llama.cpp/unicode.h +4 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce6d72aeb5fb9aff775d44284bf934e164f8470973619507ef6e6eb1ac0bec4d
|
4
|
+
data.tar.gz: 7c1ae823c90f957219b3edbc20f091b65a50caa984c1a6f4d137a46c376b2f0c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d23cb6a63b7734df2547c5e61a699fa206878c747e274e004c829b77335a7cc7434e92168a55d8ab0a617b11eddb5d45d5057a91b92e848735fd9e852b2476cd
|
7
|
+
data.tar.gz: f54b09de3cc60de81be977e9706a9beb3bf28e7740a19a57f6add543fe10cd6dc4101cbbe22dd5b62870c78a1ad4d10f57dd29b7c3e3e12b950e6575cf67b0c7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2781 to b2839.
|
4
|
+
- Add constants for pre-tokenization types.
|
5
|
+
- Add constant for model file type.
|
6
|
+
|
1
7
|
## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
|
2
8
|
|
3
9
|
- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -3428,6 +3428,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
3428
3428
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
|
3429
3429
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
|
3430
3430
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3433
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
3431
3436
|
|
3432
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3433
3438
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
@@ -3465,6 +3470,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3465
3470
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3466
3471
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3467
3472
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3473
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
|
3468
3474
|
|
3469
3475
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3470
3476
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.15.
|
6
|
+
VERSION = '0.15.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2839'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -24,6 +24,11 @@ module LLaMACpp
|
|
24
24
|
LLAMA_VOCAB_PRE_TYPE_MPT: Integer
|
25
25
|
LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
|
26
26
|
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
27
32
|
|
28
33
|
LLAMA_FTYPE_ALL_F32: Integer
|
29
34
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
@@ -53,6 +58,7 @@ module LLaMACpp
|
|
53
58
|
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
54
59
|
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
55
60
|
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
61
|
+
LLAMA_FTYPE_MOSTLY_BF16: Integer
|
56
62
|
|
57
63
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
58
64
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
|
|
77
77
|
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
78
78
|
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
79
79
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
80
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
81
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
|
82
80
|
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
83
81
|
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
84
82
|
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
83
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
|
85
84
|
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
86
85
|
continue; \
|
87
86
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
@@ -436,7 +435,7 @@ ifdef LLAMA_CUDA
|
|
436
435
|
else
|
437
436
|
CUDA_PATH ?= /usr/local/cuda
|
438
437
|
endif
|
439
|
-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
438
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
440
439
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
441
440
|
OBJS += ggml-cuda.o
|
442
441
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
@@ -761,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
761
760
|
ar rcs libllama.a $^
|
762
761
|
|
763
762
|
clean:
|
764
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
763
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
765
764
|
rm -vrf ggml-cuda/*.o
|
766
765
|
|
767
766
|
#
|
@@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
113
113
|
for (int id = 0; id < info.device_count; ++id) {
|
114
114
|
int device_vmm = 0;
|
115
115
|
|
116
|
-
#if !defined(GGML_USE_HIPBLAS)
|
116
|
+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
117
117
|
CUdevice device;
|
118
118
|
CU_CHECK(cuDeviceGet(&device, id));
|
119
119
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
@@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
259
259
|
};
|
260
260
|
|
261
261
|
// pool with virtual memory
|
262
|
-
#if !defined(GGML_USE_HIPBLAS)
|
262
|
+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
263
263
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
264
264
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
265
265
|
|
@@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
356
356
|
#endif // !defined(GGML_USE_HIPBLAS)
|
357
357
|
|
358
358
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
359
|
-
#if !defined(GGML_USE_HIPBLAS)
|
359
|
+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
360
360
|
if (ggml_cuda_info().devices[device].vmm) {
|
361
361
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
362
362
|
}
|
@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
|
|
1647
1647
|
}
|
1648
1648
|
}
|
1649
1649
|
|
1650
|
-
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
1650
|
+
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1651
1651
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
1652
1652
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
1653
1653
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
@@ -1670,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
|
|
1670
1670
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
1671
1671
|
}
|
1672
1672
|
|
1673
|
-
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
1673
|
+
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1674
1674
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
1675
1675
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
1676
1676
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
@@ -2410,32 +2410,304 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
2410
2410
|
GGML_UNUSED(backend);
|
2411
2411
|
}
|
2412
2412
|
|
2413
|
+
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
2414
|
+
graph_node_properties->node_address = node->data;
|
2415
|
+
graph_node_properties->node_op = node->op;
|
2416
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
2417
|
+
graph_node_properties->ne[i] = node->ne[i];
|
2418
|
+
graph_node_properties->nb[i] = node->nb[i];
|
2419
|
+
}
|
2420
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
2421
|
+
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
2422
|
+
}
|
2423
|
+
}
|
2424
|
+
|
2425
|
+
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
2426
|
+
if (node->data != graph_node_properties->node_address &&
|
2427
|
+
node->op != GGML_OP_CPY &&
|
2428
|
+
node->op != GGML_OP_VIEW) {
|
2429
|
+
return false;
|
2430
|
+
}
|
2431
|
+
|
2432
|
+
if (node->op != graph_node_properties->node_op) {
|
2433
|
+
return false;
|
2434
|
+
}
|
2435
|
+
|
2436
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
2437
|
+
if (node->ne[i] != graph_node_properties->ne[i]) {
|
2438
|
+
return false;
|
2439
|
+
}
|
2440
|
+
if (node->nb[i] != graph_node_properties->nb[i]) {
|
2441
|
+
return false;
|
2442
|
+
}
|
2443
|
+
}
|
2444
|
+
|
2445
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
2446
|
+
if (node->src[i] &&
|
2447
|
+
node->src[i]->data != graph_node_properties->src_address[i] &&
|
2448
|
+
node->op != GGML_OP_CPY &&
|
2449
|
+
node->op != GGML_OP_VIEW
|
2450
|
+
) {
|
2451
|
+
return false;
|
2452
|
+
}
|
2453
|
+
}
|
2454
|
+
return true;
|
2455
|
+
}
|
2456
|
+
|
2413
2457
|
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
2414
2458
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2415
2459
|
|
2416
2460
|
ggml_cuda_set_device(cuda_ctx->device);
|
2417
2461
|
|
2418
|
-
|
2419
|
-
|
2462
|
+
#ifdef USE_CUDA_GRAPH
|
2463
|
+
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
2420
2464
|
|
2421
|
-
|
2422
|
-
|
2465
|
+
// Objects required for CUDA Graph
|
2466
|
+
if (cuda_ctx->cuda_graph == nullptr) {
|
2467
|
+
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
2468
|
+
}
|
2469
|
+
|
2470
|
+
bool use_cuda_graph = true;
|
2471
|
+
bool cuda_graph_update_required = false;
|
2472
|
+
// pointer to CUDA cpy kernel, which is required to identify
|
2473
|
+
// kernel parameters which need updated in the graph for each token
|
2474
|
+
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
2475
|
+
|
2476
|
+
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
2477
|
+
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2478
|
+
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2479
|
+
#ifndef NDEBUG
|
2480
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2481
|
+
#endif
|
2482
|
+
}
|
2483
|
+
}
|
2484
|
+
|
2485
|
+
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
2486
|
+
// or previous graph capture failure.
|
2487
|
+
// Also disable for multi-gpu for now. TO DO investigate
|
2488
|
+
if (disable_cuda_graphs_due_to_env
|
2489
|
+
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
2490
|
+
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
2491
|
+
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
2492
|
+
use_cuda_graph = false;
|
2493
|
+
}
|
2494
|
+
|
2495
|
+
if (use_cuda_graph) {
|
2496
|
+
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
2497
|
+
cuda_graph_update_required = true;
|
2423
2498
|
}
|
2424
2499
|
|
2500
|
+
// Check if the graph size has changed
|
2501
|
+
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
2502
|
+
cuda_graph_update_required = true;
|
2503
|
+
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
2504
|
+
}
|
2505
|
+
|
2506
|
+
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
2507
|
+
// and store properties to allow this comparison for the next token
|
2508
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2509
|
+
bool has_matching_properties = true;
|
2510
|
+
if (!cuda_graph_update_required) {
|
2511
|
+
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
2512
|
+
}
|
2513
|
+
if (!has_matching_properties) {
|
2514
|
+
cuda_graph_update_required = true;
|
2515
|
+
}
|
2516
|
+
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
2517
|
+
}
|
2518
|
+
|
2519
|
+
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
2520
|
+
cuda_ctx->cuda_graph->updated_kernel_arg.clear();
|
2521
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2522
|
+
ggml_tensor * node = cgraph->nodes[i];
|
2523
|
+
|
2524
|
+
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2525
|
+
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2425
2526
|
#ifndef NDEBUG
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
2429
|
-
|
2527
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2528
|
+
#endif
|
2529
|
+
}
|
2530
|
+
|
2531
|
+
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2532
|
+
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2533
|
+
#ifndef NDEBUG
|
2534
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2535
|
+
#endif
|
2536
|
+
}
|
2537
|
+
|
2538
|
+
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
2539
|
+
// disable CUDA graphs for batch size > 1 for now.
|
2540
|
+
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2541
|
+
use_cuda_graph = false;
|
2542
|
+
#ifndef NDEBUG
|
2543
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2544
|
+
#endif
|
2545
|
+
}
|
2546
|
+
|
2547
|
+
if (node->op == GGML_OP_CPY) {
|
2548
|
+
// store the copy op parameter which changes with each token.
|
2549
|
+
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
2550
|
+
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
2551
|
+
// store a pointer to the copy op CUDA kernel to identify it later
|
2552
|
+
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
2553
|
+
}
|
2554
|
+
}
|
2555
|
+
|
2556
|
+
if (!use_cuda_graph) {
|
2557
|
+
break;
|
2558
|
+
}
|
2559
|
+
}
|
2560
|
+
|
2561
|
+
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
2562
|
+
if (cuda_graph_update_required) {
|
2563
|
+
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
2564
|
+
} else {
|
2565
|
+
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
2566
|
+
}
|
2567
|
+
|
2568
|
+
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2569
|
+
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2570
|
+
#ifndef NDEBUG
|
2571
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2572
|
+
#endif
|
2573
|
+
}
|
2574
|
+
}
|
2575
|
+
|
2576
|
+
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
|
2577
|
+
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
2578
|
+
}
|
2579
|
+
|
2580
|
+
#else
|
2581
|
+
bool use_cuda_graph = false;
|
2582
|
+
bool cuda_graph_update_required = false;
|
2583
|
+
#endif // USE_CUDA_GRAPH
|
2584
|
+
|
2585
|
+
bool graph_evaluated_or_captured = false;
|
2586
|
+
|
2587
|
+
while (!graph_evaluated_or_captured) {
|
2588
|
+
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
2589
|
+
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
2590
|
+
if (!use_cuda_graph || cuda_graph_update_required) {
|
2591
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2592
|
+
ggml_tensor * node = cgraph->nodes[i];
|
2593
|
+
|
2594
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
2595
|
+
continue;
|
2596
|
+
}
|
2597
|
+
|
2598
|
+
#ifndef NDEBUG
|
2599
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
2600
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
2601
|
+
if (node->src[j] != nullptr) {
|
2602
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
2603
|
+
}
|
2604
|
+
}
|
2605
|
+
#endif
|
2606
|
+
|
2607
|
+
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2608
|
+
if (!ok) {
|
2609
|
+
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2610
|
+
}
|
2611
|
+
GGML_ASSERT(ok);
|
2430
2612
|
}
|
2431
2613
|
}
|
2614
|
+
|
2615
|
+
#ifdef USE_CUDA_GRAPH
|
2616
|
+
if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
|
2617
|
+
if (cuda_ctx->cuda_graph->graph != nullptr) {
|
2618
|
+
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
|
2619
|
+
cuda_ctx->cuda_graph->graph = nullptr;
|
2620
|
+
}
|
2621
|
+
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
2622
|
+
|
2623
|
+
#if 0
|
2624
|
+
if (disable_cuda_graphs_due_to_failed_capture) {
|
2625
|
+
use_cuda_graph = false;
|
2626
|
+
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2627
|
+
#ifndef NDEBUG
|
2628
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2432
2629
|
#endif
|
2630
|
+
} else {
|
2631
|
+
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
2632
|
+
}
|
2633
|
+
#endif
|
2634
|
+
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
2635
|
+
} else {
|
2636
|
+
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
2637
|
+
}
|
2638
|
+
}
|
2639
|
+
|
2640
|
+
if (use_cuda_graph) {
|
2641
|
+
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
|
2642
|
+
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
2643
|
+
}
|
2644
|
+
|
2645
|
+
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
2646
|
+
|
2647
|
+
if (cuda_graph_update_required) {
|
2648
|
+
// Extract nodes from graph
|
2649
|
+
if (cuda_ctx->cuda_graph->num_nodes == 0) {
|
2650
|
+
// First call with null argument gets number of nodes in graph
|
2651
|
+
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
2652
|
+
}
|
2653
|
+
// Subsequent call with non-null argument gets nodes
|
2654
|
+
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
2655
|
+
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
2656
|
+
if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
2657
|
+
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
2658
|
+
|
2659
|
+
// Loop over nodes, and extract kernel parameters from each node
|
2660
|
+
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
2661
|
+
cudaGraphNodeType node_type;
|
2662
|
+
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
|
2663
|
+
if (node_type == cudaGraphNodeTypeKernel) {
|
2664
|
+
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
|
2665
|
+
if (stat == cudaErrorInvalidDeviceFunction) {
|
2666
|
+
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
2667
|
+
// We don't need to update blas nodes, so clear error and move on.
|
2668
|
+
cudaGetLastError();
|
2669
|
+
} else {
|
2670
|
+
GGML_ASSERT(stat == cudaSuccess);
|
2671
|
+
}
|
2672
|
+
}
|
2673
|
+
}
|
2674
|
+
}
|
2675
|
+
}
|
2676
|
+
|
2677
|
+
// One of the arguments to the copy kernel is updated for each token, hence we need to
|
2678
|
+
// replace that argument with the updated value in the CUDA graph
|
2679
|
+
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
2680
|
+
int k = 0;
|
2681
|
+
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
2682
|
+
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
2683
|
+
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
2684
|
+
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
2685
|
+
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
2686
|
+
}
|
2687
|
+
}
|
2688
|
+
}
|
2433
2689
|
|
2434
|
-
|
2435
|
-
|
2436
|
-
|
2690
|
+
// Update graph executable
|
2691
|
+
cudaGraphExecUpdateResultInfo result_info;
|
2692
|
+
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2693
|
+
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2694
|
+
#ifndef NDEBUG
|
2695
|
+
fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
|
2696
|
+
#endif
|
2697
|
+
// The pre-existing graph exec cannot be updated due to violated constraints
|
2698
|
+
// so instead clear error and re-instantiate
|
2699
|
+
cudaGetLastError();
|
2700
|
+
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
2701
|
+
cuda_ctx->cuda_graph->instance = nullptr;
|
2702
|
+
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
2703
|
+
} else {
|
2704
|
+
GGML_ASSERT(stat == cudaSuccess);
|
2437
2705
|
}
|
2438
|
-
|
2706
|
+
// Launch graph
|
2707
|
+
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
2708
|
+
#else
|
2709
|
+
graph_evaluated_or_captured = true;
|
2710
|
+
#endif // USE_CUDA_GRAPH
|
2439
2711
|
}
|
2440
2712
|
|
2441
2713
|
return GGML_STATUS_SUCCESS;
|
@@ -17,6 +17,83 @@
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
19
|
|
20
|
+
/**
|
21
|
+
* Converts brain16 to float32.
|
22
|
+
*
|
23
|
+
* The bfloat16 floating point format has the following structure:
|
24
|
+
*
|
25
|
+
* ┌sign
|
26
|
+
* │
|
27
|
+
* │ ┌exponent
|
28
|
+
* │ │
|
29
|
+
* │ │ ┌mantissa
|
30
|
+
* │ │ │
|
31
|
+
* │┌──┴───┐┌─┴───┐
|
32
|
+
* 0b0000000000000000 brain16
|
33
|
+
*
|
34
|
+
* Since bf16 has the same number of exponent bits as a 32bit float,
|
35
|
+
* encoding and decoding numbers becomes relatively straightforward.
|
36
|
+
*
|
37
|
+
* ┌sign
|
38
|
+
* │
|
39
|
+
* │ ┌exponent
|
40
|
+
* │ │
|
41
|
+
* │ │ ┌mantissa
|
42
|
+
* │ │ │
|
43
|
+
* │┌──┴───┐┌─┴───────────────────┐
|
44
|
+
* 0b00000000000000000000000000000000 IEEE binary32
|
45
|
+
*
|
46
|
+
* For comparison, the standard fp16 format has fewer exponent bits.
|
47
|
+
*
|
48
|
+
* ┌sign
|
49
|
+
* │
|
50
|
+
* │ ┌exponent
|
51
|
+
* │ │
|
52
|
+
* │ │ ┌mantissa
|
53
|
+
* │ │ │
|
54
|
+
* │┌─┴─┐┌─┴──────┐
|
55
|
+
* 0b0000000000000000 IEEE binary16
|
56
|
+
*
|
57
|
+
* @see IEEE 754-2008
|
58
|
+
*/
|
59
|
+
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
60
|
+
union {
|
61
|
+
float f;
|
62
|
+
uint32_t i;
|
63
|
+
} u;
|
64
|
+
u.i = (uint32_t)h.bits << 16;
|
65
|
+
return u.f;
|
66
|
+
}
|
67
|
+
|
68
|
+
/**
|
69
|
+
* Converts float32 to brain16.
|
70
|
+
*
|
71
|
+
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
72
|
+
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
73
|
+
* This code should vectorize nicely if using modern compilers.
|
74
|
+
*/
|
75
|
+
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
76
|
+
ggml_bf16_t h;
|
77
|
+
union {
|
78
|
+
float f;
|
79
|
+
uint32_t i;
|
80
|
+
} u;
|
81
|
+
u.f = s;
|
82
|
+
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
83
|
+
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
84
|
+
return h;
|
85
|
+
}
|
86
|
+
if (!(u.i & 0x7f800000)) { /* subnormal */
|
87
|
+
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
88
|
+
return h;
|
89
|
+
}
|
90
|
+
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
91
|
+
return h;
|
92
|
+
}
|
93
|
+
|
94
|
+
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
95
|
+
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
96
|
+
|
20
97
|
#ifdef __cplusplus
|
21
98
|
extern "C" {
|
22
99
|
#endif
|
@@ -265,11 +265,20 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
|
265
265
|
|
266
266
|
static void * ggml_metal_host_malloc(size_t n) {
|
267
267
|
void * data = NULL;
|
268
|
+
|
269
|
+
#if TARGET_OS_OSX
|
270
|
+
kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
|
271
|
+
if (err != KERN_SUCCESS) {
|
272
|
+
GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
|
273
|
+
return NULL;
|
274
|
+
}
|
275
|
+
#else
|
268
276
|
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
269
277
|
if (result != 0) {
|
270
278
|
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
271
279
|
return NULL;
|
272
280
|
}
|
281
|
+
#endif
|
273
282
|
|
274
283
|
return data;
|
275
284
|
}
|
@@ -803,7 +812,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
803
812
|
case GGML_OP_DIAG_MASK_INF:
|
804
813
|
case GGML_OP_GET_ROWS:
|
805
814
|
{
|
806
|
-
return op->ne[3] == 1;
|
815
|
+
return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
|
807
816
|
}
|
808
817
|
default:
|
809
818
|
return false;
|
@@ -2840,7 +2849,11 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
|
|
2840
2849
|
ggml_backend_metal_free_device();
|
2841
2850
|
|
2842
2851
|
if (ctx->owned) {
|
2852
|
+
#if TARGET_OS_OSX
|
2853
|
+
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
|
2854
|
+
#else
|
2843
2855
|
free(ctx->all_data);
|
2856
|
+
#endif
|
2844
2857
|
}
|
2845
2858
|
|
2846
2859
|
free(ctx);
|
@@ -2944,14 +2957,16 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
|
|
2944
2957
|
ctx->owned = true;
|
2945
2958
|
ctx->n_buffers = 1;
|
2946
2959
|
|
2947
|
-
ctx->
|
2948
|
-
|
2949
|
-
|
2950
|
-
|
2951
|
-
|
2952
|
-
|
2960
|
+
if (ctx->all_data != NULL) {
|
2961
|
+
ctx->buffers[0].data = ctx->all_data;
|
2962
|
+
ctx->buffers[0].size = size;
|
2963
|
+
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
2964
|
+
length:size_aligned
|
2965
|
+
options:MTLResourceStorageModeShared
|
2966
|
+
deallocator:nil];
|
2967
|
+
}
|
2953
2968
|
|
2954
|
-
if (ctx->buffers[0].metal == nil) {
|
2969
|
+
if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
|
2955
2970
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
2956
2971
|
free(ctx);
|
2957
2972
|
ggml_backend_metal_free_device();
|
@@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
|
|
2175
2175
|
|
2176
2176
|
const short D4 = D/4;
|
2177
2177
|
const short D8 = D/8;
|
2178
|
-
|
2178
|
+
//const short Q8 = Q/8;
|
2179
2179
|
const short NW = N_SIMDWIDTH;
|
2180
2180
|
const short SH = (C + Q); // shared memory per simdgroup in (half)
|
2181
2181
|
|
@@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
|
|
2119
2119
|
if (alignment == (cl_uint)-1) {
|
2120
2120
|
ggml_cl_init();
|
2121
2121
|
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
|
2122
|
+
alignment /= 8; // bits to bytes
|
2122
2123
|
}
|
2123
2124
|
return alignment;
|
2124
2125
|
|
@@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|
12450
12450
|
const size_t nb = nbytes/ggml_type_size(type);
|
12451
12451
|
|
12452
12452
|
switch (type) {
|
12453
|
+
case GGML_TYPE_BF16:
|
12454
|
+
{
|
12455
|
+
int nans = 0;
|
12456
|
+
int infs = 0;
|
12457
|
+
const unsigned short * f = (const unsigned short *) data;
|
12458
|
+
for (size_t i = 0; i < nb; ++i) {
|
12459
|
+
nans += (f[i] & 0x7fff) > 0x7f80;
|
12460
|
+
infs += (f[i] & 0x7fff) == 0x7f80;
|
12461
|
+
}
|
12462
|
+
if (nans) {
|
12463
|
+
fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
|
12464
|
+
return false;
|
12465
|
+
}
|
12466
|
+
if (infs) {
|
12467
|
+
fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
|
12468
|
+
return false;
|
12469
|
+
}
|
12470
|
+
} break;
|
12453
12471
|
case GGML_TYPE_F16:
|
12454
12472
|
{
|
12455
12473
|
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
|