llama_cpp 0.15.0 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30dd4c29b86098faf7c78de5fa8e57021b631bb5eb3d14c93f63f1d186383ab8
|
4
|
+
data.tar.gz: b011d891f1cd725f84821428a8db24004b52c9614e785f493f721f7abde71029
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c1628f93762747688f802db8593946e8581c869f63c610669b45759f644b3d19b061825b788e328b6b984977112837586ed398b6118a8f8e5f0c7f6fd0eb2dd
|
7
|
+
data.tar.gz: 2f8c3d9f1e6c0f6db7e0682995c8d34179d5405d32784bf00f04a3408cb5bf4c95557bfa1692026f8d3dc9e672d6b15dec5d33cbd76ddc1d94e5ec964a9d0409
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2839 to b2917.
|
4
|
+
|
5
|
+
Implementation binding for rpc_servers in llama_model_params has been skipped.
|
6
|
+
|
7
|
+
## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
|
8
|
+
|
9
|
+
- Bump llama.cpp from b2781 to b2839.
|
10
|
+
- Add constants for pre-tokenization types.
|
11
|
+
- Add constant for model file type.
|
12
|
+
|
1
13
|
## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
|
2
14
|
|
3
15
|
- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -3428,6 +3428,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
3428
3428
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
|
3429
3429
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
|
3430
3430
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3433
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
3431
3436
|
|
3432
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3433
3438
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
@@ -3465,6 +3470,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3465
3470
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3466
3471
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3467
3472
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3473
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
|
3468
3474
|
|
3469
3475
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3470
3476
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.15.
|
6
|
+
VERSION = '0.15.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2917'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -24,6 +24,11 @@ module LLaMACpp
|
|
24
24
|
LLAMA_VOCAB_PRE_TYPE_MPT: Integer
|
25
25
|
LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
|
26
26
|
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
27
32
|
|
28
33
|
LLAMA_FTYPE_ALL_F32: Integer
|
29
34
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
@@ -53,6 +58,7 @@ module LLaMACpp
|
|
53
58
|
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
54
59
|
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
55
60
|
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
61
|
+
LLAMA_FTYPE_MOSTLY_BF16: Integer
|
56
62
|
|
57
63
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
58
64
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
|
|
77
77
|
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
78
78
|
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
79
79
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
80
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
81
|
-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
|
82
80
|
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
83
81
|
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
84
82
|
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
83
|
+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
|
85
84
|
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
86
85
|
continue; \
|
87
86
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
@@ -436,7 +435,7 @@ ifdef LLAMA_CUDA
|
|
436
435
|
else
|
437
436
|
CUDA_PATH ?= /usr/local/cuda
|
438
437
|
endif
|
439
|
-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
438
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
440
439
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
441
440
|
OBJS += ggml-cuda.o
|
442
441
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
@@ -563,10 +562,10 @@ endif # LLAMA_VULKAN
|
|
563
562
|
ifdef LLAMA_HIPBLAS
|
564
563
|
ifeq ($(wildcard /opt/rocm),)
|
565
564
|
ROCM_PATH ?= /usr
|
566
|
-
|
565
|
+
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
567
566
|
else
|
568
567
|
ROCM_PATH ?= /opt/rocm
|
569
|
-
|
568
|
+
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
570
569
|
endif
|
571
570
|
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
|
572
571
|
LLAMA_CUDA_DMMV_X ?= 32
|
@@ -578,7 +577,7 @@ ifdef LLAMA_HIP_UMA
|
|
578
577
|
endif # LLAMA_HIP_UMA
|
579
578
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
580
579
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
581
|
-
HIPFLAGS += $(addprefix --offload-arch=,$(
|
580
|
+
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
582
581
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
583
582
|
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
584
583
|
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
@@ -761,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
761
760
|
ar rcs libllama.a $^
|
762
761
|
|
763
762
|
clean:
|
764
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
763
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
765
764
|
rm -vrf ggml-cuda/*.o
|
766
765
|
|
767
766
|
#
|
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1182
1182
|
static char * fmt_size(size_t size) {
|
1183
1183
|
static char buffer[128];
|
1184
1184
|
if (size >= 1024*1024) {
|
1185
|
-
|
1185
|
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
1186
1186
|
} else {
|
1187
|
-
|
1187
|
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
1188
1188
|
}
|
1189
1189
|
return buffer;
|
1190
1190
|
}
|
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
|
1895
1895
|
|
1896
1896
|
tensor->buffer = buffer;
|
1897
1897
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1898
|
-
tensor->backend = tensor->view_src->backend;
|
1899
1898
|
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1900
1899
|
}
|
1901
1900
|
|
@@ -4,7 +4,6 @@
|
|
4
4
|
|
5
5
|
#include "ggml-cuda/common.cuh"
|
6
6
|
#include "ggml-cuda/acc.cuh"
|
7
|
-
#include "ggml-cuda/alibi.cuh"
|
8
7
|
#include "ggml-cuda/arange.cuh"
|
9
8
|
#include "ggml-cuda/argsort.cuh"
|
10
9
|
#include "ggml-cuda/binbcast.cuh"
|
@@ -113,7 +112,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
113
112
|
for (int id = 0; id < info.device_count; ++id) {
|
114
113
|
int device_vmm = 0;
|
115
114
|
|
116
|
-
#if !defined(GGML_USE_HIPBLAS)
|
115
|
+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
117
116
|
CUdevice device;
|
118
117
|
CU_CHECK(cuDeviceGet(&device, id));
|
119
118
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
@@ -259,7 +258,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
259
258
|
};
|
260
259
|
|
261
260
|
// pool with virtual memory
|
262
|
-
#if !defined(GGML_USE_HIPBLAS)
|
261
|
+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
263
262
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
264
263
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
265
264
|
|
@@ -356,7 +355,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|
356
355
|
#endif // !defined(GGML_USE_HIPBLAS)
|
357
356
|
|
358
357
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
359
|
-
#if !defined(GGML_USE_HIPBLAS)
|
358
|
+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
360
359
|
if (ggml_cuda_info().devices[device].vmm) {
|
361
360
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
362
361
|
}
|
@@ -1647,7 +1646,7 @@ static void ggml_cuda_op_mul_mat(
|
|
1647
1646
|
}
|
1648
1647
|
}
|
1649
1648
|
|
1650
|
-
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
1649
|
+
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1651
1650
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
1652
1651
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
1653
1652
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
@@ -1670,7 +1669,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
|
|
1670
1669
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
1671
1670
|
}
|
1672
1671
|
|
1673
|
-
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
1672
|
+
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1674
1673
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
1675
1674
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
1676
1675
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
@@ -2205,6 +2204,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2205
2204
|
case GGML_UNARY_OP_RELU:
|
2206
2205
|
ggml_cuda_op_relu(ctx, dst);
|
2207
2206
|
break;
|
2207
|
+
case GGML_UNARY_OP_SIGMOID:
|
2208
|
+
ggml_cuda_op_sigmoid(ctx, dst);
|
2209
|
+
break;
|
2208
2210
|
case GGML_UNARY_OP_HARDSIGMOID:
|
2209
2211
|
ggml_cuda_op_hardsigmoid(ctx, dst);
|
2210
2212
|
break;
|
@@ -2277,9 +2279,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2277
2279
|
case GGML_OP_ROPE:
|
2278
2280
|
ggml_cuda_op_rope(ctx, dst);
|
2279
2281
|
break;
|
2280
|
-
case GGML_OP_ALIBI:
|
2281
|
-
ggml_cuda_op_alibi(ctx, dst);
|
2282
|
-
break;
|
2283
2282
|
case GGML_OP_IM2COL:
|
2284
2283
|
ggml_cuda_op_im2col(ctx, dst);
|
2285
2284
|
break;
|
@@ -2410,44 +2409,318 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
2410
2409
|
GGML_UNUSED(backend);
|
2411
2410
|
}
|
2412
2411
|
|
2412
|
+
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
2413
|
+
graph_node_properties->node_address = node->data;
|
2414
|
+
graph_node_properties->node_op = node->op;
|
2415
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
2416
|
+
graph_node_properties->ne[i] = node->ne[i];
|
2417
|
+
graph_node_properties->nb[i] = node->nb[i];
|
2418
|
+
}
|
2419
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
2420
|
+
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
2421
|
+
}
|
2422
|
+
}
|
2423
|
+
|
2424
|
+
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
2425
|
+
if (node->data != graph_node_properties->node_address &&
|
2426
|
+
node->op != GGML_OP_CPY &&
|
2427
|
+
node->op != GGML_OP_VIEW) {
|
2428
|
+
return false;
|
2429
|
+
}
|
2430
|
+
|
2431
|
+
if (node->op != graph_node_properties->node_op) {
|
2432
|
+
return false;
|
2433
|
+
}
|
2434
|
+
|
2435
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
2436
|
+
if (node->ne[i] != graph_node_properties->ne[i]) {
|
2437
|
+
return false;
|
2438
|
+
}
|
2439
|
+
if (node->nb[i] != graph_node_properties->nb[i]) {
|
2440
|
+
return false;
|
2441
|
+
}
|
2442
|
+
}
|
2443
|
+
|
2444
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
2445
|
+
if (node->src[i] &&
|
2446
|
+
node->src[i]->data != graph_node_properties->src_address[i] &&
|
2447
|
+
node->op != GGML_OP_CPY &&
|
2448
|
+
node->op != GGML_OP_VIEW
|
2449
|
+
) {
|
2450
|
+
return false;
|
2451
|
+
}
|
2452
|
+
}
|
2453
|
+
return true;
|
2454
|
+
}
|
2455
|
+
|
2413
2456
|
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
2414
2457
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2415
2458
|
|
2416
2459
|
ggml_cuda_set_device(cuda_ctx->device);
|
2417
2460
|
|
2418
|
-
|
2419
|
-
|
2461
|
+
#ifdef USE_CUDA_GRAPH
|
2462
|
+
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
2420
2463
|
|
2421
|
-
|
2422
|
-
|
2464
|
+
// Objects required for CUDA Graph
|
2465
|
+
if (cuda_ctx->cuda_graph == nullptr) {
|
2466
|
+
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
2467
|
+
}
|
2468
|
+
|
2469
|
+
bool use_cuda_graph = true;
|
2470
|
+
bool cuda_graph_update_required = false;
|
2471
|
+
// pointer to CUDA cpy kernel, which is required to identify
|
2472
|
+
// kernel parameters which need updated in the graph for each token
|
2473
|
+
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
2474
|
+
|
2475
|
+
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
2476
|
+
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2477
|
+
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2478
|
+
#ifndef NDEBUG
|
2479
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2480
|
+
#endif
|
2481
|
+
}
|
2482
|
+
}
|
2483
|
+
|
2484
|
+
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
2485
|
+
// or previous graph capture failure.
|
2486
|
+
// Also disable for multi-gpu for now. TO DO investigate
|
2487
|
+
if (disable_cuda_graphs_due_to_env
|
2488
|
+
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
2489
|
+
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
2490
|
+
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
2491
|
+
use_cuda_graph = false;
|
2492
|
+
}
|
2493
|
+
|
2494
|
+
if (use_cuda_graph) {
|
2495
|
+
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
2496
|
+
cuda_graph_update_required = true;
|
2497
|
+
}
|
2498
|
+
|
2499
|
+
// Check if the graph size has changed
|
2500
|
+
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
2501
|
+
cuda_graph_update_required = true;
|
2502
|
+
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
2503
|
+
}
|
2504
|
+
|
2505
|
+
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
2506
|
+
// and store properties to allow this comparison for the next token
|
2507
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2508
|
+
bool has_matching_properties = true;
|
2509
|
+
if (!cuda_graph_update_required) {
|
2510
|
+
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
2511
|
+
}
|
2512
|
+
if (!has_matching_properties) {
|
2513
|
+
cuda_graph_update_required = true;
|
2514
|
+
}
|
2515
|
+
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
2516
|
+
}
|
2517
|
+
|
2518
|
+
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
2519
|
+
cuda_ctx->cuda_graph->updated_kernel_arg.clear();
|
2520
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2521
|
+
ggml_tensor * node = cgraph->nodes[i];
|
2522
|
+
|
2523
|
+
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2524
|
+
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2525
|
+
#ifndef NDEBUG
|
2526
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2527
|
+
#endif
|
2528
|
+
}
|
2529
|
+
|
2530
|
+
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2531
|
+
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2532
|
+
#ifndef NDEBUG
|
2533
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2534
|
+
#endif
|
2535
|
+
}
|
2536
|
+
|
2537
|
+
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
2538
|
+
// disable CUDA graphs for batch size > 1 for now.
|
2539
|
+
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2540
|
+
use_cuda_graph = false;
|
2541
|
+
#ifndef NDEBUG
|
2542
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2543
|
+
#endif
|
2544
|
+
}
|
2545
|
+
|
2546
|
+
if (node->op == GGML_OP_CPY) {
|
2547
|
+
// store the copy op parameter which changes with each token.
|
2548
|
+
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
2549
|
+
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
2550
|
+
// store a pointer to the copy op CUDA kernel to identify it later
|
2551
|
+
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
2552
|
+
}
|
2553
|
+
}
|
2554
|
+
|
2555
|
+
if (!use_cuda_graph) {
|
2556
|
+
break;
|
2557
|
+
}
|
2558
|
+
}
|
2559
|
+
|
2560
|
+
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
2561
|
+
if (use_cuda_graph && cuda_graph_update_required) {
|
2562
|
+
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
2563
|
+
} else {
|
2564
|
+
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
2423
2565
|
}
|
2424
2566
|
|
2567
|
+
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2568
|
+
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2425
2569
|
#ifndef NDEBUG
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
2429
|
-
|
2570
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2571
|
+
#endif
|
2572
|
+
}
|
2573
|
+
}
|
2574
|
+
|
2575
|
+
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
|
2576
|
+
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
2577
|
+
}
|
2578
|
+
|
2579
|
+
#else
|
2580
|
+
bool use_cuda_graph = false;
|
2581
|
+
bool cuda_graph_update_required = false;
|
2582
|
+
#endif // USE_CUDA_GRAPH
|
2583
|
+
|
2584
|
+
bool graph_evaluated_or_captured = false;
|
2585
|
+
|
2586
|
+
while (!graph_evaluated_or_captured) {
|
2587
|
+
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
2588
|
+
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
2589
|
+
if (!use_cuda_graph || cuda_graph_update_required) {
|
2590
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
2591
|
+
ggml_tensor * node = cgraph->nodes[i];
|
2592
|
+
|
2593
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
2594
|
+
continue;
|
2595
|
+
}
|
2596
|
+
|
2597
|
+
#ifndef NDEBUG
|
2598
|
+
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
2599
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
2600
|
+
if (node->src[j] != nullptr) {
|
2601
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
2602
|
+
}
|
2603
|
+
}
|
2604
|
+
#endif
|
2605
|
+
|
2606
|
+
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2607
|
+
if (!ok) {
|
2608
|
+
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2609
|
+
}
|
2610
|
+
GGML_ASSERT(ok);
|
2430
2611
|
}
|
2431
2612
|
}
|
2613
|
+
|
2614
|
+
#ifdef USE_CUDA_GRAPH
|
2615
|
+
if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
|
2616
|
+
if (cuda_ctx->cuda_graph->graph != nullptr) {
|
2617
|
+
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
|
2618
|
+
cuda_ctx->cuda_graph->graph = nullptr;
|
2619
|
+
}
|
2620
|
+
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
2621
|
+
|
2622
|
+
#if 0
|
2623
|
+
if (disable_cuda_graphs_due_to_failed_capture) {
|
2624
|
+
use_cuda_graph = false;
|
2625
|
+
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2626
|
+
#ifndef NDEBUG
|
2627
|
+
fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2432
2628
|
#endif
|
2629
|
+
} else {
|
2630
|
+
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
2631
|
+
}
|
2632
|
+
#endif
|
2633
|
+
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
2634
|
+
} else {
|
2635
|
+
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
2636
|
+
}
|
2637
|
+
}
|
2638
|
+
|
2639
|
+
if (use_cuda_graph) {
|
2640
|
+
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
|
2641
|
+
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
2642
|
+
}
|
2643
|
+
|
2644
|
+
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
2645
|
+
|
2646
|
+
if (cuda_graph_update_required) {
|
2647
|
+
// Extract nodes from graph
|
2648
|
+
if (cuda_ctx->cuda_graph->num_nodes == 0) {
|
2649
|
+
// First call with null argument gets number of nodes in graph
|
2650
|
+
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
2651
|
+
}
|
2652
|
+
// Subsequent call with non-null argument gets nodes
|
2653
|
+
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
2654
|
+
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
2655
|
+
if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
2656
|
+
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
2657
|
+
|
2658
|
+
// Loop over nodes, and extract kernel parameters from each node
|
2659
|
+
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
2660
|
+
cudaGraphNodeType node_type;
|
2661
|
+
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
|
2662
|
+
if (node_type == cudaGraphNodeTypeKernel) {
|
2663
|
+
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
|
2664
|
+
if (stat == cudaErrorInvalidDeviceFunction) {
|
2665
|
+
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
2666
|
+
// We don't need to update blas nodes, so clear error and move on.
|
2667
|
+
cudaGetLastError();
|
2668
|
+
} else {
|
2669
|
+
GGML_ASSERT(stat == cudaSuccess);
|
2670
|
+
}
|
2671
|
+
}
|
2672
|
+
}
|
2673
|
+
}
|
2674
|
+
}
|
2433
2675
|
|
2434
|
-
|
2435
|
-
|
2436
|
-
|
2676
|
+
// One of the arguments to the copy kernel is updated for each token, hence we need to
|
2677
|
+
// replace that argument with the updated value in the CUDA graph
|
2678
|
+
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
2679
|
+
int k = 0;
|
2680
|
+
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
2681
|
+
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
2682
|
+
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
2683
|
+
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
2684
|
+
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
2685
|
+
}
|
2686
|
+
}
|
2687
|
+
}
|
2688
|
+
|
2689
|
+
// Update graph executable
|
2690
|
+
cudaGraphExecUpdateResultInfo result_info;
|
2691
|
+
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2692
|
+
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2693
|
+
#ifndef NDEBUG
|
2694
|
+
fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
|
2695
|
+
#endif
|
2696
|
+
// The pre-existing graph exec cannot be updated due to violated constraints
|
2697
|
+
// so instead clear error and re-instantiate
|
2698
|
+
cudaGetLastError();
|
2699
|
+
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
2700
|
+
cuda_ctx->cuda_graph->instance = nullptr;
|
2701
|
+
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
2702
|
+
} else {
|
2703
|
+
GGML_ASSERT(stat == cudaSuccess);
|
2437
2704
|
}
|
2438
|
-
|
2705
|
+
// Launch graph
|
2706
|
+
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
2707
|
+
#else
|
2708
|
+
graph_evaluated_or_captured = true;
|
2709
|
+
#endif // USE_CUDA_GRAPH
|
2439
2710
|
}
|
2440
2711
|
|
2441
2712
|
return GGML_STATUS_SUCCESS;
|
2442
2713
|
}
|
2443
2714
|
|
2444
2715
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2716
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
2445
2717
|
switch (op->op) {
|
2446
2718
|
case GGML_OP_UNARY:
|
2447
2719
|
switch (ggml_get_unary_op(op)) {
|
2448
2720
|
case GGML_UNARY_OP_GELU:
|
2449
2721
|
case GGML_UNARY_OP_SILU:
|
2450
2722
|
case GGML_UNARY_OP_RELU:
|
2723
|
+
case GGML_UNARY_OP_SIGMOID:
|
2451
2724
|
case GGML_UNARY_OP_HARDSIGMOID:
|
2452
2725
|
case GGML_UNARY_OP_HARDSWISH:
|
2453
2726
|
case GGML_UNARY_OP_GELU_QUICK:
|
@@ -2557,7 +2830,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2557
2830
|
case GGML_OP_DIAG_MASK_INF:
|
2558
2831
|
case GGML_OP_SOFT_MAX:
|
2559
2832
|
case GGML_OP_ROPE:
|
2560
|
-
case GGML_OP_ALIBI:
|
2561
2833
|
case GGML_OP_IM2COL:
|
2562
2834
|
case GGML_OP_POOL_2D:
|
2563
2835
|
case GGML_OP_SUM_ROWS:
|
@@ -2569,8 +2841,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2569
2841
|
case GGML_OP_ARANGE:
|
2570
2842
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
2571
2843
|
case GGML_OP_LEAKY_RELU:
|
2572
|
-
case GGML_OP_FLASH_ATTN_EXT:
|
2573
2844
|
return true;
|
2845
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
2846
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2847
|
+
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
|
2848
|
+
#else
|
2849
|
+
if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
|
2850
|
+
return true;
|
2851
|
+
}
|
2852
|
+
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
|
2853
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2574
2854
|
default:
|
2575
2855
|
return false;
|
2576
2856
|
}
|