llama_cpp 0.15.0 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
- data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
3
+ metadata.gz: 30dd4c29b86098faf7c78de5fa8e57021b631bb5eb3d14c93f63f1d186383ab8
4
+ data.tar.gz: b011d891f1cd725f84821428a8db24004b52c9614e785f493f721f7abde71029
5
5
  SHA512:
6
- metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
- data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
6
+ metadata.gz: 6c1628f93762747688f802db8593946e8581c869f63c610669b45759f644b3d19b061825b788e328b6b984977112837586ed398b6118a8f8e5f0c7f6fd0eb2dd
7
+ data.tar.gz: 2f8c3d9f1e6c0f6db7e0682995c8d34179d5405d32784bf00f04a3408cb5bf4c95557bfa1692026f8d3dc9e672d6b15dec5d33cbd76ddc1d94e5ec964a9d0409
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
2
+
3
+ - Bump llama.cpp from b2839 to b2917.
4
+
5
+ Implementation binding for rpc_servers in llama_model_params has been skipped.
6
+
7
+ ## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
8
+
9
+ - Bump llama.cpp from b2781 to b2839.
10
+ - Add constants for pre-tokenization types.
11
+ - Add constant for model file type.
12
+
1
13
  ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
14
 
3
15
  - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
@@ -3428,6 +3428,11 @@ extern "C" void Init_llama_cpp(void) {
3428
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
3429
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
3430
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3433
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
3431
3436
 
3432
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3433
3438
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3465,6 +3470,7 @@ extern "C" void Init_llama_cpp(void) {
3465
3470
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3466
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3467
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3473
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
3468
3474
 
3469
3475
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3470
3476
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.0'
6
+ VERSION = '0.15.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2781'
9
+ LLAMA_CPP_VERSION = 'b2917'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -24,6 +24,11 @@ module LLaMACpp
24
24
  LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
25
  LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+ LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
+ LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
+ LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
27
32
 
28
33
  LLAMA_FTYPE_ALL_F32: Integer
29
34
  LLAMA_FTYPE_MOSTLY_F16: Integer
@@ -53,6 +58,7 @@ module LLaMACpp
53
58
  LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
54
59
  LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
55
60
  LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
61
+ LLAMA_FTYPE_MOSTLY_BF16: Integer
56
62
 
57
63
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
58
64
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
77
77
  ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
78
  ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
79
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
80
  ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
81
  ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
82
  ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
85
84
  elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
86
85
  continue; \
87
86
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@@ -436,7 +435,7 @@ ifdef LLAMA_CUDA
436
435
  else
437
436
  CUDA_PATH ?= /usr/local/cuda
438
437
  endif
439
- MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
438
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
440
439
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
441
440
  OBJS += ggml-cuda.o
442
441
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -563,10 +562,10 @@ endif # LLAMA_VULKAN
563
562
  ifdef LLAMA_HIPBLAS
564
563
  ifeq ($(wildcard /opt/rocm),)
565
564
  ROCM_PATH ?= /usr
566
- GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
565
+ AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
567
566
  else
568
567
  ROCM_PATH ?= /opt/rocm
569
- GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
568
+ AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
570
569
  endif
571
570
  HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
572
571
  LLAMA_CUDA_DMMV_X ?= 32
@@ -578,7 +577,7 @@ ifdef LLAMA_HIP_UMA
578
577
  endif # LLAMA_HIP_UMA
579
578
  MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
580
579
  MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
581
- HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
580
+ HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
582
581
  HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
583
582
  HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
584
583
  HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@@ -761,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
761
760
  ar rcs libllama.a $^
762
761
 
763
762
  clean:
764
- rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
763
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
765
764
  rm -vrf ggml-cuda/*.o
766
765
 
767
766
  #
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1182
1182
  static char * fmt_size(size_t size) {
1183
1183
  static char buffer[128];
1184
1184
  if (size >= 1024*1024) {
1185
- sprintf(buffer, "%zuM", size/1024/1024);
1185
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1186
1186
  } else {
1187
- sprintf(buffer, "%zuK", size/1024);
1187
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1188
1188
  }
1189
1189
  return buffer;
1190
1190
  }
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
1895
1895
 
1896
1896
  tensor->buffer = buffer;
1897
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- tensor->backend = tensor->view_src->backend;
1899
1898
  ggml_backend_buffer_init_tensor(buffer, tensor);
1900
1899
  }
1901
1900
 
@@ -4,7 +4,6 @@
4
4
 
5
5
  #include "ggml-cuda/common.cuh"
6
6
  #include "ggml-cuda/acc.cuh"
7
- #include "ggml-cuda/alibi.cuh"
8
7
  #include "ggml-cuda/arange.cuh"
9
8
  #include "ggml-cuda/argsort.cuh"
10
9
  #include "ggml-cuda/binbcast.cuh"
@@ -113,7 +112,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
113
112
  for (int id = 0; id < info.device_count; ++id) {
114
113
  int device_vmm = 0;
115
114
 
116
- #if !defined(GGML_USE_HIPBLAS)
115
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
117
116
  CUdevice device;
118
117
  CU_CHECK(cuDeviceGet(&device, id));
119
118
  CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -259,7 +258,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
259
258
  };
260
259
 
261
260
  // pool with virtual memory
262
- #if !defined(GGML_USE_HIPBLAS)
261
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
263
262
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
264
263
  static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
265
264
 
@@ -356,7 +355,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
356
355
  #endif // !defined(GGML_USE_HIPBLAS)
357
356
 
358
357
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
359
- #if !defined(GGML_USE_HIPBLAS)
358
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
360
359
  if (ggml_cuda_info().devices[device].vmm) {
361
360
  return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
362
361
  }
@@ -1647,7 +1646,7 @@ static void ggml_cuda_op_mul_mat(
1647
1646
  }
1648
1647
  }
1649
1648
 
1650
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1649
+ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1651
1650
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1652
1651
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1653
1652
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -1670,7 +1669,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
1670
1669
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1671
1670
  }
1672
1671
 
1673
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1672
+ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1674
1673
  GGML_ASSERT(!ggml_is_transposed(src0));
1675
1674
  GGML_ASSERT(!ggml_is_transposed(src1));
1676
1675
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -2205,6 +2204,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2205
2204
  case GGML_UNARY_OP_RELU:
2206
2205
  ggml_cuda_op_relu(ctx, dst);
2207
2206
  break;
2207
+ case GGML_UNARY_OP_SIGMOID:
2208
+ ggml_cuda_op_sigmoid(ctx, dst);
2209
+ break;
2208
2210
  case GGML_UNARY_OP_HARDSIGMOID:
2209
2211
  ggml_cuda_op_hardsigmoid(ctx, dst);
2210
2212
  break;
@@ -2277,9 +2279,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2277
2279
  case GGML_OP_ROPE:
2278
2280
  ggml_cuda_op_rope(ctx, dst);
2279
2281
  break;
2280
- case GGML_OP_ALIBI:
2281
- ggml_cuda_op_alibi(ctx, dst);
2282
- break;
2283
2282
  case GGML_OP_IM2COL:
2284
2283
  ggml_cuda_op_im2col(ctx, dst);
2285
2284
  break;
@@ -2410,44 +2409,318 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2410
2409
  GGML_UNUSED(backend);
2411
2410
  }
2412
2411
 
2412
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2413
+ graph_node_properties->node_address = node->data;
2414
+ graph_node_properties->node_op = node->op;
2415
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2416
+ graph_node_properties->ne[i] = node->ne[i];
2417
+ graph_node_properties->nb[i] = node->nb[i];
2418
+ }
2419
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2420
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2421
+ }
2422
+ }
2423
+
2424
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2425
+ if (node->data != graph_node_properties->node_address &&
2426
+ node->op != GGML_OP_CPY &&
2427
+ node->op != GGML_OP_VIEW) {
2428
+ return false;
2429
+ }
2430
+
2431
+ if (node->op != graph_node_properties->node_op) {
2432
+ return false;
2433
+ }
2434
+
2435
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2436
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2437
+ return false;
2438
+ }
2439
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2440
+ return false;
2441
+ }
2442
+ }
2443
+
2444
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2445
+ if (node->src[i] &&
2446
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2447
+ node->op != GGML_OP_CPY &&
2448
+ node->op != GGML_OP_VIEW
2449
+ ) {
2450
+ return false;
2451
+ }
2452
+ }
2453
+ return true;
2454
+ }
2455
+
2413
2456
  GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2414
2457
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2415
2458
 
2416
2459
  ggml_cuda_set_device(cuda_ctx->device);
2417
2460
 
2418
- for (int i = 0; i < cgraph->n_nodes; i++) {
2419
- ggml_tensor * node = cgraph->nodes[i];
2461
+ #ifdef USE_CUDA_GRAPH
2462
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2420
2463
 
2421
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2422
- continue;
2464
+ // Objects required for CUDA Graph
2465
+ if (cuda_ctx->cuda_graph == nullptr) {
2466
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2467
+ }
2468
+
2469
+ bool use_cuda_graph = true;
2470
+ bool cuda_graph_update_required = false;
2471
+ // pointer to CUDA cpy kernel, which is required to identify
2472
+ // kernel parameters which need updated in the graph for each token
2473
+ void * ggml_cuda_cpy_fn_ptr = nullptr;
2474
+
2475
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2476
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2477
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2478
+ #ifndef NDEBUG
2479
+ fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2480
+ #endif
2481
+ }
2482
+ }
2483
+
2484
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2485
+ // or previous graph capture failure.
2486
+ // Also disable for multi-gpu for now. TO DO investigate
2487
+ if (disable_cuda_graphs_due_to_env
2488
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2489
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2490
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2491
+ use_cuda_graph = false;
2492
+ }
2493
+
2494
+ if (use_cuda_graph) {
2495
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2496
+ cuda_graph_update_required = true;
2497
+ }
2498
+
2499
+ // Check if the graph size has changed
2500
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2501
+ cuda_graph_update_required = true;
2502
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2503
+ }
2504
+
2505
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2506
+ // and store properties to allow this comparison for the next token
2507
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2508
+ bool has_matching_properties = true;
2509
+ if (!cuda_graph_update_required) {
2510
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2511
+ }
2512
+ if (!has_matching_properties) {
2513
+ cuda_graph_update_required = true;
2514
+ }
2515
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2516
+ }
2517
+
2518
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2519
+ cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2520
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2521
+ ggml_tensor * node = cgraph->nodes[i];
2522
+
2523
+ if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2524
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2525
+ #ifndef NDEBUG
2526
+ fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2527
+ #endif
2528
+ }
2529
+
2530
+ if (node->op == GGML_OP_MUL_MAT_ID) {
2531
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2532
+ #ifndef NDEBUG
2533
+ fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2534
+ #endif
2535
+ }
2536
+
2537
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2538
+ // disable CUDA graphs for batch size > 1 for now.
2539
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
+ use_cuda_graph = false;
2541
+ #ifndef NDEBUG
2542
+ fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2543
+ #endif
2544
+ }
2545
+
2546
+ if (node->op == GGML_OP_CPY) {
2547
+ // store the copy op parameter which changes with each token.
2548
+ cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2549
+ if (ggml_cuda_cpy_fn_ptr == nullptr) {
2550
+ // store a pointer to the copy op CUDA kernel to identify it later
2551
+ ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2552
+ }
2553
+ }
2554
+
2555
+ if (!use_cuda_graph) {
2556
+ break;
2557
+ }
2558
+ }
2559
+
2560
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2561
+ if (use_cuda_graph && cuda_graph_update_required) {
2562
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2563
+ } else {
2564
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2423
2565
  }
2424
2566
 
2567
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2568
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2425
2569
  #ifndef NDEBUG
2426
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2427
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2428
- if (node->src[j] != nullptr) {
2429
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2570
+ fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2571
+ #endif
2572
+ }
2573
+ }
2574
+
2575
+ if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2576
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2577
+ }
2578
+
2579
+ #else
2580
+ bool use_cuda_graph = false;
2581
+ bool cuda_graph_update_required = false;
2582
+ #endif // USE_CUDA_GRAPH
2583
+
2584
+ bool graph_evaluated_or_captured = false;
2585
+
2586
+ while (!graph_evaluated_or_captured) {
2587
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2588
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2589
+ if (!use_cuda_graph || cuda_graph_update_required) {
2590
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2591
+ ggml_tensor * node = cgraph->nodes[i];
2592
+
2593
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2594
+ continue;
2595
+ }
2596
+
2597
+ #ifndef NDEBUG
2598
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2599
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2600
+ if (node->src[j] != nullptr) {
2601
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2602
+ }
2603
+ }
2604
+ #endif
2605
+
2606
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2607
+ if (!ok) {
2608
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2609
+ }
2610
+ GGML_ASSERT(ok);
2430
2611
  }
2431
2612
  }
2613
+
2614
+ #ifdef USE_CUDA_GRAPH
2615
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2616
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2617
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2618
+ cuda_ctx->cuda_graph->graph = nullptr;
2619
+ }
2620
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2621
+
2622
+ #if 0
2623
+ if (disable_cuda_graphs_due_to_failed_capture) {
2624
+ use_cuda_graph = false;
2625
+ cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
+ #ifndef NDEBUG
2627
+ fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2432
2628
  #endif
2629
+ } else {
2630
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2631
+ }
2632
+ #endif
2633
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2634
+ } else {
2635
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2636
+ }
2637
+ }
2638
+
2639
+ if (use_cuda_graph) {
2640
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2641
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2642
+ }
2643
+
2644
+ // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2645
+
2646
+ if (cuda_graph_update_required) {
2647
+ // Extract nodes from graph
2648
+ if (cuda_ctx->cuda_graph->num_nodes == 0) {
2649
+ // First call with null argument gets number of nodes in graph
2650
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2651
+ }
2652
+ // Subsequent call with non-null argument gets nodes
2653
+ cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2654
+ cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2655
+ if (cuda_ctx->cuda_graph->num_nodes > 0) {
2656
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2657
+
2658
+ // Loop over nodes, and extract kernel parameters from each node
2659
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2660
+ cudaGraphNodeType node_type;
2661
+ CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2662
+ if (node_type == cudaGraphNodeTypeKernel) {
2663
+ cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2664
+ if (stat == cudaErrorInvalidDeviceFunction) {
2665
+ // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2666
+ // We don't need to update blas nodes, so clear error and move on.
2667
+ cudaGetLastError();
2668
+ } else {
2669
+ GGML_ASSERT(stat == cudaSuccess);
2670
+ }
2671
+ }
2672
+ }
2673
+ }
2674
+ }
2433
2675
 
2434
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2435
- if (!ok) {
2436
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2676
+ // One of the arguments to the copy kernel is updated for each token, hence we need to
2677
+ // replace that argument with the updated value in the CUDA graph
2678
+ if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2679
+ int k = 0;
2680
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2681
+ if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2682
+ char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2683
+ cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2684
+ CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2685
+ }
2686
+ }
2687
+ }
2688
+
2689
+ // Update graph executable
2690
+ cudaGraphExecUpdateResultInfo result_info;
2691
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2692
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2693
+ #ifndef NDEBUG
2694
+ fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2695
+ #endif
2696
+ // The pre-existing graph exec cannot be updated due to violated constraints
2697
+ // so instead clear error and re-instantiate
2698
+ cudaGetLastError();
2699
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2700
+ cuda_ctx->cuda_graph->instance = nullptr;
2701
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2702
+ } else {
2703
+ GGML_ASSERT(stat == cudaSuccess);
2437
2704
  }
2438
- GGML_ASSERT(ok);
2705
+ // Launch graph
2706
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2707
+ #else
2708
+ graph_evaluated_or_captured = true;
2709
+ #endif // USE_CUDA_GRAPH
2439
2710
  }
2440
2711
 
2441
2712
  return GGML_STATUS_SUCCESS;
2442
2713
  }
2443
2714
 
2444
2715
  GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2716
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
2445
2717
  switch (op->op) {
2446
2718
  case GGML_OP_UNARY:
2447
2719
  switch (ggml_get_unary_op(op)) {
2448
2720
  case GGML_UNARY_OP_GELU:
2449
2721
  case GGML_UNARY_OP_SILU:
2450
2722
  case GGML_UNARY_OP_RELU:
2723
+ case GGML_UNARY_OP_SIGMOID:
2451
2724
  case GGML_UNARY_OP_HARDSIGMOID:
2452
2725
  case GGML_UNARY_OP_HARDSWISH:
2453
2726
  case GGML_UNARY_OP_GELU_QUICK:
@@ -2557,7 +2830,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2557
2830
  case GGML_OP_DIAG_MASK_INF:
2558
2831
  case GGML_OP_SOFT_MAX:
2559
2832
  case GGML_OP_ROPE:
2560
- case GGML_OP_ALIBI:
2561
2833
  case GGML_OP_IM2COL:
2562
2834
  case GGML_OP_POOL_2D:
2563
2835
  case GGML_OP_SUM_ROWS:
@@ -2569,8 +2841,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2569
2841
  case GGML_OP_ARANGE:
2570
2842
  case GGML_OP_TIMESTEP_EMBEDDING:
2571
2843
  case GGML_OP_LEAKY_RELU:
2572
- case GGML_OP_FLASH_ATTN_EXT:
2573
2844
  return true;
2845
+ case GGML_OP_FLASH_ATTN_EXT:
2846
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2847
+ return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2848
+ #else
2849
+ if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
2850
+ return true;
2851
+ }
2852
+ return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
2853
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2574
2854
  default:
2575
2855
  return false;
2576
2856
  }