llama_cpp 0.15.0 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
- data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
3
+ metadata.gz: 30dd4c29b86098faf7c78de5fa8e57021b631bb5eb3d14c93f63f1d186383ab8
4
+ data.tar.gz: b011d891f1cd725f84821428a8db24004b52c9614e785f493f721f7abde71029
5
5
  SHA512:
6
- metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
- data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
6
+ metadata.gz: 6c1628f93762747688f802db8593946e8581c869f63c610669b45759f644b3d19b061825b788e328b6b984977112837586ed398b6118a8f8e5f0c7f6fd0eb2dd
7
+ data.tar.gz: 2f8c3d9f1e6c0f6db7e0682995c8d34179d5405d32784bf00f04a3408cb5bf4c95557bfa1692026f8d3dc9e672d6b15dec5d33cbd76ddc1d94e5ec964a9d0409
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
2
+
3
+ - Bump llama.cpp from b2839 to b2917.
4
+
5
+ Implementation binding for rpc_servers in llama_model_params has been skipped.
6
+
7
+ ## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
8
+
9
+ - Bump llama.cpp from b2781 to b2839.
10
+ - Add constants for pre-tokenization types.
11
+ - Add constant for model file type.
12
+
1
13
  ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
14
 
3
15
  - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
@@ -3428,6 +3428,11 @@ extern "C" void Init_llama_cpp(void) {
3428
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
3429
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
3430
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3433
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
3431
3436
 
3432
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3433
3438
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3465,6 +3470,7 @@ extern "C" void Init_llama_cpp(void) {
3465
3470
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3466
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3467
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3473
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
3468
3474
 
3469
3475
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3470
3476
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.0'
6
+ VERSION = '0.15.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2781'
9
+ LLAMA_CPP_VERSION = 'b2917'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -24,6 +24,11 @@ module LLaMACpp
24
24
  LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
25
  LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+ LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
+ LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
+ LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
27
32
 
28
33
  LLAMA_FTYPE_ALL_F32: Integer
29
34
  LLAMA_FTYPE_MOSTLY_F16: Integer
@@ -53,6 +58,7 @@ module LLaMACpp
53
58
  LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
54
59
  LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
55
60
  LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
61
+ LLAMA_FTYPE_MOSTLY_BF16: Integer
56
62
 
57
63
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
58
64
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
77
77
  ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
78
  ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
79
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
80
  ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
81
  ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
82
  ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
85
84
  elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
86
85
  continue; \
87
86
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@@ -436,7 +435,7 @@ ifdef LLAMA_CUDA
436
435
  else
437
436
  CUDA_PATH ?= /usr/local/cuda
438
437
  endif
439
- MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
438
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
440
439
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
441
440
  OBJS += ggml-cuda.o
442
441
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -563,10 +562,10 @@ endif # LLAMA_VULKAN
563
562
  ifdef LLAMA_HIPBLAS
564
563
  ifeq ($(wildcard /opt/rocm),)
565
564
  ROCM_PATH ?= /usr
566
- GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
565
+ AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
567
566
  else
568
567
  ROCM_PATH ?= /opt/rocm
569
- GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
568
+ AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
570
569
  endif
571
570
  HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
572
571
  LLAMA_CUDA_DMMV_X ?= 32
@@ -578,7 +577,7 @@ ifdef LLAMA_HIP_UMA
578
577
  endif # LLAMA_HIP_UMA
579
578
  MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
580
579
  MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
581
- HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
580
+ HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
582
581
  HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
583
582
  HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
584
583
  HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@@ -761,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
761
760
  ar rcs libllama.a $^
762
761
 
763
762
  clean:
764
- rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
763
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
765
764
  rm -vrf ggml-cuda/*.o
766
765
 
767
766
  #
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1182
1182
  static char * fmt_size(size_t size) {
1183
1183
  static char buffer[128];
1184
1184
  if (size >= 1024*1024) {
1185
- sprintf(buffer, "%zuM", size/1024/1024);
1185
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1186
1186
  } else {
1187
- sprintf(buffer, "%zuK", size/1024);
1187
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1188
1188
  }
1189
1189
  return buffer;
1190
1190
  }
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
1895
1895
 
1896
1896
  tensor->buffer = buffer;
1897
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- tensor->backend = tensor->view_src->backend;
1899
1898
  ggml_backend_buffer_init_tensor(buffer, tensor);
1900
1899
  }
1901
1900
 
@@ -4,7 +4,6 @@
4
4
 
5
5
  #include "ggml-cuda/common.cuh"
6
6
  #include "ggml-cuda/acc.cuh"
7
- #include "ggml-cuda/alibi.cuh"
8
7
  #include "ggml-cuda/arange.cuh"
9
8
  #include "ggml-cuda/argsort.cuh"
10
9
  #include "ggml-cuda/binbcast.cuh"
@@ -113,7 +112,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
113
112
  for (int id = 0; id < info.device_count; ++id) {
114
113
  int device_vmm = 0;
115
114
 
116
- #if !defined(GGML_USE_HIPBLAS)
115
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
117
116
  CUdevice device;
118
117
  CU_CHECK(cuDeviceGet(&device, id));
119
118
  CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -259,7 +258,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
259
258
  };
260
259
 
261
260
  // pool with virtual memory
262
- #if !defined(GGML_USE_HIPBLAS)
261
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
263
262
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
264
263
  static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
265
264
 
@@ -356,7 +355,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
356
355
  #endif // !defined(GGML_USE_HIPBLAS)
357
356
 
358
357
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
359
- #if !defined(GGML_USE_HIPBLAS)
358
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
360
359
  if (ggml_cuda_info().devices[device].vmm) {
361
360
  return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
362
361
  }
@@ -1647,7 +1646,7 @@ static void ggml_cuda_op_mul_mat(
1647
1646
  }
1648
1647
  }
1649
1648
 
1650
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1649
+ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1651
1650
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1652
1651
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1653
1652
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -1670,7 +1669,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
1670
1669
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1671
1670
  }
1672
1671
 
1673
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1672
+ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1674
1673
  GGML_ASSERT(!ggml_is_transposed(src0));
1675
1674
  GGML_ASSERT(!ggml_is_transposed(src1));
1676
1675
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -2205,6 +2204,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2205
2204
  case GGML_UNARY_OP_RELU:
2206
2205
  ggml_cuda_op_relu(ctx, dst);
2207
2206
  break;
2207
+ case GGML_UNARY_OP_SIGMOID:
2208
+ ggml_cuda_op_sigmoid(ctx, dst);
2209
+ break;
2208
2210
  case GGML_UNARY_OP_HARDSIGMOID:
2209
2211
  ggml_cuda_op_hardsigmoid(ctx, dst);
2210
2212
  break;
@@ -2277,9 +2279,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2277
2279
  case GGML_OP_ROPE:
2278
2280
  ggml_cuda_op_rope(ctx, dst);
2279
2281
  break;
2280
- case GGML_OP_ALIBI:
2281
- ggml_cuda_op_alibi(ctx, dst);
2282
- break;
2283
2282
  case GGML_OP_IM2COL:
2284
2283
  ggml_cuda_op_im2col(ctx, dst);
2285
2284
  break;
@@ -2410,44 +2409,318 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2410
2409
  GGML_UNUSED(backend);
2411
2410
  }
2412
2411
 
2412
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2413
+ graph_node_properties->node_address = node->data;
2414
+ graph_node_properties->node_op = node->op;
2415
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2416
+ graph_node_properties->ne[i] = node->ne[i];
2417
+ graph_node_properties->nb[i] = node->nb[i];
2418
+ }
2419
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2420
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2421
+ }
2422
+ }
2423
+
2424
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2425
+ if (node->data != graph_node_properties->node_address &&
2426
+ node->op != GGML_OP_CPY &&
2427
+ node->op != GGML_OP_VIEW) {
2428
+ return false;
2429
+ }
2430
+
2431
+ if (node->op != graph_node_properties->node_op) {
2432
+ return false;
2433
+ }
2434
+
2435
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2436
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2437
+ return false;
2438
+ }
2439
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2440
+ return false;
2441
+ }
2442
+ }
2443
+
2444
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2445
+ if (node->src[i] &&
2446
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2447
+ node->op != GGML_OP_CPY &&
2448
+ node->op != GGML_OP_VIEW
2449
+ ) {
2450
+ return false;
2451
+ }
2452
+ }
2453
+ return true;
2454
+ }
2455
+
2413
2456
  GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2414
2457
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2415
2458
 
2416
2459
  ggml_cuda_set_device(cuda_ctx->device);
2417
2460
 
2418
- for (int i = 0; i < cgraph->n_nodes; i++) {
2419
- ggml_tensor * node = cgraph->nodes[i];
2461
+ #ifdef USE_CUDA_GRAPH
2462
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2420
2463
 
2421
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2422
- continue;
2464
+ // Objects required for CUDA Graph
2465
+ if (cuda_ctx->cuda_graph == nullptr) {
2466
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2467
+ }
2468
+
2469
+ bool use_cuda_graph = true;
2470
+ bool cuda_graph_update_required = false;
2471
+ // pointer to CUDA cpy kernel, which is required to identify
2472
+ // kernel parameters which need updated in the graph for each token
2473
+ void * ggml_cuda_cpy_fn_ptr = nullptr;
2474
+
2475
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2476
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2477
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2478
+ #ifndef NDEBUG
2479
+ fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2480
+ #endif
2481
+ }
2482
+ }
2483
+
2484
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2485
+ // or previous graph capture failure.
2486
+ // Also disable for multi-gpu for now. TO DO investigate
2487
+ if (disable_cuda_graphs_due_to_env
2488
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2489
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2490
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2491
+ use_cuda_graph = false;
2492
+ }
2493
+
2494
+ if (use_cuda_graph) {
2495
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2496
+ cuda_graph_update_required = true;
2497
+ }
2498
+
2499
+ // Check if the graph size has changed
2500
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2501
+ cuda_graph_update_required = true;
2502
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2503
+ }
2504
+
2505
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2506
+ // and store properties to allow this comparison for the next token
2507
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2508
+ bool has_matching_properties = true;
2509
+ if (!cuda_graph_update_required) {
2510
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2511
+ }
2512
+ if (!has_matching_properties) {
2513
+ cuda_graph_update_required = true;
2514
+ }
2515
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2516
+ }
2517
+
2518
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2519
+ cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2520
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2521
+ ggml_tensor * node = cgraph->nodes[i];
2522
+
2523
+ if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2524
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2525
+ #ifndef NDEBUG
2526
+ fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2527
+ #endif
2528
+ }
2529
+
2530
+ if (node->op == GGML_OP_MUL_MAT_ID) {
2531
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2532
+ #ifndef NDEBUG
2533
+ fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2534
+ #endif
2535
+ }
2536
+
2537
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2538
+ // disable CUDA graphs for batch size > 1 for now.
2539
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
+ use_cuda_graph = false;
2541
+ #ifndef NDEBUG
2542
+ fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2543
+ #endif
2544
+ }
2545
+
2546
+ if (node->op == GGML_OP_CPY) {
2547
+ // store the copy op parameter which changes with each token.
2548
+ cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2549
+ if (ggml_cuda_cpy_fn_ptr == nullptr) {
2550
+ // store a pointer to the copy op CUDA kernel to identify it later
2551
+ ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2552
+ }
2553
+ }
2554
+
2555
+ if (!use_cuda_graph) {
2556
+ break;
2557
+ }
2558
+ }
2559
+
2560
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2561
+ if (use_cuda_graph && cuda_graph_update_required) {
2562
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2563
+ } else {
2564
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2423
2565
  }
2424
2566
 
2567
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2568
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2425
2569
  #ifndef NDEBUG
2426
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2427
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2428
- if (node->src[j] != nullptr) {
2429
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2570
+ fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2571
+ #endif
2572
+ }
2573
+ }
2574
+
2575
+ if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2576
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2577
+ }
2578
+
2579
+ #else
2580
+ bool use_cuda_graph = false;
2581
+ bool cuda_graph_update_required = false;
2582
+ #endif // USE_CUDA_GRAPH
2583
+
2584
+ bool graph_evaluated_or_captured = false;
2585
+
2586
+ while (!graph_evaluated_or_captured) {
2587
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2588
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2589
+ if (!use_cuda_graph || cuda_graph_update_required) {
2590
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2591
+ ggml_tensor * node = cgraph->nodes[i];
2592
+
2593
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2594
+ continue;
2595
+ }
2596
+
2597
+ #ifndef NDEBUG
2598
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2599
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2600
+ if (node->src[j] != nullptr) {
2601
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2602
+ }
2603
+ }
2604
+ #endif
2605
+
2606
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2607
+ if (!ok) {
2608
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2609
+ }
2610
+ GGML_ASSERT(ok);
2430
2611
  }
2431
2612
  }
2613
+
2614
+ #ifdef USE_CUDA_GRAPH
2615
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2616
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2617
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2618
+ cuda_ctx->cuda_graph->graph = nullptr;
2619
+ }
2620
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2621
+
2622
+ #if 0
2623
+ if (disable_cuda_graphs_due_to_failed_capture) {
2624
+ use_cuda_graph = false;
2625
+ cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
+ #ifndef NDEBUG
2627
+ fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2432
2628
  #endif
2629
+ } else {
2630
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2631
+ }
2632
+ #endif
2633
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2634
+ } else {
2635
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2636
+ }
2637
+ }
2638
+
2639
+ if (use_cuda_graph) {
2640
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2641
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2642
+ }
2643
+
2644
+ // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2645
+
2646
+ if (cuda_graph_update_required) {
2647
+ // Extract nodes from graph
2648
+ if (cuda_ctx->cuda_graph->num_nodes == 0) {
2649
+ // First call with null argument gets number of nodes in graph
2650
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2651
+ }
2652
+ // Subsequent call with non-null argument gets nodes
2653
+ cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2654
+ cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2655
+ if (cuda_ctx->cuda_graph->num_nodes > 0) {
2656
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2657
+
2658
+ // Loop over nodes, and extract kernel parameters from each node
2659
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2660
+ cudaGraphNodeType node_type;
2661
+ CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2662
+ if (node_type == cudaGraphNodeTypeKernel) {
2663
+ cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2664
+ if (stat == cudaErrorInvalidDeviceFunction) {
2665
+ // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2666
+ // We don't need to update blas nodes, so clear error and move on.
2667
+ cudaGetLastError();
2668
+ } else {
2669
+ GGML_ASSERT(stat == cudaSuccess);
2670
+ }
2671
+ }
2672
+ }
2673
+ }
2674
+ }
2433
2675
 
2434
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2435
- if (!ok) {
2436
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2676
+ // One of the arguments to the copy kernel is updated for each token, hence we need to
2677
+ // replace that argument with the updated value in the CUDA graph
2678
+ if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2679
+ int k = 0;
2680
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2681
+ if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2682
+ char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2683
+ cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2684
+ CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2685
+ }
2686
+ }
2687
+ }
2688
+
2689
+ // Update graph executable
2690
+ cudaGraphExecUpdateResultInfo result_info;
2691
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2692
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2693
+ #ifndef NDEBUG
2694
+ fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2695
+ #endif
2696
+ // The pre-existing graph exec cannot be updated due to violated constraints
2697
+ // so instead clear error and re-instantiate
2698
+ cudaGetLastError();
2699
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2700
+ cuda_ctx->cuda_graph->instance = nullptr;
2701
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2702
+ } else {
2703
+ GGML_ASSERT(stat == cudaSuccess);
2437
2704
  }
2438
- GGML_ASSERT(ok);
2705
+ // Launch graph
2706
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2707
+ #else
2708
+ graph_evaluated_or_captured = true;
2709
+ #endif // USE_CUDA_GRAPH
2439
2710
  }
2440
2711
 
2441
2712
  return GGML_STATUS_SUCCESS;
2442
2713
  }
2443
2714
 
2444
2715
  GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2716
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
2445
2717
  switch (op->op) {
2446
2718
  case GGML_OP_UNARY:
2447
2719
  switch (ggml_get_unary_op(op)) {
2448
2720
  case GGML_UNARY_OP_GELU:
2449
2721
  case GGML_UNARY_OP_SILU:
2450
2722
  case GGML_UNARY_OP_RELU:
2723
+ case GGML_UNARY_OP_SIGMOID:
2451
2724
  case GGML_UNARY_OP_HARDSIGMOID:
2452
2725
  case GGML_UNARY_OP_HARDSWISH:
2453
2726
  case GGML_UNARY_OP_GELU_QUICK:
@@ -2557,7 +2830,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2557
2830
  case GGML_OP_DIAG_MASK_INF:
2558
2831
  case GGML_OP_SOFT_MAX:
2559
2832
  case GGML_OP_ROPE:
2560
- case GGML_OP_ALIBI:
2561
2833
  case GGML_OP_IM2COL:
2562
2834
  case GGML_OP_POOL_2D:
2563
2835
  case GGML_OP_SUM_ROWS:
@@ -2569,8 +2841,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2569
2841
  case GGML_OP_ARANGE:
2570
2842
  case GGML_OP_TIMESTEP_EMBEDDING:
2571
2843
  case GGML_OP_LEAKY_RELU:
2572
- case GGML_OP_FLASH_ATTN_EXT:
2573
2844
  return true;
2845
+ case GGML_OP_FLASH_ATTN_EXT:
2846
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2847
+ return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2848
+ #else
2849
+ if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
2850
+ return true;
2851
+ }
2852
+ return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
2853
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2574
2854
  default:
2575
2855
  return false;
2576
2856
  }