llama_cpp 0.15.0 → 0.15.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
- data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
3
+ metadata.gz: ce6d72aeb5fb9aff775d44284bf934e164f8470973619507ef6e6eb1ac0bec4d
4
+ data.tar.gz: 7c1ae823c90f957219b3edbc20f091b65a50caa984c1a6f4d137a46c376b2f0c
5
5
  SHA512:
6
- metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
- data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
6
+ metadata.gz: d23cb6a63b7734df2547c5e61a699fa206878c747e274e004c829b77335a7cc7434e92168a55d8ab0a617b11eddb5d45d5057a91b92e848735fd9e852b2476cd
7
+ data.tar.gz: f54b09de3cc60de81be977e9706a9beb3bf28e7740a19a57f6add543fe10cd6dc4101cbbe22dd5b62870c78a1ad4d10f57dd29b7c3e3e12b950e6575cf67b0c7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
2
+
3
+ - Bump llama.cpp from b2781 to b2839.
4
+ - Add constants for pre-tokenization types.
5
+ - Add constant for model file type.
6
+
1
7
  ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
8
 
3
9
  - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
@@ -3428,6 +3428,11 @@ extern "C" void Init_llama_cpp(void) {
3428
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
3429
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
3430
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3433
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
3431
3436
 
3432
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3433
3438
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3465,6 +3470,7 @@ extern "C" void Init_llama_cpp(void) {
3465
3470
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3466
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3467
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3473
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
3468
3474
 
3469
3475
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3470
3476
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.0'
6
+ VERSION = '0.15.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2781'
9
+ LLAMA_CPP_VERSION = 'b2839'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -24,6 +24,11 @@ module LLaMACpp
24
24
  LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
25
  LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+ LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
+ LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
+ LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
27
32
 
28
33
  LLAMA_FTYPE_ALL_F32: Integer
29
34
  LLAMA_FTYPE_MOSTLY_F16: Integer
@@ -53,6 +58,7 @@ module LLaMACpp
53
58
  LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
54
59
  LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
55
60
  LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
61
+ LLAMA_FTYPE_MOSTLY_BF16: Integer
56
62
 
57
63
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
58
64
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
77
77
  ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
78
  ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
79
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
80
  ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
81
  ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
82
  ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
85
84
  elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
86
85
  continue; \
87
86
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@@ -436,7 +435,7 @@ ifdef LLAMA_CUDA
436
435
  else
437
436
  CUDA_PATH ?= /usr/local/cuda
438
437
  endif
439
- MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
438
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
440
439
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
441
440
  OBJS += ggml-cuda.o
442
441
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -761,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
761
760
  ar rcs libllama.a $^
762
761
 
763
762
  clean:
764
- rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
763
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
765
764
  rm -vrf ggml-cuda/*.o
766
765
 
767
766
  #
@@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
113
113
  for (int id = 0; id < info.device_count; ++id) {
114
114
  int device_vmm = 0;
115
115
 
116
- #if !defined(GGML_USE_HIPBLAS)
116
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
117
117
  CUdevice device;
118
118
  CU_CHECK(cuDeviceGet(&device, id));
119
119
  CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
259
259
  };
260
260
 
261
261
  // pool with virtual memory
262
- #if !defined(GGML_USE_HIPBLAS)
262
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
263
263
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
264
264
  static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
265
265
 
@@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
356
356
  #endif // !defined(GGML_USE_HIPBLAS)
357
357
 
358
358
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
359
- #if !defined(GGML_USE_HIPBLAS)
359
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
360
360
  if (ggml_cuda_info().devices[device].vmm) {
361
361
  return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
362
362
  }
@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
1647
1647
  }
1648
1648
  }
1649
1649
 
1650
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1650
+ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1651
1651
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1652
1652
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1653
1653
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -1670,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
1670
1670
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1671
1671
  }
1672
1672
 
1673
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1673
+ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1674
1674
  GGML_ASSERT(!ggml_is_transposed(src0));
1675
1675
  GGML_ASSERT(!ggml_is_transposed(src1));
1676
1676
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -2410,32 +2410,304 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2410
2410
  GGML_UNUSED(backend);
2411
2411
  }
2412
2412
 
2413
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2414
+ graph_node_properties->node_address = node->data;
2415
+ graph_node_properties->node_op = node->op;
2416
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2417
+ graph_node_properties->ne[i] = node->ne[i];
2418
+ graph_node_properties->nb[i] = node->nb[i];
2419
+ }
2420
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2421
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2422
+ }
2423
+ }
2424
+
2425
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2426
+ if (node->data != graph_node_properties->node_address &&
2427
+ node->op != GGML_OP_CPY &&
2428
+ node->op != GGML_OP_VIEW) {
2429
+ return false;
2430
+ }
2431
+
2432
+ if (node->op != graph_node_properties->node_op) {
2433
+ return false;
2434
+ }
2435
+
2436
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2437
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2438
+ return false;
2439
+ }
2440
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2441
+ return false;
2442
+ }
2443
+ }
2444
+
2445
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2446
+ if (node->src[i] &&
2447
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2448
+ node->op != GGML_OP_CPY &&
2449
+ node->op != GGML_OP_VIEW
2450
+ ) {
2451
+ return false;
2452
+ }
2453
+ }
2454
+ return true;
2455
+ }
2456
+
2413
2457
  GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2414
2458
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2415
2459
 
2416
2460
  ggml_cuda_set_device(cuda_ctx->device);
2417
2461
 
2418
- for (int i = 0; i < cgraph->n_nodes; i++) {
2419
- ggml_tensor * node = cgraph->nodes[i];
2462
+ #ifdef USE_CUDA_GRAPH
2463
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2420
2464
 
2421
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2422
- continue;
2465
+ // Objects required for CUDA Graph
2466
+ if (cuda_ctx->cuda_graph == nullptr) {
2467
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2468
+ }
2469
+
2470
+ bool use_cuda_graph = true;
2471
+ bool cuda_graph_update_required = false;
2472
+ // pointer to CUDA cpy kernel, which is required to identify
2473
+ // kernel parameters which need updated in the graph for each token
2474
+ void * ggml_cuda_cpy_fn_ptr = nullptr;
2475
+
2476
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2477
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2478
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2479
+ #ifndef NDEBUG
2480
+ fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2481
+ #endif
2482
+ }
2483
+ }
2484
+
2485
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2486
+ // or previous graph capture failure.
2487
+ // Also disable for multi-gpu for now. TO DO investigate
2488
+ if (disable_cuda_graphs_due_to_env
2489
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2490
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2491
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2492
+ use_cuda_graph = false;
2493
+ }
2494
+
2495
+ if (use_cuda_graph) {
2496
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2497
+ cuda_graph_update_required = true;
2423
2498
  }
2424
2499
 
2500
+ // Check if the graph size has changed
2501
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2502
+ cuda_graph_update_required = true;
2503
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2504
+ }
2505
+
2506
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2507
+ // and store properties to allow this comparison for the next token
2508
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2509
+ bool has_matching_properties = true;
2510
+ if (!cuda_graph_update_required) {
2511
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2512
+ }
2513
+ if (!has_matching_properties) {
2514
+ cuda_graph_update_required = true;
2515
+ }
2516
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2517
+ }
2518
+
2519
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2520
+ cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2521
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2522
+ ggml_tensor * node = cgraph->nodes[i];
2523
+
2524
+ if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2525
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2425
2526
  #ifndef NDEBUG
2426
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2427
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2428
- if (node->src[j] != nullptr) {
2429
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2527
+ fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2528
+ #endif
2529
+ }
2530
+
2531
+ if (node->op == GGML_OP_MUL_MAT_ID) {
2532
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2533
+ #ifndef NDEBUG
2534
+ fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2535
+ #endif
2536
+ }
2537
+
2538
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2539
+ // disable CUDA graphs for batch size > 1 for now.
2540
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2541
+ use_cuda_graph = false;
2542
+ #ifndef NDEBUG
2543
+ fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2544
+ #endif
2545
+ }
2546
+
2547
+ if (node->op == GGML_OP_CPY) {
2548
+ // store the copy op parameter which changes with each token.
2549
+ cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2550
+ if (ggml_cuda_cpy_fn_ptr == nullptr) {
2551
+ // store a pointer to the copy op CUDA kernel to identify it later
2552
+ ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2553
+ }
2554
+ }
2555
+
2556
+ if (!use_cuda_graph) {
2557
+ break;
2558
+ }
2559
+ }
2560
+
2561
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2562
+ if (cuda_graph_update_required) {
2563
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2564
+ } else {
2565
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2566
+ }
2567
+
2568
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2569
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2570
+ #ifndef NDEBUG
2571
+ fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2572
+ #endif
2573
+ }
2574
+ }
2575
+
2576
+ if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2577
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2578
+ }
2579
+
2580
+ #else
2581
+ bool use_cuda_graph = false;
2582
+ bool cuda_graph_update_required = false;
2583
+ #endif // USE_CUDA_GRAPH
2584
+
2585
+ bool graph_evaluated_or_captured = false;
2586
+
2587
+ while (!graph_evaluated_or_captured) {
2588
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2589
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2590
+ if (!use_cuda_graph || cuda_graph_update_required) {
2591
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2592
+ ggml_tensor * node = cgraph->nodes[i];
2593
+
2594
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2595
+ continue;
2596
+ }
2597
+
2598
+ #ifndef NDEBUG
2599
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2600
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2601
+ if (node->src[j] != nullptr) {
2602
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2603
+ }
2604
+ }
2605
+ #endif
2606
+
2607
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2608
+ if (!ok) {
2609
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2610
+ }
2611
+ GGML_ASSERT(ok);
2430
2612
  }
2431
2613
  }
2614
+
2615
+ #ifdef USE_CUDA_GRAPH
2616
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2617
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2618
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2619
+ cuda_ctx->cuda_graph->graph = nullptr;
2620
+ }
2621
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2622
+
2623
+ #if 0
2624
+ if (disable_cuda_graphs_due_to_failed_capture) {
2625
+ use_cuda_graph = false;
2626
+ cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2627
+ #ifndef NDEBUG
2628
+ fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2432
2629
  #endif
2630
+ } else {
2631
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2632
+ }
2633
+ #endif
2634
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2635
+ } else {
2636
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2637
+ }
2638
+ }
2639
+
2640
+ if (use_cuda_graph) {
2641
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2642
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2643
+ }
2644
+
2645
+ // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2646
+
2647
+ if (cuda_graph_update_required) {
2648
+ // Extract nodes from graph
2649
+ if (cuda_ctx->cuda_graph->num_nodes == 0) {
2650
+ // First call with null argument gets number of nodes in graph
2651
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2652
+ }
2653
+ // Subsequent call with non-null argument gets nodes
2654
+ cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2655
+ cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2656
+ if (cuda_ctx->cuda_graph->num_nodes > 0) {
2657
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2658
+
2659
+ // Loop over nodes, and extract kernel parameters from each node
2660
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2661
+ cudaGraphNodeType node_type;
2662
+ CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2663
+ if (node_type == cudaGraphNodeTypeKernel) {
2664
+ cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2665
+ if (stat == cudaErrorInvalidDeviceFunction) {
2666
+ // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2667
+ // We don't need to update blas nodes, so clear error and move on.
2668
+ cudaGetLastError();
2669
+ } else {
2670
+ GGML_ASSERT(stat == cudaSuccess);
2671
+ }
2672
+ }
2673
+ }
2674
+ }
2675
+ }
2676
+
2677
+ // One of the arguments to the copy kernel is updated for each token, hence we need to
2678
+ // replace that argument with the updated value in the CUDA graph
2679
+ if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2680
+ int k = 0;
2681
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2682
+ if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2683
+ char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2684
+ cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2685
+ CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2686
+ }
2687
+ }
2688
+ }
2433
2689
 
2434
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2435
- if (!ok) {
2436
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2690
+ // Update graph executable
2691
+ cudaGraphExecUpdateResultInfo result_info;
2692
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2693
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2694
+ #ifndef NDEBUG
2695
+ fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2696
+ #endif
2697
+ // The pre-existing graph exec cannot be updated due to violated constraints
2698
+ // so instead clear error and re-instantiate
2699
+ cudaGetLastError();
2700
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2701
+ cuda_ctx->cuda_graph->instance = nullptr;
2702
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2703
+ } else {
2704
+ GGML_ASSERT(stat == cudaSuccess);
2437
2705
  }
2438
- GGML_ASSERT(ok);
2706
+ // Launch graph
2707
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2708
+ #else
2709
+ graph_evaluated_or_captured = true;
2710
+ #endif // USE_CUDA_GRAPH
2439
2711
  }
2440
2712
 
2441
2713
  return GGML_STATUS_SUCCESS;
@@ -17,6 +17,83 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ /**
21
+ * Converts brain16 to float32.
22
+ *
23
+ * The bfloat16 floating point format has the following structure:
24
+ *
25
+ * ┌sign
26
+ * │
27
+ * │ ┌exponent
28
+ * │ │
29
+ * │ │ ┌mantissa
30
+ * │ │ │
31
+ * │┌──┴───┐┌─┴───┐
32
+ * 0b0000000000000000 brain16
33
+ *
34
+ * Since bf16 has the same number of exponent bits as a 32bit float,
35
+ * encoding and decoding numbers becomes relatively straightforward.
36
+ *
37
+ * ┌sign
38
+ * │
39
+ * │ ┌exponent
40
+ * │ │
41
+ * │ │ ┌mantissa
42
+ * │ │ │
43
+ * │┌──┴───┐┌─┴───────────────────┐
44
+ * 0b00000000000000000000000000000000 IEEE binary32
45
+ *
46
+ * For comparison, the standard fp16 format has fewer exponent bits.
47
+ *
48
+ * ┌sign
49
+ * │
50
+ * │ ┌exponent
51
+ * │ │
52
+ * │ │ ┌mantissa
53
+ * │ │ │
54
+ * │┌─┴─┐┌─┴──────┐
55
+ * 0b0000000000000000 IEEE binary16
56
+ *
57
+ * @see IEEE 754-2008
58
+ */
59
+ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
60
+ union {
61
+ float f;
62
+ uint32_t i;
63
+ } u;
64
+ u.i = (uint32_t)h.bits << 16;
65
+ return u.f;
66
+ }
67
+
68
+ /**
69
+ * Converts float32 to brain16.
70
+ *
71
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
72
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
73
+ * This code should vectorize nicely if using modern compilers.
74
+ */
75
+ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
76
+ ggml_bf16_t h;
77
+ union {
78
+ float f;
79
+ uint32_t i;
80
+ } u;
81
+ u.f = s;
82
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
83
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
84
+ return h;
85
+ }
86
+ if (!(u.i & 0x7f800000)) { /* subnormal */
87
+ h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
88
+ return h;
89
+ }
90
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
91
+ return h;
92
+ }
93
+
94
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
95
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
96
+
20
97
  #ifdef __cplusplus
21
98
  extern "C" {
22
99
  #endif
@@ -265,11 +265,20 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
265
265
 
266
266
  static void * ggml_metal_host_malloc(size_t n) {
267
267
  void * data = NULL;
268
+
269
+ #if TARGET_OS_OSX
270
+ kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
271
+ if (err != KERN_SUCCESS) {
272
+ GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
273
+ return NULL;
274
+ }
275
+ #else
268
276
  const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
269
277
  if (result != 0) {
270
278
  GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
271
279
  return NULL;
272
280
  }
281
+ #endif
273
282
 
274
283
  return data;
275
284
  }
@@ -803,7 +812,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
803
812
  case GGML_OP_DIAG_MASK_INF:
804
813
  case GGML_OP_GET_ROWS:
805
814
  {
806
- return op->ne[3] == 1;
815
+ return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
807
816
  }
808
817
  default:
809
818
  return false;
@@ -2840,7 +2849,11 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
2840
2849
  ggml_backend_metal_free_device();
2841
2850
 
2842
2851
  if (ctx->owned) {
2852
+ #if TARGET_OS_OSX
2853
+ vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
2854
+ #else
2843
2855
  free(ctx->all_data);
2856
+ #endif
2844
2857
  }
2845
2858
 
2846
2859
  free(ctx);
@@ -2944,14 +2957,16 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
2944
2957
  ctx->owned = true;
2945
2958
  ctx->n_buffers = 1;
2946
2959
 
2947
- ctx->buffers[0].data = ctx->all_data;
2948
- ctx->buffers[0].size = size;
2949
- ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
2950
- length:size_aligned
2951
- options:MTLResourceStorageModeShared
2952
- deallocator:nil];
2960
+ if (ctx->all_data != NULL) {
2961
+ ctx->buffers[0].data = ctx->all_data;
2962
+ ctx->buffers[0].size = size;
2963
+ ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
2964
+ length:size_aligned
2965
+ options:MTLResourceStorageModeShared
2966
+ deallocator:nil];
2967
+ }
2953
2968
 
2954
- if (ctx->buffers[0].metal == nil) {
2969
+ if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
2955
2970
  GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2956
2971
  free(ctx);
2957
2972
  ggml_backend_metal_free_device();
@@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
2175
2175
 
2176
2176
  const short D4 = D/4;
2177
2177
  const short D8 = D/8;
2178
- const short Q8 = Q/8;
2178
+ //const short Q8 = Q/8;
2179
2179
  const short NW = N_SIMDWIDTH;
2180
2180
  const short SH = (C + Q); // shared memory per simdgroup in (half)
2181
2181
 
@@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
2119
2119
  if (alignment == (cl_uint)-1) {
2120
2120
  ggml_cl_init();
2121
2121
  clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
2122
+ alignment /= 8; // bits to bytes
2122
2123
  }
2123
2124
  return alignment;
2124
2125
 
@@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
12450
12450
  const size_t nb = nbytes/ggml_type_size(type);
12451
12451
 
12452
12452
  switch (type) {
12453
+ case GGML_TYPE_BF16:
12454
+ {
12455
+ int nans = 0;
12456
+ int infs = 0;
12457
+ const unsigned short * f = (const unsigned short *) data;
12458
+ for (size_t i = 0; i < nb; ++i) {
12459
+ nans += (f[i] & 0x7fff) > 0x7f80;
12460
+ infs += (f[i] & 0x7fff) == 0x7f80;
12461
+ }
12462
+ if (nans) {
12463
+ fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
12464
+ return false;
12465
+ }
12466
+ if (infs) {
12467
+ fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
12468
+ return false;
12469
+ }
12470
+ } break;
12453
12471
  case GGML_TYPE_F16:
12454
12472
  {
12455
12473
  const ggml_fp16_t * f = (const ggml_fp16_t *) data;