llama_cpp 0.15.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
- data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
3
+ metadata.gz: ce6d72aeb5fb9aff775d44284bf934e164f8470973619507ef6e6eb1ac0bec4d
4
+ data.tar.gz: 7c1ae823c90f957219b3edbc20f091b65a50caa984c1a6f4d137a46c376b2f0c
5
5
  SHA512:
6
- metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
- data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
6
+ metadata.gz: d23cb6a63b7734df2547c5e61a699fa206878c747e274e004c829b77335a7cc7434e92168a55d8ab0a617b11eddb5d45d5057a91b92e848735fd9e852b2476cd
7
+ data.tar.gz: f54b09de3cc60de81be977e9706a9beb3bf28e7740a19a57f6add543fe10cd6dc4101cbbe22dd5b62870c78a1ad4d10f57dd29b7c3e3e12b950e6575cf67b0c7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
2
+
3
+ - Bump llama.cpp from b2781 to b2839.
4
+ - Add constants for pre-tokenization types.
5
+ - Add constant for model file type.
6
+
1
7
  ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
8
 
3
9
  - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
@@ -3428,6 +3428,11 @@ extern "C" void Init_llama_cpp(void) {
3428
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
3429
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
3430
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3433
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
3431
3436
 
3432
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3433
3438
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3465,6 +3470,7 @@ extern "C" void Init_llama_cpp(void) {
3465
3470
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3466
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3467
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3473
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_BF16", INT2NUM(LLAMA_FTYPE_MOSTLY_BF16));
3468
3474
 
3469
3475
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3470
3476
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.0'
6
+ VERSION = '0.15.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2781'
9
+ LLAMA_CPP_VERSION = 'b2839'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -24,6 +24,11 @@ module LLaMACpp
24
24
  LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
25
  LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+ LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
+ LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
+ LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
27
32
 
28
33
  LLAMA_FTYPE_ALL_F32: Integer
29
34
  LLAMA_FTYPE_MOSTLY_F16: Integer
@@ -53,6 +58,7 @@ module LLaMACpp
53
58
  LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
54
59
  LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
55
60
  LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
61
+ LLAMA_FTYPE_MOSTLY_BF16: Integer
56
62
 
57
63
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
58
64
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
77
77
  ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
78
  ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
79
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
- ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
80
  ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
81
  ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
82
  ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
85
84
  elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
86
85
  continue; \
87
86
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@@ -436,7 +435,7 @@ ifdef LLAMA_CUDA
436
435
  else
437
436
  CUDA_PATH ?= /usr/local/cuda
438
437
  endif
439
- MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
438
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
440
439
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
441
440
  OBJS += ggml-cuda.o
442
441
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -761,7 +760,7 @@ lib: llama.o ggml.o $(OBJS)
761
760
  ar rcs libllama.a $^
762
761
 
763
762
  clean:
764
- rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
763
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
765
764
  rm -vrf ggml-cuda/*.o
766
765
 
767
766
  #
@@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
113
113
  for (int id = 0; id < info.device_count; ++id) {
114
114
  int device_vmm = 0;
115
115
 
116
- #if !defined(GGML_USE_HIPBLAS)
116
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
117
117
  CUdevice device;
118
118
  CU_CHECK(cuDeviceGet(&device, id));
119
119
  CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
259
259
  };
260
260
 
261
261
  // pool with virtual memory
262
- #if !defined(GGML_USE_HIPBLAS)
262
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
263
263
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
264
264
  static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
265
265
 
@@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
356
356
  #endif // !defined(GGML_USE_HIPBLAS)
357
357
 
358
358
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
359
- #if !defined(GGML_USE_HIPBLAS)
359
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
360
360
  if (ggml_cuda_info().devices[device].vmm) {
361
361
  return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
362
362
  }
@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
1647
1647
  }
1648
1648
  }
1649
1649
 
1650
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1650
+ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1651
1651
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1652
1652
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1653
1653
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -1670,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
1670
1670
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1671
1671
  }
1672
1672
 
1673
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1673
+ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1674
1674
  GGML_ASSERT(!ggml_is_transposed(src0));
1675
1675
  GGML_ASSERT(!ggml_is_transposed(src1));
1676
1676
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -2410,32 +2410,304 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2410
2410
  GGML_UNUSED(backend);
2411
2411
  }
2412
2412
 
2413
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2414
+ graph_node_properties->node_address = node->data;
2415
+ graph_node_properties->node_op = node->op;
2416
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2417
+ graph_node_properties->ne[i] = node->ne[i];
2418
+ graph_node_properties->nb[i] = node->nb[i];
2419
+ }
2420
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2421
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2422
+ }
2423
+ }
2424
+
2425
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2426
+ if (node->data != graph_node_properties->node_address &&
2427
+ node->op != GGML_OP_CPY &&
2428
+ node->op != GGML_OP_VIEW) {
2429
+ return false;
2430
+ }
2431
+
2432
+ if (node->op != graph_node_properties->node_op) {
2433
+ return false;
2434
+ }
2435
+
2436
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2437
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2438
+ return false;
2439
+ }
2440
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2441
+ return false;
2442
+ }
2443
+ }
2444
+
2445
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2446
+ if (node->src[i] &&
2447
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2448
+ node->op != GGML_OP_CPY &&
2449
+ node->op != GGML_OP_VIEW
2450
+ ) {
2451
+ return false;
2452
+ }
2453
+ }
2454
+ return true;
2455
+ }
2456
+
2413
2457
  GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2414
2458
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2415
2459
 
2416
2460
  ggml_cuda_set_device(cuda_ctx->device);
2417
2461
 
2418
- for (int i = 0; i < cgraph->n_nodes; i++) {
2419
- ggml_tensor * node = cgraph->nodes[i];
2462
+ #ifdef USE_CUDA_GRAPH
2463
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2420
2464
 
2421
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2422
- continue;
2465
+ // Objects required for CUDA Graph
2466
+ if (cuda_ctx->cuda_graph == nullptr) {
2467
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2468
+ }
2469
+
2470
+ bool use_cuda_graph = true;
2471
+ bool cuda_graph_update_required = false;
2472
+ // pointer to CUDA cpy kernel, which is required to identify
2473
+ // kernel parameters which need updated in the graph for each token
2474
+ void * ggml_cuda_cpy_fn_ptr = nullptr;
2475
+
2476
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2477
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2478
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2479
+ #ifndef NDEBUG
2480
+ fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2481
+ #endif
2482
+ }
2483
+ }
2484
+
2485
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2486
+ // or previous graph capture failure.
2487
+ // Also disable for multi-gpu for now. TO DO investigate
2488
+ if (disable_cuda_graphs_due_to_env
2489
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2490
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2491
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2492
+ use_cuda_graph = false;
2493
+ }
2494
+
2495
+ if (use_cuda_graph) {
2496
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2497
+ cuda_graph_update_required = true;
2423
2498
  }
2424
2499
 
2500
+ // Check if the graph size has changed
2501
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2502
+ cuda_graph_update_required = true;
2503
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2504
+ }
2505
+
2506
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2507
+ // and store properties to allow this comparison for the next token
2508
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2509
+ bool has_matching_properties = true;
2510
+ if (!cuda_graph_update_required) {
2511
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2512
+ }
2513
+ if (!has_matching_properties) {
2514
+ cuda_graph_update_required = true;
2515
+ }
2516
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2517
+ }
2518
+
2519
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2520
+ cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2521
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2522
+ ggml_tensor * node = cgraph->nodes[i];
2523
+
2524
+ if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2525
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2425
2526
  #ifndef NDEBUG
2426
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2427
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2428
- if (node->src[j] != nullptr) {
2429
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2527
+ fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2528
+ #endif
2529
+ }
2530
+
2531
+ if (node->op == GGML_OP_MUL_MAT_ID) {
2532
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2533
+ #ifndef NDEBUG
2534
+ fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2535
+ #endif
2536
+ }
2537
+
2538
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2539
+ // disable CUDA graphs for batch size > 1 for now.
2540
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2541
+ use_cuda_graph = false;
2542
+ #ifndef NDEBUG
2543
+ fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2544
+ #endif
2545
+ }
2546
+
2547
+ if (node->op == GGML_OP_CPY) {
2548
+ // store the copy op parameter which changes with each token.
2549
+ cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2550
+ if (ggml_cuda_cpy_fn_ptr == nullptr) {
2551
+ // store a pointer to the copy op CUDA kernel to identify it later
2552
+ ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2553
+ }
2554
+ }
2555
+
2556
+ if (!use_cuda_graph) {
2557
+ break;
2558
+ }
2559
+ }
2560
+
2561
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2562
+ if (cuda_graph_update_required) {
2563
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2564
+ } else {
2565
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2566
+ }
2567
+
2568
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2569
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2570
+ #ifndef NDEBUG
2571
+ fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2572
+ #endif
2573
+ }
2574
+ }
2575
+
2576
+ if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2577
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2578
+ }
2579
+
2580
+ #else
2581
+ bool use_cuda_graph = false;
2582
+ bool cuda_graph_update_required = false;
2583
+ #endif // USE_CUDA_GRAPH
2584
+
2585
+ bool graph_evaluated_or_captured = false;
2586
+
2587
+ while (!graph_evaluated_or_captured) {
2588
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2589
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2590
+ if (!use_cuda_graph || cuda_graph_update_required) {
2591
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2592
+ ggml_tensor * node = cgraph->nodes[i];
2593
+
2594
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2595
+ continue;
2596
+ }
2597
+
2598
+ #ifndef NDEBUG
2599
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2600
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2601
+ if (node->src[j] != nullptr) {
2602
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2603
+ }
2604
+ }
2605
+ #endif
2606
+
2607
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2608
+ if (!ok) {
2609
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2610
+ }
2611
+ GGML_ASSERT(ok);
2430
2612
  }
2431
2613
  }
2614
+
2615
+ #ifdef USE_CUDA_GRAPH
2616
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2617
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2618
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2619
+ cuda_ctx->cuda_graph->graph = nullptr;
2620
+ }
2621
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2622
+
2623
+ #if 0
2624
+ if (disable_cuda_graphs_due_to_failed_capture) {
2625
+ use_cuda_graph = false;
2626
+ cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2627
+ #ifndef NDEBUG
2628
+ fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2432
2629
  #endif
2630
+ } else {
2631
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2632
+ }
2633
+ #endif
2634
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2635
+ } else {
2636
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2637
+ }
2638
+ }
2639
+
2640
+ if (use_cuda_graph) {
2641
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2642
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2643
+ }
2644
+
2645
+ // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2646
+
2647
+ if (cuda_graph_update_required) {
2648
+ // Extract nodes from graph
2649
+ if (cuda_ctx->cuda_graph->num_nodes == 0) {
2650
+ // First call with null argument gets number of nodes in graph
2651
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2652
+ }
2653
+ // Subsequent call with non-null argument gets nodes
2654
+ cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2655
+ cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2656
+ if (cuda_ctx->cuda_graph->num_nodes > 0) {
2657
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2658
+
2659
+ // Loop over nodes, and extract kernel parameters from each node
2660
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2661
+ cudaGraphNodeType node_type;
2662
+ CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2663
+ if (node_type == cudaGraphNodeTypeKernel) {
2664
+ cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2665
+ if (stat == cudaErrorInvalidDeviceFunction) {
2666
+ // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2667
+ // We don't need to update blas nodes, so clear error and move on.
2668
+ cudaGetLastError();
2669
+ } else {
2670
+ GGML_ASSERT(stat == cudaSuccess);
2671
+ }
2672
+ }
2673
+ }
2674
+ }
2675
+ }
2676
+
2677
+ // One of the arguments to the copy kernel is updated for each token, hence we need to
2678
+ // replace that argument with the updated value in the CUDA graph
2679
+ if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2680
+ int k = 0;
2681
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2682
+ if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2683
+ char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2684
+ cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2685
+ CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2686
+ }
2687
+ }
2688
+ }
2433
2689
 
2434
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2435
- if (!ok) {
2436
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2690
+ // Update graph executable
2691
+ cudaGraphExecUpdateResultInfo result_info;
2692
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2693
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2694
+ #ifndef NDEBUG
2695
+ fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2696
+ #endif
2697
+ // The pre-existing graph exec cannot be updated due to violated constraints
2698
+ // so instead clear error and re-instantiate
2699
+ cudaGetLastError();
2700
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2701
+ cuda_ctx->cuda_graph->instance = nullptr;
2702
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2703
+ } else {
2704
+ GGML_ASSERT(stat == cudaSuccess);
2437
2705
  }
2438
- GGML_ASSERT(ok);
2706
+ // Launch graph
2707
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2708
+ #else
2709
+ graph_evaluated_or_captured = true;
2710
+ #endif // USE_CUDA_GRAPH
2439
2711
  }
2440
2712
 
2441
2713
  return GGML_STATUS_SUCCESS;
@@ -17,6 +17,83 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ /**
21
+ * Converts brain16 to float32.
22
+ *
23
+ * The bfloat16 floating point format has the following structure:
24
+ *
25
+ * ┌sign
26
+ * │
27
+ * │ ┌exponent
28
+ * │ │
29
+ * │ │ ┌mantissa
30
+ * │ │ │
31
+ * │┌──┴───┐┌─┴───┐
32
+ * 0b0000000000000000 brain16
33
+ *
34
+ * Since bf16 has the same number of exponent bits as a 32bit float,
35
+ * encoding and decoding numbers becomes relatively straightforward.
36
+ *
37
+ * ┌sign
38
+ * │
39
+ * │ ┌exponent
40
+ * │ │
41
+ * │ │ ┌mantissa
42
+ * │ │ │
43
+ * │┌──┴───┐┌─┴───────────────────┐
44
+ * 0b00000000000000000000000000000000 IEEE binary32
45
+ *
46
+ * For comparison, the standard fp16 format has fewer exponent bits.
47
+ *
48
+ * ┌sign
49
+ * │
50
+ * │ ┌exponent
51
+ * │ │
52
+ * │ │ ┌mantissa
53
+ * │ │ │
54
+ * │┌─┴─┐┌─┴──────┐
55
+ * 0b0000000000000000 IEEE binary16
56
+ *
57
+ * @see IEEE 754-2008
58
+ */
59
+ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
60
+ union {
61
+ float f;
62
+ uint32_t i;
63
+ } u;
64
+ u.i = (uint32_t)h.bits << 16;
65
+ return u.f;
66
+ }
67
+
68
+ /**
69
+ * Converts float32 to brain16.
70
+ *
71
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
72
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
73
+ * This code should vectorize nicely if using modern compilers.
74
+ */
75
+ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
76
+ ggml_bf16_t h;
77
+ union {
78
+ float f;
79
+ uint32_t i;
80
+ } u;
81
+ u.f = s;
82
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
83
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
84
+ return h;
85
+ }
86
+ if (!(u.i & 0x7f800000)) { /* subnormal */
87
+ h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
88
+ return h;
89
+ }
90
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
91
+ return h;
92
+ }
93
+
94
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
95
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
96
+
20
97
  #ifdef __cplusplus
21
98
  extern "C" {
22
99
  #endif
@@ -265,11 +265,20 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
265
265
 
266
266
  static void * ggml_metal_host_malloc(size_t n) {
267
267
  void * data = NULL;
268
+
269
+ #if TARGET_OS_OSX
270
+ kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
271
+ if (err != KERN_SUCCESS) {
272
+ GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
273
+ return NULL;
274
+ }
275
+ #else
268
276
  const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
269
277
  if (result != 0) {
270
278
  GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
271
279
  return NULL;
272
280
  }
281
+ #endif
273
282
 
274
283
  return data;
275
284
  }
@@ -803,7 +812,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
803
812
  case GGML_OP_DIAG_MASK_INF:
804
813
  case GGML_OP_GET_ROWS:
805
814
  {
806
- return op->ne[3] == 1;
815
+ return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
807
816
  }
808
817
  default:
809
818
  return false;
@@ -2840,7 +2849,11 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
2840
2849
  ggml_backend_metal_free_device();
2841
2850
 
2842
2851
  if (ctx->owned) {
2852
+ #if TARGET_OS_OSX
2853
+ vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
2854
+ #else
2843
2855
  free(ctx->all_data);
2856
+ #endif
2844
2857
  }
2845
2858
 
2846
2859
  free(ctx);
@@ -2944,14 +2957,16 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
2944
2957
  ctx->owned = true;
2945
2958
  ctx->n_buffers = 1;
2946
2959
 
2947
- ctx->buffers[0].data = ctx->all_data;
2948
- ctx->buffers[0].size = size;
2949
- ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
2950
- length:size_aligned
2951
- options:MTLResourceStorageModeShared
2952
- deallocator:nil];
2960
+ if (ctx->all_data != NULL) {
2961
+ ctx->buffers[0].data = ctx->all_data;
2962
+ ctx->buffers[0].size = size;
2963
+ ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
2964
+ length:size_aligned
2965
+ options:MTLResourceStorageModeShared
2966
+ deallocator:nil];
2967
+ }
2953
2968
 
2954
- if (ctx->buffers[0].metal == nil) {
2969
+ if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
2955
2970
  GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2956
2971
  free(ctx);
2957
2972
  ggml_backend_metal_free_device();
@@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
2175
2175
 
2176
2176
  const short D4 = D/4;
2177
2177
  const short D8 = D/8;
2178
- const short Q8 = Q/8;
2178
+ //const short Q8 = Q/8;
2179
2179
  const short NW = N_SIMDWIDTH;
2180
2180
  const short SH = (C + Q); // shared memory per simdgroup in (half)
2181
2181
 
@@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
2119
2119
  if (alignment == (cl_uint)-1) {
2120
2120
  ggml_cl_init();
2121
2121
  clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
2122
+ alignment /= 8; // bits to bytes
2122
2123
  }
2123
2124
  return alignment;
2124
2125
 
@@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
12450
12450
  const size_t nb = nbytes/ggml_type_size(type);
12451
12451
 
12452
12452
  switch (type) {
12453
+ case GGML_TYPE_BF16:
12454
+ {
12455
+ int nans = 0;
12456
+ int infs = 0;
12457
+ const unsigned short * f = (const unsigned short *) data;
12458
+ for (size_t i = 0; i < nb; ++i) {
12459
+ nans += (f[i] & 0x7fff) > 0x7f80;
12460
+ infs += (f[i] & 0x7fff) == 0x7f80;
12461
+ }
12462
+ if (nans) {
12463
+ fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
12464
+ return false;
12465
+ }
12466
+ if (infs) {
12467
+ fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
12468
+ return false;
12469
+ }
12470
+ } break;
12453
12471
  case GGML_TYPE_F16:
12454
12472
  {
12455
12473
  const ggml_fp16_t * f = (const ggml_fp16_t *) data;