llama_cpp 0.12.7 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1106,7 +1106,9 @@ void ggml_vk_instance_init() {
1106
1106
 
1107
1107
  const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
1108
1108
  const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
1109
+ #ifdef __APPLE__
1109
1110
  const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
1111
+ #endif
1110
1112
 
1111
1113
  std::vector<const char*> layers;
1112
1114
 
@@ -1117,13 +1119,17 @@ void ggml_vk_instance_init() {
1117
1119
  if (validation_ext) {
1118
1120
  extensions.push_back("VK_EXT_validation_features");
1119
1121
  }
1122
+ #ifdef __APPLE__
1120
1123
  if (portability_enumeration_ext) {
1121
1124
  extensions.push_back("VK_KHR_portability_enumeration");
1122
1125
  }
1126
+ #endif
1123
1127
  vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
1128
+ #ifdef __APPLE__
1124
1129
  if (portability_enumeration_ext) {
1125
1130
  instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
1126
1131
  }
1132
+ #endif
1127
1133
 
1128
1134
  std::vector<vk::ValidationFeatureEnableEXT> features_enable;
1129
1135
  vk::ValidationFeaturesEXT validation_features;
@@ -2320,8 +2326,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2320
2326
  src1_uma = d_Qy != nullptr;
2321
2327
  }
2322
2328
 
2323
- const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
2324
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2329
+ const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2330
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2325
2331
 
2326
2332
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2327
2333
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@@ -2453,7 +2459,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2453
2459
  // compute
2454
2460
  ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
2455
2461
 
2456
- if (dst->backend == GGML_BACKEND_CPU) {
2462
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2457
2463
  // copy dst to host
2458
2464
  float * d = (float *) ((char *) dst->data);
2459
2465
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
@@ -2506,8 +2512,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2506
2512
  src1_uma = d_Qy != nullptr;
2507
2513
  }
2508
2514
 
2509
- const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
2510
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2515
+ const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2516
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2511
2517
 
2512
2518
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2513
2519
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@@ -2630,7 +2636,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2630
2636
  ggml_vk_sync_buffers(subctx);
2631
2637
  ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
2632
2638
 
2633
- if (dst->backend == GGML_BACKEND_CPU) {
2639
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2634
2640
  // copy dst to host
2635
2641
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
2636
2642
  ggml_vk_sync_buffers(subctx);
@@ -2647,7 +2653,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2647
2653
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2648
2654
  #endif
2649
2655
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2650
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2656
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2651
2657
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
2652
2658
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
2653
2659
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -2679,7 +2685,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2679
2685
  src1_uma = d_Qy != nullptr;
2680
2686
  }
2681
2687
 
2682
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2688
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2683
2689
 
2684
2690
  const uint64_t x_ne = ne00 * ne01 * ne02;
2685
2691
  const uint64_t y_ne = ne10 * ne11 * ne12;
@@ -2721,7 +2727,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2721
2727
  ggml_vk_sync_buffers(subctx);
2722
2728
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2723
2729
 
2724
- if (dst->backend == GGML_BACKEND_CPU) {
2730
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2725
2731
  // copy dst to host
2726
2732
  float * d = (float *) dst->data;
2727
2733
  ggml_vk_sync_buffers(subctx);
@@ -2738,7 +2744,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2738
2744
  GGML_ASSERT(!ggml_is_transposed(src0));
2739
2745
  GGML_ASSERT(!ggml_is_transposed(src1));
2740
2746
  GGML_ASSERT(!ggml_is_permuted(src0));
2741
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2747
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2742
2748
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
2743
2749
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
2744
2750
 
@@ -2771,7 +2777,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2771
2777
  src1_uma = d_Qy != nullptr;
2772
2778
  }
2773
2779
 
2774
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2780
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2775
2781
 
2776
2782
  const uint64_t d_ne = ne01 * ne11 * ne12;
2777
2783
 
@@ -2814,7 +2820,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2814
2820
  ggml_vk_sync_buffers(subctx);
2815
2821
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2816
2822
 
2817
- if (dst->backend == GGML_BACKEND_CPU) {
2823
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2818
2824
  // copy dst to host
2819
2825
  float * d = (float *) dst->data;
2820
2826
  ggml_vk_sync_buffers(subctx);
@@ -2832,7 +2838,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
2832
2838
  return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2833
2839
  (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
2834
2840
  dst->type == GGML_TYPE_F32 &&
2835
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU);
2841
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
2836
2842
  }
2837
2843
 
2838
2844
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -2880,8 +2886,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
2880
2886
  // TODO: support for transposed / permuted tensors
2881
2887
  GGML_ASSERT(nb0 == sizeof(float));
2882
2888
  GGML_ASSERT(nb00 == sizeof(float));
2883
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2884
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
2889
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2890
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
2885
2891
 
2886
2892
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
2887
2893
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3110,8 +3116,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3110
3116
  }
3111
3117
  }
3112
3118
 
3113
- const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
3114
- const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
3119
+ const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
3120
+ const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
3115
3121
 
3116
3122
  uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
3117
3123
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
@@ -3120,7 +3126,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3120
3126
  vk_buffer d_D = extra->buffer_gpu.lock();
3121
3127
 
3122
3128
  // Workaround for tiny tensor inputs on ROPE
3123
- if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) {
3129
+ if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3124
3130
  y_sz = VK_WHOLE_SIZE;
3125
3131
  }
3126
3132
 
@@ -3209,9 +3215,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3209
3215
  ggml_vk_sync_buffers(subctx);
3210
3216
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
3211
3217
  }
3212
- if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
3218
+ if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
3213
3219
  ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
3214
- } else if(dst->backend == GGML_BACKEND_CPU) {
3220
+ } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
3215
3221
  // copy dst to host
3216
3222
  float * d = (float *) dst->data;
3217
3223
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
@@ -3253,7 +3259,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3253
3259
  ggml_vk_sync_buffers(subctx);
3254
3260
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
3255
3261
  }
3256
- if (dst->backend == GGML_BACKEND_CPU) {
3262
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3257
3263
  // copy dst to host
3258
3264
  ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
3259
3265
  }
@@ -3359,7 +3365,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
3359
3365
 
3360
3366
  static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
3361
3367
  // If backend is CPU, data from src0 has to be copied off the device
3362
- if (dst->backend == GGML_BACKEND_CPU) {
3368
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3363
3369
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3364
3370
  vk_buffer d_D = extra_src0->buffer_gpu.lock();
3365
3371
  ggml_vk_sync_buffers(subctx);
@@ -3994,9 +4000,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
3994
4000
  #ifdef GGML_VULKAN_DEBUG
3995
4001
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
3996
4002
  #endif
3997
- const bool any_on_device = node->backend == GGML_BACKEND_GPU
3998
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3999
- || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
4003
+ const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4004
+ || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4005
+ || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
4000
4006
 
4001
4007
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
4002
4008
  return;
@@ -4215,9 +4221,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4215
4221
  }
4216
4222
 
4217
4223
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
4218
- const bool any_on_device = node->backend == GGML_BACKEND_GPU
4219
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
4220
- || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
4224
+ const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4225
+ || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4226
+ || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4221
4227
 
4222
4228
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
4223
4229
  return;
@@ -4371,7 +4377,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4371
4377
  last_node = true;
4372
4378
  #endif
4373
4379
 
4374
- if (node->backend == GGML_BACKEND_CPU || last_node) {
4380
+ if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
4375
4381
  ggml_vk_ctx_end(ctx->compute_ctx);
4376
4382
  ctx->compute_ctx->exit_tensor = node;
4377
4383
  ctx->compute_ctx = nullptr;
@@ -4379,9 +4385,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4379
4385
  }
4380
4386
 
4381
4387
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
4382
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
4383
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
4384
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
4388
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
4389
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4390
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4385
4391
 
4386
4392
  if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
4387
4393
  return false;
@@ -4442,7 +4448,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
4442
4448
  if (params->ith != 0) {
4443
4449
  return true;
4444
4450
  }
4445
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
4451
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
4446
4452
  return true;
4447
4453
  }
4448
4454
 
@@ -4745,7 +4751,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
4745
4751
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
4746
4752
  }
4747
4753
 
4748
- tensor->backend = GGML_BACKEND_GPU;
4754
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
4749
4755
  tensor->extra = extra;
4750
4756
  }
4751
4757
 
@@ -4753,7 +4759,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
4753
4759
  #ifdef GGML_VULKAN_DEBUG
4754
4760
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4755
4761
  #endif
4756
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
4762
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4757
4763
 
4758
4764
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4759
4765
 
@@ -4768,7 +4774,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
4768
4774
  #ifdef GGML_VULKAN_DEBUG
4769
4775
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4770
4776
  #endif
4771
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
4777
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4772
4778
 
4773
4779
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4774
4780
 
@@ -4999,7 +5005,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
4999
5005
  #endif
5000
5006
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5001
5007
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
5002
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
5008
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5003
5009
 
5004
5010
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5005
5011
 
@@ -5020,7 +5026,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
5020
5026
  #endif
5021
5027
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5022
5028
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
5023
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
5029
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5024
5030
 
5025
5031
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5026
5032
 
@@ -5097,7 +5103,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
5097
5103
  int last_node = cgraph->n_nodes - 1;
5098
5104
 
5099
5105
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
5100
- while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
5106
+ while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
5101
5107
  last_node -= 1;
5102
5108
  }
5103
5109
 
@@ -5106,7 +5112,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
5106
5112
  }
5107
5113
 
5108
5114
  ggml_compute_params params = {};
5109
- params.type = GGML_TASK_COMPUTE;
5115
+ params.type = GGML_TASK_TYPE_COMPUTE;
5110
5116
  params.ith = 0;
5111
5117
  for (int i = 0; i < cgraph->n_nodes; i++) {
5112
5118
  ggml_tensor * node = cgraph->nodes[i];
@@ -5244,6 +5250,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
5244
5250
  /* .supports_op = */ ggml_backend_vk_supports_op,
5245
5251
  };
5246
5252
 
5253
+ static ggml_guid_t ggml_backend_vk_guid() {
5254
+ static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
5255
+ return &guid;
5256
+ }
5257
+
5247
5258
  GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5248
5259
  if (vk_instance.initialized[idx]) {
5249
5260
  return vk_instance.backends[idx];
@@ -5262,6 +5273,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5262
5273
  vk_instance.initialized[idx] = true;
5263
5274
 
5264
5275
  ggml_backend_t vk_backend = new ggml_backend {
5276
+ /* .guid = */ ggml_backend_vk_guid(),
5265
5277
  /* .interface = */ ggml_backend_vk_interface,
5266
5278
  /* .context = */ &vk_instance.contexts[ctx->idx],
5267
5279
  };
@@ -5272,7 +5284,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5272
5284
  }
5273
5285
 
5274
5286
  GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
5275
- return backend && backend->iface.get_name == ggml_backend_vk_name;
5287
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
5276
5288
  }
5277
5289
 
5278
5290
  GGML_CALL int ggml_backend_vk_get_device_count() {
@@ -5410,7 +5422,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
5410
5422
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
5411
5423
  void * tensor_data = tensor->data;
5412
5424
 
5413
- if (tensor->backend == GGML_BACKEND_GPU) {
5425
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5414
5426
  const size_t tensor_size = ggml_nbytes(tensor);
5415
5427
  tensor_data = malloc(tensor_size);
5416
5428
 
@@ -5436,14 +5448,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
5436
5448
  std::vector<const ggml_tensor *> done;
5437
5449
  ggml_vk_print_graph_origin(tensor, done);
5438
5450
 
5439
- if (tensor->backend == GGML_BACKEND_GPU) {
5451
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5440
5452
  free(tensor_data);
5441
5453
  }
5442
5454
  }
5443
5455
 
5444
5456
  static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
5445
5457
  return;
5446
- GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
5458
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
5447
5459
  if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
5448
5460
  return;
5449
5461
  }
@@ -5481,7 +5493,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5481
5493
  if (params->ith != 0) {
5482
5494
  return;
5483
5495
  }
5484
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5496
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5485
5497
  return;
5486
5498
  }
5487
5499
 
@@ -5518,10 +5530,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5518
5530
 
5519
5531
  src0_buffer = malloc(src0_size);
5520
5532
  src0_clone->data = src0_buffer;
5521
- if (src0->backend == GGML_BACKEND_CPU) {
5533
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
5522
5534
  memcpy(src0_clone->data, src0->data, src0_size);
5523
5535
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
5524
- } else if (src0->backend == GGML_BACKEND_GPU) {
5536
+ } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
5525
5537
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
5526
5538
  uint64_t offset = extra->offset;
5527
5539
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
@@ -5561,10 +5573,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5561
5573
 
5562
5574
  src1_buffer = malloc(src1_size);
5563
5575
  src1_clone->data = src1_buffer;
5564
- if (src1->backend == GGML_BACKEND_CPU) {
5576
+ if (src1->backend == GGML_BACKEND_TYPE_CPU) {
5565
5577
  memcpy(src1_clone->data, src1->data, src1_size);
5566
5578
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
5567
- } else if (src1->backend == GGML_BACKEND_GPU) {
5579
+ } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
5568
5580
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
5569
5581
  uint64_t offset = extra->offset;
5570
5582
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
@@ -5723,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5723
5735
  if (params->ith != 0) {
5724
5736
  return;
5725
5737
  }
5726
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5738
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5727
5739
  return;
5728
5740
  }
5729
5741
  if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
@@ -5735,7 +5747,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5735
5747
 
5736
5748
  void * tensor_data = tensor->data;
5737
5749
 
5738
- if (tensor->backend == GGML_BACKEND_GPU) {
5750
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5739
5751
  size_t tensor_size = ggml_nbytes(tensor);
5740
5752
  tensor_data = malloc(tensor_size);
5741
5753
 
@@ -5868,7 +5880,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5868
5880
  comp_result = nullptr;
5869
5881
  comp_size = 0;
5870
5882
 
5871
- if (tensor->backend == GGML_BACKEND_GPU) {
5883
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5872
5884
  free(tensor_data);
5873
5885
  }
5874
5886
  }