llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
707
707
  q.cmd_buffer_idx = 0;
708
708
  }
709
709
 
710
- static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
710
+ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
711
+ for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
712
+ vk::MemoryType memory_type = mem_props->memoryTypes[i];
713
+ if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
714
+ (flags & memory_type.propertyFlags) == flags &&
715
+ mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
716
+ return static_cast<int32_t>(i);
717
+ }
718
+ }
719
+ return UINT32_MAX;
720
+ }
721
+
722
+ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
711
723
  #ifdef GGML_VULKAN_DEBUG
712
- std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
724
+ std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
713
725
  #endif
714
726
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
715
727
 
@@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
736
748
 
737
749
  uint32_t memory_type_index = UINT32_MAX;
738
750
 
739
- for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
740
- vk::MemoryType memory_type = mem_props.memoryTypes[i];
741
- if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
742
- memory_type_index = i;
743
- break;
744
- }
751
+ memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
752
+ buf->memory_property_flags = req_flags;
753
+
754
+ if (memory_type_index == UINT32_MAX && fallback_flags) {
755
+ memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
756
+ buf->memory_property_flags = fallback_flags;
745
757
  }
746
758
 
747
- if (memory_type_index >= mem_props.memoryTypeCount) {
759
+ if (memory_type_index == UINT32_MAX) {
748
760
  ctx->device.lock()->device.destroyBuffer(buf->buffer);
749
761
  buf->size = 0;
750
762
  throw vk::OutOfDeviceMemoryError("No suitable memory type found");
@@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
758
770
  buf->size = 0;
759
771
  throw e;
760
772
  }
761
- buf->memory_property_flags = req_flags;
762
773
  buf->ptr = nullptr;
763
774
 
764
- if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
775
+ if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
765
776
  buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
766
777
  }
767
778
 
@@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
778
789
  return buf;
779
790
  }
780
791
 
781
- static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
792
+ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
782
793
  try {
783
- return ggml_vk_create_buffer(ctx, size, req_flags);
794
+ return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
784
795
  } catch (const vk::SystemError& e) {
785
796
  std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
786
797
  std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
791
802
  static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
792
803
  vk_buffer buf;
793
804
  try {
794
- buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
795
- } catch (const vk::SystemError& e) {
796
805
  if (ctx->device.lock()->uma) {
797
806
  // Fall back to host memory type
798
- buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
807
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
799
808
  } else {
800
- std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
801
- std::cerr << "ggml_vulkan: " << e.what() << std::endl;
802
- throw e;
809
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
803
810
  }
811
+ } catch (const vk::SystemError& e) {
812
+ std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
813
+ std::cerr << "ggml_vulkan: " << e.what() << std::endl;
814
+ throw e;
804
815
  }
805
816
 
806
817
  return buf;
@@ -1080,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1080
1091
  }
1081
1092
  }
1082
1093
 
1094
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1095
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1096
+
1083
1097
  void ggml_vk_instance_init() {
1084
1098
  if (vk_instance_initialized) {
1085
1099
  return;
@@ -1089,28 +1103,48 @@ void ggml_vk_instance_init() {
1089
1103
  #endif
1090
1104
 
1091
1105
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1092
- const std::vector<const char*> layers = {
1093
- #ifdef GGML_VULKAN_VALIDATE
1094
- "VK_LAYER_KHRONOS_validation",
1095
- #endif
1096
- };
1097
- const std::vector<const char*> extensions = {
1098
- #ifdef GGML_VULKAN_VALIDATE
1099
- "VK_EXT_validation_features",
1106
+
1107
+ const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
1108
+ const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
1109
+ #ifdef __APPLE__
1110
+ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
1100
1111
  #endif
1101
- };
1102
- vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
1103
- #ifdef GGML_VULKAN_VALIDATE
1104
- const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1105
- vk::ValidationFeaturesEXT validation_features = {
1106
- features_enable,
1107
- {},
1108
- };
1109
- validation_features.setPNext(nullptr);
1110
- instance_create_info.setPNext(&validation_features);
1111
1112
 
1112
- std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1113
+ std::vector<const char*> layers;
1114
+
1115
+ if (validation_ext) {
1116
+ layers.push_back("VK_LAYER_KHRONOS_validation");
1117
+ }
1118
+ std::vector<const char*> extensions;
1119
+ if (validation_ext) {
1120
+ extensions.push_back("VK_EXT_validation_features");
1121
+ }
1122
+ #ifdef __APPLE__
1123
+ if (portability_enumeration_ext) {
1124
+ extensions.push_back("VK_KHR_portability_enumeration");
1125
+ }
1113
1126
  #endif
1127
+ vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
1128
+ #ifdef __APPLE__
1129
+ if (portability_enumeration_ext) {
1130
+ instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
1131
+ }
1132
+ #endif
1133
+
1134
+ std::vector<vk::ValidationFeatureEnableEXT> features_enable;
1135
+ vk::ValidationFeaturesEXT validation_features;
1136
+
1137
+ if (validation_ext) {
1138
+ features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1139
+ validation_features = {
1140
+ features_enable,
1141
+ {},
1142
+ };
1143
+ validation_features.setPNext(nullptr);
1144
+ instance_create_info.setPNext(&validation_features);
1145
+
1146
+ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1147
+ }
1114
1148
  vk_instance.instance = vk::createInstance(instance_create_info);
1115
1149
 
1116
1150
  memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@@ -1139,7 +1173,7 @@ void ggml_vk_instance_init() {
1139
1173
  vk_instance_initialized = true;
1140
1174
  }
1141
1175
 
1142
- void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1176
+ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1143
1177
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1144
1178
  size_t dev_num = vk_instance.device_indices[idx];
1145
1179
  #ifdef GGML_VULKAN_DEBUG
@@ -1157,12 +1191,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1157
1191
  vk_instance.devices[idx] = std::make_shared<vk_device>();
1158
1192
  ctx->device = vk_instance.devices[idx];
1159
1193
  ctx->device.lock()->physical_device = devices[dev_num];
1160
- std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1194
+ const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1161
1195
 
1162
1196
  bool maintenance4_support = false;
1163
1197
 
1164
1198
  // Check if maintenance4 is supported
1165
- for (auto properties : ext_props) {
1199
+ for (const auto& properties : ext_props) {
1166
1200
  if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
1167
1201
  maintenance4_support = true;
1168
1202
  }
@@ -1193,7 +1227,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1193
1227
  bool fp16_storage = false;
1194
1228
  bool fp16_compute = false;
1195
1229
 
1196
- for (auto properties : ext_props) {
1230
+ for (const auto& properties : ext_props) {
1197
1231
  if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
1198
1232
  fp16_storage = true;
1199
1233
  } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
@@ -1422,7 +1456,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
1422
1456
  #ifdef GGML_VULKAN_DEBUG
1423
1457
  std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
1424
1458
  #endif
1425
- vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1459
+ vk_buffer buf = ggml_vk_create_buffer(ctx, size,
1460
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1461
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1426
1462
 
1427
1463
  if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
1428
1464
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
@@ -1568,7 +1604,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
1568
1604
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
1569
1605
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
1570
1606
  ggml_vk_destroy_buffer(ctx->sync_staging);
1571
- ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1607
+ ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
1608
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1609
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1572
1610
  }
1573
1611
  }
1574
1612
 
@@ -2288,8 +2326,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2288
2326
  src1_uma = d_Qy != nullptr;
2289
2327
  }
2290
2328
 
2291
- const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
2292
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2329
+ const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2330
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2293
2331
 
2294
2332
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2295
2333
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@@ -2421,7 +2459,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2421
2459
  // compute
2422
2460
  ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
2423
2461
 
2424
- if (dst->backend == GGML_BACKEND_CPU) {
2462
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2425
2463
  // copy dst to host
2426
2464
  float * d = (float *) ((char *) dst->data);
2427
2465
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
@@ -2474,8 +2512,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2474
2512
  src1_uma = d_Qy != nullptr;
2475
2513
  }
2476
2514
 
2477
- const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
2478
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2515
+ const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2516
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2479
2517
 
2480
2518
  const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2481
2519
  const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@@ -2598,7 +2636,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2598
2636
  ggml_vk_sync_buffers(subctx);
2599
2637
  ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
2600
2638
 
2601
- if (dst->backend == GGML_BACKEND_CPU) {
2639
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2602
2640
  // copy dst to host
2603
2641
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
2604
2642
  ggml_vk_sync_buffers(subctx);
@@ -2615,7 +2653,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2615
2653
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2616
2654
  #endif
2617
2655
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2618
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2656
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2619
2657
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
2620
2658
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
2621
2659
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -2647,7 +2685,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2647
2685
  src1_uma = d_Qy != nullptr;
2648
2686
  }
2649
2687
 
2650
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2688
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2651
2689
 
2652
2690
  const uint64_t x_ne = ne00 * ne01 * ne02;
2653
2691
  const uint64_t y_ne = ne10 * ne11 * ne12;
@@ -2689,7 +2727,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2689
2727
  ggml_vk_sync_buffers(subctx);
2690
2728
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2691
2729
 
2692
- if (dst->backend == GGML_BACKEND_CPU) {
2730
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2693
2731
  // copy dst to host
2694
2732
  float * d = (float *) dst->data;
2695
2733
  ggml_vk_sync_buffers(subctx);
@@ -2706,7 +2744,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2706
2744
  GGML_ASSERT(!ggml_is_transposed(src0));
2707
2745
  GGML_ASSERT(!ggml_is_transposed(src1));
2708
2746
  GGML_ASSERT(!ggml_is_permuted(src0));
2709
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2747
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2710
2748
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
2711
2749
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
2712
2750
 
@@ -2739,7 +2777,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2739
2777
  src1_uma = d_Qy != nullptr;
2740
2778
  }
2741
2779
 
2742
- const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
2780
+ const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2743
2781
 
2744
2782
  const uint64_t d_ne = ne01 * ne11 * ne12;
2745
2783
 
@@ -2782,7 +2820,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2782
2820
  ggml_vk_sync_buffers(subctx);
2783
2821
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
2784
2822
 
2785
- if (dst->backend == GGML_BACKEND_CPU) {
2823
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2786
2824
  // copy dst to host
2787
2825
  float * d = (float *) dst->data;
2788
2826
  ggml_vk_sync_buffers(subctx);
@@ -2800,7 +2838,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
2800
2838
  return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2801
2839
  (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
2802
2840
  dst->type == GGML_TYPE_F32 &&
2803
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU);
2841
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
2804
2842
  }
2805
2843
 
2806
2844
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -2848,8 +2886,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
2848
2886
  // TODO: support for transposed / permuted tensors
2849
2887
  GGML_ASSERT(nb0 == sizeof(float));
2850
2888
  GGML_ASSERT(nb00 == sizeof(float));
2851
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2852
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
2889
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
2890
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
2853
2891
 
2854
2892
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
2855
2893
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3078,8 +3116,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3078
3116
  }
3079
3117
  }
3080
3118
 
3081
- const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
3082
- const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
3119
+ const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
3120
+ const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
3083
3121
 
3084
3122
  uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
3085
3123
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
@@ -3088,7 +3126,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3088
3126
  vk_buffer d_D = extra->buffer_gpu.lock();
3089
3127
 
3090
3128
  // Workaround for tiny tensor inputs on ROPE
3091
- if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) {
3129
+ if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3092
3130
  y_sz = VK_WHOLE_SIZE;
3093
3131
  }
3094
3132
 
@@ -3177,9 +3215,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3177
3215
  ggml_vk_sync_buffers(subctx);
3178
3216
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
3179
3217
  }
3180
- if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
3218
+ if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
3181
3219
  ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
3182
- } else if(dst->backend == GGML_BACKEND_CPU) {
3220
+ } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
3183
3221
  // copy dst to host
3184
3222
  float * d = (float *) dst->data;
3185
3223
  ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
@@ -3221,7 +3259,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3221
3259
  ggml_vk_sync_buffers(subctx);
3222
3260
  ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
3223
3261
  }
3224
- if (dst->backend == GGML_BACKEND_CPU) {
3262
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3225
3263
  // copy dst to host
3226
3264
  ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
3227
3265
  }
@@ -3327,7 +3365,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
3327
3365
 
3328
3366
  static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
3329
3367
  // If backend is CPU, data from src0 has to be copied off the device
3330
- if (dst->backend == GGML_BACKEND_CPU) {
3368
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3331
3369
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3332
3370
  vk_buffer d_D = extra_src0->buffer_gpu.lock();
3333
3371
  ggml_vk_sync_buffers(subctx);
@@ -3962,9 +4000,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
3962
4000
  #ifdef GGML_VULKAN_DEBUG
3963
4001
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
3964
4002
  #endif
3965
- const bool any_on_device = node->backend == GGML_BACKEND_GPU
3966
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3967
- || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
4003
+ const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4004
+ || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4005
+ || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
3968
4006
 
3969
4007
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
3970
4008
  return;
@@ -4082,7 +4120,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4082
4120
  std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4083
4121
  #endif
4084
4122
  #if defined(GGML_VULKAN_RUN_TESTS)
4085
- ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4123
+ ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
4124
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
4125
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4086
4126
  ggml_vk_test_transfer(ctx, 8192 * 1000, false);
4087
4127
  ggml_vk_test_transfer(ctx, 8192 * 1000, true);
4088
4128
 
@@ -4174,14 +4214,16 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4174
4214
  if (ctx->staging != nullptr) {
4175
4215
  ggml_vk_destroy_buffer(ctx->staging);
4176
4216
  }
4177
- ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4217
+ ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
4218
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
4219
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4178
4220
  }
4179
4221
  }
4180
4222
 
4181
4223
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
4182
- const bool any_on_device = node->backend == GGML_BACKEND_GPU
4183
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
4184
- || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
4224
+ const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4225
+ || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4226
+ || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4185
4227
 
4186
4228
  if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
4187
4229
  return;
@@ -4335,7 +4377,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4335
4377
  last_node = true;
4336
4378
  #endif
4337
4379
 
4338
- if (node->backend == GGML_BACKEND_CPU || last_node) {
4380
+ if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
4339
4381
  ggml_vk_ctx_end(ctx->compute_ctx);
4340
4382
  ctx->compute_ctx->exit_tensor = node;
4341
4383
  ctx->compute_ctx = nullptr;
@@ -4343,9 +4385,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4343
4385
  }
4344
4386
 
4345
4387
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
4346
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
4347
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
4348
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
4388
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
4389
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4390
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4349
4391
 
4350
4392
  if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
4351
4393
  return false;
@@ -4406,7 +4448,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
4406
4448
  if (params->ith != 0) {
4407
4449
  return true;
4408
4450
  }
4409
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
4451
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
4410
4452
  return true;
4411
4453
  }
4412
4454
 
@@ -4537,13 +4579,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
4537
4579
  }
4538
4580
  }
4539
4581
 
4540
- GGML_CALL int ggml_vk_get_device_count() {
4582
+ GGML_CALL static int ggml_vk_get_device_count() {
4541
4583
  ggml_vk_instance_init();
4542
4584
 
4543
4585
  return vk_instance.device_indices.size();
4544
4586
  }
4545
4587
 
4546
- GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4588
+ GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4547
4589
  ggml_vk_instance_init();
4548
4590
 
4549
4591
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -4561,7 +4603,7 @@ void ggml_vk_init_cpu_assist() {
4561
4603
 
4562
4604
  std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
4563
4605
 
4564
- for (size_t i = 0; i < ggml_vk_get_device_count(); i++) {
4606
+ for (int i = 0; i < ggml_vk_get_device_count(); i++) {
4565
4607
  ggml_vk_print_gpu_info(i);
4566
4608
  }
4567
4609
  // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
@@ -4709,7 +4751,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
4709
4751
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
4710
4752
  }
4711
4753
 
4712
- tensor->backend = GGML_BACKEND_GPU;
4754
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
4713
4755
  tensor->extra = extra;
4714
4756
  }
4715
4757
 
@@ -4717,7 +4759,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
4717
4759
  #ifdef GGML_VULKAN_DEBUG
4718
4760
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4719
4761
  #endif
4720
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
4762
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4721
4763
 
4722
4764
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4723
4765
 
@@ -4732,7 +4774,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
4732
4774
  #ifdef GGML_VULKAN_DEBUG
4733
4775
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
4734
4776
  #endif
4735
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
4777
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4736
4778
 
4737
4779
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
4738
4780
 
@@ -4963,7 +5005,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
4963
5005
  #endif
4964
5006
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
4965
5007
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
4966
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
5008
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4967
5009
 
4968
5010
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
4969
5011
 
@@ -4984,7 +5026,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
4984
5026
  #endif
4985
5027
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
4986
5028
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
4987
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
5029
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
4988
5030
 
4989
5031
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
4990
5032
 
@@ -5061,7 +5103,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
5061
5103
  int last_node = cgraph->n_nodes - 1;
5062
5104
 
5063
5105
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
5064
- while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
5106
+ while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
5065
5107
  last_node -= 1;
5066
5108
  }
5067
5109
 
@@ -5070,7 +5112,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
5070
5112
  }
5071
5113
 
5072
5114
  ggml_compute_params params = {};
5073
- params.type = GGML_TASK_COMPUTE;
5115
+ params.type = GGML_TASK_TYPE_COMPUTE;
5074
5116
  params.ith = 0;
5075
5117
  for (int i = 0; i < cgraph->n_nodes; i++) {
5076
5118
  ggml_tensor * node = cgraph->nodes[i];
@@ -5208,6 +5250,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
5208
5250
  /* .supports_op = */ ggml_backend_vk_supports_op,
5209
5251
  };
5210
5252
 
5253
+ static ggml_guid_t ggml_backend_vk_guid() {
5254
+ static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
5255
+ return &guid;
5256
+ }
5257
+
5211
5258
  GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5212
5259
  if (vk_instance.initialized[idx]) {
5213
5260
  return vk_instance.backends[idx];
@@ -5226,6 +5273,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5226
5273
  vk_instance.initialized[idx] = true;
5227
5274
 
5228
5275
  ggml_backend_t vk_backend = new ggml_backend {
5276
+ /* .guid = */ ggml_backend_vk_guid(),
5229
5277
  /* .interface = */ ggml_backend_vk_interface,
5230
5278
  /* .context = */ &vk_instance.contexts[ctx->idx],
5231
5279
  };
@@ -5236,7 +5284,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5236
5284
  }
5237
5285
 
5238
5286
  GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
5239
- return backend && backend->iface.get_name == ggml_backend_vk_name;
5287
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
5240
5288
  }
5241
5289
 
5242
5290
  GGML_CALL int ggml_backend_vk_get_device_count() {
@@ -5248,7 +5296,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
5248
5296
  }
5249
5297
 
5250
5298
  GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
5251
- GGML_ASSERT(device < vk_instance.device_indices.size());
5299
+ GGML_ASSERT(device < (int) vk_instance.device_indices.size());
5252
5300
 
5253
5301
  vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
5254
5302
 
@@ -5282,6 +5330,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
5282
5330
  return vk_instance.device_indices.size();
5283
5331
  }
5284
5332
 
5333
+ // Extension availability
5334
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5335
+ #ifdef GGML_VULKAN_VALIDATE
5336
+ bool portability_enumeration_ext = false;
5337
+ // Check for portability enumeration extension for MoltenVK support
5338
+ for (const auto& properties : instance_extensions) {
5339
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5340
+ return true;
5341
+ }
5342
+ }
5343
+ if (!portability_enumeration_ext) {
5344
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5345
+ }
5346
+ #endif
5347
+ return false;
5348
+
5349
+ UNUSED(instance_extensions);
5350
+ }
5351
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5352
+ #ifdef __APPLE__
5353
+ bool portability_enumeration_ext = false;
5354
+ // Check for portability enumeration extension for MoltenVK support
5355
+ for (const auto& properties : instance_extensions) {
5356
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5357
+ return true;
5358
+ }
5359
+ }
5360
+ if (!portability_enumeration_ext) {
5361
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5362
+ }
5363
+ #endif
5364
+ return false;
5365
+
5366
+ UNUSED(instance_extensions);
5367
+ }
5368
+
5285
5369
  // checks
5286
5370
 
5287
5371
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5338,7 +5422,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
5338
5422
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
5339
5423
  void * tensor_data = tensor->data;
5340
5424
 
5341
- if (tensor->backend == GGML_BACKEND_GPU) {
5425
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5342
5426
  const size_t tensor_size = ggml_nbytes(tensor);
5343
5427
  tensor_data = malloc(tensor_size);
5344
5428
 
@@ -5364,14 +5448,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
5364
5448
  std::vector<const ggml_tensor *> done;
5365
5449
  ggml_vk_print_graph_origin(tensor, done);
5366
5450
 
5367
- if (tensor->backend == GGML_BACKEND_GPU) {
5451
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5368
5452
  free(tensor_data);
5369
5453
  }
5370
5454
  }
5371
5455
 
5372
5456
  static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
5373
5457
  return;
5374
- GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
5458
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
5375
5459
  if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
5376
5460
  return;
5377
5461
  }
@@ -5409,7 +5493,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5409
5493
  if (params->ith != 0) {
5410
5494
  return;
5411
5495
  }
5412
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5496
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5413
5497
  return;
5414
5498
  }
5415
5499
 
@@ -5446,10 +5530,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5446
5530
 
5447
5531
  src0_buffer = malloc(src0_size);
5448
5532
  src0_clone->data = src0_buffer;
5449
- if (src0->backend == GGML_BACKEND_CPU) {
5533
+ if (src0->backend == GGML_BACKEND_TYPE_CPU) {
5450
5534
  memcpy(src0_clone->data, src0->data, src0_size);
5451
5535
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
5452
- } else if (src0->backend == GGML_BACKEND_GPU) {
5536
+ } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
5453
5537
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
5454
5538
  uint64_t offset = extra->offset;
5455
5539
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
@@ -5489,10 +5573,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5489
5573
 
5490
5574
  src1_buffer = malloc(src1_size);
5491
5575
  src1_clone->data = src1_buffer;
5492
- if (src1->backend == GGML_BACKEND_CPU) {
5576
+ if (src1->backend == GGML_BACKEND_TYPE_CPU) {
5493
5577
  memcpy(src1_clone->data, src1->data, src1_size);
5494
5578
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
5495
- } else if (src1->backend == GGML_BACKEND_GPU) {
5579
+ } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
5496
5580
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
5497
5581
  uint64_t offset = extra->offset;
5498
5582
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
@@ -5651,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5651
5735
  if (params->ith != 0) {
5652
5736
  return;
5653
5737
  }
5654
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5738
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
5655
5739
  return;
5656
5740
  }
5657
5741
  if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
@@ -5663,7 +5747,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5663
5747
 
5664
5748
  void * tensor_data = tensor->data;
5665
5749
 
5666
- if (tensor->backend == GGML_BACKEND_GPU) {
5750
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5667
5751
  size_t tensor_size = ggml_nbytes(tensor);
5668
5752
  tensor_data = malloc(tensor_size);
5669
5753
 
@@ -5796,7 +5880,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
5796
5880
  comp_result = nullptr;
5797
5881
  comp_size = 0;
5798
5882
 
5799
- if (tensor->backend == GGML_BACKEND_GPU) {
5883
+ if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
5800
5884
  free(tensor_data);
5801
5885
  }
5802
5886
  }