llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@
27
27
  #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
28
28
 
29
29
  #define VK_VENDOR_ID_AMD 0x1002
30
+ #define VK_VENDOR_ID_APPLE 0x106b
30
31
  #define VK_VENDOR_ID_INTEL 0x8086
31
32
  #define VK_VENDOR_ID_NVIDIA 0x10de
32
33
 
@@ -706,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
706
707
  q.cmd_buffer_idx = 0;
707
708
  }
708
709
 
709
- static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
710
+ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
711
+ for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
712
+ vk::MemoryType memory_type = mem_props->memoryTypes[i];
713
+ if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
714
+ (flags & memory_type.propertyFlags) == flags &&
715
+ mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
716
+ return static_cast<int32_t>(i);
717
+ }
718
+ }
719
+ return UINT32_MAX;
720
+ }
721
+
722
+ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
710
723
  #ifdef GGML_VULKAN_DEBUG
711
- std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
724
+ std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
712
725
  #endif
713
726
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
714
727
 
@@ -735,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
735
748
 
736
749
  uint32_t memory_type_index = UINT32_MAX;
737
750
 
738
- for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
739
- vk::MemoryType memory_type = mem_props.memoryTypes[i];
740
- if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
741
- memory_type_index = i;
742
- break;
743
- }
751
+ memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
752
+ buf->memory_property_flags = req_flags;
753
+
754
+ if (memory_type_index == UINT32_MAX && fallback_flags) {
755
+ memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
756
+ buf->memory_property_flags = fallback_flags;
744
757
  }
745
758
 
746
- if (memory_type_index >= mem_props.memoryTypeCount) {
759
+ if (memory_type_index == UINT32_MAX) {
747
760
  ctx->device.lock()->device.destroyBuffer(buf->buffer);
748
761
  buf->size = 0;
749
762
  throw vk::OutOfDeviceMemoryError("No suitable memory type found");
@@ -757,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
757
770
  buf->size = 0;
758
771
  throw e;
759
772
  }
760
- buf->memory_property_flags = req_flags;
761
773
  buf->ptr = nullptr;
762
774
 
763
- if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
775
+ if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
764
776
  buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
765
777
  }
766
778
 
@@ -777,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
777
789
  return buf;
778
790
  }
779
791
 
780
- static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
792
+ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
781
793
  try {
782
- return ggml_vk_create_buffer(ctx, size, req_flags);
794
+ return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
783
795
  } catch (const vk::SystemError& e) {
784
796
  std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
785
797
  std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@@ -790,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
790
802
  static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
791
803
  vk_buffer buf;
792
804
  try {
793
- buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
794
- } catch (const vk::SystemError& e) {
795
805
  if (ctx->device.lock()->uma) {
796
806
  // Fall back to host memory type
797
- buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
807
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
798
808
  } else {
799
- std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
800
- std::cerr << "ggml_vulkan: " << e.what() << std::endl;
801
- throw e;
809
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
802
810
  }
811
+ } catch (const vk::SystemError& e) {
812
+ std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
813
+ std::cerr << "ggml_vulkan: " << e.what() << std::endl;
814
+ throw e;
803
815
  }
804
816
 
805
817
  return buf;
@@ -1079,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1079
1091
  }
1080
1092
  }
1081
1093
 
1094
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1095
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1096
+
1082
1097
  void ggml_vk_instance_init() {
1083
1098
  if (vk_instance_initialized) {
1084
1099
  return;
@@ -1088,28 +1103,42 @@ void ggml_vk_instance_init() {
1088
1103
  #endif
1089
1104
 
1090
1105
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1091
- const std::vector<const char*> layers = {
1092
- #ifdef GGML_VULKAN_VALIDATE
1093
- "VK_LAYER_KHRONOS_validation",
1094
- #endif
1095
- };
1096
- const std::vector<const char*> extensions = {
1097
- #ifdef GGML_VULKAN_VALIDATE
1098
- "VK_EXT_validation_features",
1099
- #endif
1100
- };
1101
- vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
1102
- #ifdef GGML_VULKAN_VALIDATE
1103
- const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1104
- vk::ValidationFeaturesEXT validation_features = {
1105
- features_enable,
1106
- {},
1107
- };
1108
- validation_features.setPNext(nullptr);
1109
- instance_create_info.setPNext(&validation_features);
1110
1106
 
1111
- std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1112
- #endif
1107
+ const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
1108
+ const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
1109
+ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
1110
+
1111
+ std::vector<const char*> layers;
1112
+
1113
+ if (validation_ext) {
1114
+ layers.push_back("VK_LAYER_KHRONOS_validation");
1115
+ }
1116
+ std::vector<const char*> extensions;
1117
+ if (validation_ext) {
1118
+ extensions.push_back("VK_EXT_validation_features");
1119
+ }
1120
+ if (portability_enumeration_ext) {
1121
+ extensions.push_back("VK_KHR_portability_enumeration");
1122
+ }
1123
+ vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
1124
+ if (portability_enumeration_ext) {
1125
+ instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
1126
+ }
1127
+
1128
+ std::vector<vk::ValidationFeatureEnableEXT> features_enable;
1129
+ vk::ValidationFeaturesEXT validation_features;
1130
+
1131
+ if (validation_ext) {
1132
+ features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1133
+ validation_features = {
1134
+ features_enable,
1135
+ {},
1136
+ };
1137
+ validation_features.setPNext(nullptr);
1138
+ instance_create_info.setPNext(&validation_features);
1139
+
1140
+ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1141
+ }
1113
1142
  vk_instance.instance = vk::createInstance(instance_create_info);
1114
1143
 
1115
1144
  memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@@ -1138,7 +1167,7 @@ void ggml_vk_instance_init() {
1138
1167
  vk_instance_initialized = true;
1139
1168
  }
1140
1169
 
1141
- void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1170
+ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1142
1171
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1143
1172
  size_t dev_num = vk_instance.device_indices[idx];
1144
1173
  #ifdef GGML_VULKAN_DEBUG
@@ -1156,12 +1185,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1156
1185
  vk_instance.devices[idx] = std::make_shared<vk_device>();
1157
1186
  ctx->device = vk_instance.devices[idx];
1158
1187
  ctx->device.lock()->physical_device = devices[dev_num];
1159
- std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1188
+ const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1160
1189
 
1161
1190
  bool maintenance4_support = false;
1162
1191
 
1163
1192
  // Check if maintenance4 is supported
1164
- for (auto properties : ext_props) {
1193
+ for (const auto& properties : ext_props) {
1165
1194
  if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
1166
1195
  maintenance4_support = true;
1167
1196
  }
@@ -1192,7 +1221,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1192
1221
  bool fp16_storage = false;
1193
1222
  bool fp16_compute = false;
1194
1223
 
1195
- for (auto properties : ext_props) {
1224
+ for (const auto& properties : ext_props) {
1196
1225
  if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
1197
1226
  fp16_storage = true;
1198
1227
  } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
@@ -1421,7 +1450,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
1421
1450
  #ifdef GGML_VULKAN_DEBUG
1422
1451
  std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
1423
1452
  #endif
1424
- vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1453
+ vk_buffer buf = ggml_vk_create_buffer(ctx, size,
1454
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1455
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1425
1456
 
1426
1457
  if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
1427
1458
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
@@ -1567,7 +1598,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
1567
1598
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
1568
1599
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
1569
1600
  ggml_vk_destroy_buffer(ctx->sync_staging);
1570
- ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1601
+ ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
1602
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1603
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1571
1604
  }
1572
1605
  }
1573
1606
 
@@ -2034,18 +2067,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
2034
2067
  return ctx->pipeline_matmul_f32_aligned_l.align;
2035
2068
  }
2036
2069
 
2070
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
2071
+ if (bit16_x && bit16_y) {
2072
+ if (m <= 32 || n <= 32) {
2073
+ #ifdef GGML_VULKAN_DEBUG
2074
+ std::cerr << " S" << std::endl;
2075
+ #endif
2076
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2077
+ }
2078
+ #ifdef GGML_VULKAN_DEBUG
2079
+ std::cerr << " M" << std::endl;
2080
+ #endif
2081
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
2082
+ }
2083
+ if (bit16_x && !bit16_y) {
2084
+ if (m <= 32 || n <= 32) {
2085
+ #ifdef GGML_VULKAN_DEBUG
2086
+ std::cerr << " S" << std::endl;
2087
+ #endif
2088
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2089
+ }
2090
+ #ifdef GGML_VULKAN_DEBUG
2091
+ std::cerr << " M" << std::endl;
2092
+ #endif
2093
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
2094
+ }
2095
+ if (!bit16_x && bit16_y) {
2096
+ GGML_ASSERT(false);
2097
+ }
2098
+
2099
+ if (m <= 32 || n <= 32) {
2100
+ #ifdef GGML_VULKAN_DEBUG
2101
+ std::cerr << " S" << std::endl;
2102
+ #endif
2103
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2104
+ }
2105
+ #ifdef GGML_VULKAN_DEBUG
2106
+ std::cerr << " M" << std::endl;
2107
+ #endif
2108
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
2109
+ }
2110
+
2111
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
2112
+ #ifdef GGML_VULKAN_DEBUG
2113
+ std::cerr << " M" << std::endl;
2114
+ #endif
2115
+ if (bit16_x && bit16_y) {
2116
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
2117
+ }
2118
+ if (bit16_x && !bit16_y) {
2119
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
2120
+ }
2121
+ if (!bit16_x && bit16_y) {
2122
+ GGML_ASSERT(false);
2123
+ }
2124
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
2125
+ }
2126
+
2127
+ static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
2128
+ #ifdef GGML_VULKAN_DEBUG
2129
+ std::cerr << " S" << std::endl;
2130
+ #endif
2131
+ if (bit16_x && bit16_y) {
2132
+ return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2133
+ }
2134
+ if (bit16_x && !bit16_y) {
2135
+ return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2136
+ }
2137
+ if (!bit16_x && bit16_y) {
2138
+ GGML_ASSERT(false);
2139
+ }
2140
+ return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2141
+ }
2142
+
2037
2143
  static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
2038
2144
  #ifdef GGML_VULKAN_DEBUG
2039
2145
  std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
2040
2146
  #endif
2147
+ switch (ctx->device.lock()->vendor_id) {
2148
+ case VK_VENDOR_ID_AMD:
2149
+ return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
2150
+ case VK_VENDOR_ID_APPLE:
2151
+ return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
2152
+ case VK_VENDOR_ID_INTEL:
2153
+ return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
2154
+ }
2155
+
2041
2156
  if (bit16_x && bit16_y) {
2042
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2157
+ if (m <= 32 || n <= 32) {
2043
2158
  #ifdef GGML_VULKAN_DEBUG
2044
2159
  std::cerr << " S" << std::endl;
2045
2160
  #endif
2046
2161
  return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
2047
2162
  }
2048
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2163
+ if (m <= 64 || n <= 64) {
2049
2164
  #ifdef GGML_VULKAN_DEBUG
2050
2165
  std::cerr << " M" << std::endl;
2051
2166
  #endif
@@ -2057,13 +2172,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2057
2172
  return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
2058
2173
  }
2059
2174
  if (bit16_x && !bit16_y) {
2060
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2175
+ if (m <= 32 || n <= 32) {
2061
2176
  #ifdef GGML_VULKAN_DEBUG
2062
2177
  std::cerr << " S" << std::endl;
2063
2178
  #endif
2064
2179
  return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
2065
2180
  }
2066
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2181
+ if (m <= 64 || n <= 64) {
2067
2182
  #ifdef GGML_VULKAN_DEBUG
2068
2183
  std::cerr << " M" << std::endl;
2069
2184
  #endif
@@ -2078,13 +2193,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2078
2193
  GGML_ASSERT(false);
2079
2194
  }
2080
2195
 
2081
- if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
2196
+ if (m <= 32 || n <= 32) {
2082
2197
  #ifdef GGML_VULKAN_DEBUG
2083
2198
  std::cerr << " S" << std::endl;
2084
2199
  #endif
2085
2200
  return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
2086
2201
  }
2087
- if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
2202
+ if (m <= 64 || n <= 64) {
2088
2203
  #ifdef GGML_VULKAN_DEBUG
2089
2204
  std::cerr << " M" << std::endl;
2090
2205
  #endif
@@ -3999,7 +4114,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
3999
4114
  std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4000
4115
  #endif
4001
4116
  #if defined(GGML_VULKAN_RUN_TESTS)
4002
- ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4117
+ ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
4118
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
4119
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4003
4120
  ggml_vk_test_transfer(ctx, 8192 * 1000, false);
4004
4121
  ggml_vk_test_transfer(ctx, 8192 * 1000, true);
4005
4122
 
@@ -4091,7 +4208,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4091
4208
  if (ctx->staging != nullptr) {
4092
4209
  ggml_vk_destroy_buffer(ctx->staging);
4093
4210
  }
4094
- ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4211
+ ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
4212
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
4213
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4095
4214
  }
4096
4215
  }
4097
4216
 
@@ -4454,13 +4573,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
4454
4573
  }
4455
4574
  }
4456
4575
 
4457
- GGML_CALL int ggml_vk_get_device_count() {
4576
+ GGML_CALL static int ggml_vk_get_device_count() {
4458
4577
  ggml_vk_instance_init();
4459
4578
 
4460
4579
  return vk_instance.device_indices.size();
4461
4580
  }
4462
4581
 
4463
- GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4582
+ GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4464
4583
  ggml_vk_instance_init();
4465
4584
 
4466
4585
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -4478,7 +4597,7 @@ void ggml_vk_init_cpu_assist() {
4478
4597
 
4479
4598
  std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
4480
4599
 
4481
- for (size_t i = 0; i < ggml_vk_get_device_count(); i++) {
4600
+ for (int i = 0; i < ggml_vk_get_device_count(); i++) {
4482
4601
  ggml_vk_print_gpu_info(i);
4483
4602
  }
4484
4603
  // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
@@ -5165,7 +5284,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
5165
5284
  }
5166
5285
 
5167
5286
  GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
5168
- GGML_ASSERT(device < vk_instance.device_indices.size());
5287
+ GGML_ASSERT(device < (int) vk_instance.device_indices.size());
5169
5288
 
5170
5289
  vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
5171
5290
 
@@ -5199,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
5199
5318
  return vk_instance.device_indices.size();
5200
5319
  }
5201
5320
 
5321
+ // Extension availability
5322
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5323
+ #ifdef GGML_VULKAN_VALIDATE
5324
+ bool portability_enumeration_ext = false;
5325
+ // Check for portability enumeration extension for MoltenVK support
5326
+ for (const auto& properties : instance_extensions) {
5327
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5328
+ return true;
5329
+ }
5330
+ }
5331
+ if (!portability_enumeration_ext) {
5332
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5333
+ }
5334
+ #endif
5335
+ return false;
5336
+
5337
+ UNUSED(instance_extensions);
5338
+ }
5339
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5340
+ #ifdef __APPLE__
5341
+ bool portability_enumeration_ext = false;
5342
+ // Check for portability enumeration extension for MoltenVK support
5343
+ for (const auto& properties : instance_extensions) {
5344
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5345
+ return true;
5346
+ }
5347
+ }
5348
+ if (!portability_enumeration_ext) {
5349
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5350
+ }
5351
+ #endif
5352
+ return false;
5353
+
5354
+ UNUSED(instance_extensions);
5355
+ }
5356
+
5202
5357
  // checks
5203
5358
 
5204
5359
  #ifdef GGML_VULKAN_CHECK_RESULTS