llama_cpp 0.12.6 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
707
707
  q.cmd_buffer_idx = 0;
708
708
  }
709
709
 
710
- static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
710
+ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
711
+ for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
712
+ vk::MemoryType memory_type = mem_props->memoryTypes[i];
713
+ if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
714
+ (flags & memory_type.propertyFlags) == flags &&
715
+ mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
716
+ return static_cast<int32_t>(i);
717
+ }
718
+ }
719
+ return UINT32_MAX;
720
+ }
721
+
722
+ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
711
723
  #ifdef GGML_VULKAN_DEBUG
712
- std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
724
+ std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
713
725
  #endif
714
726
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
715
727
 
@@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
736
748
 
737
749
  uint32_t memory_type_index = UINT32_MAX;
738
750
 
739
- for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
740
- vk::MemoryType memory_type = mem_props.memoryTypes[i];
741
- if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
742
- memory_type_index = i;
743
- break;
744
- }
751
+ memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
752
+ buf->memory_property_flags = req_flags;
753
+
754
+ if (memory_type_index == UINT32_MAX && fallback_flags) {
755
+ memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
756
+ buf->memory_property_flags = fallback_flags;
745
757
  }
746
758
 
747
- if (memory_type_index >= mem_props.memoryTypeCount) {
759
+ if (memory_type_index == UINT32_MAX) {
748
760
  ctx->device.lock()->device.destroyBuffer(buf->buffer);
749
761
  buf->size = 0;
750
762
  throw vk::OutOfDeviceMemoryError("No suitable memory type found");
@@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
758
770
  buf->size = 0;
759
771
  throw e;
760
772
  }
761
- buf->memory_property_flags = req_flags;
762
773
  buf->ptr = nullptr;
763
774
 
764
- if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
775
+ if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
765
776
  buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
766
777
  }
767
778
 
@@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
778
789
  return buf;
779
790
  }
780
791
 
781
- static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
792
+ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
782
793
  try {
783
- return ggml_vk_create_buffer(ctx, size, req_flags);
794
+ return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
784
795
  } catch (const vk::SystemError& e) {
785
796
  std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
786
797
  std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
791
802
  static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
792
803
  vk_buffer buf;
793
804
  try {
794
- buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
795
- } catch (const vk::SystemError& e) {
796
805
  if (ctx->device.lock()->uma) {
797
806
  // Fall back to host memory type
798
- buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
807
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
799
808
  } else {
800
- std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
801
- std::cerr << "ggml_vulkan: " << e.what() << std::endl;
802
- throw e;
809
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
803
810
  }
811
+ } catch (const vk::SystemError& e) {
812
+ std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
813
+ std::cerr << "ggml_vulkan: " << e.what() << std::endl;
814
+ throw e;
804
815
  }
805
816
 
806
817
  return buf;
@@ -1080,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1080
1091
  }
1081
1092
  }
1082
1093
 
1094
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1095
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1096
+
1083
1097
  void ggml_vk_instance_init() {
1084
1098
  if (vk_instance_initialized) {
1085
1099
  return;
@@ -1089,28 +1103,42 @@ void ggml_vk_instance_init() {
1089
1103
  #endif
1090
1104
 
1091
1105
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1092
- const std::vector<const char*> layers = {
1093
- #ifdef GGML_VULKAN_VALIDATE
1094
- "VK_LAYER_KHRONOS_validation",
1095
- #endif
1096
- };
1097
- const std::vector<const char*> extensions = {
1098
- #ifdef GGML_VULKAN_VALIDATE
1099
- "VK_EXT_validation_features",
1100
- #endif
1101
- };
1102
- vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
1103
- #ifdef GGML_VULKAN_VALIDATE
1104
- const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1105
- vk::ValidationFeaturesEXT validation_features = {
1106
- features_enable,
1107
- {},
1108
- };
1109
- validation_features.setPNext(nullptr);
1110
- instance_create_info.setPNext(&validation_features);
1111
1106
 
1112
- std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1113
- #endif
1107
+ const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
1108
+ const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
1109
+ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
1110
+
1111
+ std::vector<const char*> layers;
1112
+
1113
+ if (validation_ext) {
1114
+ layers.push_back("VK_LAYER_KHRONOS_validation");
1115
+ }
1116
+ std::vector<const char*> extensions;
1117
+ if (validation_ext) {
1118
+ extensions.push_back("VK_EXT_validation_features");
1119
+ }
1120
+ if (portability_enumeration_ext) {
1121
+ extensions.push_back("VK_KHR_portability_enumeration");
1122
+ }
1123
+ vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
1124
+ if (portability_enumeration_ext) {
1125
+ instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
1126
+ }
1127
+
1128
+ std::vector<vk::ValidationFeatureEnableEXT> features_enable;
1129
+ vk::ValidationFeaturesEXT validation_features;
1130
+
1131
+ if (validation_ext) {
1132
+ features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1133
+ validation_features = {
1134
+ features_enable,
1135
+ {},
1136
+ };
1137
+ validation_features.setPNext(nullptr);
1138
+ instance_create_info.setPNext(&validation_features);
1139
+
1140
+ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1141
+ }
1114
1142
  vk_instance.instance = vk::createInstance(instance_create_info);
1115
1143
 
1116
1144
  memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@@ -1139,7 +1167,7 @@ void ggml_vk_instance_init() {
1139
1167
  vk_instance_initialized = true;
1140
1168
  }
1141
1169
 
1142
- void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1170
+ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1143
1171
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1144
1172
  size_t dev_num = vk_instance.device_indices[idx];
1145
1173
  #ifdef GGML_VULKAN_DEBUG
@@ -1157,12 +1185,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1157
1185
  vk_instance.devices[idx] = std::make_shared<vk_device>();
1158
1186
  ctx->device = vk_instance.devices[idx];
1159
1187
  ctx->device.lock()->physical_device = devices[dev_num];
1160
- std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1188
+ const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1161
1189
 
1162
1190
  bool maintenance4_support = false;
1163
1191
 
1164
1192
  // Check if maintenance4 is supported
1165
- for (auto properties : ext_props) {
1193
+ for (const auto& properties : ext_props) {
1166
1194
  if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
1167
1195
  maintenance4_support = true;
1168
1196
  }
@@ -1193,7 +1221,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1193
1221
  bool fp16_storage = false;
1194
1222
  bool fp16_compute = false;
1195
1223
 
1196
- for (auto properties : ext_props) {
1224
+ for (const auto& properties : ext_props) {
1197
1225
  if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
1198
1226
  fp16_storage = true;
1199
1227
  } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
@@ -1422,7 +1450,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
1422
1450
  #ifdef GGML_VULKAN_DEBUG
1423
1451
  std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
1424
1452
  #endif
1425
- vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1453
+ vk_buffer buf = ggml_vk_create_buffer(ctx, size,
1454
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1455
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1426
1456
 
1427
1457
  if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
1428
1458
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
@@ -1568,7 +1598,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
1568
1598
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
1569
1599
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
1570
1600
  ggml_vk_destroy_buffer(ctx->sync_staging);
1571
- ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1601
+ ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
1602
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1603
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1572
1604
  }
1573
1605
  }
1574
1606
 
@@ -4082,7 +4114,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4082
4114
  std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4083
4115
  #endif
4084
4116
  #if defined(GGML_VULKAN_RUN_TESTS)
4085
- ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4117
+ ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
4118
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
4119
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4086
4120
  ggml_vk_test_transfer(ctx, 8192 * 1000, false);
4087
4121
  ggml_vk_test_transfer(ctx, 8192 * 1000, true);
4088
4122
 
@@ -4174,7 +4208,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4174
4208
  if (ctx->staging != nullptr) {
4175
4209
  ggml_vk_destroy_buffer(ctx->staging);
4176
4210
  }
4177
- ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4211
+ ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
4212
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
4213
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4178
4214
  }
4179
4215
  }
4180
4216
 
@@ -4537,13 +4573,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
4537
4573
  }
4538
4574
  }
4539
4575
 
4540
- GGML_CALL int ggml_vk_get_device_count() {
4576
+ GGML_CALL static int ggml_vk_get_device_count() {
4541
4577
  ggml_vk_instance_init();
4542
4578
 
4543
4579
  return vk_instance.device_indices.size();
4544
4580
  }
4545
4581
 
4546
- GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4582
+ GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4547
4583
  ggml_vk_instance_init();
4548
4584
 
4549
4585
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -4561,7 +4597,7 @@ void ggml_vk_init_cpu_assist() {
4561
4597
 
4562
4598
  std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
4563
4599
 
4564
- for (size_t i = 0; i < ggml_vk_get_device_count(); i++) {
4600
+ for (int i = 0; i < ggml_vk_get_device_count(); i++) {
4565
4601
  ggml_vk_print_gpu_info(i);
4566
4602
  }
4567
4603
  // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
@@ -5248,7 +5284,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
5248
5284
  }
5249
5285
 
5250
5286
  GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
5251
- GGML_ASSERT(device < vk_instance.device_indices.size());
5287
+ GGML_ASSERT(device < (int) vk_instance.device_indices.size());
5252
5288
 
5253
5289
  vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
5254
5290
 
@@ -5282,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
5282
5318
  return vk_instance.device_indices.size();
5283
5319
  }
5284
5320
 
5321
+ // Extension availability
5322
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5323
+ #ifdef GGML_VULKAN_VALIDATE
5324
+ bool portability_enumeration_ext = false;
5325
+ // Check for portability enumeration extension for MoltenVK support
5326
+ for (const auto& properties : instance_extensions) {
5327
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5328
+ return true;
5329
+ }
5330
+ }
5331
+ if (!portability_enumeration_ext) {
5332
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5333
+ }
5334
+ #endif
5335
+ return false;
5336
+
5337
+ UNUSED(instance_extensions);
5338
+ }
5339
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5340
+ #ifdef __APPLE__
5341
+ bool portability_enumeration_ext = false;
5342
+ // Check for portability enumeration extension for MoltenVK support
5343
+ for (const auto& properties : instance_extensions) {
5344
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5345
+ return true;
5346
+ }
5347
+ }
5348
+ if (!portability_enumeration_ext) {
5349
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5350
+ }
5351
+ #endif
5352
+ return false;
5353
+
5354
+ UNUSED(instance_extensions);
5355
+ }
5356
+
5285
5357
  // checks
5286
5358
 
5287
5359
  #ifdef GGML_VULKAN_CHECK_RESULTS