llama_cpp 0.12.6 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
707
707
  q.cmd_buffer_idx = 0;
708
708
  }
709
709
 
710
- static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
710
+ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
711
+ for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
712
+ vk::MemoryType memory_type = mem_props->memoryTypes[i];
713
+ if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
714
+ (flags & memory_type.propertyFlags) == flags &&
715
+ mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
716
+ return static_cast<int32_t>(i);
717
+ }
718
+ }
719
+ return UINT32_MAX;
720
+ }
721
+
722
+ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
711
723
  #ifdef GGML_VULKAN_DEBUG
712
- std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
724
+ std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
713
725
  #endif
714
726
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
715
727
 
@@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
736
748
 
737
749
  uint32_t memory_type_index = UINT32_MAX;
738
750
 
739
- for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
740
- vk::MemoryType memory_type = mem_props.memoryTypes[i];
741
- if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
742
- memory_type_index = i;
743
- break;
744
- }
751
+ memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
752
+ buf->memory_property_flags = req_flags;
753
+
754
+ if (memory_type_index == UINT32_MAX && fallback_flags) {
755
+ memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
756
+ buf->memory_property_flags = fallback_flags;
745
757
  }
746
758
 
747
- if (memory_type_index >= mem_props.memoryTypeCount) {
759
+ if (memory_type_index == UINT32_MAX) {
748
760
  ctx->device.lock()->device.destroyBuffer(buf->buffer);
749
761
  buf->size = 0;
750
762
  throw vk::OutOfDeviceMemoryError("No suitable memory type found");
@@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
758
770
  buf->size = 0;
759
771
  throw e;
760
772
  }
761
- buf->memory_property_flags = req_flags;
762
773
  buf->ptr = nullptr;
763
774
 
764
- if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
775
+ if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
765
776
  buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
766
777
  }
767
778
 
@@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
778
789
  return buf;
779
790
  }
780
791
 
781
- static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
792
+ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
782
793
  try {
783
- return ggml_vk_create_buffer(ctx, size, req_flags);
794
+ return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
784
795
  } catch (const vk::SystemError& e) {
785
796
  std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
786
797
  std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
791
802
  static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
792
803
  vk_buffer buf;
793
804
  try {
794
- buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
795
- } catch (const vk::SystemError& e) {
796
805
  if (ctx->device.lock()->uma) {
797
806
  // Fall back to host memory type
798
- buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
807
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
799
808
  } else {
800
- std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
801
- std::cerr << "ggml_vulkan: " << e.what() << std::endl;
802
- throw e;
809
+ buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
803
810
  }
811
+ } catch (const vk::SystemError& e) {
812
+ std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
813
+ std::cerr << "ggml_vulkan: " << e.what() << std::endl;
814
+ throw e;
804
815
  }
805
816
 
806
817
  return buf;
@@ -1080,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1080
1091
  }
1081
1092
  }
1082
1093
 
1094
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1095
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
1096
+
1083
1097
  void ggml_vk_instance_init() {
1084
1098
  if (vk_instance_initialized) {
1085
1099
  return;
@@ -1089,28 +1103,42 @@ void ggml_vk_instance_init() {
1089
1103
  #endif
1090
1104
 
1091
1105
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1092
- const std::vector<const char*> layers = {
1093
- #ifdef GGML_VULKAN_VALIDATE
1094
- "VK_LAYER_KHRONOS_validation",
1095
- #endif
1096
- };
1097
- const std::vector<const char*> extensions = {
1098
- #ifdef GGML_VULKAN_VALIDATE
1099
- "VK_EXT_validation_features",
1100
- #endif
1101
- };
1102
- vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
1103
- #ifdef GGML_VULKAN_VALIDATE
1104
- const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1105
- vk::ValidationFeaturesEXT validation_features = {
1106
- features_enable,
1107
- {},
1108
- };
1109
- validation_features.setPNext(nullptr);
1110
- instance_create_info.setPNext(&validation_features);
1111
1106
 
1112
- std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1113
- #endif
1107
+ const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
1108
+ const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
1109
+ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
1110
+
1111
+ std::vector<const char*> layers;
1112
+
1113
+ if (validation_ext) {
1114
+ layers.push_back("VK_LAYER_KHRONOS_validation");
1115
+ }
1116
+ std::vector<const char*> extensions;
1117
+ if (validation_ext) {
1118
+ extensions.push_back("VK_EXT_validation_features");
1119
+ }
1120
+ if (portability_enumeration_ext) {
1121
+ extensions.push_back("VK_KHR_portability_enumeration");
1122
+ }
1123
+ vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
1124
+ if (portability_enumeration_ext) {
1125
+ instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
1126
+ }
1127
+
1128
+ std::vector<vk::ValidationFeatureEnableEXT> features_enable;
1129
+ vk::ValidationFeaturesEXT validation_features;
1130
+
1131
+ if (validation_ext) {
1132
+ features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
1133
+ validation_features = {
1134
+ features_enable,
1135
+ {},
1136
+ };
1137
+ validation_features.setPNext(nullptr);
1138
+ instance_create_info.setPNext(&validation_features);
1139
+
1140
+ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
1141
+ }
1114
1142
  vk_instance.instance = vk::createInstance(instance_create_info);
1115
1143
 
1116
1144
  memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@@ -1139,7 +1167,7 @@ void ggml_vk_instance_init() {
1139
1167
  vk_instance_initialized = true;
1140
1168
  }
1141
1169
 
1142
- void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1170
+ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1143
1171
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1144
1172
  size_t dev_num = vk_instance.device_indices[idx];
1145
1173
  #ifdef GGML_VULKAN_DEBUG
@@ -1157,12 +1185,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1157
1185
  vk_instance.devices[idx] = std::make_shared<vk_device>();
1158
1186
  ctx->device = vk_instance.devices[idx];
1159
1187
  ctx->device.lock()->physical_device = devices[dev_num];
1160
- std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1188
+ const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
1161
1189
 
1162
1190
  bool maintenance4_support = false;
1163
1191
 
1164
1192
  // Check if maintenance4 is supported
1165
- for (auto properties : ext_props) {
1193
+ for (const auto& properties : ext_props) {
1166
1194
  if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
1167
1195
  maintenance4_support = true;
1168
1196
  }
@@ -1193,7 +1221,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1193
1221
  bool fp16_storage = false;
1194
1222
  bool fp16_compute = false;
1195
1223
 
1196
- for (auto properties : ext_props) {
1224
+ for (const auto& properties : ext_props) {
1197
1225
  if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
1198
1226
  fp16_storage = true;
1199
1227
  } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
@@ -1422,7 +1450,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
1422
1450
  #ifdef GGML_VULKAN_DEBUG
1423
1451
  std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
1424
1452
  #endif
1425
- vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1453
+ vk_buffer buf = ggml_vk_create_buffer(ctx, size,
1454
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1455
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1426
1456
 
1427
1457
  if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
1428
1458
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
@@ -1568,7 +1598,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
1568
1598
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
1569
1599
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
1570
1600
  ggml_vk_destroy_buffer(ctx->sync_staging);
1571
- ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
1601
+ ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
1602
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
1603
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1572
1604
  }
1573
1605
  }
1574
1606
 
@@ -4082,7 +4114,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4082
4114
  std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4083
4115
  #endif
4084
4116
  #if defined(GGML_VULKAN_RUN_TESTS)
4085
- ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4117
+ ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
4118
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
4119
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4086
4120
  ggml_vk_test_transfer(ctx, 8192 * 1000, false);
4087
4121
  ggml_vk_test_transfer(ctx, 8192 * 1000, true);
4088
4122
 
@@ -4174,7 +4208,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4174
4208
  if (ctx->staging != nullptr) {
4175
4209
  ggml_vk_destroy_buffer(ctx->staging);
4176
4210
  }
4177
- ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
4211
+ ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
4212
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
4213
+ vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
4178
4214
  }
4179
4215
  }
4180
4216
 
@@ -4537,13 +4573,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
4537
4573
  }
4538
4574
  }
4539
4575
 
4540
- GGML_CALL int ggml_vk_get_device_count() {
4576
+ GGML_CALL static int ggml_vk_get_device_count() {
4541
4577
  ggml_vk_instance_init();
4542
4578
 
4543
4579
  return vk_instance.device_indices.size();
4544
4580
  }
4545
4581
 
4546
- GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4582
+ GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
4547
4583
  ggml_vk_instance_init();
4548
4584
 
4549
4585
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -4561,7 +4597,7 @@ void ggml_vk_init_cpu_assist() {
4561
4597
 
4562
4598
  std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
4563
4599
 
4564
- for (size_t i = 0; i < ggml_vk_get_device_count(); i++) {
4600
+ for (int i = 0; i < ggml_vk_get_device_count(); i++) {
4565
4601
  ggml_vk_print_gpu_info(i);
4566
4602
  }
4567
4603
  // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
@@ -5248,7 +5284,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
5248
5284
  }
5249
5285
 
5250
5286
  GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
5251
- GGML_ASSERT(device < vk_instance.device_indices.size());
5287
+ GGML_ASSERT(device < (int) vk_instance.device_indices.size());
5252
5288
 
5253
5289
  vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
5254
5290
 
@@ -5282,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
5282
5318
  return vk_instance.device_indices.size();
5283
5319
  }
5284
5320
 
5321
+ // Extension availability
5322
+ static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5323
+ #ifdef GGML_VULKAN_VALIDATE
5324
+ bool portability_enumeration_ext = false;
5325
+ // Check for portability enumeration extension for MoltenVK support
5326
+ for (const auto& properties : instance_extensions) {
5327
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5328
+ return true;
5329
+ }
5330
+ }
5331
+ if (!portability_enumeration_ext) {
5332
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5333
+ }
5334
+ #endif
5335
+ return false;
5336
+
5337
+ UNUSED(instance_extensions);
5338
+ }
5339
+ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
5340
+ #ifdef __APPLE__
5341
+ bool portability_enumeration_ext = false;
5342
+ // Check for portability enumeration extension for MoltenVK support
5343
+ for (const auto& properties : instance_extensions) {
5344
+ if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
5345
+ return true;
5346
+ }
5347
+ }
5348
+ if (!portability_enumeration_ext) {
5349
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
5350
+ }
5351
+ #endif
5352
+ return false;
5353
+
5354
+ UNUSED(instance_extensions);
5355
+ }
5356
+
5285
5357
  // checks
5286
5358
 
5287
5359
  #ifdef GGML_VULKAN_CHECK_RESULTS