llama_cpp 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
707
707
|
q.cmd_buffer_idx = 0;
|
708
708
|
}
|
709
709
|
|
710
|
-
static
|
710
|
+
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
711
|
+
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
712
|
+
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
713
|
+
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
|
714
|
+
(flags & memory_type.propertyFlags) == flags &&
|
715
|
+
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
|
716
|
+
return static_cast<int32_t>(i);
|
717
|
+
}
|
718
|
+
}
|
719
|
+
return UINT32_MAX;
|
720
|
+
}
|
721
|
+
|
722
|
+
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
711
723
|
#ifdef GGML_VULKAN_DEBUG
|
712
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
|
724
|
+
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
713
725
|
#endif
|
714
726
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
715
727
|
|
@@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
736
748
|
|
737
749
|
uint32_t memory_type_index = UINT32_MAX;
|
738
750
|
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
751
|
+
memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
|
752
|
+
buf->memory_property_flags = req_flags;
|
753
|
+
|
754
|
+
if (memory_type_index == UINT32_MAX && fallback_flags) {
|
755
|
+
memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
|
756
|
+
buf->memory_property_flags = fallback_flags;
|
745
757
|
}
|
746
758
|
|
747
|
-
if (memory_type_index
|
759
|
+
if (memory_type_index == UINT32_MAX) {
|
748
760
|
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
749
761
|
buf->size = 0;
|
750
762
|
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
@@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
758
770
|
buf->size = 0;
|
759
771
|
throw e;
|
760
772
|
}
|
761
|
-
buf->memory_property_flags = req_flags;
|
762
773
|
buf->ptr = nullptr;
|
763
774
|
|
764
|
-
if (
|
775
|
+
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
765
776
|
buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
766
777
|
}
|
767
778
|
|
@@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
778
789
|
return buf;
|
779
790
|
}
|
780
791
|
|
781
|
-
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
792
|
+
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
782
793
|
try {
|
783
|
-
return ggml_vk_create_buffer(ctx, size, req_flags);
|
794
|
+
return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
|
784
795
|
} catch (const vk::SystemError& e) {
|
785
796
|
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
786
797
|
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
@@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
|
|
791
802
|
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
792
803
|
vk_buffer buf;
|
793
804
|
try {
|
794
|
-
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
795
|
-
} catch (const vk::SystemError& e) {
|
796
805
|
if (ctx->device.lock()->uma) {
|
797
806
|
// Fall back to host memory type
|
798
|
-
buf =
|
807
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
799
808
|
} else {
|
800
|
-
|
801
|
-
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
802
|
-
throw e;
|
809
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
803
810
|
}
|
811
|
+
} catch (const vk::SystemError& e) {
|
812
|
+
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
813
|
+
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
814
|
+
throw e;
|
804
815
|
}
|
805
816
|
|
806
817
|
return buf;
|
@@ -1080,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1080
1091
|
}
|
1081
1092
|
}
|
1082
1093
|
|
1094
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
1095
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
1096
|
+
|
1083
1097
|
void ggml_vk_instance_init() {
|
1084
1098
|
if (vk_instance_initialized) {
|
1085
1099
|
return;
|
@@ -1089,28 +1103,42 @@ void ggml_vk_instance_init() {
|
|
1089
1103
|
#endif
|
1090
1104
|
|
1091
1105
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
1092
|
-
const std::vector<const char*> layers = {
|
1093
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1094
|
-
"VK_LAYER_KHRONOS_validation",
|
1095
|
-
#endif
|
1096
|
-
};
|
1097
|
-
const std::vector<const char*> extensions = {
|
1098
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1099
|
-
"VK_EXT_validation_features",
|
1100
|
-
#endif
|
1101
|
-
};
|
1102
|
-
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
|
1103
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1104
|
-
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
1105
|
-
vk::ValidationFeaturesEXT validation_features = {
|
1106
|
-
features_enable,
|
1107
|
-
{},
|
1108
|
-
};
|
1109
|
-
validation_features.setPNext(nullptr);
|
1110
|
-
instance_create_info.setPNext(&validation_features);
|
1111
1106
|
|
1112
|
-
std::
|
1113
|
-
|
1107
|
+
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
1108
|
+
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
1109
|
+
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
1110
|
+
|
1111
|
+
std::vector<const char*> layers;
|
1112
|
+
|
1113
|
+
if (validation_ext) {
|
1114
|
+
layers.push_back("VK_LAYER_KHRONOS_validation");
|
1115
|
+
}
|
1116
|
+
std::vector<const char*> extensions;
|
1117
|
+
if (validation_ext) {
|
1118
|
+
extensions.push_back("VK_EXT_validation_features");
|
1119
|
+
}
|
1120
|
+
if (portability_enumeration_ext) {
|
1121
|
+
extensions.push_back("VK_KHR_portability_enumeration");
|
1122
|
+
}
|
1123
|
+
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
1124
|
+
if (portability_enumeration_ext) {
|
1125
|
+
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
1129
|
+
vk::ValidationFeaturesEXT validation_features;
|
1130
|
+
|
1131
|
+
if (validation_ext) {
|
1132
|
+
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
1133
|
+
validation_features = {
|
1134
|
+
features_enable,
|
1135
|
+
{},
|
1136
|
+
};
|
1137
|
+
validation_features.setPNext(nullptr);
|
1138
|
+
instance_create_info.setPNext(&validation_features);
|
1139
|
+
|
1140
|
+
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
1141
|
+
}
|
1114
1142
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
1115
1143
|
|
1116
1144
|
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
@@ -1139,7 +1167,7 @@ void ggml_vk_instance_init() {
|
|
1139
1167
|
vk_instance_initialized = true;
|
1140
1168
|
}
|
1141
1169
|
|
1142
|
-
void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1170
|
+
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1143
1171
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1144
1172
|
size_t dev_num = vk_instance.device_indices[idx];
|
1145
1173
|
#ifdef GGML_VULKAN_DEBUG
|
@@ -1157,12 +1185,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1157
1185
|
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
1158
1186
|
ctx->device = vk_instance.devices[idx];
|
1159
1187
|
ctx->device.lock()->physical_device = devices[dev_num];
|
1160
|
-
std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
1188
|
+
const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
1161
1189
|
|
1162
1190
|
bool maintenance4_support = false;
|
1163
1191
|
|
1164
1192
|
// Check if maintenance4 is supported
|
1165
|
-
for (auto properties : ext_props) {
|
1193
|
+
for (const auto& properties : ext_props) {
|
1166
1194
|
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
1167
1195
|
maintenance4_support = true;
|
1168
1196
|
}
|
@@ -1193,7 +1221,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1193
1221
|
bool fp16_storage = false;
|
1194
1222
|
bool fp16_compute = false;
|
1195
1223
|
|
1196
|
-
for (auto properties : ext_props) {
|
1224
|
+
for (const auto& properties : ext_props) {
|
1197
1225
|
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
1198
1226
|
fp16_storage = true;
|
1199
1227
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
@@ -1422,7 +1450,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
1422
1450
|
#ifdef GGML_VULKAN_DEBUG
|
1423
1451
|
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
1424
1452
|
#endif
|
1425
|
-
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
1453
|
+
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
1454
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
1455
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
1426
1456
|
|
1427
1457
|
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
1428
1458
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
@@ -1568,7 +1598,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
1568
1598
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
1569
1599
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
1570
1600
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
1571
|
-
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
1601
|
+
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
1602
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
1603
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
1572
1604
|
}
|
1573
1605
|
}
|
1574
1606
|
|
@@ -4082,7 +4114,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4082
4114
|
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
4083
4115
|
#endif
|
4084
4116
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4085
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
4117
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
4118
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
|
4119
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
4086
4120
|
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
4087
4121
|
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
4088
4122
|
|
@@ -4174,7 +4208,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4174
4208
|
if (ctx->staging != nullptr) {
|
4175
4209
|
ggml_vk_destroy_buffer(ctx->staging);
|
4176
4210
|
}
|
4177
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
4211
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
4212
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
4213
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
4178
4214
|
}
|
4179
4215
|
}
|
4180
4216
|
|
@@ -4537,13 +4573,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
4537
4573
|
}
|
4538
4574
|
}
|
4539
4575
|
|
4540
|
-
GGML_CALL int ggml_vk_get_device_count() {
|
4576
|
+
GGML_CALL static int ggml_vk_get_device_count() {
|
4541
4577
|
ggml_vk_instance_init();
|
4542
4578
|
|
4543
4579
|
return vk_instance.device_indices.size();
|
4544
4580
|
}
|
4545
4581
|
|
4546
|
-
GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
4582
|
+
GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
4547
4583
|
ggml_vk_instance_init();
|
4548
4584
|
|
4549
4585
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -4561,7 +4597,7 @@ void ggml_vk_init_cpu_assist() {
|
|
4561
4597
|
|
4562
4598
|
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
4563
4599
|
|
4564
|
-
for (
|
4600
|
+
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
4565
4601
|
ggml_vk_print_gpu_info(i);
|
4566
4602
|
}
|
4567
4603
|
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
@@ -5248,7 +5284,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
|
|
5248
5284
|
}
|
5249
5285
|
|
5250
5286
|
GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
5251
|
-
GGML_ASSERT(device < vk_instance.device_indices.size());
|
5287
|
+
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
5252
5288
|
|
5253
5289
|
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
5254
5290
|
|
@@ -5282,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
|
5282
5318
|
return vk_instance.device_indices.size();
|
5283
5319
|
}
|
5284
5320
|
|
5321
|
+
// Extension availability
|
5322
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
5323
|
+
#ifdef GGML_VULKAN_VALIDATE
|
5324
|
+
bool portability_enumeration_ext = false;
|
5325
|
+
// Check for portability enumeration extension for MoltenVK support
|
5326
|
+
for (const auto& properties : instance_extensions) {
|
5327
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
5328
|
+
return true;
|
5329
|
+
}
|
5330
|
+
}
|
5331
|
+
if (!portability_enumeration_ext) {
|
5332
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
5333
|
+
}
|
5334
|
+
#endif
|
5335
|
+
return false;
|
5336
|
+
|
5337
|
+
UNUSED(instance_extensions);
|
5338
|
+
}
|
5339
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
5340
|
+
#ifdef __APPLE__
|
5341
|
+
bool portability_enumeration_ext = false;
|
5342
|
+
// Check for portability enumeration extension for MoltenVK support
|
5343
|
+
for (const auto& properties : instance_extensions) {
|
5344
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
5345
|
+
return true;
|
5346
|
+
}
|
5347
|
+
}
|
5348
|
+
if (!portability_enumeration_ext) {
|
5349
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
5350
|
+
}
|
5351
|
+
#endif
|
5352
|
+
return false;
|
5353
|
+
|
5354
|
+
UNUSED(instance_extensions);
|
5355
|
+
}
|
5356
|
+
|
5285
5357
|
// checks
|
5286
5358
|
|
5287
5359
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|