llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
707
707
|
q.cmd_buffer_idx = 0;
|
708
708
|
}
|
709
709
|
|
710
|
-
static
|
710
|
+
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
711
|
+
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
712
|
+
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
713
|
+
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
|
714
|
+
(flags & memory_type.propertyFlags) == flags &&
|
715
|
+
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
|
716
|
+
return static_cast<int32_t>(i);
|
717
|
+
}
|
718
|
+
}
|
719
|
+
return UINT32_MAX;
|
720
|
+
}
|
721
|
+
|
722
|
+
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
711
723
|
#ifdef GGML_VULKAN_DEBUG
|
712
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
|
724
|
+
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
713
725
|
#endif
|
714
726
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
715
727
|
|
@@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
736
748
|
|
737
749
|
uint32_t memory_type_index = UINT32_MAX;
|
738
750
|
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
751
|
+
memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
|
752
|
+
buf->memory_property_flags = req_flags;
|
753
|
+
|
754
|
+
if (memory_type_index == UINT32_MAX && fallback_flags) {
|
755
|
+
memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
|
756
|
+
buf->memory_property_flags = fallback_flags;
|
745
757
|
}
|
746
758
|
|
747
|
-
if (memory_type_index
|
759
|
+
if (memory_type_index == UINT32_MAX) {
|
748
760
|
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
749
761
|
buf->size = 0;
|
750
762
|
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
@@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
758
770
|
buf->size = 0;
|
759
771
|
throw e;
|
760
772
|
}
|
761
|
-
buf->memory_property_flags = req_flags;
|
762
773
|
buf->ptr = nullptr;
|
763
774
|
|
764
|
-
if (
|
775
|
+
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
765
776
|
buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
766
777
|
}
|
767
778
|
|
@@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
778
789
|
return buf;
|
779
790
|
}
|
780
791
|
|
781
|
-
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
792
|
+
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
782
793
|
try {
|
783
|
-
return ggml_vk_create_buffer(ctx, size, req_flags);
|
794
|
+
return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
|
784
795
|
} catch (const vk::SystemError& e) {
|
785
796
|
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
786
797
|
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
@@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
|
|
791
802
|
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
792
803
|
vk_buffer buf;
|
793
804
|
try {
|
794
|
-
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
795
|
-
} catch (const vk::SystemError& e) {
|
796
805
|
if (ctx->device.lock()->uma) {
|
797
806
|
// Fall back to host memory type
|
798
|
-
buf =
|
807
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
799
808
|
} else {
|
800
|
-
|
801
|
-
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
802
|
-
throw e;
|
809
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
803
810
|
}
|
811
|
+
} catch (const vk::SystemError& e) {
|
812
|
+
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
813
|
+
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
814
|
+
throw e;
|
804
815
|
}
|
805
816
|
|
806
817
|
return buf;
|
@@ -1080,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1080
1091
|
}
|
1081
1092
|
}
|
1082
1093
|
|
1094
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
1095
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
1096
|
+
|
1083
1097
|
void ggml_vk_instance_init() {
|
1084
1098
|
if (vk_instance_initialized) {
|
1085
1099
|
return;
|
@@ -1089,28 +1103,42 @@ void ggml_vk_instance_init() {
|
|
1089
1103
|
#endif
|
1090
1104
|
|
1091
1105
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
1092
|
-
const std::vector<const char*> layers = {
|
1093
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1094
|
-
"VK_LAYER_KHRONOS_validation",
|
1095
|
-
#endif
|
1096
|
-
};
|
1097
|
-
const std::vector<const char*> extensions = {
|
1098
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1099
|
-
"VK_EXT_validation_features",
|
1100
|
-
#endif
|
1101
|
-
};
|
1102
|
-
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
|
1103
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1104
|
-
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
1105
|
-
vk::ValidationFeaturesEXT validation_features = {
|
1106
|
-
features_enable,
|
1107
|
-
{},
|
1108
|
-
};
|
1109
|
-
validation_features.setPNext(nullptr);
|
1110
|
-
instance_create_info.setPNext(&validation_features);
|
1111
1106
|
|
1112
|
-
std::
|
1113
|
-
|
1107
|
+
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
1108
|
+
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
1109
|
+
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
1110
|
+
|
1111
|
+
std::vector<const char*> layers;
|
1112
|
+
|
1113
|
+
if (validation_ext) {
|
1114
|
+
layers.push_back("VK_LAYER_KHRONOS_validation");
|
1115
|
+
}
|
1116
|
+
std::vector<const char*> extensions;
|
1117
|
+
if (validation_ext) {
|
1118
|
+
extensions.push_back("VK_EXT_validation_features");
|
1119
|
+
}
|
1120
|
+
if (portability_enumeration_ext) {
|
1121
|
+
extensions.push_back("VK_KHR_portability_enumeration");
|
1122
|
+
}
|
1123
|
+
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
1124
|
+
if (portability_enumeration_ext) {
|
1125
|
+
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
1129
|
+
vk::ValidationFeaturesEXT validation_features;
|
1130
|
+
|
1131
|
+
if (validation_ext) {
|
1132
|
+
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
1133
|
+
validation_features = {
|
1134
|
+
features_enable,
|
1135
|
+
{},
|
1136
|
+
};
|
1137
|
+
validation_features.setPNext(nullptr);
|
1138
|
+
instance_create_info.setPNext(&validation_features);
|
1139
|
+
|
1140
|
+
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
1141
|
+
}
|
1114
1142
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
1115
1143
|
|
1116
1144
|
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
@@ -1139,7 +1167,7 @@ void ggml_vk_instance_init() {
|
|
1139
1167
|
vk_instance_initialized = true;
|
1140
1168
|
}
|
1141
1169
|
|
1142
|
-
void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1170
|
+
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1143
1171
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1144
1172
|
size_t dev_num = vk_instance.device_indices[idx];
|
1145
1173
|
#ifdef GGML_VULKAN_DEBUG
|
@@ -1157,12 +1185,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1157
1185
|
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
1158
1186
|
ctx->device = vk_instance.devices[idx];
|
1159
1187
|
ctx->device.lock()->physical_device = devices[dev_num];
|
1160
|
-
std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
1188
|
+
const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
1161
1189
|
|
1162
1190
|
bool maintenance4_support = false;
|
1163
1191
|
|
1164
1192
|
// Check if maintenance4 is supported
|
1165
|
-
for (auto properties : ext_props) {
|
1193
|
+
for (const auto& properties : ext_props) {
|
1166
1194
|
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
1167
1195
|
maintenance4_support = true;
|
1168
1196
|
}
|
@@ -1193,7 +1221,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1193
1221
|
bool fp16_storage = false;
|
1194
1222
|
bool fp16_compute = false;
|
1195
1223
|
|
1196
|
-
for (auto properties : ext_props) {
|
1224
|
+
for (const auto& properties : ext_props) {
|
1197
1225
|
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
1198
1226
|
fp16_storage = true;
|
1199
1227
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
@@ -1422,7 +1450,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
1422
1450
|
#ifdef GGML_VULKAN_DEBUG
|
1423
1451
|
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
1424
1452
|
#endif
|
1425
|
-
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
1453
|
+
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
1454
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
1455
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
1426
1456
|
|
1427
1457
|
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
1428
1458
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
@@ -1568,7 +1598,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
1568
1598
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
1569
1599
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
1570
1600
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
1571
|
-
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
1601
|
+
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
1602
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
1603
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
1572
1604
|
}
|
1573
1605
|
}
|
1574
1606
|
|
@@ -4082,7 +4114,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4082
4114
|
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
4083
4115
|
#endif
|
4084
4116
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4085
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
4117
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
4118
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
|
4119
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
4086
4120
|
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
4087
4121
|
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
4088
4122
|
|
@@ -4174,7 +4208,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4174
4208
|
if (ctx->staging != nullptr) {
|
4175
4209
|
ggml_vk_destroy_buffer(ctx->staging);
|
4176
4210
|
}
|
4177
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
4211
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
4212
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
4213
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
4178
4214
|
}
|
4179
4215
|
}
|
4180
4216
|
|
@@ -4537,13 +4573,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
4537
4573
|
}
|
4538
4574
|
}
|
4539
4575
|
|
4540
|
-
GGML_CALL int ggml_vk_get_device_count() {
|
4576
|
+
GGML_CALL static int ggml_vk_get_device_count() {
|
4541
4577
|
ggml_vk_instance_init();
|
4542
4578
|
|
4543
4579
|
return vk_instance.device_indices.size();
|
4544
4580
|
}
|
4545
4581
|
|
4546
|
-
GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
4582
|
+
GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
4547
4583
|
ggml_vk_instance_init();
|
4548
4584
|
|
4549
4585
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -4561,7 +4597,7 @@ void ggml_vk_init_cpu_assist() {
|
|
4561
4597
|
|
4562
4598
|
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
4563
4599
|
|
4564
|
-
for (
|
4600
|
+
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
4565
4601
|
ggml_vk_print_gpu_info(i);
|
4566
4602
|
}
|
4567
4603
|
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
@@ -5248,7 +5284,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
|
|
5248
5284
|
}
|
5249
5285
|
|
5250
5286
|
GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
5251
|
-
GGML_ASSERT(device < vk_instance.device_indices.size());
|
5287
|
+
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
5252
5288
|
|
5253
5289
|
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
5254
5290
|
|
@@ -5282,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
|
5282
5318
|
return vk_instance.device_indices.size();
|
5283
5319
|
}
|
5284
5320
|
|
5321
|
+
// Extension availability
|
5322
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
5323
|
+
#ifdef GGML_VULKAN_VALIDATE
|
5324
|
+
bool portability_enumeration_ext = false;
|
5325
|
+
// Check for portability enumeration extension for MoltenVK support
|
5326
|
+
for (const auto& properties : instance_extensions) {
|
5327
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
5328
|
+
return true;
|
5329
|
+
}
|
5330
|
+
}
|
5331
|
+
if (!portability_enumeration_ext) {
|
5332
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
5333
|
+
}
|
5334
|
+
#endif
|
5335
|
+
return false;
|
5336
|
+
|
5337
|
+
UNUSED(instance_extensions);
|
5338
|
+
}
|
5339
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
5340
|
+
#ifdef __APPLE__
|
5341
|
+
bool portability_enumeration_ext = false;
|
5342
|
+
// Check for portability enumeration extension for MoltenVK support
|
5343
|
+
for (const auto& properties : instance_extensions) {
|
5344
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
5345
|
+
return true;
|
5346
|
+
}
|
5347
|
+
}
|
5348
|
+
if (!portability_enumeration_ext) {
|
5349
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
5350
|
+
}
|
5351
|
+
#endif
|
5352
|
+
return false;
|
5353
|
+
|
5354
|
+
UNUSED(instance_extensions);
|
5355
|
+
}
|
5356
|
+
|
5285
5357
|
// checks
|
5286
5358
|
|
5287
5359
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|