llama_cpp 0.12.6 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/ext/llama_cpp/llama_cpp.cpp +90 -269
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +28 -23
- data/vendor/tmp/llama.cpp/Makefile +51 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -11
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +191 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +2472 -862
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +3176 -667
- data/vendor/tmp/llama.cpp/ggml-quants.h +77 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +373 -424
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +186 -102
- data/vendor/tmp/llama.cpp/ggml.c +1266 -699
- data/vendor/tmp/llama.cpp/ggml.h +59 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1517 -717
- data/vendor/tmp/llama.cpp/llama.h +87 -63
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
|
@@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
|
707
707
|
q.cmd_buffer_idx = 0;
|
|
708
708
|
}
|
|
709
709
|
|
|
710
|
-
static
|
|
710
|
+
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
|
711
|
+
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
|
712
|
+
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
|
713
|
+
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
|
|
714
|
+
(flags & memory_type.propertyFlags) == flags &&
|
|
715
|
+
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
|
|
716
|
+
return static_cast<int32_t>(i);
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
return UINT32_MAX;
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
|
711
723
|
#ifdef GGML_VULKAN_DEBUG
|
|
712
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
|
|
724
|
+
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
|
713
725
|
#endif
|
|
714
726
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
|
715
727
|
|
|
@@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
|
736
748
|
|
|
737
749
|
uint32_t memory_type_index = UINT32_MAX;
|
|
738
750
|
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
751
|
+
memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
|
|
752
|
+
buf->memory_property_flags = req_flags;
|
|
753
|
+
|
|
754
|
+
if (memory_type_index == UINT32_MAX && fallback_flags) {
|
|
755
|
+
memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
|
|
756
|
+
buf->memory_property_flags = fallback_flags;
|
|
745
757
|
}
|
|
746
758
|
|
|
747
|
-
if (memory_type_index
|
|
759
|
+
if (memory_type_index == UINT32_MAX) {
|
|
748
760
|
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
|
749
761
|
buf->size = 0;
|
|
750
762
|
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
|
@@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
|
758
770
|
buf->size = 0;
|
|
759
771
|
throw e;
|
|
760
772
|
}
|
|
761
|
-
buf->memory_property_flags = req_flags;
|
|
762
773
|
buf->ptr = nullptr;
|
|
763
774
|
|
|
764
|
-
if (
|
|
775
|
+
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
|
765
776
|
buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
|
766
777
|
}
|
|
767
778
|
|
|
@@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
|
778
789
|
return buf;
|
|
779
790
|
}
|
|
780
791
|
|
|
781
|
-
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
|
792
|
+
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
|
782
793
|
try {
|
|
783
|
-
return ggml_vk_create_buffer(ctx, size, req_flags);
|
|
794
|
+
return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
|
|
784
795
|
} catch (const vk::SystemError& e) {
|
|
785
796
|
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
|
786
797
|
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
|
@@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
|
|
|
791
802
|
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
|
792
803
|
vk_buffer buf;
|
|
793
804
|
try {
|
|
794
|
-
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
795
|
-
} catch (const vk::SystemError& e) {
|
|
796
805
|
if (ctx->device.lock()->uma) {
|
|
797
806
|
// Fall back to host memory type
|
|
798
|
-
buf =
|
|
807
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
799
808
|
} else {
|
|
800
|
-
|
|
801
|
-
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
|
802
|
-
throw e;
|
|
809
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
803
810
|
}
|
|
811
|
+
} catch (const vk::SystemError& e) {
|
|
812
|
+
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
|
813
|
+
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
|
814
|
+
throw e;
|
|
804
815
|
}
|
|
805
816
|
|
|
806
817
|
return buf;
|
|
@@ -1080,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
1080
1091
|
}
|
|
1081
1092
|
}
|
|
1082
1093
|
|
|
1094
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
|
1095
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
|
1096
|
+
|
|
1083
1097
|
void ggml_vk_instance_init() {
|
|
1084
1098
|
if (vk_instance_initialized) {
|
|
1085
1099
|
return;
|
|
@@ -1089,28 +1103,48 @@ void ggml_vk_instance_init() {
|
|
|
1089
1103
|
#endif
|
|
1090
1104
|
|
|
1091
1105
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
#
|
|
1096
|
-
|
|
1097
|
-
const std::vector<const char*> extensions = {
|
|
1098
|
-
#ifdef GGML_VULKAN_VALIDATE
|
|
1099
|
-
"VK_EXT_validation_features",
|
|
1106
|
+
|
|
1107
|
+
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
|
1108
|
+
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
|
1109
|
+
#ifdef __APPLE__
|
|
1110
|
+
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
|
1100
1111
|
#endif
|
|
1101
|
-
};
|
|
1102
|
-
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
|
|
1103
|
-
#ifdef GGML_VULKAN_VALIDATE
|
|
1104
|
-
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
|
1105
|
-
vk::ValidationFeaturesEXT validation_features = {
|
|
1106
|
-
features_enable,
|
|
1107
|
-
{},
|
|
1108
|
-
};
|
|
1109
|
-
validation_features.setPNext(nullptr);
|
|
1110
|
-
instance_create_info.setPNext(&validation_features);
|
|
1111
1112
|
|
|
1112
|
-
std::
|
|
1113
|
+
std::vector<const char*> layers;
|
|
1114
|
+
|
|
1115
|
+
if (validation_ext) {
|
|
1116
|
+
layers.push_back("VK_LAYER_KHRONOS_validation");
|
|
1117
|
+
}
|
|
1118
|
+
std::vector<const char*> extensions;
|
|
1119
|
+
if (validation_ext) {
|
|
1120
|
+
extensions.push_back("VK_EXT_validation_features");
|
|
1121
|
+
}
|
|
1122
|
+
#ifdef __APPLE__
|
|
1123
|
+
if (portability_enumeration_ext) {
|
|
1124
|
+
extensions.push_back("VK_KHR_portability_enumeration");
|
|
1125
|
+
}
|
|
1113
1126
|
#endif
|
|
1127
|
+
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
|
1128
|
+
#ifdef __APPLE__
|
|
1129
|
+
if (portability_enumeration_ext) {
|
|
1130
|
+
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
|
1131
|
+
}
|
|
1132
|
+
#endif
|
|
1133
|
+
|
|
1134
|
+
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
|
1135
|
+
vk::ValidationFeaturesEXT validation_features;
|
|
1136
|
+
|
|
1137
|
+
if (validation_ext) {
|
|
1138
|
+
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
|
1139
|
+
validation_features = {
|
|
1140
|
+
features_enable,
|
|
1141
|
+
{},
|
|
1142
|
+
};
|
|
1143
|
+
validation_features.setPNext(nullptr);
|
|
1144
|
+
instance_create_info.setPNext(&validation_features);
|
|
1145
|
+
|
|
1146
|
+
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
|
1147
|
+
}
|
|
1114
1148
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
|
1115
1149
|
|
|
1116
1150
|
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
|
@@ -1139,7 +1173,7 @@ void ggml_vk_instance_init() {
|
|
|
1139
1173
|
vk_instance_initialized = true;
|
|
1140
1174
|
}
|
|
1141
1175
|
|
|
1142
|
-
void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1176
|
+
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1143
1177
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
|
1144
1178
|
size_t dev_num = vk_instance.device_indices[idx];
|
|
1145
1179
|
#ifdef GGML_VULKAN_DEBUG
|
|
@@ -1157,12 +1191,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
|
1157
1191
|
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
|
1158
1192
|
ctx->device = vk_instance.devices[idx];
|
|
1159
1193
|
ctx->device.lock()->physical_device = devices[dev_num];
|
|
1160
|
-
std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
|
1194
|
+
const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
|
1161
1195
|
|
|
1162
1196
|
bool maintenance4_support = false;
|
|
1163
1197
|
|
|
1164
1198
|
// Check if maintenance4 is supported
|
|
1165
|
-
for (auto properties : ext_props) {
|
|
1199
|
+
for (const auto& properties : ext_props) {
|
|
1166
1200
|
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
|
1167
1201
|
maintenance4_support = true;
|
|
1168
1202
|
}
|
|
@@ -1193,7 +1227,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
|
1193
1227
|
bool fp16_storage = false;
|
|
1194
1228
|
bool fp16_compute = false;
|
|
1195
1229
|
|
|
1196
|
-
for (auto properties : ext_props) {
|
|
1230
|
+
for (const auto& properties : ext_props) {
|
|
1197
1231
|
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
|
1198
1232
|
fp16_storage = true;
|
|
1199
1233
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
|
@@ -1422,7 +1456,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
|
1422
1456
|
#ifdef GGML_VULKAN_DEBUG
|
|
1423
1457
|
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
|
1424
1458
|
#endif
|
|
1425
|
-
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
|
1459
|
+
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
|
1460
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
1461
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
1426
1462
|
|
|
1427
1463
|
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
|
1428
1464
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
|
@@ -1568,7 +1604,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
|
1568
1604
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
|
1569
1605
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
|
1570
1606
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
|
1571
|
-
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
|
1607
|
+
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
|
1608
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
1609
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
1572
1610
|
}
|
|
1573
1611
|
}
|
|
1574
1612
|
|
|
@@ -2288,8 +2326,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2288
2326
|
src1_uma = d_Qy != nullptr;
|
|
2289
2327
|
}
|
|
2290
2328
|
|
|
2291
|
-
const bool load_x = src0->backend !=
|
|
2292
|
-
const bool load_y = src1->backend !=
|
|
2329
|
+
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
|
2330
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
|
2293
2331
|
|
|
2294
2332
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
|
2295
2333
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
|
@@ -2421,7 +2459,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2421
2459
|
// compute
|
|
2422
2460
|
ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
|
2423
2461
|
|
|
2424
|
-
if (dst->backend ==
|
|
2462
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
2425
2463
|
// copy dst to host
|
|
2426
2464
|
float * d = (float *) ((char *) dst->data);
|
|
2427
2465
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
|
@@ -2474,8 +2512,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
2474
2512
|
src1_uma = d_Qy != nullptr;
|
|
2475
2513
|
}
|
|
2476
2514
|
|
|
2477
|
-
const bool load_x = src0->backend !=
|
|
2478
|
-
const bool load_y = src1->backend !=
|
|
2515
|
+
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
|
2516
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
|
2479
2517
|
|
|
2480
2518
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
|
2481
2519
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
|
@@ -2598,7 +2636,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
2598
2636
|
ggml_vk_sync_buffers(subctx);
|
|
2599
2637
|
ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
|
2600
2638
|
|
|
2601
|
-
if (dst->backend ==
|
|
2639
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
2602
2640
|
// copy dst to host
|
|
2603
2641
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
|
2604
2642
|
ggml_vk_sync_buffers(subctx);
|
|
@@ -2615,7 +2653,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
2615
2653
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
|
2616
2654
|
#endif
|
|
2617
2655
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
|
2618
|
-
GGML_ASSERT(src0->backend ==
|
|
2656
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
|
2619
2657
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
|
2620
2658
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
|
2621
2659
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
@@ -2647,7 +2685,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
2647
2685
|
src1_uma = d_Qy != nullptr;
|
|
2648
2686
|
}
|
|
2649
2687
|
|
|
2650
|
-
const bool load_y = src1->backend !=
|
|
2688
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
|
2651
2689
|
|
|
2652
2690
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
|
2653
2691
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
|
@@ -2689,7 +2727,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
2689
2727
|
ggml_vk_sync_buffers(subctx);
|
|
2690
2728
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
2691
2729
|
|
|
2692
|
-
if (dst->backend ==
|
|
2730
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
2693
2731
|
// copy dst to host
|
|
2694
2732
|
float * d = (float *) dst->data;
|
|
2695
2733
|
ggml_vk_sync_buffers(subctx);
|
|
@@ -2706,7 +2744,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
2706
2744
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
2707
2745
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
2708
2746
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
|
2709
|
-
GGML_ASSERT(src0->backend ==
|
|
2747
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
|
2710
2748
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
2711
2749
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
2712
2750
|
|
|
@@ -2739,7 +2777,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
2739
2777
|
src1_uma = d_Qy != nullptr;
|
|
2740
2778
|
}
|
|
2741
2779
|
|
|
2742
|
-
const bool load_y = src1->backend !=
|
|
2780
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
|
2743
2781
|
|
|
2744
2782
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
|
2745
2783
|
|
|
@@ -2782,7 +2820,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
2782
2820
|
ggml_vk_sync_buffers(subctx);
|
|
2783
2821
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
2784
2822
|
|
|
2785
|
-
if (dst->backend ==
|
|
2823
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
2786
2824
|
// copy dst to host
|
|
2787
2825
|
float * d = (float *) dst->data;
|
|
2788
2826
|
ggml_vk_sync_buffers(subctx);
|
|
@@ -2800,7 +2838,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
|
|
|
2800
2838
|
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
|
2801
2839
|
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
|
2802
2840
|
dst->type == GGML_TYPE_F32 &&
|
|
2803
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
|
2841
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
|
2804
2842
|
}
|
|
2805
2843
|
|
|
2806
2844
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
|
@@ -2848,8 +2886,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
|
2848
2886
|
// TODO: support for transposed / permuted tensors
|
|
2849
2887
|
GGML_ASSERT(nb0 == sizeof(float));
|
|
2850
2888
|
GGML_ASSERT(nb00 == sizeof(float));
|
|
2851
|
-
GGML_ASSERT(src0->backend ==
|
|
2852
|
-
GGML_ASSERT(dst->backend ==
|
|
2889
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
|
2890
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
|
2853
2891
|
|
|
2854
2892
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
2855
2893
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
@@ -3078,8 +3116,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3078
3116
|
}
|
|
3079
3117
|
}
|
|
3080
3118
|
|
|
3081
|
-
const bool transfer_src0 = src0->backend !=
|
|
3082
|
-
const bool transfer_src1 = use_src1 && src1->backend !=
|
|
3119
|
+
const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
|
3120
|
+
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
|
3083
3121
|
|
|
3084
3122
|
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
|
|
3085
3123
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
|
@@ -3088,7 +3126,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3088
3126
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3089
3127
|
|
|
3090
3128
|
// Workaround for tiny tensor inputs on ROPE
|
|
3091
|
-
if (use_src1 && src1->backend ==
|
|
3129
|
+
if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
|
|
3092
3130
|
y_sz = VK_WHOLE_SIZE;
|
|
3093
3131
|
}
|
|
3094
3132
|
|
|
@@ -3177,9 +3215,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3177
3215
|
ggml_vk_sync_buffers(subctx);
|
|
3178
3216
|
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
3179
3217
|
}
|
|
3180
|
-
if (dst->backend ==
|
|
3218
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
|
3181
3219
|
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
|
3182
|
-
} else if(dst->backend ==
|
|
3220
|
+
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
3183
3221
|
// copy dst to host
|
|
3184
3222
|
float * d = (float *) dst->data;
|
|
3185
3223
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
|
@@ -3221,7 +3259,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3221
3259
|
ggml_vk_sync_buffers(subctx);
|
|
3222
3260
|
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
3223
3261
|
}
|
|
3224
|
-
if (dst->backend ==
|
|
3262
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
3225
3263
|
// copy dst to host
|
|
3226
3264
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
|
3227
3265
|
}
|
|
@@ -3327,7 +3365,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
|
3327
3365
|
|
|
3328
3366
|
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
3329
3367
|
// If backend is CPU, data from src0 has to be copied off the device
|
|
3330
|
-
if (dst->backend ==
|
|
3368
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
3331
3369
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
3332
3370
|
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
|
3333
3371
|
ggml_vk_sync_buffers(subctx);
|
|
@@ -3962,9 +4000,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
|
3962
4000
|
#ifdef GGML_VULKAN_DEBUG
|
|
3963
4001
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
|
3964
4002
|
#endif
|
|
3965
|
-
const bool any_on_device = node->backend ==
|
|
3966
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend ==
|
|
3967
|
-
|| (node->src[1] != nullptr && (node->src[1]->backend ==
|
|
4003
|
+
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
|
4004
|
+
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
|
4005
|
+
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
|
3968
4006
|
|
|
3969
4007
|
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
|
|
3970
4008
|
return;
|
|
@@ -4082,7 +4120,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
4082
4120
|
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
|
4083
4121
|
#endif
|
|
4084
4122
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
|
4085
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
|
4123
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
|
4124
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
|
|
4125
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
4086
4126
|
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
|
4087
4127
|
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
|
4088
4128
|
|
|
@@ -4174,14 +4214,16 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
4174
4214
|
if (ctx->staging != nullptr) {
|
|
4175
4215
|
ggml_vk_destroy_buffer(ctx->staging);
|
|
4176
4216
|
}
|
|
4177
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
|
4217
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
|
4218
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
4219
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
4178
4220
|
}
|
|
4179
4221
|
}
|
|
4180
4222
|
|
|
4181
4223
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
|
4182
|
-
const bool any_on_device = node->backend ==
|
|
4183
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend ==
|
|
4184
|
-
|| (node->src[1] != nullptr && node->src[1]->backend ==
|
|
4224
|
+
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
|
4225
|
+
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
|
4226
|
+
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
|
4185
4227
|
|
|
4186
4228
|
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
|
4187
4229
|
return;
|
|
@@ -4335,7 +4377,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
4335
4377
|
last_node = true;
|
|
4336
4378
|
#endif
|
|
4337
4379
|
|
|
4338
|
-
if (node->backend ==
|
|
4380
|
+
if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
|
|
4339
4381
|
ggml_vk_ctx_end(ctx->compute_ctx);
|
|
4340
4382
|
ctx->compute_ctx->exit_tensor = node;
|
|
4341
4383
|
ctx->compute_ctx = nullptr;
|
|
@@ -4343,9 +4385,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
4343
4385
|
}
|
|
4344
4386
|
|
|
4345
4387
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
|
4346
|
-
const bool any_on_device = tensor->backend ==
|
|
4347
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
|
4348
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
|
4388
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
|
4389
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
|
4390
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
|
4349
4391
|
|
|
4350
4392
|
if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
|
|
4351
4393
|
return false;
|
|
@@ -4406,7 +4448,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
4406
4448
|
if (params->ith != 0) {
|
|
4407
4449
|
return true;
|
|
4408
4450
|
}
|
|
4409
|
-
if (params->type ==
|
|
4451
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
4410
4452
|
return true;
|
|
4411
4453
|
}
|
|
4412
4454
|
|
|
@@ -4537,13 +4579,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
4537
4579
|
}
|
|
4538
4580
|
}
|
|
4539
4581
|
|
|
4540
|
-
GGML_CALL int ggml_vk_get_device_count() {
|
|
4582
|
+
GGML_CALL static int ggml_vk_get_device_count() {
|
|
4541
4583
|
ggml_vk_instance_init();
|
|
4542
4584
|
|
|
4543
4585
|
return vk_instance.device_indices.size();
|
|
4544
4586
|
}
|
|
4545
4587
|
|
|
4546
|
-
GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
|
4588
|
+
GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
|
4547
4589
|
ggml_vk_instance_init();
|
|
4548
4590
|
|
|
4549
4591
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
@@ -4561,7 +4603,7 @@ void ggml_vk_init_cpu_assist() {
|
|
|
4561
4603
|
|
|
4562
4604
|
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
|
4563
4605
|
|
|
4564
|
-
for (
|
|
4606
|
+
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
|
4565
4607
|
ggml_vk_print_gpu_info(i);
|
|
4566
4608
|
}
|
|
4567
4609
|
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
|
@@ -4709,7 +4751,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
|
4709
4751
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
|
4710
4752
|
}
|
|
4711
4753
|
|
|
4712
|
-
tensor->backend =
|
|
4754
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
|
4713
4755
|
tensor->extra = extra;
|
|
4714
4756
|
}
|
|
4715
4757
|
|
|
@@ -4717,7 +4759,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
|
4717
4759
|
#ifdef GGML_VULKAN_DEBUG
|
|
4718
4760
|
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
|
4719
4761
|
#endif
|
|
4720
|
-
GGML_ASSERT(tensor->backend ==
|
|
4762
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
4721
4763
|
|
|
4722
4764
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
4723
4765
|
|
|
@@ -4732,7 +4774,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
|
4732
4774
|
#ifdef GGML_VULKAN_DEBUG
|
|
4733
4775
|
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
|
4734
4776
|
#endif
|
|
4735
|
-
GGML_ASSERT(tensor->backend ==
|
|
4777
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
4736
4778
|
|
|
4737
4779
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
4738
4780
|
|
|
@@ -4963,7 +5005,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
|
4963
5005
|
#endif
|
|
4964
5006
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
4965
5007
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
4966
|
-
GGML_ASSERT(tensor->backend ==
|
|
5008
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
4967
5009
|
|
|
4968
5010
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
4969
5011
|
|
|
@@ -4984,7 +5026,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
|
4984
5026
|
#endif
|
|
4985
5027
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
4986
5028
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
4987
|
-
GGML_ASSERT(tensor->backend ==
|
|
5029
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
4988
5030
|
|
|
4989
5031
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
4990
5032
|
|
|
@@ -5061,7 +5103,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
|
5061
5103
|
int last_node = cgraph->n_nodes - 1;
|
|
5062
5104
|
|
|
5063
5105
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
|
5064
|
-
while (last_node > 0 && cgraph->nodes[last_node]->backend !=
|
|
5106
|
+
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
|
|
5065
5107
|
last_node -= 1;
|
|
5066
5108
|
}
|
|
5067
5109
|
|
|
@@ -5070,7 +5112,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
|
5070
5112
|
}
|
|
5071
5113
|
|
|
5072
5114
|
ggml_compute_params params = {};
|
|
5073
|
-
params.type =
|
|
5115
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
|
5074
5116
|
params.ith = 0;
|
|
5075
5117
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
5076
5118
|
ggml_tensor * node = cgraph->nodes[i];
|
|
@@ -5208,6 +5250,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
|
5208
5250
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
5209
5251
|
};
|
|
5210
5252
|
|
|
5253
|
+
static ggml_guid_t ggml_backend_vk_guid() {
|
|
5254
|
+
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
|
5255
|
+
return &guid;
|
|
5256
|
+
}
|
|
5257
|
+
|
|
5211
5258
|
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5212
5259
|
if (vk_instance.initialized[idx]) {
|
|
5213
5260
|
return vk_instance.backends[idx];
|
|
@@ -5226,6 +5273,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
|
5226
5273
|
vk_instance.initialized[idx] = true;
|
|
5227
5274
|
|
|
5228
5275
|
ggml_backend_t vk_backend = new ggml_backend {
|
|
5276
|
+
/* .guid = */ ggml_backend_vk_guid(),
|
|
5229
5277
|
/* .interface = */ ggml_backend_vk_interface,
|
|
5230
5278
|
/* .context = */ &vk_instance.contexts[ctx->idx],
|
|
5231
5279
|
};
|
|
@@ -5236,7 +5284,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
|
5236
5284
|
}
|
|
5237
5285
|
|
|
5238
5286
|
GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
|
|
5239
|
-
return backend && backend->
|
|
5287
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
|
5240
5288
|
}
|
|
5241
5289
|
|
|
5242
5290
|
GGML_CALL int ggml_backend_vk_get_device_count() {
|
|
@@ -5248,7 +5296,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
|
|
|
5248
5296
|
}
|
|
5249
5297
|
|
|
5250
5298
|
GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
|
5251
|
-
GGML_ASSERT(device < vk_instance.device_indices.size());
|
|
5299
|
+
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
|
5252
5300
|
|
|
5253
5301
|
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
|
5254
5302
|
|
|
@@ -5282,6 +5330,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
|
|
5282
5330
|
return vk_instance.device_indices.size();
|
|
5283
5331
|
}
|
|
5284
5332
|
|
|
5333
|
+
// Extension availability
|
|
5334
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
|
5335
|
+
#ifdef GGML_VULKAN_VALIDATE
|
|
5336
|
+
bool portability_enumeration_ext = false;
|
|
5337
|
+
// Check for portability enumeration extension for MoltenVK support
|
|
5338
|
+
for (const auto& properties : instance_extensions) {
|
|
5339
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
|
5340
|
+
return true;
|
|
5341
|
+
}
|
|
5342
|
+
}
|
|
5343
|
+
if (!portability_enumeration_ext) {
|
|
5344
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
|
5345
|
+
}
|
|
5346
|
+
#endif
|
|
5347
|
+
return false;
|
|
5348
|
+
|
|
5349
|
+
UNUSED(instance_extensions);
|
|
5350
|
+
}
|
|
5351
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
|
5352
|
+
#ifdef __APPLE__
|
|
5353
|
+
bool portability_enumeration_ext = false;
|
|
5354
|
+
// Check for portability enumeration extension for MoltenVK support
|
|
5355
|
+
for (const auto& properties : instance_extensions) {
|
|
5356
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
|
5357
|
+
return true;
|
|
5358
|
+
}
|
|
5359
|
+
}
|
|
5360
|
+
if (!portability_enumeration_ext) {
|
|
5361
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
|
5362
|
+
}
|
|
5363
|
+
#endif
|
|
5364
|
+
return false;
|
|
5365
|
+
|
|
5366
|
+
UNUSED(instance_extensions);
|
|
5367
|
+
}
|
|
5368
|
+
|
|
5285
5369
|
// checks
|
|
5286
5370
|
|
|
5287
5371
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
@@ -5338,7 +5422,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
|
5338
5422
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
|
5339
5423
|
void * tensor_data = tensor->data;
|
|
5340
5424
|
|
|
5341
|
-
if (tensor->backend ==
|
|
5425
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
|
5342
5426
|
const size_t tensor_size = ggml_nbytes(tensor);
|
|
5343
5427
|
tensor_data = malloc(tensor_size);
|
|
5344
5428
|
|
|
@@ -5364,14 +5448,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
|
5364
5448
|
std::vector<const ggml_tensor *> done;
|
|
5365
5449
|
ggml_vk_print_graph_origin(tensor, done);
|
|
5366
5450
|
|
|
5367
|
-
if (tensor->backend ==
|
|
5451
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
|
5368
5452
|
free(tensor_data);
|
|
5369
5453
|
}
|
|
5370
5454
|
}
|
|
5371
5455
|
|
|
5372
5456
|
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
|
5373
5457
|
return;
|
|
5374
|
-
GGML_ASSERT(tensor->backend ==
|
|
5458
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
|
5375
5459
|
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
|
5376
5460
|
return;
|
|
5377
5461
|
}
|
|
@@ -5409,7 +5493,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5409
5493
|
if (params->ith != 0) {
|
|
5410
5494
|
return;
|
|
5411
5495
|
}
|
|
5412
|
-
if (params->type ==
|
|
5496
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
|
5413
5497
|
return;
|
|
5414
5498
|
}
|
|
5415
5499
|
|
|
@@ -5446,10 +5530,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5446
5530
|
|
|
5447
5531
|
src0_buffer = malloc(src0_size);
|
|
5448
5532
|
src0_clone->data = src0_buffer;
|
|
5449
|
-
if (src0->backend ==
|
|
5533
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
|
5450
5534
|
memcpy(src0_clone->data, src0->data, src0_size);
|
|
5451
5535
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
|
5452
|
-
} else if (src0->backend ==
|
|
5536
|
+
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
|
5453
5537
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
5454
5538
|
uint64_t offset = extra->offset;
|
|
5455
5539
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
|
@@ -5489,10 +5573,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5489
5573
|
|
|
5490
5574
|
src1_buffer = malloc(src1_size);
|
|
5491
5575
|
src1_clone->data = src1_buffer;
|
|
5492
|
-
if (src1->backend ==
|
|
5576
|
+
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
|
5493
5577
|
memcpy(src1_clone->data, src1->data, src1_size);
|
|
5494
5578
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
|
5495
|
-
} else if (src1->backend ==
|
|
5579
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
|
5496
5580
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
5497
5581
|
uint64_t offset = extra->offset;
|
|
5498
5582
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
|
@@ -5651,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5651
5735
|
if (params->ith != 0) {
|
|
5652
5736
|
return;
|
|
5653
5737
|
}
|
|
5654
|
-
if (params->type ==
|
|
5738
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
|
5655
5739
|
return;
|
|
5656
5740
|
}
|
|
5657
5741
|
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
|
@@ -5663,7 +5747,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5663
5747
|
|
|
5664
5748
|
void * tensor_data = tensor->data;
|
|
5665
5749
|
|
|
5666
|
-
if (tensor->backend ==
|
|
5750
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
|
5667
5751
|
size_t tensor_size = ggml_nbytes(tensor);
|
|
5668
5752
|
tensor_data = malloc(tensor_size);
|
|
5669
5753
|
|
|
@@ -5796,7 +5880,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5796
5880
|
comp_result = nullptr;
|
|
5797
5881
|
comp_size = 0;
|
|
5798
5882
|
|
|
5799
|
-
if (tensor->backend ==
|
|
5883
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
|
5800
5884
|
free(tensor_data);
|
|
5801
5885
|
}
|
|
5802
5886
|
}
|