llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -27,6 +27,7 @@
|
|
27
27
|
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
28
28
|
|
29
29
|
#define VK_VENDOR_ID_AMD 0x1002
|
30
|
+
#define VK_VENDOR_ID_APPLE 0x106b
|
30
31
|
#define VK_VENDOR_ID_INTEL 0x8086
|
31
32
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
32
33
|
|
@@ -706,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
706
707
|
q.cmd_buffer_idx = 0;
|
707
708
|
}
|
708
709
|
|
709
|
-
static
|
710
|
+
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
711
|
+
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
712
|
+
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
713
|
+
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
|
714
|
+
(flags & memory_type.propertyFlags) == flags &&
|
715
|
+
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
|
716
|
+
return static_cast<int32_t>(i);
|
717
|
+
}
|
718
|
+
}
|
719
|
+
return UINT32_MAX;
|
720
|
+
}
|
721
|
+
|
722
|
+
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
710
723
|
#ifdef GGML_VULKAN_DEBUG
|
711
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
|
724
|
+
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
712
725
|
#endif
|
713
726
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
714
727
|
|
@@ -735,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
735
748
|
|
736
749
|
uint32_t memory_type_index = UINT32_MAX;
|
737
750
|
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
751
|
+
memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
|
752
|
+
buf->memory_property_flags = req_flags;
|
753
|
+
|
754
|
+
if (memory_type_index == UINT32_MAX && fallback_flags) {
|
755
|
+
memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
|
756
|
+
buf->memory_property_flags = fallback_flags;
|
744
757
|
}
|
745
758
|
|
746
|
-
if (memory_type_index
|
759
|
+
if (memory_type_index == UINT32_MAX) {
|
747
760
|
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
748
761
|
buf->size = 0;
|
749
762
|
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
@@ -757,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
757
770
|
buf->size = 0;
|
758
771
|
throw e;
|
759
772
|
}
|
760
|
-
buf->memory_property_flags = req_flags;
|
761
773
|
buf->ptr = nullptr;
|
762
774
|
|
763
|
-
if (
|
775
|
+
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
764
776
|
buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
765
777
|
}
|
766
778
|
|
@@ -777,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
777
789
|
return buf;
|
778
790
|
}
|
779
791
|
|
780
|
-
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
792
|
+
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
781
793
|
try {
|
782
|
-
return ggml_vk_create_buffer(ctx, size, req_flags);
|
794
|
+
return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
|
783
795
|
} catch (const vk::SystemError& e) {
|
784
796
|
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
785
797
|
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
@@ -790,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
|
|
790
802
|
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
791
803
|
vk_buffer buf;
|
792
804
|
try {
|
793
|
-
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
794
|
-
} catch (const vk::SystemError& e) {
|
795
805
|
if (ctx->device.lock()->uma) {
|
796
806
|
// Fall back to host memory type
|
797
|
-
buf =
|
807
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
798
808
|
} else {
|
799
|
-
|
800
|
-
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
801
|
-
throw e;
|
809
|
+
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
802
810
|
}
|
811
|
+
} catch (const vk::SystemError& e) {
|
812
|
+
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
813
|
+
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
814
|
+
throw e;
|
803
815
|
}
|
804
816
|
|
805
817
|
return buf;
|
@@ -1079,6 +1091,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1079
1091
|
}
|
1080
1092
|
}
|
1081
1093
|
|
1094
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
1095
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
1096
|
+
|
1082
1097
|
void ggml_vk_instance_init() {
|
1083
1098
|
if (vk_instance_initialized) {
|
1084
1099
|
return;
|
@@ -1088,28 +1103,42 @@ void ggml_vk_instance_init() {
|
|
1088
1103
|
#endif
|
1089
1104
|
|
1090
1105
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
1091
|
-
const std::vector<const char*> layers = {
|
1092
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1093
|
-
"VK_LAYER_KHRONOS_validation",
|
1094
|
-
#endif
|
1095
|
-
};
|
1096
|
-
const std::vector<const char*> extensions = {
|
1097
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1098
|
-
"VK_EXT_validation_features",
|
1099
|
-
#endif
|
1100
|
-
};
|
1101
|
-
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
|
1102
|
-
#ifdef GGML_VULKAN_VALIDATE
|
1103
|
-
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
1104
|
-
vk::ValidationFeaturesEXT validation_features = {
|
1105
|
-
features_enable,
|
1106
|
-
{},
|
1107
|
-
};
|
1108
|
-
validation_features.setPNext(nullptr);
|
1109
|
-
instance_create_info.setPNext(&validation_features);
|
1110
1106
|
|
1111
|
-
std::
|
1112
|
-
|
1107
|
+
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
1108
|
+
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
1109
|
+
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
1110
|
+
|
1111
|
+
std::vector<const char*> layers;
|
1112
|
+
|
1113
|
+
if (validation_ext) {
|
1114
|
+
layers.push_back("VK_LAYER_KHRONOS_validation");
|
1115
|
+
}
|
1116
|
+
std::vector<const char*> extensions;
|
1117
|
+
if (validation_ext) {
|
1118
|
+
extensions.push_back("VK_EXT_validation_features");
|
1119
|
+
}
|
1120
|
+
if (portability_enumeration_ext) {
|
1121
|
+
extensions.push_back("VK_KHR_portability_enumeration");
|
1122
|
+
}
|
1123
|
+
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
1124
|
+
if (portability_enumeration_ext) {
|
1125
|
+
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
1129
|
+
vk::ValidationFeaturesEXT validation_features;
|
1130
|
+
|
1131
|
+
if (validation_ext) {
|
1132
|
+
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
1133
|
+
validation_features = {
|
1134
|
+
features_enable,
|
1135
|
+
{},
|
1136
|
+
};
|
1137
|
+
validation_features.setPNext(nullptr);
|
1138
|
+
instance_create_info.setPNext(&validation_features);
|
1139
|
+
|
1140
|
+
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
1141
|
+
}
|
1113
1142
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
1114
1143
|
|
1115
1144
|
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
@@ -1138,7 +1167,7 @@ void ggml_vk_instance_init() {
|
|
1138
1167
|
vk_instance_initialized = true;
|
1139
1168
|
}
|
1140
1169
|
|
1141
|
-
void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1170
|
+
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1142
1171
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1143
1172
|
size_t dev_num = vk_instance.device_indices[idx];
|
1144
1173
|
#ifdef GGML_VULKAN_DEBUG
|
@@ -1156,12 +1185,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1156
1185
|
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
1157
1186
|
ctx->device = vk_instance.devices[idx];
|
1158
1187
|
ctx->device.lock()->physical_device = devices[dev_num];
|
1159
|
-
std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
1188
|
+
const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
1160
1189
|
|
1161
1190
|
bool maintenance4_support = false;
|
1162
1191
|
|
1163
1192
|
// Check if maintenance4 is supported
|
1164
|
-
for (auto properties : ext_props) {
|
1193
|
+
for (const auto& properties : ext_props) {
|
1165
1194
|
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
1166
1195
|
maintenance4_support = true;
|
1167
1196
|
}
|
@@ -1192,7 +1221,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1192
1221
|
bool fp16_storage = false;
|
1193
1222
|
bool fp16_compute = false;
|
1194
1223
|
|
1195
|
-
for (auto properties : ext_props) {
|
1224
|
+
for (const auto& properties : ext_props) {
|
1196
1225
|
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
1197
1226
|
fp16_storage = true;
|
1198
1227
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
@@ -1421,7 +1450,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
1421
1450
|
#ifdef GGML_VULKAN_DEBUG
|
1422
1451
|
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
1423
1452
|
#endif
|
1424
|
-
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
1453
|
+
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
1454
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
1455
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
1425
1456
|
|
1426
1457
|
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
1427
1458
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
@@ -1567,7 +1598,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
1567
1598
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
1568
1599
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
1569
1600
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
1570
|
-
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
1601
|
+
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
1602
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
1603
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
1571
1604
|
}
|
1572
1605
|
}
|
1573
1606
|
|
@@ -2034,18 +2067,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
|
2034
2067
|
return ctx->pipeline_matmul_f32_aligned_l.align;
|
2035
2068
|
}
|
2036
2069
|
|
2070
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
2071
|
+
if (bit16_x && bit16_y) {
|
2072
|
+
if (m <= 32 || n <= 32) {
|
2073
|
+
#ifdef GGML_VULKAN_DEBUG
|
2074
|
+
std::cerr << " S" << std::endl;
|
2075
|
+
#endif
|
2076
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2077
|
+
}
|
2078
|
+
#ifdef GGML_VULKAN_DEBUG
|
2079
|
+
std::cerr << " M" << std::endl;
|
2080
|
+
#endif
|
2081
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2082
|
+
}
|
2083
|
+
if (bit16_x && !bit16_y) {
|
2084
|
+
if (m <= 32 || n <= 32) {
|
2085
|
+
#ifdef GGML_VULKAN_DEBUG
|
2086
|
+
std::cerr << " S" << std::endl;
|
2087
|
+
#endif
|
2088
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2089
|
+
}
|
2090
|
+
#ifdef GGML_VULKAN_DEBUG
|
2091
|
+
std::cerr << " M" << std::endl;
|
2092
|
+
#endif
|
2093
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2094
|
+
}
|
2095
|
+
if (!bit16_x && bit16_y) {
|
2096
|
+
GGML_ASSERT(false);
|
2097
|
+
}
|
2098
|
+
|
2099
|
+
if (m <= 32 || n <= 32) {
|
2100
|
+
#ifdef GGML_VULKAN_DEBUG
|
2101
|
+
std::cerr << " S" << std::endl;
|
2102
|
+
#endif
|
2103
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2104
|
+
}
|
2105
|
+
#ifdef GGML_VULKAN_DEBUG
|
2106
|
+
std::cerr << " M" << std::endl;
|
2107
|
+
#endif
|
2108
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2109
|
+
}
|
2110
|
+
|
2111
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
2112
|
+
#ifdef GGML_VULKAN_DEBUG
|
2113
|
+
std::cerr << " M" << std::endl;
|
2114
|
+
#endif
|
2115
|
+
if (bit16_x && bit16_y) {
|
2116
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2117
|
+
}
|
2118
|
+
if (bit16_x && !bit16_y) {
|
2119
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2120
|
+
}
|
2121
|
+
if (!bit16_x && bit16_y) {
|
2122
|
+
GGML_ASSERT(false);
|
2123
|
+
}
|
2124
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2125
|
+
}
|
2126
|
+
|
2127
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
2128
|
+
#ifdef GGML_VULKAN_DEBUG
|
2129
|
+
std::cerr << " S" << std::endl;
|
2130
|
+
#endif
|
2131
|
+
if (bit16_x && bit16_y) {
|
2132
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2133
|
+
}
|
2134
|
+
if (bit16_x && !bit16_y) {
|
2135
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2136
|
+
}
|
2137
|
+
if (!bit16_x && bit16_y) {
|
2138
|
+
GGML_ASSERT(false);
|
2139
|
+
}
|
2140
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2141
|
+
}
|
2142
|
+
|
2037
2143
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
2038
2144
|
#ifdef GGML_VULKAN_DEBUG
|
2039
2145
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
2040
2146
|
#endif
|
2147
|
+
switch (ctx->device.lock()->vendor_id) {
|
2148
|
+
case VK_VENDOR_ID_AMD:
|
2149
|
+
return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
|
2150
|
+
case VK_VENDOR_ID_APPLE:
|
2151
|
+
return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
|
2152
|
+
case VK_VENDOR_ID_INTEL:
|
2153
|
+
return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
|
2154
|
+
}
|
2155
|
+
|
2041
2156
|
if (bit16_x && bit16_y) {
|
2042
|
-
if (
|
2157
|
+
if (m <= 32 || n <= 32) {
|
2043
2158
|
#ifdef GGML_VULKAN_DEBUG
|
2044
2159
|
std::cerr << " S" << std::endl;
|
2045
2160
|
#endif
|
2046
2161
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2047
2162
|
}
|
2048
|
-
if (
|
2163
|
+
if (m <= 64 || n <= 64) {
|
2049
2164
|
#ifdef GGML_VULKAN_DEBUG
|
2050
2165
|
std::cerr << " M" << std::endl;
|
2051
2166
|
#endif
|
@@ -2057,13 +2172,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2057
2172
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
2058
2173
|
}
|
2059
2174
|
if (bit16_x && !bit16_y) {
|
2060
|
-
if (
|
2175
|
+
if (m <= 32 || n <= 32) {
|
2061
2176
|
#ifdef GGML_VULKAN_DEBUG
|
2062
2177
|
std::cerr << " S" << std::endl;
|
2063
2178
|
#endif
|
2064
2179
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2065
2180
|
}
|
2066
|
-
if (
|
2181
|
+
if (m <= 64 || n <= 64) {
|
2067
2182
|
#ifdef GGML_VULKAN_DEBUG
|
2068
2183
|
std::cerr << " M" << std::endl;
|
2069
2184
|
#endif
|
@@ -2078,13 +2193,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2078
2193
|
GGML_ASSERT(false);
|
2079
2194
|
}
|
2080
2195
|
|
2081
|
-
if (
|
2196
|
+
if (m <= 32 || n <= 32) {
|
2082
2197
|
#ifdef GGML_VULKAN_DEBUG
|
2083
2198
|
std::cerr << " S" << std::endl;
|
2084
2199
|
#endif
|
2085
2200
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2086
2201
|
}
|
2087
|
-
if (
|
2202
|
+
if (m <= 64 || n <= 64) {
|
2088
2203
|
#ifdef GGML_VULKAN_DEBUG
|
2089
2204
|
std::cerr << " M" << std::endl;
|
2090
2205
|
#endif
|
@@ -3999,7 +4114,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
3999
4114
|
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
4000
4115
|
#endif
|
4001
4116
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4002
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
4117
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
4118
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
|
4119
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
4003
4120
|
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
4004
4121
|
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
4005
4122
|
|
@@ -4091,7 +4208,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4091
4208
|
if (ctx->staging != nullptr) {
|
4092
4209
|
ggml_vk_destroy_buffer(ctx->staging);
|
4093
4210
|
}
|
4094
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
4211
|
+
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
4212
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
4213
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
4095
4214
|
}
|
4096
4215
|
}
|
4097
4216
|
|
@@ -4454,13 +4573,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
4454
4573
|
}
|
4455
4574
|
}
|
4456
4575
|
|
4457
|
-
GGML_CALL int ggml_vk_get_device_count() {
|
4576
|
+
GGML_CALL static int ggml_vk_get_device_count() {
|
4458
4577
|
ggml_vk_instance_init();
|
4459
4578
|
|
4460
4579
|
return vk_instance.device_indices.size();
|
4461
4580
|
}
|
4462
4581
|
|
4463
|
-
GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
4582
|
+
GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
4464
4583
|
ggml_vk_instance_init();
|
4465
4584
|
|
4466
4585
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -4478,7 +4597,7 @@ void ggml_vk_init_cpu_assist() {
|
|
4478
4597
|
|
4479
4598
|
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
4480
4599
|
|
4481
|
-
for (
|
4600
|
+
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
4482
4601
|
ggml_vk_print_gpu_info(i);
|
4483
4602
|
}
|
4484
4603
|
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
@@ -5165,7 +5284,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
|
|
5165
5284
|
}
|
5166
5285
|
|
5167
5286
|
GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
5168
|
-
GGML_ASSERT(device < vk_instance.device_indices.size());
|
5287
|
+
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
5169
5288
|
|
5170
5289
|
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
5171
5290
|
|
@@ -5199,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
|
5199
5318
|
return vk_instance.device_indices.size();
|
5200
5319
|
}
|
5201
5320
|
|
5321
|
+
// Extension availability
|
5322
|
+
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
5323
|
+
#ifdef GGML_VULKAN_VALIDATE
|
5324
|
+
bool portability_enumeration_ext = false;
|
5325
|
+
// Check for portability enumeration extension for MoltenVK support
|
5326
|
+
for (const auto& properties : instance_extensions) {
|
5327
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
5328
|
+
return true;
|
5329
|
+
}
|
5330
|
+
}
|
5331
|
+
if (!portability_enumeration_ext) {
|
5332
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
5333
|
+
}
|
5334
|
+
#endif
|
5335
|
+
return false;
|
5336
|
+
|
5337
|
+
UNUSED(instance_extensions);
|
5338
|
+
}
|
5339
|
+
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
5340
|
+
#ifdef __APPLE__
|
5341
|
+
bool portability_enumeration_ext = false;
|
5342
|
+
// Check for portability enumeration extension for MoltenVK support
|
5343
|
+
for (const auto& properties : instance_extensions) {
|
5344
|
+
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
5345
|
+
return true;
|
5346
|
+
}
|
5347
|
+
}
|
5348
|
+
if (!portability_enumeration_ext) {
|
5349
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
5350
|
+
}
|
5351
|
+
#endif
|
5352
|
+
return false;
|
5353
|
+
|
5354
|
+
UNUSED(instance_extensions);
|
5355
|
+
}
|
5356
|
+
|
5202
5357
|
// checks
|
5203
5358
|
|
5204
5359
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|