llama_cpp 0.12.7 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -1106,7 +1106,9 @@ void ggml_vk_instance_init() {
|
|
1106
1106
|
|
1107
1107
|
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
1108
1108
|
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
1109
|
+
#ifdef __APPLE__
|
1109
1110
|
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
1111
|
+
#endif
|
1110
1112
|
|
1111
1113
|
std::vector<const char*> layers;
|
1112
1114
|
|
@@ -1117,13 +1119,17 @@ void ggml_vk_instance_init() {
|
|
1117
1119
|
if (validation_ext) {
|
1118
1120
|
extensions.push_back("VK_EXT_validation_features");
|
1119
1121
|
}
|
1122
|
+
#ifdef __APPLE__
|
1120
1123
|
if (portability_enumeration_ext) {
|
1121
1124
|
extensions.push_back("VK_KHR_portability_enumeration");
|
1122
1125
|
}
|
1126
|
+
#endif
|
1123
1127
|
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
1128
|
+
#ifdef __APPLE__
|
1124
1129
|
if (portability_enumeration_ext) {
|
1125
1130
|
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
1126
1131
|
}
|
1132
|
+
#endif
|
1127
1133
|
|
1128
1134
|
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
1129
1135
|
vk::ValidationFeaturesEXT validation_features;
|
@@ -2320,8 +2326,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2320
2326
|
src1_uma = d_Qy != nullptr;
|
2321
2327
|
}
|
2322
2328
|
|
2323
|
-
const bool load_x = src0->backend !=
|
2324
|
-
const bool load_y = src1->backend !=
|
2329
|
+
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
2330
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2325
2331
|
|
2326
2332
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2327
2333
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
@@ -2453,7 +2459,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2453
2459
|
// compute
|
2454
2460
|
ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
2455
2461
|
|
2456
|
-
if (dst->backend ==
|
2462
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2457
2463
|
// copy dst to host
|
2458
2464
|
float * d = (float *) ((char *) dst->data);
|
2459
2465
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
@@ -2506,8 +2512,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2506
2512
|
src1_uma = d_Qy != nullptr;
|
2507
2513
|
}
|
2508
2514
|
|
2509
|
-
const bool load_x = src0->backend !=
|
2510
|
-
const bool load_y = src1->backend !=
|
2515
|
+
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
2516
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2511
2517
|
|
2512
2518
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2513
2519
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
@@ -2630,7 +2636,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2630
2636
|
ggml_vk_sync_buffers(subctx);
|
2631
2637
|
ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
2632
2638
|
|
2633
|
-
if (dst->backend ==
|
2639
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2634
2640
|
// copy dst to host
|
2635
2641
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
2636
2642
|
ggml_vk_sync_buffers(subctx);
|
@@ -2647,7 +2653,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2647
2653
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2648
2654
|
#endif
|
2649
2655
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
2650
|
-
GGML_ASSERT(src0->backend ==
|
2656
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
2651
2657
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
2652
2658
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
2653
2659
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -2679,7 +2685,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2679
2685
|
src1_uma = d_Qy != nullptr;
|
2680
2686
|
}
|
2681
2687
|
|
2682
|
-
const bool load_y = src1->backend !=
|
2688
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2683
2689
|
|
2684
2690
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
2685
2691
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
@@ -2721,7 +2727,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2721
2727
|
ggml_vk_sync_buffers(subctx);
|
2722
2728
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
2723
2729
|
|
2724
|
-
if (dst->backend ==
|
2730
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2725
2731
|
// copy dst to host
|
2726
2732
|
float * d = (float *) dst->data;
|
2727
2733
|
ggml_vk_sync_buffers(subctx);
|
@@ -2738,7 +2744,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2738
2744
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
2739
2745
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
2740
2746
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
2741
|
-
GGML_ASSERT(src0->backend ==
|
2747
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
2742
2748
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2743
2749
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2744
2750
|
|
@@ -2771,7 +2777,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2771
2777
|
src1_uma = d_Qy != nullptr;
|
2772
2778
|
}
|
2773
2779
|
|
2774
|
-
const bool load_y = src1->backend !=
|
2780
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2775
2781
|
|
2776
2782
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
2777
2783
|
|
@@ -2814,7 +2820,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2814
2820
|
ggml_vk_sync_buffers(subctx);
|
2815
2821
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
2816
2822
|
|
2817
|
-
if (dst->backend ==
|
2823
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2818
2824
|
// copy dst to host
|
2819
2825
|
float * d = (float *) dst->data;
|
2820
2826
|
ggml_vk_sync_buffers(subctx);
|
@@ -2832,7 +2838,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2832
2838
|
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
2833
2839
|
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
2834
2840
|
dst->type == GGML_TYPE_F32 &&
|
2835
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
2841
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
2836
2842
|
}
|
2837
2843
|
|
2838
2844
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
@@ -2880,8 +2886,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
2880
2886
|
// TODO: support for transposed / permuted tensors
|
2881
2887
|
GGML_ASSERT(nb0 == sizeof(float));
|
2882
2888
|
GGML_ASSERT(nb00 == sizeof(float));
|
2883
|
-
GGML_ASSERT(src0->backend ==
|
2884
|
-
GGML_ASSERT(dst->backend ==
|
2889
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
2890
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
2885
2891
|
|
2886
2892
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2887
2893
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -3110,8 +3116,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3110
3116
|
}
|
3111
3117
|
}
|
3112
3118
|
|
3113
|
-
const bool transfer_src0 = src0->backend !=
|
3114
|
-
const bool transfer_src1 = use_src1 && src1->backend !=
|
3119
|
+
const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
3120
|
+
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
3115
3121
|
|
3116
3122
|
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
|
3117
3123
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
@@ -3120,7 +3126,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3120
3126
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3121
3127
|
|
3122
3128
|
// Workaround for tiny tensor inputs on ROPE
|
3123
|
-
if (use_src1 && src1->backend ==
|
3129
|
+
if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
|
3124
3130
|
y_sz = VK_WHOLE_SIZE;
|
3125
3131
|
}
|
3126
3132
|
|
@@ -3209,9 +3215,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3209
3215
|
ggml_vk_sync_buffers(subctx);
|
3210
3216
|
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3211
3217
|
}
|
3212
|
-
if (dst->backend ==
|
3218
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
3213
3219
|
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
3214
|
-
} else if(dst->backend ==
|
3220
|
+
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3215
3221
|
// copy dst to host
|
3216
3222
|
float * d = (float *) dst->data;
|
3217
3223
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
@@ -3253,7 +3259,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3253
3259
|
ggml_vk_sync_buffers(subctx);
|
3254
3260
|
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3255
3261
|
}
|
3256
|
-
if (dst->backend ==
|
3262
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3257
3263
|
// copy dst to host
|
3258
3264
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
3259
3265
|
}
|
@@ -3359,7 +3365,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
3359
3365
|
|
3360
3366
|
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3361
3367
|
// If backend is CPU, data from src0 has to be copied off the device
|
3362
|
-
if (dst->backend ==
|
3368
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3363
3369
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3364
3370
|
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
3365
3371
|
ggml_vk_sync_buffers(subctx);
|
@@ -3994,9 +4000,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
3994
4000
|
#ifdef GGML_VULKAN_DEBUG
|
3995
4001
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
3996
4002
|
#endif
|
3997
|
-
const bool any_on_device = node->backend ==
|
3998
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend ==
|
3999
|
-
|| (node->src[1] != nullptr && (node->src[1]->backend ==
|
4003
|
+
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
4004
|
+
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4005
|
+
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
4000
4006
|
|
4001
4007
|
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
|
4002
4008
|
return;
|
@@ -4215,9 +4221,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4215
4221
|
}
|
4216
4222
|
|
4217
4223
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
4218
|
-
const bool any_on_device = node->backend ==
|
4219
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend ==
|
4220
|
-
|| (node->src[1] != nullptr && node->src[1]->backend ==
|
4224
|
+
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
4225
|
+
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4226
|
+
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4221
4227
|
|
4222
4228
|
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
4223
4229
|
return;
|
@@ -4371,7 +4377,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4371
4377
|
last_node = true;
|
4372
4378
|
#endif
|
4373
4379
|
|
4374
|
-
if (node->backend ==
|
4380
|
+
if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
|
4375
4381
|
ggml_vk_ctx_end(ctx->compute_ctx);
|
4376
4382
|
ctx->compute_ctx->exit_tensor = node;
|
4377
4383
|
ctx->compute_ctx = nullptr;
|
@@ -4379,9 +4385,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4379
4385
|
}
|
4380
4386
|
|
4381
4387
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
4382
|
-
const bool any_on_device = tensor->backend ==
|
4383
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
4384
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
4388
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
4389
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4390
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4385
4391
|
|
4386
4392
|
if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
|
4387
4393
|
return false;
|
@@ -4442,7 +4448,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4442
4448
|
if (params->ith != 0) {
|
4443
4449
|
return true;
|
4444
4450
|
}
|
4445
|
-
if (params->type ==
|
4451
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
4446
4452
|
return true;
|
4447
4453
|
}
|
4448
4454
|
|
@@ -4745,7 +4751,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
4745
4751
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
4746
4752
|
}
|
4747
4753
|
|
4748
|
-
tensor->backend =
|
4754
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
4749
4755
|
tensor->extra = extra;
|
4750
4756
|
}
|
4751
4757
|
|
@@ -4753,7 +4759,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
4753
4759
|
#ifdef GGML_VULKAN_DEBUG
|
4754
4760
|
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
4755
4761
|
#endif
|
4756
|
-
GGML_ASSERT(tensor->backend ==
|
4762
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
4757
4763
|
|
4758
4764
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
4759
4765
|
|
@@ -4768,7 +4774,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
4768
4774
|
#ifdef GGML_VULKAN_DEBUG
|
4769
4775
|
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
4770
4776
|
#endif
|
4771
|
-
GGML_ASSERT(tensor->backend ==
|
4777
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
4772
4778
|
|
4773
4779
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
4774
4780
|
|
@@ -4999,7 +5005,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
4999
5005
|
#endif
|
5000
5006
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5001
5007
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
5002
|
-
GGML_ASSERT(tensor->backend ==
|
5008
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5003
5009
|
|
5004
5010
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
5005
5011
|
|
@@ -5020,7 +5026,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
5020
5026
|
#endif
|
5021
5027
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5022
5028
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
5023
|
-
GGML_ASSERT(tensor->backend ==
|
5029
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5024
5030
|
|
5025
5031
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
5026
5032
|
|
@@ -5097,7 +5103,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
5097
5103
|
int last_node = cgraph->n_nodes - 1;
|
5098
5104
|
|
5099
5105
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
5100
|
-
while (last_node > 0 && cgraph->nodes[last_node]->backend !=
|
5106
|
+
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
|
5101
5107
|
last_node -= 1;
|
5102
5108
|
}
|
5103
5109
|
|
@@ -5106,7 +5112,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
5106
5112
|
}
|
5107
5113
|
|
5108
5114
|
ggml_compute_params params = {};
|
5109
|
-
params.type =
|
5115
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
5110
5116
|
params.ith = 0;
|
5111
5117
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
5112
5118
|
ggml_tensor * node = cgraph->nodes[i];
|
@@ -5244,6 +5250,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
5244
5250
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
5245
5251
|
};
|
5246
5252
|
|
5253
|
+
static ggml_guid_t ggml_backend_vk_guid() {
|
5254
|
+
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
5255
|
+
return &guid;
|
5256
|
+
}
|
5257
|
+
|
5247
5258
|
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
5248
5259
|
if (vk_instance.initialized[idx]) {
|
5249
5260
|
return vk_instance.backends[idx];
|
@@ -5262,6 +5273,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5262
5273
|
vk_instance.initialized[idx] = true;
|
5263
5274
|
|
5264
5275
|
ggml_backend_t vk_backend = new ggml_backend {
|
5276
|
+
/* .guid = */ ggml_backend_vk_guid(),
|
5265
5277
|
/* .interface = */ ggml_backend_vk_interface,
|
5266
5278
|
/* .context = */ &vk_instance.contexts[ctx->idx],
|
5267
5279
|
};
|
@@ -5272,7 +5284,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5272
5284
|
}
|
5273
5285
|
|
5274
5286
|
GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
|
5275
|
-
return backend && backend->
|
5287
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
5276
5288
|
}
|
5277
5289
|
|
5278
5290
|
GGML_CALL int ggml_backend_vk_get_device_count() {
|
@@ -5410,7 +5422,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
5410
5422
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
5411
5423
|
void * tensor_data = tensor->data;
|
5412
5424
|
|
5413
|
-
if (tensor->backend ==
|
5425
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5414
5426
|
const size_t tensor_size = ggml_nbytes(tensor);
|
5415
5427
|
tensor_data = malloc(tensor_size);
|
5416
5428
|
|
@@ -5436,14 +5448,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
5436
5448
|
std::vector<const ggml_tensor *> done;
|
5437
5449
|
ggml_vk_print_graph_origin(tensor, done);
|
5438
5450
|
|
5439
|
-
if (tensor->backend ==
|
5451
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5440
5452
|
free(tensor_data);
|
5441
5453
|
}
|
5442
5454
|
}
|
5443
5455
|
|
5444
5456
|
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
5445
5457
|
return;
|
5446
|
-
GGML_ASSERT(tensor->backend ==
|
5458
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
5447
5459
|
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
5448
5460
|
return;
|
5449
5461
|
}
|
@@ -5481,7 +5493,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5481
5493
|
if (params->ith != 0) {
|
5482
5494
|
return;
|
5483
5495
|
}
|
5484
|
-
if (params->type ==
|
5496
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
5485
5497
|
return;
|
5486
5498
|
}
|
5487
5499
|
|
@@ -5518,10 +5530,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5518
5530
|
|
5519
5531
|
src0_buffer = malloc(src0_size);
|
5520
5532
|
src0_clone->data = src0_buffer;
|
5521
|
-
if (src0->backend ==
|
5533
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
5522
5534
|
memcpy(src0_clone->data, src0->data, src0_size);
|
5523
5535
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
5524
|
-
} else if (src0->backend ==
|
5536
|
+
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
5525
5537
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
5526
5538
|
uint64_t offset = extra->offset;
|
5527
5539
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
@@ -5561,10 +5573,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5561
5573
|
|
5562
5574
|
src1_buffer = malloc(src1_size);
|
5563
5575
|
src1_clone->data = src1_buffer;
|
5564
|
-
if (src1->backend ==
|
5576
|
+
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
5565
5577
|
memcpy(src1_clone->data, src1->data, src1_size);
|
5566
5578
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
5567
|
-
} else if (src1->backend ==
|
5579
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
5568
5580
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
5569
5581
|
uint64_t offset = extra->offset;
|
5570
5582
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
@@ -5723,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5723
5735
|
if (params->ith != 0) {
|
5724
5736
|
return;
|
5725
5737
|
}
|
5726
|
-
if (params->type ==
|
5738
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
5727
5739
|
return;
|
5728
5740
|
}
|
5729
5741
|
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
@@ -5735,7 +5747,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5735
5747
|
|
5736
5748
|
void * tensor_data = tensor->data;
|
5737
5749
|
|
5738
|
-
if (tensor->backend ==
|
5750
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5739
5751
|
size_t tensor_size = ggml_nbytes(tensor);
|
5740
5752
|
tensor_data = malloc(tensor_size);
|
5741
5753
|
|
@@ -5868,7 +5880,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5868
5880
|
comp_result = nullptr;
|
5869
5881
|
comp_size = 0;
|
5870
5882
|
|
5871
|
-
if (tensor->backend ==
|
5883
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5872
5884
|
free(tensor_data);
|
5873
5885
|
}
|
5874
5886
|
}
|