llama_cpp 0.12.7 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -69,6 +69,33 @@ struct vk_queue {
|
|
69
69
|
vk::PipelineStageFlags stage_flags;
|
70
70
|
};
|
71
71
|
|
72
|
+
struct vk_pipeline_struct {
|
73
|
+
std::string name;
|
74
|
+
vk::ShaderModule shader_module;
|
75
|
+
vk::DescriptorSetLayout dsl;
|
76
|
+
std::vector<vk::DescriptorPool> descriptor_pools;
|
77
|
+
std::vector<vk::DescriptorSet> descriptor_sets;
|
78
|
+
uint32_t descriptor_set_idx;
|
79
|
+
vk::PipelineLayout layout;
|
80
|
+
vk::Pipeline pipeline;
|
81
|
+
uint32_t push_constant_size;
|
82
|
+
uint32_t parameter_count;
|
83
|
+
std::array<uint32_t, 3> wg_denoms;
|
84
|
+
uint32_t align;
|
85
|
+
};
|
86
|
+
|
87
|
+
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
|
88
|
+
typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref;
|
89
|
+
|
90
|
+
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
|
91
|
+
|
92
|
+
struct vk_matmul_pipeline_struct {
|
93
|
+
vk_pipeline l, m, s;
|
94
|
+
vk_pipeline a_l, a_m, a_s;
|
95
|
+
};
|
96
|
+
|
97
|
+
typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline;
|
98
|
+
|
72
99
|
struct vk_device {
|
73
100
|
vk::PhysicalDevice physical_device;
|
74
101
|
vk::PhysicalDeviceProperties properties;
|
@@ -84,10 +111,61 @@ struct vk_device {
|
|
84
111
|
uint32_t subgroup_size;
|
85
112
|
bool uma;
|
86
113
|
|
114
|
+
bool initialized;
|
115
|
+
size_t idx;
|
116
|
+
|
117
|
+
vk_matmul_pipeline pipeline_matmul_f32;
|
118
|
+
vk_matmul_pipeline pipeline_matmul_f16;
|
119
|
+
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
120
|
+
vk_pipeline pipeline_matmul_split_k_reduce;
|
121
|
+
|
122
|
+
vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES];
|
123
|
+
|
124
|
+
vk_pipeline pipeline_dequant[VK_NUM_TYPES];
|
125
|
+
vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES];
|
126
|
+
|
127
|
+
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
128
|
+
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
129
|
+
vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
|
130
|
+
vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
|
131
|
+
vk_pipeline pipeline_mul_f32;
|
132
|
+
vk_pipeline pipeline_add_f32;
|
133
|
+
vk_pipeline pipeline_scale_f32;
|
134
|
+
vk_pipeline pipeline_sqr_f32;
|
135
|
+
vk_pipeline pipeline_clamp_f32;
|
136
|
+
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
|
137
|
+
vk_pipeline pipeline_norm_f32;
|
138
|
+
vk_pipeline pipeline_rms_norm_f32;
|
139
|
+
vk_pipeline pipeline_gelu_f32;
|
140
|
+
vk_pipeline pipeline_silu_f32;
|
141
|
+
vk_pipeline pipeline_relu_f32;
|
142
|
+
vk_pipeline pipeline_diag_mask_inf_f32;
|
143
|
+
vk_pipeline pipeline_soft_max_f32;
|
144
|
+
vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
|
145
|
+
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
146
|
+
vk_pipeline pipeline_argsort_f32;
|
147
|
+
|
148
|
+
std::vector<vk_pipeline_ref> pipelines;
|
149
|
+
|
87
150
|
~vk_device() {
|
88
151
|
#ifdef GGML_VULKAN_DEBUG
|
89
152
|
std::cerr << "destroy device " << name << std::endl;
|
90
153
|
#endif
|
154
|
+
device.destroyCommandPool(compute_queue.pool);
|
155
|
+
if (!single_queue) {
|
156
|
+
device.destroyCommandPool(transfer_queue.pool);
|
157
|
+
}
|
158
|
+
|
159
|
+
for (auto& pipeline : pipelines) {
|
160
|
+
if (pipeline.expired()) {
|
161
|
+
continue;
|
162
|
+
}
|
163
|
+
|
164
|
+
vk_pipeline pl = pipeline.lock();
|
165
|
+
ggml_vk_destroy_pipeline(device, pl);
|
166
|
+
}
|
167
|
+
pipelines.clear();
|
168
|
+
|
91
169
|
device.destroy();
|
92
170
|
}
|
93
171
|
};
|
@@ -125,21 +203,6 @@ struct vk_subbuffer {
|
|
125
203
|
uint64_t size;
|
126
204
|
};
|
127
205
|
|
128
|
-
struct vk_pipeline {
|
129
|
-
std::string name;
|
130
|
-
vk::ShaderModule shader_module;
|
131
|
-
vk::DescriptorSetLayout dsl;
|
132
|
-
std::vector<vk::DescriptorPool> descriptor_pools;
|
133
|
-
std::vector<vk::DescriptorSet> descriptor_sets;
|
134
|
-
uint32_t descriptor_set_idx;
|
135
|
-
vk::PipelineLayout layout;
|
136
|
-
vk::Pipeline pipeline;
|
137
|
-
uint32_t push_constant_size;
|
138
|
-
uint32_t parameter_count;
|
139
|
-
std::array<uint32_t, 3> wg_denoms;
|
140
|
-
uint32_t align;
|
141
|
-
};
|
142
|
-
|
143
206
|
struct vk_semaphore {
|
144
207
|
vk::Semaphore s;
|
145
208
|
uint64_t value;
|
@@ -160,11 +223,21 @@ struct vk_op_push_constants {
|
|
160
223
|
float param2;
|
161
224
|
};
|
162
225
|
|
163
|
-
struct
|
226
|
+
struct vk_op_unary_push_constants {
|
227
|
+
uint32_t ne;
|
228
|
+
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
229
|
+
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
230
|
+
uint32_t d_offset;
|
231
|
+
float param1; float param2;
|
232
|
+
};
|
233
|
+
|
234
|
+
struct vk_op_binary_push_constants {
|
164
235
|
uint32_t ne;
|
165
|
-
uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t nb02;
|
166
|
-
uint32_t ne10; uint32_t ne11; uint32_t nb10; uint32_t nb11; uint32_t nb12;
|
236
|
+
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
237
|
+
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
238
|
+
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
167
239
|
uint32_t d_offset;
|
240
|
+
float param1; float param2;
|
168
241
|
};
|
169
242
|
|
170
243
|
struct vk_op_diag_mask_push_constants {
|
@@ -196,6 +269,22 @@ struct vk_op_rope_neox_push_constants {
|
|
196
269
|
float inv_ndims;
|
197
270
|
};
|
198
271
|
|
272
|
+
struct vk_op_soft_max_push_constants {
|
273
|
+
uint32_t KX;
|
274
|
+
uint32_t KY;
|
275
|
+
uint32_t KZ;
|
276
|
+
float scale;
|
277
|
+
float max_bias;
|
278
|
+
float m0;
|
279
|
+
float m1;
|
280
|
+
uint32_t n_head_log2;
|
281
|
+
};
|
282
|
+
|
283
|
+
struct vk_op_argsort_push_constants {
|
284
|
+
uint32_t ncols;
|
285
|
+
bool ascending;
|
286
|
+
};
|
287
|
+
|
199
288
|
// Allow pre-recording command buffers
|
200
289
|
struct vk_staging_memcpy {
|
201
290
|
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
@@ -236,7 +325,6 @@ struct ggml_tensor_extra_gpu {
|
|
236
325
|
};
|
237
326
|
|
238
327
|
struct ggml_vk_garbage_collector {
|
239
|
-
std::vector<vk_pipeline *> pipelines;
|
240
328
|
std::vector<vk_semaphore> tl_semaphores;
|
241
329
|
std::vector<vk_semaphore> semaphores;
|
242
330
|
std::vector<vk::Event> events;
|
@@ -247,35 +335,7 @@ struct ggml_vk_garbage_collector {
|
|
247
335
|
struct ggml_backend_vk_context {
|
248
336
|
std::string name;
|
249
337
|
|
250
|
-
std::
|
251
|
-
vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s;
|
252
|
-
vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s;
|
253
|
-
vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s;
|
254
|
-
vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s;
|
255
|
-
vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s;
|
256
|
-
vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s;
|
257
|
-
vk_pipeline pipeline_matmul_split_k_reduce;
|
258
|
-
vk_pipeline pipeline_dequant[VK_NUM_TYPES];
|
259
|
-
vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES];
|
260
|
-
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
261
|
-
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
262
|
-
vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
|
263
|
-
vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
|
264
|
-
vk_pipeline pipeline_mul_f32;
|
265
|
-
vk_pipeline pipeline_add_f32;
|
266
|
-
vk_pipeline pipeline_scale_f32;
|
267
|
-
vk_pipeline pipeline_sqr_f32;
|
268
|
-
vk_pipeline pipeline_clamp_f32;
|
269
|
-
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
|
270
|
-
vk_pipeline pipeline_norm_f32;
|
271
|
-
vk_pipeline pipeline_rms_norm_f32;
|
272
|
-
vk_pipeline pipeline_gelu_f32;
|
273
|
-
vk_pipeline pipeline_silu_f32;
|
274
|
-
vk_pipeline pipeline_relu_f32;
|
275
|
-
vk_pipeline pipeline_diag_mask_inf_f32;
|
276
|
-
vk_pipeline pipeline_soft_max_f32;
|
277
|
-
vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
|
278
|
-
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
338
|
+
std::shared_ptr<vk_device> device;
|
279
339
|
|
280
340
|
size_t semaphore_idx, event_idx;
|
281
341
|
ggml_vk_garbage_collector gc;
|
@@ -304,13 +364,31 @@ struct vk_instance {
|
|
304
364
|
|
305
365
|
std::vector<size_t> device_indices;
|
306
366
|
|
307
|
-
std::shared_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
|
308
367
|
ggml_backend_t backends[GGML_VK_MAX_DEVICES];
|
309
368
|
ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES];
|
310
369
|
ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES];
|
311
370
|
bool initialized[GGML_VK_MAX_DEVICES];
|
312
371
|
};
|
313
372
|
|
373
|
+
static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
|
374
|
+
#ifdef GGML_VULKAN_DEBUG
|
375
|
+
std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
|
376
|
+
#endif
|
377
|
+
static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
|
378
|
+
|
379
|
+
if (devices[idx].expired()) {
|
380
|
+
#ifdef GGML_VULKAN_DEBUG
|
381
|
+
std::cerr << "Initializing new vk_device" << std::endl;
|
382
|
+
#endif
|
383
|
+
std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
|
384
|
+
device->initialized = false;
|
385
|
+
devices[idx] = device;
|
386
|
+
return device;
|
387
|
+
}
|
388
|
+
|
389
|
+
return devices[idx].lock();
|
390
|
+
}
|
391
|
+
|
314
392
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
315
393
|
static size_t vk_skip_checks;
|
316
394
|
static size_t vk_output_tensor;
|
@@ -334,14 +412,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
334
412
|
GGML_ASSERT(parameter_count > 0);
|
335
413
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
336
414
|
|
337
|
-
pipeline
|
338
|
-
pipeline
|
339
|
-
pipeline
|
340
|
-
pipeline
|
341
|
-
pipeline
|
415
|
+
pipeline = std::make_shared<vk_pipeline_struct>();
|
416
|
+
pipeline->name = name;
|
417
|
+
pipeline->parameter_count = parameter_count;
|
418
|
+
pipeline->push_constant_size = push_constant_size;
|
419
|
+
pipeline->wg_denoms = wg_denoms;
|
420
|
+
pipeline->align = align;
|
342
421
|
|
343
422
|
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
344
|
-
pipeline
|
423
|
+
pipeline->shader_module = ctx->device->device.createShaderModule(shader_module_create_info);
|
345
424
|
|
346
425
|
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
347
426
|
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
@@ -355,49 +434,49 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
355
434
|
vk::PushConstantRange pcr(
|
356
435
|
vk::ShaderStageFlagBits::eCompute,
|
357
436
|
0,
|
358
|
-
pipeline
|
437
|
+
pipeline->push_constant_size
|
359
438
|
);
|
360
439
|
|
361
440
|
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
|
362
441
|
{},
|
363
442
|
dsl_binding);
|
364
443
|
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
365
|
-
pipeline
|
444
|
+
pipeline->dsl = ctx->device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
366
445
|
|
367
446
|
// Check if device supports multiple descriptors per pool
|
368
|
-
if (ctx->device
|
447
|
+
if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) {
|
369
448
|
const uint32_t alloc_count = 2;
|
370
449
|
|
371
450
|
// Try allocating multiple sets from one pool
|
372
451
|
// This fails on AMD for some reason, so add a fall back to allocating one pool per set
|
373
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline
|
452
|
+
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
|
374
453
|
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size);
|
375
|
-
vk::DescriptorPool pool = ctx->device
|
454
|
+
vk::DescriptorPool pool = ctx->device->device.createDescriptorPool(descriptor_pool_create_info);
|
376
455
|
|
377
456
|
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
378
457
|
for (uint32_t i = 0; i < alloc_count; i++) {
|
379
|
-
layouts[i] = pipeline
|
458
|
+
layouts[i] = pipeline->dsl;
|
380
459
|
}
|
381
460
|
try {
|
382
461
|
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data());
|
383
|
-
std::vector<vk::DescriptorSet> sets = ctx->device
|
462
|
+
std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
384
463
|
} catch(vk::OutOfPoolMemoryError const&) {
|
385
|
-
ctx->device
|
464
|
+
ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE;
|
386
465
|
}
|
387
466
|
|
388
|
-
ctx->device
|
467
|
+
ctx->device->device.destroyDescriptorPool(pool);
|
389
468
|
}
|
390
469
|
|
391
|
-
if (ctx->device
|
392
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline
|
470
|
+
if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
|
471
|
+
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
|
393
472
|
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size);
|
394
|
-
pipeline
|
473
|
+
pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info));
|
395
474
|
}
|
396
475
|
|
397
|
-
pipeline
|
476
|
+
pipeline->descriptor_set_idx = 0;
|
398
477
|
|
399
|
-
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline
|
400
|
-
pipeline
|
478
|
+
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
|
479
|
+
pipeline->layout = ctx->device->device.createPipelineLayout(pipeline_layout_create_info);
|
401
480
|
|
402
481
|
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
|
403
482
|
|
@@ -417,72 +496,75 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
417
496
|
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
|
418
497
|
vk::PipelineShaderStageCreateFlags(),
|
419
498
|
vk::ShaderStageFlagBits::eCompute,
|
420
|
-
pipeline
|
499
|
+
pipeline->shader_module,
|
421
500
|
entrypoint.c_str(),
|
422
501
|
&specialization_info);
|
423
502
|
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
|
424
503
|
vk::PipelineCreateFlags(),
|
425
504
|
pipeline_shader_create_info,
|
426
|
-
pipeline
|
427
|
-
pipeline
|
505
|
+
pipeline->layout);
|
506
|
+
pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
428
507
|
|
429
|
-
ctx->
|
508
|
+
ctx->device->pipelines.push_back(pipeline);
|
430
509
|
}
|
431
510
|
|
432
|
-
static void ggml_vk_destroy_pipeline(
|
511
|
+
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
512
|
+
#ifdef GGML_VULKAN_DEBUG
|
513
|
+
std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
|
514
|
+
#endif
|
433
515
|
for (auto& pool : pipeline->descriptor_pools) {
|
434
|
-
|
516
|
+
device.destroyDescriptorPool(pool);
|
435
517
|
}
|
436
518
|
pipeline->descriptor_pools.clear();
|
437
519
|
pipeline->descriptor_sets.clear();
|
438
520
|
pipeline->descriptor_set_idx = 0;
|
439
521
|
|
440
|
-
|
522
|
+
device.destroyDescriptorSetLayout(pipeline->dsl);
|
441
523
|
|
442
|
-
|
524
|
+
device.destroyPipelineLayout(pipeline->layout);
|
443
525
|
|
444
|
-
|
526
|
+
device.destroyShaderModule(pipeline->shader_module);
|
445
527
|
|
446
|
-
|
528
|
+
device.destroyPipeline(pipeline->pipeline);
|
447
529
|
}
|
448
530
|
|
449
531
|
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
|
450
532
|
#ifdef GGML_VULKAN_DEBUG
|
451
|
-
std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline
|
533
|
+
std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
|
452
534
|
#endif
|
453
|
-
if (pipeline
|
535
|
+
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
454
536
|
// Enough descriptors are available
|
455
537
|
return;
|
456
538
|
}
|
457
539
|
|
458
|
-
if (ctx->device
|
459
|
-
const uint32_t alloc_count = pipeline
|
540
|
+
if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
|
541
|
+
const uint32_t alloc_count = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
|
460
542
|
|
461
543
|
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
462
544
|
for (uint32_t i = 0; i < alloc_count; i++) {
|
463
|
-
layouts[i] = pipeline
|
545
|
+
layouts[i] = pipeline->dsl;
|
464
546
|
}
|
465
|
-
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline
|
466
|
-
std::vector<vk::DescriptorSet> sets = ctx->device
|
467
|
-
pipeline
|
547
|
+
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data());
|
548
|
+
std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
549
|
+
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
|
468
550
|
} else {
|
469
|
-
for (uint32_t i = pipeline
|
470
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline
|
551
|
+
for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) {
|
552
|
+
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
|
471
553
|
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size);
|
472
|
-
pipeline
|
554
|
+
pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info));
|
473
555
|
|
474
|
-
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline
|
475
|
-
std::vector<vk::DescriptorSet> sets = ctx->device
|
476
|
-
pipeline
|
556
|
+
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[i], 1, &pipeline->dsl);
|
557
|
+
std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
558
|
+
pipeline->descriptor_sets.push_back(sets[0]);
|
477
559
|
}
|
478
560
|
}
|
479
561
|
}
|
480
562
|
|
481
563
|
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
|
482
564
|
#ifdef GGML_VULKAN_DEBUG
|
483
|
-
std::cerr << "ggml_pipeline_cleanup(" << pipeline
|
565
|
+
std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
|
484
566
|
#endif
|
485
|
-
pipeline
|
567
|
+
pipeline->descriptor_set_idx = 0;
|
486
568
|
}
|
487
569
|
|
488
570
|
static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
|
@@ -498,7 +580,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
|
|
498
580
|
q.pool,
|
499
581
|
vk::CommandBufferLevel::ePrimary,
|
500
582
|
1);
|
501
|
-
const std::vector<vk::CommandBuffer> cmd_buffers = ctx->device
|
583
|
+
const std::vector<vk::CommandBuffer> cmd_buffers = ctx->device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
502
584
|
auto buf = cmd_buffers.front();
|
503
585
|
|
504
586
|
q.cmd_buffers.push_back(buf);
|
@@ -643,11 +725,11 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
|
|
643
725
|
q.queue_family_index = queue_family_index;
|
644
726
|
|
645
727
|
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
646
|
-
q.pool = ctx->device
|
728
|
+
q.pool = ctx->device->device.createCommandPool(command_pool_create_info_compute);
|
647
729
|
|
648
730
|
q.cmd_buffer_idx = 0;
|
649
731
|
|
650
|
-
q.queue = ctx->device
|
732
|
+
q.queue = ctx->device->device.getQueue(queue_family_index, queue_index);
|
651
733
|
|
652
734
|
q.stage_flags = stage_flags;
|
653
735
|
}
|
@@ -671,7 +753,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
|
|
671
753
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
|
672
754
|
vk::SemaphoreCreateInfo ci{};
|
673
755
|
ci.setPNext(&tci);
|
674
|
-
vk::Semaphore semaphore = ctx->device
|
756
|
+
vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
|
675
757
|
ctx->gc.semaphores.push_back({ semaphore, 0 });
|
676
758
|
return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
|
677
759
|
}
|
@@ -684,7 +766,7 @@ static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context
|
|
684
766
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
685
767
|
vk::SemaphoreCreateInfo ci{};
|
686
768
|
ci.setPNext(&tci);
|
687
|
-
vk::Semaphore semaphore = ctx->device
|
769
|
+
vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
|
688
770
|
ctx->gc.tl_semaphores.push_back({ semaphore, 0 });
|
689
771
|
}
|
690
772
|
return &ctx->gc.tl_semaphores[ctx->semaphore_idx++];
|
@@ -692,7 +774,7 @@ static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context
|
|
692
774
|
|
693
775
|
static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
694
776
|
if (ctx->event_idx >= ctx->gc.events.size()) {
|
695
|
-
ctx->gc.events.push_back(ctx->device
|
777
|
+
ctx->gc.events.push_back(ctx->device->device.createEvent({}));
|
696
778
|
}
|
697
779
|
return ctx->gc.events[ctx->event_idx++];
|
698
780
|
}
|
@@ -703,7 +785,7 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
703
785
|
#endif
|
704
786
|
// Requires command buffers to be done
|
705
787
|
|
706
|
-
ctx->device
|
788
|
+
ctx->device->device.resetCommandPool(q.pool);
|
707
789
|
q.cmd_buffer_idx = 0;
|
708
790
|
}
|
709
791
|
|
@@ -740,11 +822,11 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
740
822
|
nullptr,
|
741
823
|
};
|
742
824
|
|
743
|
-
buf->buffer = ctx->device
|
825
|
+
buf->buffer = ctx->device->device.createBuffer(buffer_create_info);
|
744
826
|
|
745
|
-
vk::MemoryRequirements mem_req = ctx->device
|
827
|
+
vk::MemoryRequirements mem_req = ctx->device->device.getBufferMemoryRequirements(buf->buffer);
|
746
828
|
|
747
|
-
vk::PhysicalDeviceMemoryProperties mem_props = ctx->device
|
829
|
+
vk::PhysicalDeviceMemoryProperties mem_props = ctx->device->physical_device.getMemoryProperties();
|
748
830
|
|
749
831
|
uint32_t memory_type_index = UINT32_MAX;
|
750
832
|
|
@@ -757,30 +839,30 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
757
839
|
}
|
758
840
|
|
759
841
|
if (memory_type_index == UINT32_MAX) {
|
760
|
-
ctx->device
|
842
|
+
ctx->device->device.destroyBuffer(buf->buffer);
|
761
843
|
buf->size = 0;
|
762
844
|
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
763
845
|
}
|
764
846
|
|
765
847
|
try {
|
766
|
-
buf->device_memory = ctx->device
|
848
|
+
buf->device_memory = ctx->device->device.allocateMemory({ mem_req.size, memory_type_index });
|
767
849
|
} catch (const vk::SystemError& e) {
|
768
850
|
// Out of Host/Device memory, clean up buffer
|
769
|
-
ctx->device
|
851
|
+
ctx->device->device.destroyBuffer(buf->buffer);
|
770
852
|
buf->size = 0;
|
771
853
|
throw e;
|
772
854
|
}
|
773
855
|
buf->ptr = nullptr;
|
774
856
|
|
775
857
|
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
776
|
-
buf->ptr = ctx->device
|
858
|
+
buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
777
859
|
}
|
778
860
|
|
779
|
-
ctx->device
|
861
|
+
ctx->device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
|
780
862
|
|
781
863
|
buf->ctx = ctx;
|
782
864
|
|
783
|
-
buf->device = ctx->device
|
865
|
+
buf->device = ctx->device;
|
784
866
|
|
785
867
|
#ifdef GGML_VULKAN_DEBUG
|
786
868
|
std::cerr << "Created buffer " << buf->buffer << std::endl;
|
@@ -802,7 +884,7 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
|
|
802
884
|
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
803
885
|
vk_buffer buf;
|
804
886
|
try {
|
805
|
-
if (ctx->device
|
887
|
+
if (ctx->device->uma) {
|
806
888
|
// Fall back to host memory type
|
807
889
|
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
808
890
|
} else {
|
@@ -883,10 +965,16 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
883
965
|
std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
|
884
966
|
#endif
|
885
967
|
|
968
|
+
const std::shared_ptr<vk_device> device = ctx->device;
|
969
|
+
|
886
970
|
// mulmat
|
887
|
-
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16,
|
888
|
-
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16,
|
889
|
-
std::initializer_list<uint32_t> warptile_s = {
|
971
|
+
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
972
|
+
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
973
|
+
std::initializer_list<uint32_t> warptile_s = { device->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
|
974
|
+
|
975
|
+
std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
976
|
+
std::initializer_list<uint32_t> warptile_mmq_m = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
977
|
+
std::initializer_list<uint32_t> warptile_mmq_s = { device->subgroup_size, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
|
890
978
|
|
891
979
|
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
|
892
980
|
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
|
@@ -896,126 +984,206 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
896
984
|
uint32_t m_align = 64;
|
897
985
|
uint32_t s_align = 32;
|
898
986
|
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
910
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
911
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
912
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
913
|
-
|
914
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
915
|
-
|
916
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
917
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
918
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
919
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
987
|
+
ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
988
|
+
ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
989
|
+
ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
990
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
991
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
992
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
993
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
994
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
995
|
+
|
996
|
+
if (device->fp16) {
|
997
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
998
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
999
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
|
1000
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
1001
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
1002
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
1003
|
+
|
1004
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1005
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
1006
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
|
1007
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
1008
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
1009
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
1010
|
+
|
1011
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1012
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
1013
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
|
1014
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
1015
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
1016
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
1017
|
+
|
1018
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1019
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1020
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1021
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1022
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1023
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1024
|
+
|
1025
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_0_f32_l", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1026
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_0_f32_m", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1027
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_0_f32_s", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1028
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1029
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1030
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1031
|
+
|
1032
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1033
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1034
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1035
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1036
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1037
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1038
|
+
|
1039
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1040
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1041
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1042
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1043
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1044
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1045
|
+
|
1046
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1047
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1048
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1049
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1050
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1051
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
920
1052
|
} else {
|
921
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
922
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
923
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
924
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
925
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
926
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
927
|
-
|
928
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
929
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
930
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
931
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
932
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
933
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
934
|
-
|
935
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
936
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
937
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
938
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
939
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
940
|
-
ggml_vk_create_pipeline(ctx, ctx->
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
1053
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1054
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
1055
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
|
1056
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
1057
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
1058
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
1059
|
+
|
1060
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1061
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
1062
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
|
1063
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
1064
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
1065
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
1066
|
+
|
1067
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1068
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
1069
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
|
1070
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
|
1071
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
|
1072
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
|
1073
|
+
|
1074
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1075
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1076
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1077
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1078
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1079
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1080
|
+
|
1081
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1082
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1083
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1084
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1085
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1086
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1087
|
+
|
1088
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1089
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1090
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1091
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1092
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1093
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1094
|
+
|
1095
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1096
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1097
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1098
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1099
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1100
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1101
|
+
|
1102
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1103
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1104
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1105
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1106
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1107
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1108
|
+
}
|
1109
|
+
|
1110
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1111
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1112
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1113
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1114
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1115
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1116
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1117
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1118
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1119
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
1120
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
954
1121
|
|
955
1122
|
// dequant shaders
|
956
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",
|
957
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
958
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
959
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
960
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
961
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
962
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
963
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
964
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
965
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
966
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[
|
967
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
|
1123
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1124
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1125
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1126
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1127
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1128
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1129
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1130
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1131
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
1132
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1133
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
968
1134
|
|
969
1135
|
// get_rows
|
970
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
971
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
972
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
973
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
974
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
975
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1136
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1137
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1138
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1139
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1140
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1141
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
976
1142
|
|
977
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
978
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
979
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
980
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
981
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
982
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1143
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1144
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1145
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1146
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1147
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1148
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
983
1149
|
|
984
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
1150
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
985
1151
|
|
986
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
987
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
1152
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
1153
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
988
1154
|
|
989
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
990
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
1155
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
1156
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
991
1157
|
|
992
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(
|
993
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(
|
994
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(
|
1158
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
1159
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
1160
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
995
1161
|
|
996
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(
|
1162
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
997
1163
|
|
998
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(
|
1164
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
999
1165
|
|
1000
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(
|
1166
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
1001
1167
|
|
1002
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(
|
1168
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
1003
1169
|
|
1004
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(
|
1170
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
1005
1171
|
|
1006
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1007
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1008
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1172
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1173
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1174
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
1009
1175
|
|
1010
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
1176
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
1011
1177
|
|
1012
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main",
|
1178
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1013
1179
|
|
1014
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1015
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1180
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1181
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1016
1182
|
|
1017
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1018
|
-
ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1183
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1184
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1185
|
+
|
1186
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
1019
1187
|
}
|
1020
1188
|
|
1021
1189
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
@@ -1057,8 +1225,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1057
1225
|
}
|
1058
1226
|
}
|
1059
1227
|
|
1060
|
-
const char*
|
1061
|
-
bool force_disable_f16 =
|
1228
|
+
const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
|
1229
|
+
bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
|
1062
1230
|
|
1063
1231
|
bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
1064
1232
|
|
@@ -1106,7 +1274,9 @@ void ggml_vk_instance_init() {
|
|
1106
1274
|
|
1107
1275
|
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
1108
1276
|
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
1277
|
+
#ifdef __APPLE__
|
1109
1278
|
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
1279
|
+
#endif
|
1110
1280
|
|
1111
1281
|
std::vector<const char*> layers;
|
1112
1282
|
|
@@ -1117,13 +1287,17 @@ void ggml_vk_instance_init() {
|
|
1117
1287
|
if (validation_ext) {
|
1118
1288
|
extensions.push_back("VK_EXT_validation_features");
|
1119
1289
|
}
|
1290
|
+
#ifdef __APPLE__
|
1120
1291
|
if (portability_enumeration_ext) {
|
1121
1292
|
extensions.push_back("VK_KHR_portability_enumeration");
|
1122
1293
|
}
|
1294
|
+
#endif
|
1123
1295
|
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
1296
|
+
#ifdef __APPLE__
|
1124
1297
|
if (portability_enumeration_ext) {
|
1125
1298
|
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
1126
1299
|
}
|
1300
|
+
#endif
|
1127
1301
|
|
1128
1302
|
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
1129
1303
|
vk::ValidationFeaturesEXT validation_features;
|
@@ -1182,140 +1356,152 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1182
1356
|
throw std::runtime_error("Device not found");
|
1183
1357
|
}
|
1184
1358
|
|
1185
|
-
|
1186
|
-
ctx->device
|
1187
|
-
|
1188
|
-
|
1359
|
+
ctx->device = ggml_vk_get_device(idx);
|
1360
|
+
if (!ctx->device->initialized) {
|
1361
|
+
ctx->device->physical_device = devices[dev_num];
|
1362
|
+
const std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties();
|
1189
1363
|
|
1190
|
-
|
1364
|
+
bool maintenance4_support = false;
|
1191
1365
|
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1366
|
+
// Check if maintenance4 is supported
|
1367
|
+
for (const auto& properties : ext_props) {
|
1368
|
+
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
1369
|
+
maintenance4_support = true;
|
1370
|
+
}
|
1196
1371
|
}
|
1197
|
-
}
|
1198
1372
|
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1373
|
+
vk::PhysicalDeviceProperties2 props2;
|
1374
|
+
vk::PhysicalDeviceMaintenance3Properties props3;
|
1375
|
+
vk::PhysicalDeviceMaintenance4Properties props4;
|
1376
|
+
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
1377
|
+
props2.pNext = &props3;
|
1378
|
+
props3.pNext = &subgroup_props;
|
1379
|
+
if (maintenance4_support) {
|
1380
|
+
subgroup_props.pNext = &props4;
|
1381
|
+
}
|
1382
|
+
ctx->device->physical_device.getProperties2(&props2);
|
1383
|
+
ctx->device->properties = props2.properties;
|
1210
1384
|
|
1211
|
-
|
1212
|
-
ctx->device.lock()->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
|
1213
|
-
} else {
|
1214
|
-
ctx->device.lock()->max_memory_allocation_size = props3.maxMemoryAllocationSize;
|
1215
|
-
}
|
1385
|
+
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
|
1216
1386
|
|
1217
|
-
|
1218
|
-
|
1219
|
-
|
1387
|
+
if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
|
1388
|
+
ctx->device->max_memory_allocation_size = std::stoi(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
|
1389
|
+
} else if (maintenance4_support) {
|
1390
|
+
ctx->device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
|
1391
|
+
} else {
|
1392
|
+
ctx->device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
|
1393
|
+
}
|
1220
1394
|
|
1221
|
-
|
1222
|
-
|
1395
|
+
ctx->device->vendor_id = ctx->device->properties.vendorID;
|
1396
|
+
ctx->device->subgroup_size = subgroup_props.subgroupSize;
|
1397
|
+
ctx->device->uma = ctx->device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
1223
1398
|
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1228
|
-
|
1399
|
+
bool fp16_storage = false;
|
1400
|
+
bool fp16_compute = false;
|
1401
|
+
|
1402
|
+
for (const auto& properties : ext_props) {
|
1403
|
+
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
1404
|
+
fp16_storage = true;
|
1405
|
+
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
1406
|
+
fp16_compute = true;
|
1407
|
+
}
|
1229
1408
|
}
|
1230
|
-
}
|
1231
1409
|
|
1232
|
-
|
1233
|
-
|
1410
|
+
const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
|
1411
|
+
const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
|
1234
1412
|
|
1235
|
-
|
1413
|
+
ctx->device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
1236
1414
|
|
1237
|
-
|
1415
|
+
std::vector<vk::QueueFamilyProperties> queue_family_props = ctx->device->physical_device.getQueueFamilyProperties();
|
1238
1416
|
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1417
|
+
// Try to find a non-graphics compute queue and transfer-focused queues
|
1418
|
+
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
|
1419
|
+
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
|
1242
1420
|
|
1243
|
-
|
1244
|
-
|
1421
|
+
const float priorities[] = { 1.0f, 1.0f };
|
1422
|
+
ctx->device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
|
1245
1423
|
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1424
|
+
std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos;
|
1425
|
+
if (compute_queue_family_index != transfer_queue_family_index) {
|
1426
|
+
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
|
1427
|
+
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1});
|
1428
|
+
} else if(!ctx->device->single_queue) {
|
1429
|
+
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities});
|
1430
|
+
} else {
|
1431
|
+
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
|
1432
|
+
}
|
1433
|
+
vk::DeviceCreateInfo device_create_info;
|
1434
|
+
std::vector<const char *> device_extensions;
|
1435
|
+
vk::PhysicalDeviceFeatures device_features = ctx->device->physical_device.getFeatures();
|
1258
1436
|
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1437
|
+
VkPhysicalDeviceFeatures2 device_features2;
|
1438
|
+
device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
|
1439
|
+
device_features2.pNext = nullptr;
|
1440
|
+
device_features2.features = (VkPhysicalDeviceFeatures)device_features;
|
1263
1441
|
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1442
|
+
VkPhysicalDeviceVulkan11Features vk11_features;
|
1443
|
+
vk11_features.pNext = nullptr;
|
1444
|
+
vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
|
1445
|
+
device_features2.pNext = &vk11_features;
|
1268
1446
|
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1447
|
+
VkPhysicalDeviceVulkan12Features vk12_features;
|
1448
|
+
vk12_features.pNext = nullptr;
|
1449
|
+
vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
|
1450
|
+
vk11_features.pNext = &vk12_features;
|
1273
1451
|
|
1274
|
-
|
1452
|
+
vkGetPhysicalDeviceFeatures2(ctx->device->physical_device, &device_features2);
|
1275
1453
|
|
1276
|
-
|
1454
|
+
ctx->device->fp16 = ctx->device->fp16 && vk12_features.shaderFloat16;
|
1277
1455
|
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1456
|
+
if (!vk11_features.storageBuffer16BitAccess) {
|
1457
|
+
std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
|
1458
|
+
throw std::runtime_error("Unsupported device");
|
1459
|
+
}
|
1282
1460
|
|
1283
|
-
|
1461
|
+
device_extensions.push_back("VK_KHR_16bit_storage");
|
1284
1462
|
|
1285
1463
|
#ifdef GGML_VULKAN_VALIDATE
|
1286
|
-
|
1464
|
+
device_extensions.push_back("VK_KHR_shader_non_semantic_info");
|
1287
1465
|
#endif
|
1288
1466
|
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1467
|
+
if (ctx->device->fp16) {
|
1468
|
+
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
1469
|
+
}
|
1470
|
+
ctx->device->name = ctx->device->properties.deviceName.data();
|
1293
1471
|
|
1294
|
-
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1472
|
+
device_create_info = {
|
1473
|
+
vk::DeviceCreateFlags(),
|
1474
|
+
device_queue_create_infos,
|
1475
|
+
{},
|
1476
|
+
device_extensions
|
1477
|
+
};
|
1478
|
+
device_create_info.setPNext(&device_features2);
|
1479
|
+
ctx->device->device = ctx->device->physical_device.createDevice(device_create_info);
|
1302
1480
|
|
1303
|
-
|
1481
|
+
ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
|
1304
1482
|
|
1305
|
-
|
1306
|
-
|
1483
|
+
// Queues
|
1484
|
+
ggml_vk_create_queue(ctx, ctx->device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
|
1307
1485
|
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1486
|
+
// Shaders
|
1487
|
+
ggml_vk_load_shaders(ctx);
|
1488
|
+
|
1489
|
+
if (!ctx->device->single_queue) {
|
1490
|
+
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
|
1491
|
+
ggml_vk_create_queue(ctx, ctx->device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
|
1492
|
+
} else {
|
1493
|
+
// TODO: Use pointer or reference to avoid copy
|
1494
|
+
ctx->device->transfer_queue = ctx->device->compute_queue;
|
1495
|
+
}
|
1496
|
+
|
1497
|
+
ctx->device->idx = dev_num;
|
1498
|
+
ctx->device->initialized = true;
|
1499
|
+
} else if (ctx->device->idx != dev_num) {
|
1500
|
+
std::cerr << "ggml_vulkan: Device " << ctx->device->name << " already initialized with index " << ctx->device->idx << ", but trying to reinitialize with index " << dev_num << std::endl;
|
1501
|
+
throw std::runtime_error("Device already initialized");
|
1316
1502
|
}
|
1317
1503
|
|
1318
|
-
ctx->fence = ctx->device
|
1504
|
+
ctx->fence = ctx->device->device.createFence({});
|
1319
1505
|
|
1320
1506
|
ctx->compute_ctx = nullptr;
|
1321
1507
|
ctx->transfer_ctx = nullptr;
|
@@ -1333,7 +1519,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1333
1519
|
#endif
|
1334
1520
|
}
|
1335
1521
|
|
1336
|
-
static vk_pipeline
|
1522
|
+
static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
|
1337
1523
|
#ifdef GGML_VULKAN_DEBUG
|
1338
1524
|
std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
|
1339
1525
|
#endif
|
@@ -1354,10 +1540,36 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
1354
1540
|
return nullptr;
|
1355
1541
|
}
|
1356
1542
|
|
1357
|
-
return
|
1543
|
+
return ctx->device->pipeline_dequant[type];
|
1544
|
+
}
|
1545
|
+
|
1546
|
+
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
1547
|
+
#ifdef GGML_VULKAN_DEBUG
|
1548
|
+
std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
|
1549
|
+
#endif
|
1550
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
1551
|
+
return ctx->device->pipeline_matmul_f32;
|
1552
|
+
}
|
1553
|
+
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
1554
|
+
return ctx->device->pipeline_matmul_f16_f32;
|
1555
|
+
}
|
1556
|
+
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
1557
|
+
return ctx->device->pipeline_matmul_f16;
|
1558
|
+
}
|
1559
|
+
|
1560
|
+
GGML_ASSERT(src1_type == GGML_TYPE_F32);
|
1561
|
+
|
1562
|
+
switch (src0_type) {
|
1563
|
+
case GGML_TYPE_Q4_0:
|
1564
|
+
break;
|
1565
|
+
default:
|
1566
|
+
return nullptr;
|
1567
|
+
}
|
1568
|
+
|
1569
|
+
return ctx->device->pipeline_dequant_mul_mat_mat[src0_type];
|
1358
1570
|
}
|
1359
1571
|
|
1360
|
-
static vk_pipeline
|
1572
|
+
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) {
|
1361
1573
|
#ifdef GGML_VULKAN_DEBUG
|
1362
1574
|
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
1363
1575
|
#endif
|
@@ -1378,7 +1590,7 @@ static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
1378
1590
|
return nullptr;
|
1379
1591
|
}
|
1380
1592
|
|
1381
|
-
return
|
1593
|
+
return ctx->device->pipeline_dequant_mul_mat_vec_f32[type];
|
1382
1594
|
}
|
1383
1595
|
|
1384
1596
|
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
@@ -1457,8 +1669,8 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
1457
1669
|
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
1458
1670
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
1459
1671
|
size/1024.0/1024.0);
|
1460
|
-
ctx->device
|
1461
|
-
ctx->device
|
1672
|
+
ctx->device->device.freeMemory(buf->device_memory);
|
1673
|
+
ctx->device->device.destroyBuffer(buf->buffer);
|
1462
1674
|
return nullptr;
|
1463
1675
|
}
|
1464
1676
|
|
@@ -1522,30 +1734,30 @@ static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_
|
|
1522
1734
|
}
|
1523
1735
|
|
1524
1736
|
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
|
1525
|
-
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline
|
1526
|
-
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline
|
1527
|
-
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline
|
1737
|
+
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
1738
|
+
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
1739
|
+
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
1528
1740
|
#ifdef GGML_VULKAN_DEBUG
|
1529
|
-
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline
|
1741
|
+
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
|
1530
1742
|
#endif
|
1531
1743
|
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
1532
1744
|
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
1533
|
-
GGML_ASSERT(pipeline
|
1534
|
-
GGML_ASSERT(buffers.size() == pipeline
|
1535
|
-
vk::DescriptorSet& descriptor_set = pipeline
|
1536
|
-
for (uint32_t i = 0; i < pipeline
|
1745
|
+
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
1746
|
+
GGML_ASSERT(buffers.size() == pipeline->parameter_count);
|
1747
|
+
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
|
1748
|
+
for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
|
1537
1749
|
descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
|
1538
1750
|
}
|
1539
|
-
for (uint32_t i = 0; i < pipeline
|
1751
|
+
for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
|
1540
1752
|
write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
|
1541
1753
|
}
|
1542
1754
|
|
1543
|
-
ctx->device
|
1755
|
+
ctx->device->device.updateDescriptorSets(write_descriptor_sets, {});
|
1544
1756
|
|
1545
|
-
subctx->s->buffer.pushConstants(pipeline
|
1546
|
-
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline
|
1757
|
+
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
|
1758
|
+
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
1547
1759
|
subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
1548
|
-
pipeline
|
1760
|
+
pipeline->layout,
|
1549
1761
|
0,
|
1550
1762
|
{ descriptor_set },
|
1551
1763
|
{});
|
@@ -1804,7 +2016,7 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
1804
2016
|
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
1805
2017
|
}
|
1806
2018
|
} else {
|
1807
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device
|
2019
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
1808
2020
|
ggml_vk_ctx_begin(ctx, subctx);
|
1809
2021
|
ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true);
|
1810
2022
|
ggml_vk_ctx_end(subctx);
|
@@ -1814,8 +2026,9 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
1814
2026
|
}
|
1815
2027
|
|
1816
2028
|
ggml_vk_submit(subctx, ctx->fence);
|
1817
|
-
VK_CHECK(ctx->device
|
1818
|
-
ctx->device
|
2029
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
2030
|
+
ctx->device->device.resetFences({ ctx->fence });
|
2031
|
+
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
1819
2032
|
}
|
1820
2033
|
}
|
1821
2034
|
|
@@ -1900,18 +2113,19 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
1900
2113
|
|
1901
2114
|
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
1902
2115
|
} else {
|
1903
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device
|
2116
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
1904
2117
|
ggml_vk_ctx_begin(ctx, subctx);
|
1905
2118
|
ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true);
|
1906
2119
|
ggml_vk_ctx_end(subctx);
|
1907
2120
|
|
1908
2121
|
ggml_vk_submit(subctx, ctx->fence);
|
1909
|
-
VK_CHECK(ctx->device
|
1910
|
-
ctx->device
|
2122
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
2123
|
+
ctx->device->device.resetFences({ ctx->fence });
|
1911
2124
|
|
1912
2125
|
for (auto& cpy : subctx->out_memcpys) {
|
1913
2126
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
1914
2127
|
}
|
2128
|
+
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
1915
2129
|
}
|
1916
2130
|
}
|
1917
2131
|
|
@@ -1935,15 +2149,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
1935
2149
|
// Copy within the device
|
1936
2150
|
ggml_backend_vk_context * ctx = src->ctx;
|
1937
2151
|
|
1938
|
-
|
1939
|
-
|
1940
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
|
2152
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
1941
2153
|
ggml_vk_ctx_begin(ctx, subctx);
|
1942
2154
|
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
1943
2155
|
ggml_vk_ctx_end(subctx);
|
1944
2156
|
ggml_vk_submit(subctx, ctx->fence);
|
1945
|
-
VK_CHECK(ctx->device
|
1946
|
-
ctx->device
|
2157
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
2158
|
+
ctx->device->device.resetFences({ ctx->fence });
|
1947
2159
|
} else {
|
1948
2160
|
#ifdef GGML_VULKAN_DEBUG
|
1949
2161
|
std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
|
@@ -1971,14 +2183,14 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
|
|
1971
2183
|
// Make sure ctx owns the buffer
|
1972
2184
|
GGML_ASSERT(dst->ctx == ctx);
|
1973
2185
|
|
1974
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device
|
2186
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
1975
2187
|
ggml_vk_ctx_begin(ctx, subctx);
|
1976
2188
|
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
1977
2189
|
ggml_vk_ctx_end(subctx);
|
1978
2190
|
|
1979
2191
|
ggml_vk_submit(subctx, ctx->fence);
|
1980
|
-
VK_CHECK(ctx->device
|
1981
|
-
ctx->device
|
2192
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
2193
|
+
ctx->device->device.resetFences({ ctx->fence });
|
1982
2194
|
}
|
1983
2195
|
|
1984
2196
|
static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
|
@@ -2039,176 +2251,63 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
2039
2251
|
|
2040
2252
|
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
2041
2253
|
#ifdef GGML_VULKAN_DEBUG
|
2042
|
-
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")";
|
2254
|
+
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
|
2043
2255
|
#endif
|
2044
2256
|
if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
2045
|
-
#ifdef GGML_VULKAN_DEBUG
|
2046
|
-
std::cerr << " = 4" << std::endl;
|
2047
|
-
#endif
|
2048
2257
|
return 4;
|
2049
2258
|
}
|
2050
2259
|
|
2051
|
-
#ifdef GGML_VULKAN_DEBUG
|
2052
|
-
std::cerr << " = 1" << std::endl;
|
2053
|
-
#endif
|
2054
2260
|
return 1;
|
2055
2261
|
}
|
2056
2262
|
|
2057
|
-
static
|
2058
|
-
#ifdef GGML_VULKAN_DEBUG
|
2059
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
2060
|
-
#endif
|
2263
|
+
static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
2061
2264
|
if (m <= 32 || n <= 32) {
|
2062
|
-
return
|
2265
|
+
return aligned ? mmp->a_s : mmp->s;
|
2063
2266
|
}
|
2064
|
-
|
2065
|
-
|
2066
|
-
|
2067
|
-
return ctx->pipeline_matmul_f32_aligned_l.align;
|
2267
|
+
return aligned ? mmp->a_m : mmp->m;
|
2268
|
+
|
2269
|
+
GGML_UNUSED(ctx);
|
2068
2270
|
}
|
2069
2271
|
|
2070
|
-
static vk_pipeline
|
2071
|
-
|
2072
|
-
if (m <= 32 || n <= 32) {
|
2073
|
-
#ifdef GGML_VULKAN_DEBUG
|
2074
|
-
std::cerr << " S" << std::endl;
|
2075
|
-
#endif
|
2076
|
-
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2077
|
-
}
|
2078
|
-
#ifdef GGML_VULKAN_DEBUG
|
2079
|
-
std::cerr << " M" << std::endl;
|
2080
|
-
#endif
|
2081
|
-
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2082
|
-
}
|
2083
|
-
if (bit16_x && !bit16_y) {
|
2084
|
-
if (m <= 32 || n <= 32) {
|
2085
|
-
#ifdef GGML_VULKAN_DEBUG
|
2086
|
-
std::cerr << " S" << std::endl;
|
2087
|
-
#endif
|
2088
|
-
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2089
|
-
}
|
2090
|
-
#ifdef GGML_VULKAN_DEBUG
|
2091
|
-
std::cerr << " M" << std::endl;
|
2092
|
-
#endif
|
2093
|
-
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2094
|
-
}
|
2095
|
-
if (!bit16_x && bit16_y) {
|
2096
|
-
GGML_ASSERT(false);
|
2097
|
-
}
|
2272
|
+
static vk_pipeline ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) {
|
2273
|
+
return aligned ? mmp->a_m : mmp->m;
|
2098
2274
|
|
2099
|
-
|
2100
|
-
#ifdef GGML_VULKAN_DEBUG
|
2101
|
-
std::cerr << " S" << std::endl;
|
2102
|
-
#endif
|
2103
|
-
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2104
|
-
}
|
2105
|
-
#ifdef GGML_VULKAN_DEBUG
|
2106
|
-
std::cerr << " M" << std::endl;
|
2107
|
-
#endif
|
2108
|
-
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2275
|
+
GGML_UNUSED(ctx);
|
2109
2276
|
}
|
2110
2277
|
|
2111
|
-
static vk_pipeline
|
2112
|
-
|
2113
|
-
std::cerr << " M" << std::endl;
|
2114
|
-
#endif
|
2115
|
-
if (bit16_x && bit16_y) {
|
2116
|
-
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2117
|
-
}
|
2118
|
-
if (bit16_x && !bit16_y) {
|
2119
|
-
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2120
|
-
}
|
2121
|
-
if (!bit16_x && bit16_y) {
|
2122
|
-
GGML_ASSERT(false);
|
2123
|
-
}
|
2124
|
-
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2125
|
-
}
|
2278
|
+
static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) {
|
2279
|
+
return aligned ? mmp->a_s : mmp->s;
|
2126
2280
|
|
2127
|
-
|
2128
|
-
#ifdef GGML_VULKAN_DEBUG
|
2129
|
-
std::cerr << " S" << std::endl;
|
2130
|
-
#endif
|
2131
|
-
if (bit16_x && bit16_y) {
|
2132
|
-
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2133
|
-
}
|
2134
|
-
if (bit16_x && !bit16_y) {
|
2135
|
-
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2136
|
-
}
|
2137
|
-
if (!bit16_x && bit16_y) {
|
2138
|
-
GGML_ASSERT(false);
|
2139
|
-
}
|
2140
|
-
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2281
|
+
GGML_UNUSED(ctx);
|
2141
2282
|
}
|
2142
2283
|
|
2143
|
-
static vk_pipeline
|
2284
|
+
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
2144
2285
|
#ifdef GGML_VULKAN_DEBUG
|
2145
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline(" <<
|
2286
|
+
std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
|
2146
2287
|
#endif
|
2147
|
-
switch (ctx->device
|
2288
|
+
switch (ctx->device->vendor_id) {
|
2148
2289
|
case VK_VENDOR_ID_AMD:
|
2149
|
-
return ggml_vk_guess_matmul_pipeline_amd(ctx,
|
2290
|
+
return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
|
2150
2291
|
case VK_VENDOR_ID_APPLE:
|
2151
|
-
return ggml_vk_guess_matmul_pipeline_apple(ctx,
|
2292
|
+
return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
|
2152
2293
|
case VK_VENDOR_ID_INTEL:
|
2153
|
-
return ggml_vk_guess_matmul_pipeline_intel(ctx,
|
2154
|
-
}
|
2155
|
-
|
2156
|
-
if (bit16_x && bit16_y) {
|
2157
|
-
if (m <= 32 || n <= 32) {
|
2158
|
-
#ifdef GGML_VULKAN_DEBUG
|
2159
|
-
std::cerr << " S" << std::endl;
|
2160
|
-
#endif
|
2161
|
-
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2162
|
-
}
|
2163
|
-
if (m <= 64 || n <= 64) {
|
2164
|
-
#ifdef GGML_VULKAN_DEBUG
|
2165
|
-
std::cerr << " M" << std::endl;
|
2166
|
-
#endif
|
2167
|
-
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2168
|
-
}
|
2169
|
-
#ifdef GGML_VULKAN_DEBUG
|
2170
|
-
std::cerr << " L" << std::endl;
|
2171
|
-
#endif
|
2172
|
-
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
2173
|
-
}
|
2174
|
-
if (bit16_x && !bit16_y) {
|
2175
|
-
if (m <= 32 || n <= 32) {
|
2176
|
-
#ifdef GGML_VULKAN_DEBUG
|
2177
|
-
std::cerr << " S" << std::endl;
|
2178
|
-
#endif
|
2179
|
-
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2180
|
-
}
|
2181
|
-
if (m <= 64 || n <= 64) {
|
2182
|
-
#ifdef GGML_VULKAN_DEBUG
|
2183
|
-
std::cerr << " M" << std::endl;
|
2184
|
-
#endif
|
2185
|
-
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2186
|
-
}
|
2187
|
-
#ifdef GGML_VULKAN_DEBUG
|
2188
|
-
std::cerr << " L" << std::endl;
|
2189
|
-
#endif
|
2190
|
-
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_l : &ctx->pipeline_matmul_f16_f32_l;
|
2191
|
-
}
|
2192
|
-
if (!bit16_x && bit16_y) {
|
2193
|
-
GGML_ASSERT(false);
|
2294
|
+
return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
|
2194
2295
|
}
|
2195
2296
|
|
2196
2297
|
if (m <= 32 || n <= 32) {
|
2197
|
-
|
2198
|
-
std::cerr << " S" << std::endl;
|
2199
|
-
#endif
|
2200
|
-
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2298
|
+
return aligned ? mmp->a_s : mmp->s;
|
2201
2299
|
}
|
2202
2300
|
if (m <= 64 || n <= 64) {
|
2203
|
-
|
2204
|
-
std::cerr << " M" << std::endl;
|
2205
|
-
#endif
|
2206
|
-
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2301
|
+
return aligned ? mmp->a_m : mmp->m;
|
2207
2302
|
}
|
2303
|
+
return aligned ? mmp->a_l : mmp->l;
|
2304
|
+
}
|
2305
|
+
|
2306
|
+
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
|
2208
2307
|
#ifdef GGML_VULKAN_DEBUG
|
2209
|
-
std::cerr << "
|
2308
|
+
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
2210
2309
|
#endif
|
2211
|
-
return
|
2310
|
+
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, false)->align;
|
2212
2311
|
}
|
2213
2312
|
|
2214
2313
|
static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) {
|
@@ -2226,10 +2325,10 @@ static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, v
|
|
2226
2325
|
|
2227
2326
|
const std::array<uint32_t, 14> pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
|
2228
2327
|
// Make sure enough workgroups get assigned for split k to work
|
2229
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline
|
2328
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
2230
2329
|
ggml_vk_sync_buffers(subctx);
|
2231
2330
|
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
|
2232
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
|
2331
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
|
2233
2332
|
}
|
2234
2333
|
|
2235
2334
|
static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
|
@@ -2239,41 +2338,39 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
|
|
2239
2338
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
2240
2339
|
}
|
2241
2340
|
|
2242
|
-
static vk_pipeline
|
2341
|
+
static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
|
2243
2342
|
if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
|
2244
|
-
return
|
2343
|
+
return ctx->device->pipeline_cpy_f32_f32;
|
2245
2344
|
}
|
2246
2345
|
if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
|
2247
|
-
return
|
2346
|
+
return ctx->device->pipeline_cpy_f32_f16;
|
2248
2347
|
}
|
2249
2348
|
if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
|
2250
|
-
return
|
2349
|
+
return ctx->device->pipeline_cpy_f16_f16;
|
2251
2350
|
}
|
2252
2351
|
|
2253
2352
|
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
|
2254
2353
|
GGML_ASSERT(false);
|
2255
2354
|
}
|
2256
2355
|
|
2257
|
-
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline
|
2356
|
+
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
2258
2357
|
#ifdef GGML_VULKAN_DEBUG
|
2259
2358
|
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
2260
2359
|
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
2261
2360
|
#endif
|
2262
2361
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
2263
|
-
const int dst_type_size = ggml_type_size(buffer_type);
|
2264
2362
|
|
2265
|
-
const uint32_t ne = tensor
|
2363
|
+
const uint32_t ne = ggml_nelements(tensor);
|
2266
2364
|
|
2267
|
-
const
|
2268
|
-
|
2269
|
-
const vk_op_cpy_push_constants pc = {
|
2365
|
+
const vk_op_unary_push_constants pc = {
|
2270
2366
|
(uint32_t)ne,
|
2271
|
-
(uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size,
|
2272
|
-
(uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], 1 , (uint32_t)tensor->ne[0] ,
|
2367
|
+
(uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
|
2368
|
+
(uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
|
2273
2369
|
0,
|
2370
|
+
0.0f, 0.0f,
|
2274
2371
|
};
|
2275
2372
|
ggml_vk_sync_buffers(subctx);
|
2276
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
2373
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
|
2277
2374
|
}
|
2278
2375
|
|
2279
2376
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2313,23 +2410,30 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2313
2410
|
bool src0_uma = false;
|
2314
2411
|
bool src1_uma = false;
|
2315
2412
|
|
2316
|
-
if (ctx->device
|
2413
|
+
if (ctx->device->uma) {
|
2317
2414
|
ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
|
2318
2415
|
ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
|
2319
2416
|
src0_uma = d_Qx != nullptr;
|
2320
2417
|
src1_uma = d_Qy != nullptr;
|
2321
2418
|
}
|
2322
2419
|
|
2323
|
-
const bool load_x = src0->backend !=
|
2324
|
-
const bool load_y = src1->backend !=
|
2420
|
+
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
2421
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2325
2422
|
|
2326
2423
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2327
2424
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2328
2425
|
|
2329
|
-
const bool
|
2426
|
+
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
2330
2427
|
|
2331
|
-
|
2332
|
-
|
2428
|
+
vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type);
|
2429
|
+
|
2430
|
+
const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
|
2431
|
+
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
|
2432
|
+
|
2433
|
+
if (mmp == nullptr) {
|
2434
|
+
// Fall back to dequant + f16 mulmat
|
2435
|
+
mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16);
|
2436
|
+
}
|
2333
2437
|
|
2334
2438
|
// Not implemented
|
2335
2439
|
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
@@ -2338,17 +2442,17 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2338
2442
|
const int y_ne = ne11 * ne10;
|
2339
2443
|
const int d_ne = ne11 * ne01;
|
2340
2444
|
|
2341
|
-
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, ne01, ne11));
|
2445
|
+
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
|
2342
2446
|
const bool aligned = ne10 == kpad;
|
2343
2447
|
|
2344
2448
|
const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
2345
2449
|
|
2346
|
-
vk_pipeline
|
2450
|
+
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned);
|
2347
2451
|
|
2348
2452
|
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
2349
2453
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
2350
|
-
const uint64_t x_sz = sizeof(ggml_fp16_t) * x_ne;
|
2351
|
-
const uint64_t y_sz =
|
2454
|
+
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
2455
|
+
const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
2352
2456
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
2353
2457
|
|
2354
2458
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
@@ -2379,7 +2483,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2379
2483
|
} else {
|
2380
2484
|
d_X = d_Qx;
|
2381
2485
|
x_buf_offset = qx_buf_offset;
|
2382
|
-
GGML_ASSERT(qx_sz == x_sz);
|
2486
|
+
GGML_ASSERT(qx_sz == x_sz);
|
2383
2487
|
}
|
2384
2488
|
if (qy_needs_dequant) {
|
2385
2489
|
d_Y = ctx->prealloc_y;
|
@@ -2390,8 +2494,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2390
2494
|
GGML_ASSERT(qy_sz == y_sz);
|
2391
2495
|
}
|
2392
2496
|
|
2393
|
-
vk_pipeline
|
2394
|
-
vk_pipeline
|
2497
|
+
vk_pipeline to_fp16_vk_0 = nullptr;
|
2498
|
+
vk_pipeline to_fp16_vk_1 = nullptr;
|
2395
2499
|
|
2396
2500
|
if (x_non_contig) {
|
2397
2501
|
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
@@ -2407,19 +2511,19 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2407
2511
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
2408
2512
|
|
2409
2513
|
// Allocate descriptor sets
|
2410
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
2514
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
2411
2515
|
if (qx_needs_dequant) {
|
2412
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
2516
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
2413
2517
|
}
|
2414
2518
|
if (qy_needs_dequant) {
|
2415
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
2519
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
2416
2520
|
}
|
2417
2521
|
if (split_k > 1) {
|
2418
|
-
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce,
|
2522
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
2419
2523
|
}
|
2420
2524
|
|
2421
2525
|
if (x_non_contig) {
|
2422
|
-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }
|
2526
|
+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2423
2527
|
} else if (load_x || qx_needs_dequant) {
|
2424
2528
|
if (load_x) {
|
2425
2529
|
// copy data to device
|
@@ -2428,13 +2532,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2428
2532
|
}
|
2429
2533
|
|
2430
2534
|
if (qx_needs_dequant) {
|
2431
|
-
const std::vector<
|
2535
|
+
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2432
2536
|
ggml_vk_sync_buffers(subctx);
|
2433
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
2537
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2434
2538
|
}
|
2435
2539
|
}
|
2436
2540
|
if (y_non_contig) {
|
2437
|
-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }
|
2541
|
+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2438
2542
|
} else if (load_y) {
|
2439
2543
|
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2440
2544
|
}
|
@@ -2451,9 +2555,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2451
2555
|
}
|
2452
2556
|
|
2453
2557
|
// compute
|
2454
|
-
ggml_vk_matmul(ctx, subctx,
|
2558
|
+
ggml_vk_matmul(ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
2455
2559
|
|
2456
|
-
if (dst->backend ==
|
2560
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2457
2561
|
// copy dst to host
|
2458
2562
|
float * d = (float *) ((char *) dst->data);
|
2459
2563
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
@@ -2499,15 +2603,15 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2499
2603
|
bool src0_uma = false;
|
2500
2604
|
bool src1_uma = false;
|
2501
2605
|
|
2502
|
-
if (ctx->device
|
2606
|
+
if (ctx->device->uma) {
|
2503
2607
|
ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
|
2504
2608
|
ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
|
2505
2609
|
src0_uma = d_Qx != nullptr;
|
2506
2610
|
src1_uma = d_Qy != nullptr;
|
2507
2611
|
}
|
2508
2612
|
|
2509
|
-
const bool load_x = src0->backend !=
|
2510
|
-
const bool load_y = src1->backend !=
|
2613
|
+
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
2614
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2511
2615
|
|
2512
2616
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2513
2617
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
@@ -2521,9 +2625,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2521
2625
|
const uint64_t y_ne = ne11 * ne10;
|
2522
2626
|
const uint64_t d_ne = ne11 * ne01;
|
2523
2627
|
|
2524
|
-
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device
|
2628
|
+
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
2525
2629
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
2526
|
-
const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device
|
2630
|
+
const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
|
2527
2631
|
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
2528
2632
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
2529
2633
|
|
@@ -2563,8 +2667,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2563
2667
|
GGML_ASSERT(qy_sz == y_sz);
|
2564
2668
|
}
|
2565
2669
|
|
2566
|
-
vk_pipeline
|
2567
|
-
vk_pipeline
|
2670
|
+
vk_pipeline to_fp16_vk_0 = nullptr;
|
2671
|
+
vk_pipeline to_fp16_vk_1 = nullptr;
|
2568
2672
|
if (x_non_contig) {
|
2569
2673
|
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
2570
2674
|
}
|
@@ -2573,30 +2677,30 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2573
2677
|
} else {
|
2574
2678
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
2575
2679
|
}
|
2576
|
-
vk_pipeline
|
2680
|
+
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type);
|
2577
2681
|
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
2578
2682
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
2579
2683
|
GGML_ASSERT(dmmv != nullptr);
|
2580
2684
|
|
2581
2685
|
// Allocate descriptor sets
|
2582
2686
|
if (qx_needs_dequant) {
|
2583
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
2687
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
2584
2688
|
}
|
2585
2689
|
if (qy_needs_dequant) {
|
2586
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
2690
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
|
2587
2691
|
}
|
2588
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
2692
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13);
|
2589
2693
|
|
2590
2694
|
if (x_non_contig) {
|
2591
|
-
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device
|
2592
|
-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }
|
2695
|
+
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
2696
|
+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2593
2697
|
} else if (load_x) {
|
2594
2698
|
// copy data to device
|
2595
2699
|
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
2596
2700
|
}
|
2597
2701
|
if (y_non_contig) {
|
2598
2702
|
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
2599
|
-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }
|
2703
|
+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2600
2704
|
} else if (load_y) {
|
2601
2705
|
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2602
2706
|
}
|
@@ -2613,24 +2717,24 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2613
2717
|
const uint64_t y_offset = y_buf_offset + y_sz * it_idx1;
|
2614
2718
|
const uint64_t d_offset = d_buf_offset + d_sz * it_idx1;
|
2615
2719
|
|
2616
|
-
const uint64_t y_buffer_offset = (y_offset / ctx->device
|
2720
|
+
const uint64_t y_buffer_offset = (y_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2617
2721
|
const uint64_t y_shader_offset = y_offset - y_buffer_offset;
|
2618
2722
|
|
2619
|
-
const uint64_t d_buffer_offset = (d_offset / ctx->device
|
2723
|
+
const uint64_t d_buffer_offset = (d_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2620
2724
|
const uint64_t d_shader_offset = d_offset - d_buffer_offset;
|
2621
2725
|
|
2622
2726
|
if (!y_non_contig && qy_needs_dequant) {
|
2623
|
-
const std::vector<
|
2727
|
+
const std::vector<uint32_t> pc = { (uint32_t)ne11, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(y_ne / 32) };
|
2624
2728
|
ggml_vk_sync_buffers(subctx);
|
2625
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
2729
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)y_ne, 1, 1});
|
2626
2730
|
}
|
2627
2731
|
|
2628
2732
|
// compute
|
2629
|
-
const std::array<
|
2733
|
+
const std::array<uint32_t, 3> pc = { (uint32_t)ne00, (uint32_t)(y_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type))};
|
2630
2734
|
ggml_vk_sync_buffers(subctx);
|
2631
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
2735
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
2632
2736
|
|
2633
|
-
if (dst->backend ==
|
2737
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2634
2738
|
// copy dst to host
|
2635
2739
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
2636
2740
|
ggml_vk_sync_buffers(subctx);
|
@@ -2647,7 +2751,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2647
2751
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2648
2752
|
#endif
|
2649
2753
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
2650
|
-
GGML_ASSERT(src0->backend ==
|
2754
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
2651
2755
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
2652
2756
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
2653
2757
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -2674,18 +2778,18 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2674
2778
|
|
2675
2779
|
bool src1_uma = false;
|
2676
2780
|
|
2677
|
-
if (ctx->device
|
2781
|
+
if (ctx->device->uma) {
|
2678
2782
|
ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
|
2679
2783
|
src1_uma = d_Qy != nullptr;
|
2680
2784
|
}
|
2681
2785
|
|
2682
|
-
const bool load_y = src1->backend !=
|
2786
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2683
2787
|
|
2684
2788
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
2685
2789
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
2686
2790
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
2687
2791
|
|
2688
|
-
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device
|
2792
|
+
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
2689
2793
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
2690
2794
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
2691
2795
|
|
@@ -2704,12 +2808,12 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2704
2808
|
}
|
2705
2809
|
|
2706
2810
|
// Allocate descriptor sets
|
2707
|
-
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, 1);
|
2811
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
|
2708
2812
|
|
2709
|
-
const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device
|
2813
|
+
const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2710
2814
|
const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
2711
2815
|
|
2712
|
-
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device
|
2816
|
+
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2713
2817
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2714
2818
|
|
2715
2819
|
if (load_y) {
|
@@ -2719,9 +2823,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2719
2823
|
// compute
|
2720
2824
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2721
2825
|
ggml_vk_sync_buffers(subctx);
|
2722
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
2826
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
2723
2827
|
|
2724
|
-
if (dst->backend ==
|
2828
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2725
2829
|
// copy dst to host
|
2726
2830
|
float * d = (float *) dst->data;
|
2727
2831
|
ggml_vk_sync_buffers(subctx);
|
@@ -2738,7 +2842,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2738
2842
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
2739
2843
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
2740
2844
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
2741
|
-
GGML_ASSERT(src0->backend ==
|
2845
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
2742
2846
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2743
2847
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2744
2848
|
|
@@ -2766,12 +2870,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2766
2870
|
|
2767
2871
|
bool src1_uma = false;
|
2768
2872
|
|
2769
|
-
if (ctx->device
|
2873
|
+
if (ctx->device->uma) {
|
2770
2874
|
ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
|
2771
2875
|
src1_uma = d_Qy != nullptr;
|
2772
2876
|
}
|
2773
2877
|
|
2774
|
-
const bool load_y = src1->backend !=
|
2878
|
+
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2775
2879
|
|
2776
2880
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
2777
2881
|
|
@@ -2797,12 +2901,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2797
2901
|
}
|
2798
2902
|
|
2799
2903
|
// Allocate descriptor sets
|
2800
|
-
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
2904
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
2801
2905
|
|
2802
|
-
const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device
|
2906
|
+
const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2803
2907
|
const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
2804
2908
|
|
2805
|
-
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device
|
2909
|
+
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2806
2910
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2807
2911
|
|
2808
2912
|
if (load_y) {
|
@@ -2812,9 +2916,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2812
2916
|
// compute
|
2813
2917
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2814
2918
|
ggml_vk_sync_buffers(subctx);
|
2815
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
2919
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
2816
2920
|
|
2817
|
-
if (dst->backend ==
|
2921
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2818
2922
|
// copy dst to host
|
2819
2923
|
float * d = (float *) dst->data;
|
2820
2924
|
ggml_vk_sync_buffers(subctx);
|
@@ -2832,7 +2936,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2832
2936
|
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
2833
2937
|
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
2834
2938
|
dst->type == GGML_TYPE_F32 &&
|
2835
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
2939
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
2836
2940
|
}
|
2837
2941
|
|
2838
2942
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
@@ -2850,6 +2954,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
2850
2954
|
}
|
2851
2955
|
}
|
2852
2956
|
|
2957
|
+
// static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
2958
|
+
//
|
2959
|
+
// }
|
2960
|
+
|
2853
2961
|
static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2854
2962
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
2855
2963
|
const uint64_t ne0 = dst->ne[0];
|
@@ -2880,8 +2988,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
2880
2988
|
// TODO: support for transposed / permuted tensors
|
2881
2989
|
GGML_ASSERT(nb0 == sizeof(float));
|
2882
2990
|
GGML_ASSERT(nb00 == sizeof(float));
|
2883
|
-
GGML_ASSERT(src0->backend ==
|
2884
|
-
GGML_ASSERT(dst->backend ==
|
2991
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
2992
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
2885
2993
|
|
2886
2994
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2887
2995
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -2921,40 +3029,40 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
2921
3029
|
}
|
2922
3030
|
|
2923
3031
|
|
2924
|
-
static vk_pipeline
|
3032
|
+
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
2925
3033
|
switch (op) {
|
2926
3034
|
case GGML_OP_ADD:
|
2927
3035
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2928
|
-
return
|
3036
|
+
return ctx->device->pipeline_add_f32;
|
2929
3037
|
}
|
2930
3038
|
return nullptr;
|
2931
3039
|
case GGML_OP_GET_ROWS:
|
2932
3040
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
2933
3041
|
if (dst->type == GGML_TYPE_F16) {
|
2934
|
-
return
|
3042
|
+
return ctx->device->pipeline_get_rows[src0->type];
|
2935
3043
|
}
|
2936
3044
|
if (dst->type == GGML_TYPE_F32) {
|
2937
|
-
return
|
3045
|
+
return ctx->device->pipeline_get_rows_f32[src0->type];
|
2938
3046
|
}
|
2939
3047
|
return nullptr;
|
2940
3048
|
case GGML_OP_MUL:
|
2941
3049
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2942
|
-
return
|
3050
|
+
return ctx->device->pipeline_mul_f32;
|
2943
3051
|
}
|
2944
3052
|
return nullptr;
|
2945
3053
|
case GGML_OP_SCALE:
|
2946
3054
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2947
|
-
return
|
3055
|
+
return ctx->device->pipeline_scale_f32;
|
2948
3056
|
}
|
2949
3057
|
return nullptr;
|
2950
3058
|
case GGML_OP_SQR:
|
2951
3059
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2952
|
-
return
|
3060
|
+
return ctx->device->pipeline_sqr_f32;
|
2953
3061
|
}
|
2954
3062
|
return nullptr;
|
2955
3063
|
case GGML_OP_CLAMP:
|
2956
3064
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2957
|
-
return
|
3065
|
+
return ctx->device->pipeline_clamp_f32;
|
2958
3066
|
}
|
2959
3067
|
return nullptr;
|
2960
3068
|
case GGML_OP_CPY:
|
@@ -2963,29 +3071,29 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
2963
3071
|
return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
|
2964
3072
|
case GGML_OP_NORM:
|
2965
3073
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2966
|
-
return
|
3074
|
+
return ctx->device->pipeline_norm_f32;
|
2967
3075
|
}
|
2968
3076
|
return nullptr;
|
2969
3077
|
case GGML_OP_RMS_NORM:
|
2970
3078
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2971
|
-
return
|
3079
|
+
return ctx->device->pipeline_rms_norm_f32;
|
2972
3080
|
}
|
2973
3081
|
return nullptr;
|
2974
3082
|
case GGML_OP_UNARY:
|
2975
3083
|
switch (ggml_get_unary_op(dst)) {
|
2976
3084
|
case GGML_UNARY_OP_SILU:
|
2977
3085
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2978
|
-
return
|
3086
|
+
return ctx->device->pipeline_silu_f32;
|
2979
3087
|
}
|
2980
3088
|
break;
|
2981
3089
|
case GGML_UNARY_OP_GELU:
|
2982
3090
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2983
|
-
return
|
3091
|
+
return ctx->device->pipeline_gelu_f32;
|
2984
3092
|
}
|
2985
3093
|
break;
|
2986
3094
|
case GGML_UNARY_OP_RELU:
|
2987
3095
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2988
|
-
return
|
3096
|
+
return ctx->device->pipeline_relu_f32;
|
2989
3097
|
}
|
2990
3098
|
break;
|
2991
3099
|
default:
|
@@ -2994,12 +3102,12 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
2994
3102
|
return nullptr;
|
2995
3103
|
case GGML_OP_DIAG_MASK_INF:
|
2996
3104
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2997
|
-
return
|
3105
|
+
return ctx->device->pipeline_diag_mask_inf_f32;
|
2998
3106
|
}
|
2999
3107
|
return nullptr;
|
3000
3108
|
case GGML_OP_SOFT_MAX:
|
3001
|
-
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
3002
|
-
return
|
3109
|
+
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
3110
|
+
return ctx->device->pipeline_soft_max_f32;
|
3003
3111
|
}
|
3004
3112
|
return nullptr;
|
3005
3113
|
case GGML_OP_ROPE:
|
@@ -3014,21 +3122,26 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3014
3122
|
|
3015
3123
|
if (is_neox) {
|
3016
3124
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
3017
|
-
return
|
3125
|
+
return ctx->device->pipeline_rope_neox_f32;
|
3018
3126
|
}
|
3019
3127
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
3020
|
-
return
|
3128
|
+
return ctx->device->pipeline_rope_neox_f16;
|
3021
3129
|
}
|
3022
3130
|
} else {
|
3023
3131
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
3024
|
-
return
|
3132
|
+
return ctx->device->pipeline_rope_f32;
|
3025
3133
|
}
|
3026
3134
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
3027
|
-
return
|
3135
|
+
return ctx->device->pipeline_rope_f16;
|
3028
3136
|
}
|
3029
3137
|
}
|
3030
3138
|
return nullptr;
|
3031
3139
|
}
|
3140
|
+
case GGML_OP_ARGSORT:
|
3141
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
|
3142
|
+
return ctx->device->pipeline_argsort_f32;
|
3143
|
+
}
|
3144
|
+
return nullptr;
|
3032
3145
|
default:
|
3033
3146
|
return nullptr;
|
3034
3147
|
}
|
@@ -3044,17 +3157,19 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
|
3044
3157
|
}
|
3045
3158
|
|
3046
3159
|
template<typename PC>
|
3047
|
-
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
3160
|
+
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
3048
3161
|
#ifdef GGML_VULKAN_DEBUG
|
3049
3162
|
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3050
3163
|
if (src1 != nullptr) {
|
3051
3164
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3052
3165
|
}
|
3166
|
+
if (src2 != nullptr) {
|
3167
|
+
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", backend=" << src2->backend << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
3168
|
+
}
|
3053
3169
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3054
3170
|
#endif
|
3055
3171
|
GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
|
3056
3172
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
3057
|
-
GGML_ASSERT(src1 == nullptr || ggml_vk_dim01_contiguous(src1)); // NOLINT
|
3058
3173
|
GGML_ASSERT(dst->extra != nullptr);
|
3059
3174
|
const uint64_t ne00 = src0->ne[0];
|
3060
3175
|
const uint64_t ne01 = src0->ne[1];
|
@@ -3071,7 +3186,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3071
3186
|
const uint64_t nb2 = dst->nb[2];
|
3072
3187
|
const uint64_t nb3 = dst->nb[3];
|
3073
3188
|
|
3074
|
-
|
3189
|
+
const bool use_src2 = src2 != nullptr;
|
3190
|
+
const uint64_t ne2 = use_src2 ? src2->ne[0] * src2->ne[1] : 0;
|
3191
|
+
|
3192
|
+
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
3075
3193
|
ggml_vk_func_t op_func;
|
3076
3194
|
|
3077
3195
|
if (pipeline == nullptr) {
|
@@ -3092,40 +3210,50 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3092
3210
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3093
3211
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3094
3212
|
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
3213
|
+
ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
|
3095
3214
|
|
3096
3215
|
vk_buffer d_X = nullptr;
|
3097
3216
|
size_t x_buf_offset = 0;
|
3098
3217
|
vk_buffer d_Y = nullptr;
|
3099
3218
|
size_t y_buf_offset = 0;
|
3219
|
+
vk_buffer d_Z = nullptr;
|
3220
|
+
size_t z_buf_offset = 0;
|
3100
3221
|
|
3101
3222
|
bool src0_uma = false;
|
3102
3223
|
bool src1_uma = false;
|
3224
|
+
bool src2_uma = false;
|
3103
3225
|
|
3104
|
-
if (ctx->device
|
3226
|
+
if (ctx->device->uma) {
|
3105
3227
|
ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
|
3106
3228
|
src0_uma = d_X != nullptr;
|
3107
3229
|
if (use_src1) {
|
3108
3230
|
ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
|
3109
3231
|
src1_uma = d_Y != nullptr;
|
3110
3232
|
}
|
3233
|
+
if (use_src2) {
|
3234
|
+
ggml_vk_host_get(ctx, src1->data, d_Z, z_buf_offset);
|
3235
|
+
src2_uma = d_Z != nullptr;
|
3236
|
+
}
|
3111
3237
|
}
|
3112
3238
|
|
3113
|
-
const bool transfer_src0 = src0->backend !=
|
3114
|
-
const bool transfer_src1 = use_src1 && src1->backend !=
|
3239
|
+
const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
3240
|
+
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
3241
|
+
const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
|
3115
3242
|
|
3116
|
-
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device
|
3117
|
-
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device
|
3243
|
+
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3244
|
+
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3245
|
+
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3118
3246
|
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
3119
3247
|
|
3120
3248
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3121
3249
|
|
3122
3250
|
// Workaround for tiny tensor inputs on ROPE
|
3123
|
-
if (use_src1 && src1->backend ==
|
3251
|
+
if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
|
3124
3252
|
y_sz = VK_WHOLE_SIZE;
|
3125
3253
|
}
|
3126
3254
|
|
3127
3255
|
GGML_ASSERT(d_D != nullptr);
|
3128
|
-
uint64_t d_buf_offset = (extra->offset / ctx->device
|
3256
|
+
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
3129
3257
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
3130
3258
|
if (transfer_src0) {
|
3131
3259
|
d_X = ctx->prealloc_qx;
|
@@ -3142,6 +3270,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3142
3270
|
GGML_ASSERT(d_Y != nullptr);
|
3143
3271
|
}
|
3144
3272
|
|
3273
|
+
GGML_ASSERT(!transfer_src2);
|
3274
|
+
if (use_src2 && !src2_uma) {
|
3275
|
+
d_Z = extra_src2->buffer_gpu.lock();
|
3276
|
+
z_buf_offset = extra_src2->offset;
|
3277
|
+
GGML_ASSERT(d_Z != nullptr);
|
3278
|
+
}
|
3279
|
+
|
3145
3280
|
if (op == GGML_OP_CPY) {
|
3146
3281
|
GGML_ASSERT(!transfer_src0);
|
3147
3282
|
GGML_ASSERT(!transfer_src1);
|
@@ -3169,7 +3304,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3169
3304
|
|
3170
3305
|
// Single call if dimension 2 is contiguous
|
3171
3306
|
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3172
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
3307
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
3173
3308
|
|
3174
3309
|
switch (dst->op) {
|
3175
3310
|
case GGML_OP_NORM:
|
@@ -3198,26 +3333,42 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3198
3333
|
}
|
3199
3334
|
}
|
3200
3335
|
|
3201
|
-
if (
|
3202
|
-
// Empty src1
|
3336
|
+
if (op == GGML_OP_SOFT_MAX) {
|
3337
|
+
// Empty src1 and src2 are possible on soft_max, but the shader needs buffers
|
3338
|
+
vk_subbuffer subbuf_y;
|
3339
|
+
if (use_src1) {
|
3340
|
+
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
3341
|
+
} else {
|
3342
|
+
subbuf_y = { ctx->prealloc_y, 0, ctx->prealloc_y->size };
|
3343
|
+
}
|
3344
|
+
|
3345
|
+
vk_subbuffer subbuf_z;
|
3346
|
+
if (use_src2) {
|
3347
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
3348
|
+
} else {
|
3349
|
+
subbuf_z = { ctx->prealloc_y, 0, ctx->prealloc_y->size };
|
3350
|
+
}
|
3351
|
+
|
3203
3352
|
ggml_vk_sync_buffers(subctx);
|
3204
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
3353
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3205
3354
|
} else if (use_src1) {
|
3206
3355
|
ggml_vk_sync_buffers(subctx);
|
3207
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
3356
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3208
3357
|
} else {
|
3209
3358
|
ggml_vk_sync_buffers(subctx);
|
3210
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
3359
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3211
3360
|
}
|
3212
|
-
if (dst->backend ==
|
3361
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
3213
3362
|
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
3214
|
-
} else if(dst->backend ==
|
3363
|
+
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3215
3364
|
// copy dst to host
|
3216
3365
|
float * d = (float *) dst->data;
|
3217
3366
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
3218
3367
|
}
|
3219
3368
|
} else {
|
3220
|
-
|
3369
|
+
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
3370
|
+
|
3371
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
3221
3372
|
|
3222
3373
|
switch (dst->op) {
|
3223
3374
|
case GGML_OP_NORM:
|
@@ -3242,18 +3393,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3242
3393
|
const uint32_t y_offset = y_sz * it_idx1;
|
3243
3394
|
const uint32_t d_offset = d_sz * it_idx0;
|
3244
3395
|
|
3245
|
-
if (
|
3246
|
-
// Empty src1 is possible on soft_max, but the shader needs a buffer
|
3247
|
-
ggml_vk_sync_buffers(subctx);
|
3248
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3249
|
-
} else if (use_src1) {
|
3396
|
+
if (use_src1) {
|
3250
3397
|
ggml_vk_sync_buffers(subctx);
|
3251
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
3398
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3252
3399
|
} else {
|
3253
3400
|
ggml_vk_sync_buffers(subctx);
|
3254
|
-
ggml_vk_dispatch_pipeline(ctx, subctx,
|
3401
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
3255
3402
|
}
|
3256
|
-
if (dst->backend ==
|
3403
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3257
3404
|
// copy dst to host
|
3258
3405
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
3259
3406
|
}
|
@@ -3263,69 +3410,141 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3263
3410
|
}
|
3264
3411
|
|
3265
3412
|
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3266
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
3413
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
3267
3414
|
}
|
3268
3415
|
|
3269
3416
|
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3270
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
3417
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
3271
3418
|
}
|
3272
3419
|
|
3273
3420
|
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3274
|
-
|
3421
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3422
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
3423
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3424
|
+
|
3425
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
|
3426
|
+
(uint32_t)ggml_nelements(src0),
|
3427
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3428
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
3429
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3430
|
+
0,
|
3431
|
+
0.0f, 0.0f,
|
3432
|
+
});
|
3275
3433
|
}
|
3276
3434
|
|
3277
3435
|
static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3278
|
-
|
3436
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3437
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
3438
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3439
|
+
|
3440
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
|
3441
|
+
(uint32_t)ggml_nelements(src0),
|
3442
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3443
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
3444
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3445
|
+
0,
|
3446
|
+
0.0f, 0.0f,
|
3447
|
+
});
|
3279
3448
|
}
|
3280
3449
|
|
3281
3450
|
static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3282
3451
|
float * op_params = (float *)dst->op_params;
|
3283
|
-
|
3452
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3453
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3454
|
+
|
3455
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
|
3456
|
+
(uint32_t)ggml_nelements(src0),
|
3457
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3458
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3459
|
+
0,
|
3460
|
+
op_params[0], 0.0f
|
3461
|
+
});
|
3284
3462
|
}
|
3285
3463
|
|
3286
3464
|
static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3287
|
-
|
3465
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3466
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3467
|
+
|
3468
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
|
3469
|
+
(uint32_t)ggml_nelements(src0),
|
3470
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3471
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3472
|
+
0,
|
3473
|
+
0.0f, 0.0f,
|
3474
|
+
});
|
3288
3475
|
}
|
3289
3476
|
|
3290
3477
|
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3291
3478
|
float * op_params = (float *)dst->op_params;
|
3292
|
-
|
3479
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3480
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3481
|
+
|
3482
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
|
3483
|
+
(uint32_t)ggml_nelements(src0),
|
3484
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3485
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3486
|
+
0,
|
3487
|
+
op_params[0], op_params[1],
|
3488
|
+
});
|
3293
3489
|
}
|
3294
3490
|
|
3295
3491
|
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3296
3492
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3297
|
-
const
|
3298
|
-
const
|
3299
|
-
const uint32_t d_offset = (extra->offset % ctx->device
|
3300
|
-
|
3493
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3494
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3495
|
+
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
3496
|
+
|
3497
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
3301
3498
|
(uint32_t)ggml_nelements(src0),
|
3302
|
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size,
|
3303
|
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size,
|
3499
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3500
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3304
3501
|
d_offset,
|
3502
|
+
0.0f, 0.0f,
|
3305
3503
|
});
|
3306
3504
|
}
|
3307
3505
|
|
3308
3506
|
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3309
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f });
|
3507
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f });
|
3310
3508
|
}
|
3311
3509
|
|
3312
3510
|
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3313
3511
|
float * op_params = (float *)dst->op_params;
|
3314
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
3512
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
3315
3513
|
}
|
3316
3514
|
|
3317
3515
|
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3318
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
3516
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
3319
3517
|
}
|
3320
3518
|
|
3321
3519
|
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3322
3520
|
int32_t * op_params = (int32_t *)dst->op_params;
|
3323
|
-
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
3521
|
+
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
3324
3522
|
}
|
3325
3523
|
|
3326
|
-
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3524
|
+
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
3327
3525
|
float * op_params = (float *)dst->op_params;
|
3328
|
-
|
3526
|
+
|
3527
|
+
float scale = op_params[0];
|
3528
|
+
float max_bias = op_params[1];
|
3529
|
+
|
3530
|
+
const uint32_t ncols = (uint32_t)src0->ne[0];
|
3531
|
+
const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
|
3532
|
+
const uint32_t nrows_y = (uint32_t)src0->ne[1];
|
3533
|
+
|
3534
|
+
const uint32_t n_head_kv = nrows_x/nrows_y;
|
3535
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
3536
|
+
|
3537
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
3538
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
3539
|
+
|
3540
|
+
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
3541
|
+
ncols,
|
3542
|
+
nrows_y,
|
3543
|
+
src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
|
3544
|
+
scale, max_bias,
|
3545
|
+
m0, m1,
|
3546
|
+
n_head_log2,
|
3547
|
+
});
|
3329
3548
|
}
|
3330
3549
|
|
3331
3550
|
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3351,15 +3570,20 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
3351
3570
|
if (is_neox) {
|
3352
3571
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3353
3572
|
const float inv_ndims = -1.0f / n_dims;
|
3354
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
|
3573
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
|
3355
3574
|
} else {
|
3356
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
|
3575
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
|
3357
3576
|
}
|
3358
3577
|
}
|
3359
3578
|
|
3579
|
+
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3580
|
+
int32_t * op_params = (int32_t *)dst->op_params;
|
3581
|
+
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
3582
|
+
}
|
3583
|
+
|
3360
3584
|
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3361
3585
|
// If backend is CPU, data from src0 has to be copied off the device
|
3362
|
-
if (dst->backend ==
|
3586
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3363
3587
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3364
3588
|
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
3365
3589
|
ggml_vk_sync_buffers(subctx);
|
@@ -3408,43 +3632,43 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
3408
3632
|
const size_t y_ne = k * n * batch;
|
3409
3633
|
const size_t d_ne = m * n * batch;
|
3410
3634
|
|
3411
|
-
vk_pipeline
|
3635
|
+
vk_pipeline p;
|
3412
3636
|
std::string shname;
|
3413
3637
|
if (shader_size == 0) {
|
3414
3638
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3415
|
-
p =
|
3639
|
+
p = ctx->device->pipeline_matmul_f32->a_s;
|
3416
3640
|
shname = "F32_ALIGNED_S";
|
3417
3641
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3418
|
-
p =
|
3642
|
+
p = ctx->device->pipeline_matmul_f16_f32->a_s;
|
3419
3643
|
shname = "F16_F32_ALIGNED_S";
|
3420
3644
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
3421
|
-
p =
|
3645
|
+
p = ctx->device->pipeline_matmul_f16->a_s;
|
3422
3646
|
shname = "F16_ALIGNED_S";
|
3423
3647
|
} else {
|
3424
3648
|
GGML_ASSERT(false);
|
3425
3649
|
}
|
3426
3650
|
} else if (shader_size == 1) {
|
3427
3651
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3428
|
-
p =
|
3652
|
+
p = ctx->device->pipeline_matmul_f32->a_m;
|
3429
3653
|
shname = "F32_ALIGNED_M";
|
3430
3654
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3431
|
-
p =
|
3655
|
+
p = ctx->device->pipeline_matmul_f16_f32->a_m;
|
3432
3656
|
shname = "F16_F32_ALIGNED_M";
|
3433
3657
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
3434
|
-
p =
|
3658
|
+
p = ctx->device->pipeline_matmul_f16->a_m;
|
3435
3659
|
shname = "F16_ALIGNED_M";
|
3436
3660
|
} else {
|
3437
3661
|
GGML_ASSERT(false);
|
3438
3662
|
}
|
3439
3663
|
} else if (shader_size == 2) {
|
3440
3664
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3441
|
-
p =
|
3665
|
+
p = ctx->device->pipeline_matmul_f32->a_l;
|
3442
3666
|
shname = "F32_ALIGNED_L";
|
3443
3667
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3444
|
-
p =
|
3668
|
+
p = ctx->device->pipeline_matmul_f16_f32->a_l;
|
3445
3669
|
shname = "F16_F32_ALIGNED_L";
|
3446
3670
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
3447
|
-
p =
|
3671
|
+
p = ctx->device->pipeline_matmul_f16->a_l;
|
3448
3672
|
shname = "F16_ALIGNED_L";
|
3449
3673
|
} else {
|
3450
3674
|
GGML_ASSERT(false);
|
@@ -3458,43 +3682,43 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
3458
3682
|
if (k != kpad) {
|
3459
3683
|
if (shader_size == 0) {
|
3460
3684
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3461
|
-
p =
|
3685
|
+
p = ctx->device->pipeline_matmul_f32->s;
|
3462
3686
|
shname = "F32_S";
|
3463
3687
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3464
|
-
p =
|
3688
|
+
p = ctx->device->pipeline_matmul_f16_f32->s;
|
3465
3689
|
shname = "F16_F32_S";
|
3466
3690
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
3467
|
-
p =
|
3691
|
+
p = ctx->device->pipeline_matmul_f16->s;
|
3468
3692
|
shname = "F16_S";
|
3469
3693
|
}
|
3470
3694
|
} else if (shader_size == 1) {
|
3471
3695
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3472
|
-
p =
|
3696
|
+
p = ctx->device->pipeline_matmul_f32->m;
|
3473
3697
|
shname = "F32_M";
|
3474
3698
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3475
|
-
p =
|
3699
|
+
p = ctx->device->pipeline_matmul_f16_f32->m;
|
3476
3700
|
shname = "F16_F32_M";
|
3477
3701
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
3478
|
-
p =
|
3702
|
+
p = ctx->device->pipeline_matmul_f16->m;
|
3479
3703
|
shname = "F16_M";
|
3480
3704
|
}
|
3481
3705
|
} else if (shader_size == 2) {
|
3482
3706
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3483
|
-
p =
|
3707
|
+
p = ctx->device->pipeline_matmul_f32->l;
|
3484
3708
|
shname = "F32_L";
|
3485
3709
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
3486
|
-
p =
|
3710
|
+
p = ctx->device->pipeline_matmul_f16_f32->l;
|
3487
3711
|
shname = "F16_F32_L";
|
3488
3712
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
3489
|
-
p =
|
3713
|
+
p = ctx->device->pipeline_matmul_f16->l;
|
3490
3714
|
shname = "F16_L";
|
3491
3715
|
}
|
3492
3716
|
}
|
3493
3717
|
}
|
3494
3718
|
|
3495
|
-
ggml_pipeline_allocate_descriptor_sets(ctx,
|
3719
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
|
3496
3720
|
if (split_k > 1) {
|
3497
|
-
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, num_it);
|
3721
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
3498
3722
|
|
3499
3723
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
3500
3724
|
// Resize buffer
|
@@ -3524,9 +3748,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
3524
3748
|
}
|
3525
3749
|
for (size_t i = 0; i < y_ne; i++) {
|
3526
3750
|
if (std::is_same<float, Y_TYPE>()) {
|
3527
|
-
y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
|
3751
|
+
// y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
|
3752
|
+
y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
3528
3753
|
} else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
3529
|
-
y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
|
3754
|
+
// y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
|
3755
|
+
y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
|
3530
3756
|
} else {
|
3531
3757
|
GGML_ASSERT(false);
|
3532
3758
|
}
|
@@ -3535,17 +3761,17 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
3535
3761
|
ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
3536
3762
|
ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
3537
3763
|
|
3538
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device
|
3764
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
3539
3765
|
for (size_t i = 0; i < num_it; i++) {
|
3540
3766
|
ggml_vk_ctx_begin(ctx, subctx);
|
3541
|
-
ggml_vk_matmul(ctx, subctx,
|
3767
|
+
ggml_vk_matmul(ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n);
|
3542
3768
|
ggml_vk_ctx_end(subctx);
|
3543
3769
|
}
|
3544
3770
|
|
3545
3771
|
auto begin = std::chrono::high_resolution_clock::now();
|
3546
3772
|
ggml_vk_submit(subctx, ctx->fence);
|
3547
|
-
VK_CHECK(ctx->device
|
3548
|
-
ctx->device
|
3773
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
|
3774
|
+
ctx->device->device.resetFences({ ctx->fence });
|
3549
3775
|
|
3550
3776
|
auto end = std::chrono::high_resolution_clock::now();
|
3551
3777
|
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
@@ -3624,6 +3850,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
3624
3850
|
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
3625
3851
|
std::cerr << "Actual result: " << std::endl << std::endl;
|
3626
3852
|
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
3853
|
+
std::cerr << std::endl;
|
3854
|
+
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n + 15, first_err_b);
|
3627
3855
|
std::cerr << "Expected result: " << std::endl << std::endl;
|
3628
3856
|
ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
3629
3857
|
|
@@ -3649,15 +3877,15 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
3649
3877
|
|
3650
3878
|
free(d_chk);
|
3651
3879
|
|
3652
|
-
ggml_vk_queue_cleanup(ctx, ctx->device
|
3653
|
-
ggml_vk_queue_cleanup(ctx, ctx->device
|
3880
|
+
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
3881
|
+
ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
|
3654
3882
|
|
3655
3883
|
ggml_vk_destroy_buffer(d_X);
|
3656
3884
|
ggml_vk_destroy_buffer(d_Y);
|
3657
3885
|
ggml_vk_destroy_buffer(d_D);
|
3658
3886
|
|
3659
|
-
ggml_pipeline_cleanup(
|
3660
|
-
ggml_pipeline_cleanup(ctx->pipeline_matmul_split_k_reduce);
|
3887
|
+
ggml_pipeline_cleanup(p);
|
3888
|
+
ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
|
3661
3889
|
|
3662
3890
|
free(x);
|
3663
3891
|
free(y);
|
@@ -3730,7 +3958,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
|
|
3730
3958
|
data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
|
3731
3959
|
}
|
3732
3960
|
|
3733
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device
|
3961
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
3734
3962
|
ggml_vk_ctx_begin(ctx, subctx);
|
3735
3963
|
|
3736
3964
|
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
|
@@ -3739,8 +3967,8 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
|
|
3739
3967
|
|
3740
3968
|
ggml_vk_ctx_end(subctx);
|
3741
3969
|
ggml_vk_submit(subctx, ctx->fence);
|
3742
|
-
VK_CHECK(ctx->device
|
3743
|
-
ctx->device
|
3970
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences");
|
3971
|
+
ctx->device->device.resetFences({ ctx->fence });
|
3744
3972
|
|
3745
3973
|
ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor));
|
3746
3974
|
|
@@ -3812,7 +4040,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
|
|
3812
4040
|
x[i] = rand() / (float)RAND_MAX;
|
3813
4041
|
}
|
3814
4042
|
|
3815
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device
|
4043
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
3816
4044
|
ggml_vk_ctx_begin(ctx, subctx);
|
3817
4045
|
|
3818
4046
|
auto begin = std::chrono::high_resolution_clock::now();
|
@@ -3826,8 +4054,8 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
|
|
3826
4054
|
|
3827
4055
|
ggml_vk_ctx_end(subctx);
|
3828
4056
|
ggml_vk_submit(subctx, ctx->fence);
|
3829
|
-
VK_CHECK(ctx->device
|
3830
|
-
ctx->device
|
4057
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
|
4058
|
+
ctx->device->device.resetFences({ ctx->fence });
|
3831
4059
|
|
3832
4060
|
auto end = std::chrono::high_resolution_clock::now();
|
3833
4061
|
|
@@ -3841,8 +4069,8 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
|
|
3841
4069
|
|
3842
4070
|
ggml_vk_ctx_end(subctx);
|
3843
4071
|
ggml_vk_submit(subctx, ctx->fence);
|
3844
|
-
VK_CHECK(ctx->device
|
3845
|
-
ctx->device
|
4072
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
|
4073
|
+
ctx->device->device.resetFences({ ctx->fence });
|
3846
4074
|
|
3847
4075
|
for (auto& cpy : subctx->out_memcpys) {
|
3848
4076
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
@@ -3873,89 +4101,118 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
|
|
3873
4101
|
}
|
3874
4102
|
}
|
3875
4103
|
|
3876
|
-
static void
|
3877
|
-
#ifdef GGML_VULKAN_DEBUG
|
3878
|
-
std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
|
3879
|
-
#endif
|
3880
|
-
const size_t x_sz = sizeof(float) * ne;
|
3881
|
-
const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
|
3882
|
-
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
3883
|
-
float * x = (float *) malloc(x_sz);
|
3884
|
-
void * qx = malloc(qx_sz);
|
3885
|
-
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
3886
|
-
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
3887
|
-
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
|
3888
|
-
|
3889
|
-
for (size_t i = 0; i < ne; i++) {
|
3890
|
-
x[i] = rand() / (float)RAND_MAX;
|
3891
|
-
}
|
3892
|
-
|
4104
|
+
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
|
3893
4105
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
3894
4106
|
|
3895
|
-
vk_pipeline& p = ctx->pipeline_dequant[quant];
|
3896
|
-
|
3897
4107
|
switch(quant) {
|
4108
|
+
case GGML_TYPE_F32:
|
4109
|
+
memcpy(to, from, sizeof(float) * ne);
|
4110
|
+
break;
|
3898
4111
|
case GGML_TYPE_Q4_0:
|
3899
|
-
ggml_quantize_q4_0(
|
4112
|
+
ggml_quantize_q4_0(from, to, ne, ne, hist_cur.data());
|
3900
4113
|
break;
|
3901
4114
|
case GGML_TYPE_Q4_1:
|
3902
|
-
ggml_quantize_q4_1(
|
4115
|
+
ggml_quantize_q4_1(from, to, ne, ne, hist_cur.data());
|
3903
4116
|
break;
|
3904
4117
|
case GGML_TYPE_Q5_0:
|
3905
|
-
ggml_quantize_q5_0(
|
4118
|
+
ggml_quantize_q5_0(from, to, ne, ne, hist_cur.data());
|
3906
4119
|
break;
|
3907
4120
|
case GGML_TYPE_Q5_1:
|
3908
|
-
|
4121
|
+
ggml_quantize_q5_1(from, to, ne, ne, hist_cur.data());
|
3909
4122
|
break;
|
3910
4123
|
case GGML_TYPE_Q8_0:
|
3911
|
-
ggml_quantize_q8_0(
|
4124
|
+
ggml_quantize_q8_0(from, to, ne, ne, hist_cur.data());
|
3912
4125
|
break;
|
3913
4126
|
case GGML_TYPE_Q2_K:
|
3914
|
-
ggml_quantize_q2_K(
|
4127
|
+
ggml_quantize_q2_K(from, to, ne, ne, hist_cur.data());
|
3915
4128
|
break;
|
3916
4129
|
case GGML_TYPE_Q3_K:
|
3917
|
-
ggml_quantize_q3_K(
|
4130
|
+
ggml_quantize_q3_K(from, to, ne, ne, hist_cur.data());
|
3918
4131
|
break;
|
3919
4132
|
case GGML_TYPE_Q4_K:
|
3920
|
-
ggml_quantize_q4_K(
|
4133
|
+
ggml_quantize_q4_K(from, to, ne, ne, hist_cur.data());
|
3921
4134
|
break;
|
3922
4135
|
case GGML_TYPE_Q5_K:
|
3923
|
-
ggml_quantize_q5_K(
|
4136
|
+
ggml_quantize_q5_K(from, to, ne, ne, hist_cur.data());
|
3924
4137
|
break;
|
3925
4138
|
case GGML_TYPE_Q6_K:
|
3926
|
-
ggml_quantize_q6_K(
|
4139
|
+
ggml_quantize_q6_K(from, to, ne, ne, hist_cur.data());
|
3927
4140
|
break;
|
3928
4141
|
default:
|
3929
4142
|
GGML_ASSERT(false);
|
3930
4143
|
}
|
4144
|
+
}
|
4145
|
+
|
4146
|
+
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
4147
|
+
#ifdef GGML_VULKAN_DEBUG
|
4148
|
+
std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
|
4149
|
+
#endif
|
4150
|
+
const size_t x_sz = sizeof(float) * ne;
|
4151
|
+
const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
|
4152
|
+
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
4153
|
+
float * x = (float *) malloc(x_sz);
|
4154
|
+
void * qx = malloc(qx_sz);
|
4155
|
+
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4156
|
+
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4157
|
+
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
|
4158
|
+
|
4159
|
+
for (size_t i = 0; i < ne; i++) {
|
4160
|
+
x[i] = rand() / (float)RAND_MAX;
|
4161
|
+
}
|
4162
|
+
|
4163
|
+
vk_pipeline p = ctx->device->pipeline_dequant[quant];
|
4164
|
+
|
4165
|
+
ggml_vk_quantize_data(x, qx, ne, quant);
|
3931
4166
|
|
3932
4167
|
ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
|
3933
4168
|
|
3934
4169
|
ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
|
3935
4170
|
|
3936
|
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device
|
4171
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
3937
4172
|
ggml_vk_ctx_begin(ctx, subctx);
|
3938
|
-
const std::vector<
|
4173
|
+
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
3939
4174
|
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
3940
4175
|
ggml_vk_ctx_end(subctx);
|
3941
4176
|
|
3942
4177
|
auto begin = std::chrono::high_resolution_clock::now();
|
3943
4178
|
|
3944
4179
|
ggml_vk_submit(subctx, ctx->fence);
|
3945
|
-
VK_CHECK(ctx->device
|
3946
|
-
ctx->device
|
4180
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
4181
|
+
ctx->device->device.resetFences({ ctx->fence });
|
3947
4182
|
|
3948
4183
|
auto end = std::chrono::high_resolution_clock::now();
|
3949
4184
|
|
3950
4185
|
double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
3951
4186
|
ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);
|
3952
4187
|
|
4188
|
+
int first_err = -1;
|
4189
|
+
|
3953
4190
|
double avg_err = 0.0;
|
3954
4191
|
for (size_t i = 0; i < ne; i++) {
|
3955
|
-
|
4192
|
+
double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
|
4193
|
+
avg_err += error;
|
4194
|
+
|
4195
|
+
if (first_err < 0 && error > 0.05) {
|
4196
|
+
first_err = i;
|
4197
|
+
}
|
3956
4198
|
}
|
3957
4199
|
|
3958
|
-
|
4200
|
+
avg_err /= ne;
|
4201
|
+
|
4202
|
+
std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err << std::endl;
|
4203
|
+
|
4204
|
+
if (avg_err > 0.1) {
|
4205
|
+
std::cerr << "first_error = " << first_err << std::endl;
|
4206
|
+
std::cerr << "Actual result: " << std::endl << std::endl;
|
4207
|
+
for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
|
4208
|
+
std::cerr << ggml_fp16_to_fp32(x_chk[i]) << ", ";
|
4209
|
+
}
|
4210
|
+
std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
|
4211
|
+
for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
|
4212
|
+
std::cerr << x[i] << ", ";
|
4213
|
+
}
|
4214
|
+
std::cerr << std::endl;
|
4215
|
+
}
|
3959
4216
|
|
3960
4217
|
ggml_vk_destroy_buffer(x_buf);
|
3961
4218
|
ggml_vk_destroy_buffer(qx_buf);
|
@@ -3964,6 +4221,190 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
3964
4221
|
free(qx);
|
3965
4222
|
free(x_chk);
|
3966
4223
|
}
|
4224
|
+
|
4225
|
+
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
|
4226
|
+
#ifdef GGML_VULKAN_DEBUG
|
4227
|
+
std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
|
4228
|
+
#endif
|
4229
|
+
const size_t x_ne = m * k * batch;
|
4230
|
+
const size_t y_ne = k * n * batch;
|
4231
|
+
const size_t d_ne = m * n * batch;
|
4232
|
+
|
4233
|
+
vk_pipeline p;
|
4234
|
+
std::string shname;
|
4235
|
+
if (shader_size == 0) {
|
4236
|
+
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_s;
|
4237
|
+
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S";
|
4238
|
+
} else if (shader_size == 1) {
|
4239
|
+
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_m;
|
4240
|
+
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M";
|
4241
|
+
} else if (shader_size == 2) {
|
4242
|
+
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_l;
|
4243
|
+
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L";
|
4244
|
+
} else {
|
4245
|
+
GGML_ASSERT(0);
|
4246
|
+
}
|
4247
|
+
|
4248
|
+
const size_t kpad = ggml_vk_align_size(k, p->align);
|
4249
|
+
|
4250
|
+
if (k != kpad) {
|
4251
|
+
if (shader_size == 0) {
|
4252
|
+
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->s;
|
4253
|
+
shname = std::string(ggml_type_name(quant)) + "_S";
|
4254
|
+
} else if (shader_size == 1) {
|
4255
|
+
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->m;
|
4256
|
+
shname = std::string(ggml_type_name(quant)) + "_M";
|
4257
|
+
} else if (shader_size == 2) {
|
4258
|
+
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->l;
|
4259
|
+
shname = std::string(ggml_type_name(quant)) + "_L";
|
4260
|
+
} else {
|
4261
|
+
GGML_ASSERT(0);
|
4262
|
+
}
|
4263
|
+
}
|
4264
|
+
|
4265
|
+
const size_t x_sz = sizeof(float) * x_ne;
|
4266
|
+
const size_t y_sz = sizeof(float) * y_ne;
|
4267
|
+
const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
4268
|
+
const size_t d_sz = sizeof(float) * d_ne;
|
4269
|
+
float * x = (float *) malloc(x_sz);
|
4270
|
+
float * y = (float *) malloc(y_sz);
|
4271
|
+
void * qx = malloc(qx_sz);
|
4272
|
+
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4273
|
+
vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4274
|
+
vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4275
|
+
float * d = (float *) malloc(d_sz);
|
4276
|
+
float * d_chk = (float *) malloc(d_sz);
|
4277
|
+
|
4278
|
+
for (size_t i = 0; i < x_ne; i++) {
|
4279
|
+
x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
|
4280
|
+
}
|
4281
|
+
|
4282
|
+
ggml_vk_quantize_data(x, qx, x_ne, quant);
|
4283
|
+
|
4284
|
+
for (size_t i = 0; i < y_ne; i++) {
|
4285
|
+
// y[i] = rand() / (float)RAND_MAX;
|
4286
|
+
y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
4287
|
+
}
|
4288
|
+
|
4289
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
|
4290
|
+
if (split_k > 1) {
|
4291
|
+
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
4292
|
+
|
4293
|
+
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
4294
|
+
// Resize buffer
|
4295
|
+
if (ctx->prealloc_split_k != nullptr) {
|
4296
|
+
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
4297
|
+
}
|
4298
|
+
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4299
|
+
}
|
4300
|
+
}
|
4301
|
+
|
4302
|
+
ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
|
4303
|
+
ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz);
|
4304
|
+
|
4305
|
+
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
4306
|
+
for (size_t i = 0; i < num_it; i++) {
|
4307
|
+
ggml_vk_ctx_begin(ctx, subctx);
|
4308
|
+
ggml_vk_matmul(ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n);
|
4309
|
+
ggml_vk_ctx_end(subctx);
|
4310
|
+
}
|
4311
|
+
|
4312
|
+
auto begin = std::chrono::high_resolution_clock::now();
|
4313
|
+
|
4314
|
+
ggml_vk_submit(subctx, ctx->fence);
|
4315
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
4316
|
+
ctx->device->device.resetFences({ ctx->fence });
|
4317
|
+
|
4318
|
+
auto end = std::chrono::high_resolution_clock::now();
|
4319
|
+
|
4320
|
+
double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
4321
|
+
ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz);
|
4322
|
+
|
4323
|
+
ggml_init_params iparams = {
|
4324
|
+
/*.mem_size =*/ 1024*1024*1024,
|
4325
|
+
/*.mem_buffer =*/ NULL,
|
4326
|
+
/*.no_alloc =*/ true,
|
4327
|
+
};
|
4328
|
+
|
4329
|
+
ggml_context * ggml_ctx = ggml_init(iparams);
|
4330
|
+
|
4331
|
+
ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, quant, k, m, batch);
|
4332
|
+
ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, GGML_TYPE_F32, k, n, batch);
|
4333
|
+
ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml);
|
4334
|
+
|
4335
|
+
src0_ggml->data = qx;
|
4336
|
+
src1_ggml->data = y;
|
4337
|
+
tensor_ggml->data = d_chk;
|
4338
|
+
|
4339
|
+
ctx->disable = true;
|
4340
|
+
|
4341
|
+
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
4342
|
+
ggml_build_forward_expand(cgraph, tensor_ggml);
|
4343
|
+
|
4344
|
+
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
4345
|
+
|
4346
|
+
ctx->disable = false;
|
4347
|
+
|
4348
|
+
ggml_free(ggml_ctx);
|
4349
|
+
|
4350
|
+
double avg_err = 0.0;
|
4351
|
+
int first_err_n = -1;
|
4352
|
+
int first_err_m = -1;
|
4353
|
+
int first_err_b = -1;
|
4354
|
+
|
4355
|
+
for (size_t i = 0; i < m*n*batch; i++) {
|
4356
|
+
double err = std::fabs(d[i] - d_chk[i]);
|
4357
|
+
avg_err += err;
|
4358
|
+
|
4359
|
+
if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) {
|
4360
|
+
first_err_b = i / (m * n);
|
4361
|
+
first_err_n = (i % (m * n)) / m;
|
4362
|
+
first_err_m = (i % (m * n)) % m;
|
4363
|
+
}
|
4364
|
+
}
|
4365
|
+
|
4366
|
+
avg_err /= m * n;
|
4367
|
+
|
4368
|
+
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
4369
|
+
|
4370
|
+
if (avg_err > 0.1 || std::isnan(avg_err)) {
|
4371
|
+
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
4372
|
+
std::cerr << "Actual result: " << std::endl << std::endl;
|
4373
|
+
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
4374
|
+
std::cerr << std::endl;
|
4375
|
+
std::cerr << "Expected result: " << std::endl << std::endl;
|
4376
|
+
ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
4377
|
+
|
4378
|
+
if (split_k > 1) {
|
4379
|
+
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
4380
|
+
ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
4381
|
+
|
4382
|
+
std::cerr << "d_buf0: " << std::endl << std::endl;
|
4383
|
+
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
4384
|
+
|
4385
|
+
std::cerr << "d_buf1: " << std::endl << std::endl;
|
4386
|
+
ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
4387
|
+
|
4388
|
+
std::cerr << "d_buf2: " << std::endl << std::endl;
|
4389
|
+
ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
4390
|
+
|
4391
|
+
std::cerr << "d_buf3: " << std::endl << std::endl;
|
4392
|
+
ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
4393
|
+
|
4394
|
+
free(split_k_buf);
|
4395
|
+
}
|
4396
|
+
}
|
4397
|
+
|
4398
|
+
ggml_vk_destroy_buffer(qx_buf);
|
4399
|
+
ggml_vk_destroy_buffer(y_buf);
|
4400
|
+
ggml_vk_destroy_buffer(d_buf);
|
4401
|
+
|
4402
|
+
free(x);
|
4403
|
+
free(qx);
|
4404
|
+
free(y);
|
4405
|
+
free(d);
|
4406
|
+
free(d_chk);
|
4407
|
+
}
|
3967
4408
|
#endif
|
3968
4409
|
|
3969
4410
|
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
@@ -3976,29 +4417,19 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
3976
4417
|
return extra;
|
3977
4418
|
}
|
3978
4419
|
|
3979
|
-
static
|
3980
|
-
|
3981
|
-
|
3982
|
-
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
3983
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
3984
|
-
if (graph->nodes[i]->src[j] == node) {
|
3985
|
-
return graph->nodes[i];
|
3986
|
-
}
|
3987
|
-
}
|
3988
|
-
}
|
3989
|
-
|
3990
|
-
return nullptr;
|
4420
|
+
static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
|
4421
|
+
return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
|
3991
4422
|
}
|
3992
4423
|
|
3993
4424
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
3994
4425
|
#ifdef GGML_VULKAN_DEBUG
|
3995
4426
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
3996
4427
|
#endif
|
3997
|
-
const bool any_on_device = node->backend ==
|
3998
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend ==
|
3999
|
-
|| (node->src[1] != nullptr && (node->src[1]->backend ==
|
4428
|
+
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
4429
|
+
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4430
|
+
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
4000
4431
|
|
4001
|
-
if (ctx->disable || (!any_on_device && node
|
4432
|
+
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
|
4002
4433
|
return;
|
4003
4434
|
}
|
4004
4435
|
|
@@ -4029,7 +4460,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4029
4460
|
const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32;
|
4030
4461
|
|
4031
4462
|
int split_k;
|
4032
|
-
if (node->op == GGML_OP_MUL_MAT) {
|
4463
|
+
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
4033
4464
|
split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
4034
4465
|
} else {
|
4035
4466
|
split_k = 1;
|
@@ -4038,11 +4469,11 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4038
4469
|
const uint32_t y_ne = ne10 * ne11;
|
4039
4470
|
const uint32_t d_ne = ne20 * ne21;
|
4040
4471
|
|
4041
|
-
const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device
|
4042
|
-
const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device
|
4043
|
-
const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device
|
4044
|
-
const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device
|
4045
|
-
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device
|
4472
|
+
const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4473
|
+
const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4474
|
+
const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4475
|
+
const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4476
|
+
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
4046
4477
|
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
4047
4478
|
|
4048
4479
|
if (extra->buffer_gpu.expired()) {
|
@@ -4070,6 +4501,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4070
4501
|
case GGML_OP_DIAG_MASK_INF:
|
4071
4502
|
case GGML_OP_SOFT_MAX:
|
4072
4503
|
case GGML_OP_ROPE:
|
4504
|
+
case GGML_OP_ARGSORT:
|
4073
4505
|
break;
|
4074
4506
|
case GGML_OP_UNARY:
|
4075
4507
|
switch (ggml_get_unary_op(node)) {
|
@@ -4082,6 +4514,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4082
4514
|
}
|
4083
4515
|
break;
|
4084
4516
|
case GGML_OP_MUL_MAT:
|
4517
|
+
case GGML_OP_MUL_MAT_ID:
|
4085
4518
|
if (ctx->prealloc_size_qx < qx_sz) {
|
4086
4519
|
ctx->prealloc_size_qx = qx_sz;
|
4087
4520
|
}
|
@@ -4115,21 +4548,66 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4115
4548
|
#endif
|
4116
4549
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4117
4550
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
4118
|
-
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
|
4551
|
+
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
4119
4552
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
4120
4553
|
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
4121
4554
|
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
4122
4555
|
|
4123
|
-
ggml_vk_test_dequant(ctx,
|
4124
|
-
ggml_vk_test_dequant(ctx,
|
4125
|
-
ggml_vk_test_dequant(ctx,
|
4126
|
-
ggml_vk_test_dequant(ctx,
|
4127
|
-
ggml_vk_test_dequant(ctx,
|
4128
|
-
ggml_vk_test_dequant(ctx,
|
4129
|
-
ggml_vk_test_dequant(ctx,
|
4130
|
-
ggml_vk_test_dequant(ctx,
|
4131
|
-
ggml_vk_test_dequant(ctx,
|
4132
|
-
ggml_vk_test_dequant(ctx,
|
4556
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
|
4557
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
|
4558
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
|
4559
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_0);
|
4560
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_1);
|
4561
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q8_0);
|
4562
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q2_K);
|
4563
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q3_K);
|
4564
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
|
4565
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
|
4566
|
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
|
4567
|
+
|
4568
|
+
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
|
4569
|
+
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
|
4570
|
+
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
|
4571
|
+
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
|
4572
|
+
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
|
4573
|
+
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
|
4574
|
+
|
4575
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
|
4576
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
|
4577
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
|
4578
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
|
4579
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
|
4580
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
|
4581
|
+
|
4582
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
|
4583
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
|
4584
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
|
4585
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
|
4586
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
|
4587
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
|
4588
|
+
|
4589
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
|
4590
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
|
4591
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
|
4592
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
|
4593
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
|
4594
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
|
4595
|
+
|
4596
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
|
4597
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
|
4598
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
|
4599
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
|
4600
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
|
4601
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
|
4602
|
+
|
4603
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
|
4604
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
|
4605
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
|
4606
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
|
4607
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
4608
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
4609
|
+
|
4610
|
+
std::cerr << std::endl;
|
4133
4611
|
|
4134
4612
|
const std::vector<size_t> vals {
|
4135
4613
|
8, 8, 8,
|
@@ -4215,11 +4693,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4215
4693
|
}
|
4216
4694
|
|
4217
4695
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
4218
|
-
const bool any_on_device = node->backend ==
|
4219
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend ==
|
4220
|
-
|| (node->src[1] != nullptr && node->src[1]->backend ==
|
4696
|
+
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
4697
|
+
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4698
|
+
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4221
4699
|
|
4222
|
-
if (ctx->disable || (!any_on_device && node
|
4700
|
+
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
4223
4701
|
return;
|
4224
4702
|
}
|
4225
4703
|
|
@@ -4231,6 +4709,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4231
4709
|
|
4232
4710
|
const ggml_tensor * src0 = node->src[0];
|
4233
4711
|
const ggml_tensor * src1 = node->src[1];
|
4712
|
+
const ggml_tensor * src2 = node->src[2];
|
4234
4713
|
|
4235
4714
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
4236
4715
|
|
@@ -4265,7 +4744,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4265
4744
|
case GGML_OP_SOFT_MAX:
|
4266
4745
|
case GGML_OP_ROPE:
|
4267
4746
|
case GGML_OP_MUL_MAT:
|
4747
|
+
case GGML_OP_MUL_MAT_ID:
|
4268
4748
|
case GGML_OP_NONE:
|
4749
|
+
case GGML_OP_ARGSORT:
|
4269
4750
|
break;
|
4270
4751
|
default:
|
4271
4752
|
if (any_on_device) {
|
@@ -4276,7 +4757,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4276
4757
|
}
|
4277
4758
|
|
4278
4759
|
if (ctx->compute_ctx == nullptr) {
|
4279
|
-
ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device
|
4760
|
+
ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
4280
4761
|
ggml_vk_ctx_begin(ctx, ctx->compute_ctx);
|
4281
4762
|
}
|
4282
4763
|
|
@@ -4347,16 +4828,25 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4347
4828
|
|
4348
4829
|
break;
|
4349
4830
|
case GGML_OP_SOFT_MAX:
|
4350
|
-
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
|
4831
|
+
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
4351
4832
|
|
4352
4833
|
break;
|
4353
4834
|
case GGML_OP_ROPE:
|
4354
4835
|
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
|
4355
4836
|
|
4837
|
+
break;
|
4838
|
+
case GGML_OP_ARGSORT:
|
4839
|
+
ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node);
|
4356
4840
|
break;
|
4357
4841
|
case GGML_OP_MUL_MAT:
|
4358
4842
|
ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node);
|
4359
4843
|
|
4844
|
+
break;
|
4845
|
+
case GGML_OP_MUL_MAT_ID:
|
4846
|
+
//ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, node);
|
4847
|
+
std::cerr << "ggml_vulkan: GGML_OP_MUL_MAT_ID not implemented yet." << std::endl;
|
4848
|
+
GGML_ASSERT(false);
|
4849
|
+
|
4360
4850
|
break;
|
4361
4851
|
default:
|
4362
4852
|
return;
|
@@ -4371,7 +4861,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4371
4861
|
last_node = true;
|
4372
4862
|
#endif
|
4373
4863
|
|
4374
|
-
if (node->backend ==
|
4864
|
+
if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
|
4375
4865
|
ggml_vk_ctx_end(ctx->compute_ctx);
|
4376
4866
|
ctx->compute_ctx->exit_tensor = node;
|
4377
4867
|
ctx->compute_ctx = nullptr;
|
@@ -4379,11 +4869,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4379
4869
|
}
|
4380
4870
|
|
4381
4871
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
4382
|
-
const bool any_on_device = tensor->backend ==
|
4383
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
4384
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
4872
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
4873
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4874
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4385
4875
|
|
4386
|
-
if (ctx->disable || (!any_on_device && tensor
|
4876
|
+
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
|
4387
4877
|
return false;
|
4388
4878
|
}
|
4389
4879
|
|
@@ -4409,6 +4899,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4409
4899
|
case GGML_OP_PERMUTE:
|
4410
4900
|
case GGML_OP_TRANSPOSE:
|
4411
4901
|
case GGML_OP_NONE:
|
4902
|
+
case GGML_OP_ARGSORT:
|
4412
4903
|
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
4413
4904
|
|
4414
4905
|
break;
|
@@ -4424,6 +4915,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4424
4915
|
}
|
4425
4916
|
break;
|
4426
4917
|
case GGML_OP_MUL_MAT:
|
4918
|
+
case GGML_OP_MUL_MAT_ID:
|
4427
4919
|
if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
4428
4920
|
return false;
|
4429
4921
|
}
|
@@ -4442,7 +4934,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4442
4934
|
if (params->ith != 0) {
|
4443
4935
|
return true;
|
4444
4936
|
}
|
4445
|
-
if (params->type ==
|
4937
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
4446
4938
|
return true;
|
4447
4939
|
}
|
4448
4940
|
|
@@ -4469,8 +4961,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4469
4961
|
}
|
4470
4962
|
|
4471
4963
|
if (tensor == subctx.exit_tensor) {
|
4472
|
-
VK_CHECK(ctx->device
|
4473
|
-
ctx->device
|
4964
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
4965
|
+
ctx->device->device.resetFences({ ctx->fence });
|
4474
4966
|
|
4475
4967
|
// Do staging buffer copies
|
4476
4968
|
for (auto& cpy : subctx.out_memcpys) {
|
@@ -4498,20 +4990,25 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
4498
4990
|
}
|
4499
4991
|
ctx->gc.temp_buffers.clear();
|
4500
4992
|
|
4501
|
-
for (auto
|
4502
|
-
|
4993
|
+
for (auto& pipeline : ctx->device->pipelines) {
|
4994
|
+
if (pipeline.expired()) {
|
4995
|
+
continue;
|
4996
|
+
}
|
4997
|
+
|
4998
|
+
vk_pipeline pl = pipeline.lock();
|
4999
|
+
ggml_pipeline_cleanup(pl);
|
4503
5000
|
}
|
4504
5001
|
|
4505
|
-
ggml_vk_queue_cleanup(ctx, ctx->device
|
4506
|
-
ggml_vk_queue_cleanup(ctx, ctx->device
|
5002
|
+
ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
|
5003
|
+
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
4507
5004
|
|
4508
5005
|
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
4509
|
-
ctx->device
|
5006
|
+
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
4510
5007
|
}
|
4511
5008
|
ctx->gc.semaphores.clear();
|
4512
5009
|
|
4513
5010
|
for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) {
|
4514
|
-
ctx->device
|
5011
|
+
ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
|
4515
5012
|
}
|
4516
5013
|
ctx->gc.tl_semaphores.clear();
|
4517
5014
|
ctx->semaphore_idx = 0;
|
@@ -4519,7 +5016,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
4519
5016
|
ctx->event_idx = 0;
|
4520
5017
|
|
4521
5018
|
for (auto& event : ctx->gc.events) {
|
4522
|
-
ctx->device
|
5019
|
+
ctx->device->device.resetEvent(event);
|
4523
5020
|
}
|
4524
5021
|
|
4525
5022
|
ctx->staging_offset = 0;
|
@@ -4556,21 +5053,11 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
4556
5053
|
ctx->staging_size = 0;
|
4557
5054
|
|
4558
5055
|
for (auto& event : ctx->gc.events) {
|
4559
|
-
ctx->device
|
5056
|
+
ctx->device->device.destroyEvent(event);
|
4560
5057
|
}
|
4561
5058
|
ctx->gc.events.clear();
|
4562
5059
|
|
4563
|
-
|
4564
|
-
ggml_vk_destroy_pipeline(ctx, pipeline);
|
4565
|
-
}
|
4566
|
-
ctx->gc.pipelines.clear();
|
4567
|
-
|
4568
|
-
ctx->device.lock()->device.destroyFence(ctx->fence);
|
4569
|
-
|
4570
|
-
ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->compute_queue.pool);
|
4571
|
-
if (!ctx->device.lock()->single_queue) {
|
4572
|
-
ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->transfer_queue.pool);
|
4573
|
-
}
|
5060
|
+
ctx->device->device.destroyFence(ctx->fence);
|
4574
5061
|
}
|
4575
5062
|
|
4576
5063
|
GGML_CALL static int ggml_vk_get_device_count() {
|
@@ -4745,7 +5232,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
4745
5232
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
4746
5233
|
}
|
4747
5234
|
|
4748
|
-
tensor->backend =
|
5235
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
4749
5236
|
tensor->extra = extra;
|
4750
5237
|
}
|
4751
5238
|
|
@@ -4753,7 +5240,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
4753
5240
|
#ifdef GGML_VULKAN_DEBUG
|
4754
5241
|
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
4755
5242
|
#endif
|
4756
|
-
GGML_ASSERT(tensor->backend ==
|
5243
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
4757
5244
|
|
4758
5245
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
4759
5246
|
|
@@ -4768,7 +5255,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
4768
5255
|
#ifdef GGML_VULKAN_DEBUG
|
4769
5256
|
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
4770
5257
|
#endif
|
4771
|
-
GGML_ASSERT(tensor->backend ==
|
5258
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
4772
5259
|
|
4773
5260
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
4774
5261
|
|
@@ -4781,7 +5268,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
4781
5268
|
|
4782
5269
|
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
4783
5270
|
if (ggml_backend_buffer_is_vk(src->buffer)) {
|
4784
|
-
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
4785
5271
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
4786
5272
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
4787
5273
|
|
@@ -4793,6 +5279,8 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
4793
5279
|
return true;
|
4794
5280
|
}
|
4795
5281
|
return false;
|
5282
|
+
|
5283
|
+
UNUSED(buffer);
|
4796
5284
|
}
|
4797
5285
|
|
4798
5286
|
GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
@@ -4839,12 +5327,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|
4839
5327
|
|
4840
5328
|
GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
4841
5329
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
4842
|
-
return ctx->ctx->device
|
5330
|
+
return ctx->ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
4843
5331
|
}
|
4844
5332
|
|
4845
5333
|
GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
4846
5334
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
4847
|
-
return ctx->ctx->device
|
5335
|
+
return ctx->ctx->device->max_memory_allocation_size;
|
4848
5336
|
}
|
4849
5337
|
|
4850
5338
|
GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
@@ -4930,7 +5418,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|
4930
5418
|
}
|
4931
5419
|
|
4932
5420
|
GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
4933
|
-
return vk_instance.contexts[0].device
|
5421
|
+
return vk_instance.contexts[0].device->properties.limits.minMemoryMapAlignment;
|
4934
5422
|
|
4935
5423
|
UNUSED(buft);
|
4936
5424
|
}
|
@@ -4975,8 +5463,7 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
|
|
4975
5463
|
|
4976
5464
|
ggml_vk_cleanup(ctx);
|
4977
5465
|
|
4978
|
-
|
4979
|
-
vk_instance.devices[ctx->idx].reset();
|
5466
|
+
ctx->device.reset();
|
4980
5467
|
ctx->initialized = false;
|
4981
5468
|
|
4982
5469
|
vk_instance.initialized[idx] = false;
|
@@ -4999,13 +5486,13 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
4999
5486
|
#endif
|
5000
5487
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5001
5488
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
5002
|
-
GGML_ASSERT(tensor->backend ==
|
5489
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5003
5490
|
|
5004
5491
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
5005
5492
|
|
5006
5493
|
if (ctx->transfer_ctx == nullptr) {
|
5007
5494
|
// Initialize new transfer context
|
5008
|
-
ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device
|
5495
|
+
ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
5009
5496
|
ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
|
5010
5497
|
}
|
5011
5498
|
|
@@ -5020,13 +5507,13 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
5020
5507
|
#endif
|
5021
5508
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5022
5509
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
5023
|
-
GGML_ASSERT(tensor->backend ==
|
5510
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5024
5511
|
|
5025
5512
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
5026
5513
|
|
5027
5514
|
if (ctx->transfer_ctx == nullptr) {
|
5028
5515
|
// Initialize new transfer context
|
5029
|
-
ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device
|
5516
|
+
ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
5030
5517
|
ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
|
5031
5518
|
}
|
5032
5519
|
|
@@ -5046,7 +5533,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
5046
5533
|
|
5047
5534
|
if (ctx->transfer_ctx == nullptr) {
|
5048
5535
|
// Initialize new transfer context
|
5049
|
-
ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device
|
5536
|
+
ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
5050
5537
|
ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
|
5051
5538
|
}
|
5052
5539
|
|
@@ -5076,8 +5563,8 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
5076
5563
|
}
|
5077
5564
|
|
5078
5565
|
ggml_vk_submit(ctx->transfer_ctx, ctx->fence);
|
5079
|
-
VK_CHECK(ctx->device
|
5080
|
-
ctx->device
|
5566
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
|
5567
|
+
ctx->device->device.resetFences({ ctx->fence });
|
5081
5568
|
|
5082
5569
|
for (auto& cpy : ctx->transfer_ctx->out_memcpys) {
|
5083
5570
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
@@ -5086,7 +5573,7 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
5086
5573
|
ctx->transfer_ctx = nullptr;
|
5087
5574
|
}
|
5088
5575
|
|
5089
|
-
GGML_CALL static
|
5576
|
+
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
5090
5577
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5091
5578
|
|
5092
5579
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -5097,7 +5584,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
5097
5584
|
int last_node = cgraph->n_nodes - 1;
|
5098
5585
|
|
5099
5586
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
5100
|
-
while (last_node > 0 && cgraph->nodes[last_node]->backend !=
|
5587
|
+
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
|
5101
5588
|
last_node -= 1;
|
5102
5589
|
}
|
5103
5590
|
|
@@ -5106,7 +5593,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
5106
5593
|
}
|
5107
5594
|
|
5108
5595
|
ggml_compute_params params = {};
|
5109
|
-
params.type =
|
5596
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
5110
5597
|
params.ith = 0;
|
5111
5598
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
5112
5599
|
ggml_tensor * node = cgraph->nodes[i];
|
@@ -5129,7 +5616,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|
5129
5616
|
|
5130
5617
|
ggml_vk_graph_cleanup(ctx);
|
5131
5618
|
|
5132
|
-
return
|
5619
|
+
return GGML_STATUS_SUCCESS;
|
5133
5620
|
|
5134
5621
|
UNUSED(backend);
|
5135
5622
|
}
|
@@ -5147,6 +5634,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5147
5634
|
}
|
5148
5635
|
break;
|
5149
5636
|
case GGML_OP_MUL_MAT:
|
5637
|
+
case GGML_OP_MUL_MAT_ID:
|
5150
5638
|
{
|
5151
5639
|
struct ggml_tensor * a;
|
5152
5640
|
struct ggml_tensor * b;
|
@@ -5220,6 +5708,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5220
5708
|
case GGML_OP_CONT:
|
5221
5709
|
case GGML_OP_DIAG_MASK_INF:
|
5222
5710
|
case GGML_OP_SOFT_MAX:
|
5711
|
+
case GGML_OP_ARGSORT:
|
5223
5712
|
return true;
|
5224
5713
|
default:
|
5225
5714
|
return false;
|
@@ -5244,6 +5733,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
5244
5733
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
5245
5734
|
};
|
5246
5735
|
|
5736
|
+
static ggml_guid_t ggml_backend_vk_guid() {
|
5737
|
+
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
5738
|
+
return &guid;
|
5739
|
+
}
|
5740
|
+
|
5247
5741
|
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
5248
5742
|
if (vk_instance.initialized[idx]) {
|
5249
5743
|
return vk_instance.backends[idx];
|
@@ -5262,6 +5756,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5262
5756
|
vk_instance.initialized[idx] = true;
|
5263
5757
|
|
5264
5758
|
ggml_backend_t vk_backend = new ggml_backend {
|
5759
|
+
/* .guid = */ ggml_backend_vk_guid(),
|
5265
5760
|
/* .interface = */ ggml_backend_vk_interface,
|
5266
5761
|
/* .context = */ &vk_instance.contexts[ctx->idx],
|
5267
5762
|
};
|
@@ -5272,7 +5767,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5272
5767
|
}
|
5273
5768
|
|
5274
5769
|
GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
|
5275
|
-
return backend && backend->
|
5770
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
5276
5771
|
}
|
5277
5772
|
|
5278
5773
|
GGML_CALL int ggml_backend_vk_get_device_count() {
|
@@ -5410,13 +5905,14 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
5410
5905
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
5411
5906
|
void * tensor_data = tensor->data;
|
5412
5907
|
|
5413
|
-
if (tensor->backend ==
|
5908
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5414
5909
|
const size_t tensor_size = ggml_nbytes(tensor);
|
5415
5910
|
tensor_data = malloc(tensor_size);
|
5416
5911
|
|
5417
5912
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
5418
5913
|
|
5419
|
-
|
5914
|
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
5915
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
5420
5916
|
}
|
5421
5917
|
|
5422
5918
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
@@ -5436,14 +5932,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
5436
5932
|
std::vector<const ggml_tensor *> done;
|
5437
5933
|
ggml_vk_print_graph_origin(tensor, done);
|
5438
5934
|
|
5439
|
-
if (tensor->backend ==
|
5935
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5440
5936
|
free(tensor_data);
|
5441
5937
|
}
|
5442
5938
|
}
|
5443
5939
|
|
5444
5940
|
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
5445
5941
|
return;
|
5446
|
-
GGML_ASSERT(tensor->backend ==
|
5942
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
5447
5943
|
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
5448
5944
|
return;
|
5449
5945
|
}
|
@@ -5481,7 +5977,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5481
5977
|
if (params->ith != 0) {
|
5482
5978
|
return;
|
5483
5979
|
}
|
5484
|
-
if (params->type ==
|
5980
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
5485
5981
|
return;
|
5486
5982
|
}
|
5487
5983
|
|
@@ -5492,6 +5988,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5492
5988
|
|
5493
5989
|
ggml_tensor * src0 = tensor->src[0];
|
5494
5990
|
ggml_tensor * src1 = tensor->src[1];
|
5991
|
+
ggml_tensor * src2 = tensor->src[2];
|
5495
5992
|
|
5496
5993
|
struct ggml_init_params iparams = {
|
5497
5994
|
/*.mem_size =*/ 1024*1024*1024,
|
@@ -5503,13 +6000,16 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5503
6000
|
|
5504
6001
|
struct ggml_tensor * src0_clone = nullptr;
|
5505
6002
|
struct ggml_tensor * src1_clone = nullptr;
|
6003
|
+
struct ggml_tensor * src2_clone = nullptr;
|
5506
6004
|
struct ggml_tensor * tensor_clone = nullptr;
|
5507
6005
|
|
5508
6006
|
size_t src0_size;
|
5509
6007
|
size_t src1_size;
|
6008
|
+
size_t src2_size;
|
5510
6009
|
|
5511
6010
|
void * src0_buffer;
|
5512
6011
|
void * src1_buffer;
|
6012
|
+
void * src2_buffer;
|
5513
6013
|
|
5514
6014
|
if (src0 != nullptr) {
|
5515
6015
|
src0_clone = ggml_dup_tensor(ggml_ctx, src0);
|
@@ -5518,17 +6018,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5518
6018
|
|
5519
6019
|
src0_buffer = malloc(src0_size);
|
5520
6020
|
src0_clone->data = src0_buffer;
|
5521
|
-
if (src0->backend ==
|
6021
|
+
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
5522
6022
|
memcpy(src0_clone->data, src0->data, src0_size);
|
5523
6023
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
5524
|
-
} else if (src0->backend ==
|
6024
|
+
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
5525
6025
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6026
|
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
5526
6027
|
uint64_t offset = extra->offset;
|
5527
6028
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
5528
6029
|
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
5529
6030
|
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
5530
6031
|
const int idx = i3*src0->ne[2] + i2;
|
5531
|
-
ggml_vk_buffer_read(ctx,
|
6032
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
|
5532
6033
|
}
|
5533
6034
|
}
|
5534
6035
|
|
@@ -5538,10 +6039,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5538
6039
|
src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1];
|
5539
6040
|
}
|
5540
6041
|
} else {
|
5541
|
-
if (offset + src0_size >=
|
5542
|
-
src0_size =
|
6042
|
+
if (offset + src0_size >= buffer_gpu->size) {
|
6043
|
+
src0_size = buffer_gpu->size - offset;
|
5543
6044
|
}
|
5544
|
-
ggml_vk_buffer_read(ctx,
|
6045
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src0_clone->data, src0_size);
|
5545
6046
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
5546
6047
|
}
|
5547
6048
|
} else {
|
@@ -5561,17 +6062,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5561
6062
|
|
5562
6063
|
src1_buffer = malloc(src1_size);
|
5563
6064
|
src1_clone->data = src1_buffer;
|
5564
|
-
if (src1->backend ==
|
6065
|
+
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
5565
6066
|
memcpy(src1_clone->data, src1->data, src1_size);
|
5566
6067
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
5567
|
-
} else if (src1->backend ==
|
6068
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
5568
6069
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6070
|
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
5569
6071
|
uint64_t offset = extra->offset;
|
5570
6072
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
5571
6073
|
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
5572
6074
|
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
5573
6075
|
const int idx = i3*src1->ne[2] + i2;
|
5574
|
-
ggml_vk_buffer_read(ctx,
|
6076
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
|
5575
6077
|
}
|
5576
6078
|
}
|
5577
6079
|
|
@@ -5581,10 +6083,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5581
6083
|
src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1];
|
5582
6084
|
}
|
5583
6085
|
} else {
|
5584
|
-
if (offset + src1_size >=
|
5585
|
-
src1_size =
|
6086
|
+
if (offset + src1_size >= buffer_gpu->size) {
|
6087
|
+
src1_size = buffer_gpu->size - offset;
|
5586
6088
|
}
|
5587
|
-
ggml_vk_buffer_read(ctx,
|
6089
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src1_clone->data, src1_size);
|
5588
6090
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
5589
6091
|
}
|
5590
6092
|
} else {
|
@@ -5613,6 +6115,66 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5613
6115
|
|
5614
6116
|
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
|
5615
6117
|
}
|
6118
|
+
if (src2 != nullptr) {
|
6119
|
+
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
6120
|
+
|
6121
|
+
src2_size = ggml_nbytes(src2);
|
6122
|
+
|
6123
|
+
src2_buffer = malloc(src2_size);
|
6124
|
+
src2_clone->data = src2_buffer;
|
6125
|
+
if (src2->backend == GGML_BACKEND_TYPE_CPU) {
|
6126
|
+
memcpy(src2_clone->data, src2->data, src2_size);
|
6127
|
+
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6128
|
+
} else if (src2->backend == GGML_BACKEND_TYPE_GPU) {
|
6129
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
6130
|
+
vk_buffer buf = extra->buffer_gpu.lock();
|
6131
|
+
uint64_t offset = extra->offset;
|
6132
|
+
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
6133
|
+
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
6134
|
+
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
6135
|
+
const int idx = i3*src2->ne[2] + i2;
|
6136
|
+
ggml_vk_buffer_read(ctx, buf, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
6137
|
+
}
|
6138
|
+
}
|
6139
|
+
|
6140
|
+
src2_clone->nb[0] = src2->nb[0];
|
6141
|
+
src2_clone->nb[1] = src2->nb[1];
|
6142
|
+
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
6143
|
+
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
6144
|
+
}
|
6145
|
+
} else {
|
6146
|
+
if (offset + src2_size >= buf->size) {
|
6147
|
+
src2_size = buf->size - offset;
|
6148
|
+
}
|
6149
|
+
ggml_vk_buffer_read(ctx, buf, offset, src2_clone->data, src2_size);
|
6150
|
+
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6151
|
+
}
|
6152
|
+
} else {
|
6153
|
+
GGML_ASSERT(false);
|
6154
|
+
}
|
6155
|
+
|
6156
|
+
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6157
|
+
ggml_vk_print_tensor(ctx, src2, "src2");
|
6158
|
+
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
6159
|
+
std::cerr << "src2_clone=" << tensor << " src2_clone->backend: " << src2_clone->backend << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
6160
|
+
if (src2->src[0] != nullptr) {
|
6161
|
+
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " backend=" << src2->src[0]->backend << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
6162
|
+
}
|
6163
|
+
if (src2->src[1] != nullptr) {
|
6164
|
+
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " backend=" << src2->src[1]->backend << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
6165
|
+
}
|
6166
|
+
std::cerr << std::endl << "Result:" << std::endl;
|
6167
|
+
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
6168
|
+
std::cerr << std::endl;
|
6169
|
+
std::cerr << std::endl << "Result:" << std::endl;
|
6170
|
+
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
|
6171
|
+
std::cerr << std::endl;
|
6172
|
+
std::vector<const ggml_tensor *> done;
|
6173
|
+
ggml_vk_print_graph_origin(src2_clone, done);
|
6174
|
+
}
|
6175
|
+
|
6176
|
+
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
|
6177
|
+
}
|
5616
6178
|
|
5617
6179
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
5618
6180
|
tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone);
|
@@ -5632,7 +6194,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5632
6194
|
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
5633
6195
|
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
5634
6196
|
if (src1 != nullptr) {
|
5635
|
-
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, *(float *)tensor->op_params);
|
6197
|
+
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
5636
6198
|
} else {
|
5637
6199
|
tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
|
5638
6200
|
}
|
@@ -5715,6 +6277,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5715
6277
|
if (src1 != nullptr) {
|
5716
6278
|
free(src1_buffer);
|
5717
6279
|
}
|
6280
|
+
if (src2 != nullptr) {
|
6281
|
+
free(src1_buffer);
|
6282
|
+
}
|
5718
6283
|
|
5719
6284
|
ggml_free(ggml_ctx);
|
5720
6285
|
}
|
@@ -5723,7 +6288,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5723
6288
|
if (params->ith != 0) {
|
5724
6289
|
return;
|
5725
6290
|
}
|
5726
|
-
if (params->type ==
|
6291
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
5727
6292
|
return;
|
5728
6293
|
}
|
5729
6294
|
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
@@ -5735,17 +6300,18 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5735
6300
|
|
5736
6301
|
void * tensor_data = tensor->data;
|
5737
6302
|
|
5738
|
-
if (tensor->backend ==
|
6303
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5739
6304
|
size_t tensor_size = ggml_nbytes(tensor);
|
5740
6305
|
tensor_data = malloc(tensor_size);
|
5741
6306
|
|
5742
6307
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
5743
6308
|
|
5744
|
-
|
5745
|
-
|
6309
|
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6310
|
+
if (extra->offset + tensor_size >= buffer_gpu->size) {
|
6311
|
+
tensor_size = buffer_gpu->size - (extra->offset);
|
5746
6312
|
}
|
5747
6313
|
|
5748
|
-
ggml_vk_buffer_read(ctx,
|
6314
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
5749
6315
|
}
|
5750
6316
|
|
5751
6317
|
float first_error_result = -1.0f;
|
@@ -5868,7 +6434,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5868
6434
|
comp_result = nullptr;
|
5869
6435
|
comp_size = 0;
|
5870
6436
|
|
5871
|
-
if (tensor->backend ==
|
6437
|
+
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
5872
6438
|
free(tensor_data);
|
5873
6439
|
}
|
5874
6440
|
}
|