llama_cpp 0.15.1 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -114,6 +114,7 @@ struct vk_device {
114
114
  size_t idx;
115
115
 
116
116
  vk_matmul_pipeline pipeline_matmul_f32;
117
+ vk_matmul_pipeline pipeline_matmul_f32_f16;
117
118
  vk_matmul_pipeline pipeline_matmul_f16;
118
119
  vk_matmul_pipeline pipeline_matmul_f16_f32;
119
120
  vk_pipeline pipeline_matmul_split_k_reduce;
@@ -289,12 +290,12 @@ struct vk_op_rope_neox_push_constants {
289
290
  float corr_dims[4];
290
291
  float theta_scale;
291
292
  float inv_ndims;
293
+ uint32_t has_freq_facs;
292
294
  };
293
295
 
294
296
  struct vk_op_soft_max_push_constants {
295
297
  uint32_t KX;
296
298
  uint32_t KY;
297
- uint32_t KZ;
298
299
  float scale;
299
300
  float max_bias;
300
301
  float m0;
@@ -304,7 +305,8 @@ struct vk_op_soft_max_push_constants {
304
305
 
305
306
  struct vk_op_argsort_push_constants {
306
307
  uint32_t ncols;
307
- bool ascending;
308
+ uint32_t ncols_pad;
309
+ int32_t order;
308
310
  };
309
311
 
310
312
  // Allow pre-recording command buffers
@@ -375,13 +377,12 @@ struct ggml_backend_vk_context {
375
377
  vk_context * compute_ctx;
376
378
  vk_context * transfer_ctx;
377
379
 
378
- bool disable;
379
380
  bool initialized;
380
381
 
381
382
  size_t idx;
382
383
  };
383
384
 
384
- struct vk_instance {
385
+ struct vk_instance_t {
385
386
  vk::Instance instance;
386
387
 
387
388
  std::vector<size_t> device_indices;
@@ -423,7 +424,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
423
424
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
424
425
 
425
426
  static bool vk_instance_initialized = false;
426
- static vk_instance vk_instance;
427
+ static vk_instance_t vk_instance;
427
428
 
428
429
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
430
 
@@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1013
1014
  uint32_t s_align = 32;
1014
1015
 
1015
1016
  ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
+ ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1016
1018
  ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
1019
  ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1018
1020
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1048
1050
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1049
1051
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1050
1052
 
1053
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1054
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1055
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1056
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1057
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1058
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1059
+
1051
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1052
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1053
1062
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1230
1239
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1231
1240
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1232
1241
 
1242
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1243
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1244
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1245
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1246
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1247
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1248
+
1233
1249
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1234
1250
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1235
1251
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1501,14 +1517,14 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1501
1517
 
1502
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
1503
1519
 
1504
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1505
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1520
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1521
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1506
1522
 
1507
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1508
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1509
1525
 
1510
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1511
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1512
1528
 
1513
1529
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1514
1530
  }
@@ -1859,7 +1875,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1859
1875
  ctx->compute_ctx = nullptr;
1860
1876
  ctx->transfer_ctx = nullptr;
1861
1877
 
1862
- ctx->disable = false;
1863
1878
  ctx->initialized = true;
1864
1879
 
1865
1880
  ctx->idx = idx;
@@ -1903,6 +1918,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1903
1918
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1904
1919
  return ctx->device->pipeline_matmul_f32;
1905
1920
  }
1921
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
1922
+ return ctx->device->pipeline_matmul_f32_f16;
1923
+ }
1906
1924
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
1907
1925
  return ctx->device->pipeline_matmul_f16_f32;
1908
1926
  }
@@ -2722,7 +2740,7 @@ static void ggml_vk_matmul(
2722
2740
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2723
2741
  uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
2724
2742
  #ifdef GGML_VULKAN_DEBUG
2725
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2743
+ std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2726
2744
  #endif
2727
2745
  ggml_vk_sync_buffers(subctx);
2728
2746
  if (split_k == 1) {
@@ -2792,7 +2810,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2792
2810
 
2793
2811
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2794
2812
  #ifdef GGML_VULKAN_DEBUG
2795
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2813
+ std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2796
2814
  std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2797
2815
  #endif
2798
2816
  const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2812,9 +2830,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2812
2830
 
2813
2831
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2814
2832
  #ifdef GGML_VULKAN_DEBUG
2815
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2816
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2817
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2833
+ std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2834
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2835
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2818
2836
  #endif
2819
2837
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2820
2838
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -2982,19 +3000,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2982
3000
  ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
2983
3001
  0, 0, 0, 0, 1
2984
3002
  ); // NOLINT
2985
-
2986
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2987
- // copy dst to host
2988
- float * d = (float *) ((char *) dst->data);
2989
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
2990
- }
2991
3003
  }
2992
3004
 
2993
3005
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2994
3006
  #ifdef GGML_VULKAN_DEBUG
2995
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2996
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2997
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3007
+ std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3008
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3009
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2998
3010
  #endif
2999
3011
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3000
3012
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -3147,12 +3159,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3147
3159
 
3148
3160
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3149
3161
  #ifdef GGML_VULKAN_DEBUG
3150
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3151
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3152
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3162
+ std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3163
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3164
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3153
3165
  #endif
3154
3166
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3155
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3156
3167
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3157
3168
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
3158
3169
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -3217,25 +3228,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3217
3228
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3218
3229
  ggml_vk_sync_buffers(subctx);
3219
3230
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3220
-
3221
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3222
- // copy dst to host
3223
- float * d = (float *) dst->data;
3224
- ggml_vk_sync_buffers(subctx);
3225
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3226
- }
3227
3231
  }
3228
3232
 
3229
3233
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3230
3234
  #ifdef GGML_VULKAN_DEBUG
3231
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3232
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3233
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3235
+ std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3236
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3237
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3234
3238
  #endif
3235
3239
  GGML_ASSERT(!ggml_is_transposed(src0));
3236
3240
  GGML_ASSERT(!ggml_is_transposed(src1));
3237
3241
  GGML_ASSERT(!ggml_is_permuted(src0));
3238
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3239
3242
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3240
3243
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3241
3244
 
@@ -3302,26 +3305,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3302
3305
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3303
3306
  ggml_vk_sync_buffers(subctx);
3304
3307
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3305
-
3306
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3307
- // copy dst to host
3308
- float * d = (float *) dst->data;
3309
- ggml_vk_sync_buffers(subctx);
3310
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3311
- }
3312
- }
3313
-
3314
- static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
3315
- const uint64_t ne10 = src1->ne[0];
3316
-
3317
- const uint64_t ne0 = dst->ne[0];
3318
- const uint64_t ne1 = dst->ne[1];
3319
-
3320
- // TODO: find the optimal values for these
3321
- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
3322
- (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
3323
- dst->type == GGML_TYPE_F32 &&
3324
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
3325
3308
  }
3326
3309
 
3327
3310
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3711,8 +3694,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3711
3694
  // TODO: support for transposed / permuted tensors
3712
3695
  GGML_ASSERT(nb0 == sizeof(float));
3713
3696
  GGML_ASSERT(nb00 == sizeof(float));
3714
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3715
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
3716
3697
 
3717
3698
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3718
3699
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3830,12 +3811,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3830
3811
  return nullptr;
3831
3812
  case GGML_OP_SOFT_MAX:
3832
3813
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
3833
- GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32 || src2->type == GGML_TYPE_F16);
3834
3814
 
3835
- if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
3815
+ if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
3836
3816
  return ctx->device->pipeline_soft_max_f32;
3837
3817
  }
3838
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3818
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3839
3819
  return ctx->device->pipeline_soft_max_f32_f16;
3840
3820
  }
3841
3821
  return nullptr;
@@ -3874,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3874
3854
  default:
3875
3855
  return nullptr;
3876
3856
  }
3857
+
3858
+ GGML_UNUSED(src2);
3877
3859
  }
3878
3860
 
3879
3861
  static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
@@ -3903,14 +3885,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3903
3885
  template<typename PC>
3904
3886
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3905
3887
  #ifdef GGML_VULKAN_DEBUG
3906
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3888
+ std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3907
3889
  if (src1 != nullptr) {
3908
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3890
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3909
3891
  }
3910
3892
  if (src2 != nullptr) {
3911
- std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", backend=" << src2->backend << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3893
+ std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3912
3894
  }
3913
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3895
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3914
3896
  #endif
3915
3897
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3916
3898
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
@@ -3920,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3920
3902
  const uint64_t ne02 = src0->ne[2];
3921
3903
  const uint64_t ne03 = src0->ne[3];
3922
3904
  const uint64_t ne0 = ne00 * ne01;
3905
+
3923
3906
  const bool use_src1 = src1 != nullptr;
3924
3907
  const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
3925
3908
  const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
@@ -3927,11 +3910,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3927
3910
  const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
3928
3911
  const uint64_t ne1 = ne10 * ne11;
3929
3912
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
3930
- const uint64_t nb2 = dst->nb[2];
3931
- const uint64_t nb3 = dst->nb[3];
3932
3913
 
3933
3914
  const bool use_src2 = src2 != nullptr;
3934
- const uint64_t ne2 = use_src2 ? src2->ne[0] * src2->ne[1] : 0;
3915
+ const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
3916
+ const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
3917
+ const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
3918
+ const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
3919
+ const uint64_t ne2 = ne20 * ne21;
3935
3920
 
3936
3921
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
3937
3922
  ggml_vk_func_t op_func;
@@ -3977,7 +3962,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3977
3962
  src1_uma = d_Y != nullptr;
3978
3963
  }
3979
3964
  if (use_src2) {
3980
- ggml_vk_host_get(ctx, src1->data, d_Z, z_buf_offset);
3965
+ ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
3981
3966
  src2_uma = d_Z != nullptr;
3982
3967
  }
3983
3968
  }
@@ -3990,7 +3975,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3990
3975
  vk_buffer d_D = extra->buffer_gpu.lock();
3991
3976
 
3992
3977
  // Workaround for tiny tensor inputs on ROPE
3993
- if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3978
+ if (use_src1 && y_sz > d_D->size) {
3994
3979
  y_sz = VK_WHOLE_SIZE;
3995
3980
  }
3996
3981
 
@@ -4007,7 +3992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4007
3992
  y_buf_offset = extra_src1->offset;
4008
3993
  GGML_ASSERT(d_Y != nullptr);
4009
3994
  }
4010
-
4011
3995
  if (use_src2 && !src2_uma) {
4012
3996
  d_Z = extra_src2->buffer_gpu.lock();
4013
3997
  z_buf_offset = extra_src2->offset;
@@ -4017,6 +4001,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4017
4001
  if (op_supports_incontiguous) {
4018
4002
  x_sz = ggml_nbytes(src0);
4019
4003
  y_sz = use_src1 ? ggml_nbytes(src1) : 0;
4004
+ z_sz = use_src2 ? ggml_nbytes(src2) : 0;
4020
4005
  d_sz = ggml_nbytes(dst);
4021
4006
 
4022
4007
  if (x_buf_offset + x_sz >= d_X->size) {
@@ -4025,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4025
4010
  if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
4026
4011
  y_sz = VK_WHOLE_SIZE;
4027
4012
  }
4013
+ if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
4014
+ z_sz = VK_WHOLE_SIZE;
4015
+ }
4028
4016
  if (d_buf_offset + d_sz >= d_D->size) {
4029
4017
  d_sz = VK_WHOLE_SIZE;
4030
4018
  }
@@ -4047,7 +4035,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4047
4035
  elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
4048
4036
  break;
4049
4037
  case GGML_OP_GET_ROWS:
4050
- elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4038
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4039
+ break;
4040
+ case GGML_OP_ARGSORT:
4041
+ elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
4051
4042
  break;
4052
4043
  default:
4053
4044
  elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
@@ -4061,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4061
4052
  if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4062
4053
  y_sz *= ne12 * ne13;
4063
4054
  }
4055
+ if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4056
+ z_sz *= ne22 * ne23;
4057
+ }
4064
4058
  if (d_sz != VK_WHOLE_SIZE) {
4065
4059
  d_sz *= ne02 * ne03;
4066
4060
  }
4067
4061
  }
4068
4062
 
4069
4063
  if (op == GGML_OP_SOFT_MAX) {
4070
- // Empty src1 and src2 are possible on soft_max, but the shader needs buffers
4064
+ // Empty src1 is possible in soft_max, but the shader needs a buffer
4071
4065
  vk_subbuffer subbuf_y;
4072
4066
  if (use_src1) {
4073
4067
  subbuf_y = { d_Y, y_buf_offset, y_sz };
@@ -4075,15 +4069,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4075
4069
  subbuf_y = { d_X, 0, d_X->size };
4076
4070
  }
4077
4071
 
4078
- vk_subbuffer subbuf_z;
4079
- if (use_src2) {
4080
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4072
+ ggml_vk_sync_buffers(subctx);
4073
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4074
+ } else if (op == GGML_OP_ROPE) {
4075
+ const int mode = ((int32_t *) dst->op_params)[2];
4076
+ const bool is_neox = mode & 2;
4077
+
4078
+ if (is_neox) {
4079
+ // Empty src2 is possible in rope, but the shader needs a buffer
4080
+ vk_subbuffer subbuf_z;
4081
+ if (use_src2) {
4082
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4083
+ } else {
4084
+ subbuf_z = { d_X, 0, d_X->size };
4085
+ }
4086
+
4087
+ ggml_vk_sync_buffers(subctx);
4088
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4081
4089
  } else {
4082
- subbuf_z = { d_X, 0, d_X->size };
4090
+ ggml_vk_sync_buffers(subctx);
4091
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4083
4092
  }
4084
-
4093
+ } else if (use_src2) {
4085
4094
  ggml_vk_sync_buffers(subctx);
4086
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4095
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4087
4096
  } else if (use_src1) {
4088
4097
  ggml_vk_sync_buffers(subctx);
4089
4098
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4091,22 +4100,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4091
4100
  ggml_vk_sync_buffers(subctx);
4092
4101
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4093
4102
  }
4094
- if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
4095
- ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
4096
- } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
4097
- // copy dst to host
4098
- float * d = (float *) dst->data;
4099
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
4100
- }
4101
4103
  } else {
4102
4104
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4105
+ GGML_ASSERT(op != GGML_OP_ARGSORT);
4106
+ GGML_ASSERT(!use_src2);
4103
4107
 
4104
4108
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
4105
4109
 
4106
4110
  switch (dst->op) {
4107
4111
  case GGML_OP_NORM:
4108
4112
  case GGML_OP_RMS_NORM:
4109
- case GGML_OP_SOFT_MAX:
4110
4113
  elements = { (uint32_t)ne01, 1, 1 };
4111
4114
  break;
4112
4115
  case GGML_OP_DIAG_MASK_INF:
@@ -4136,10 +4139,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4136
4139
  ggml_vk_sync_buffers(subctx);
4137
4140
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4138
4141
  }
4139
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
4140
- // copy dst to host
4141
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
4142
- }
4143
4142
  }
4144
4143
  }
4145
4144
  }
@@ -4270,7 +4269,7 @@ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * su
4270
4269
  ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4271
4270
  }
4272
4271
 
4273
- static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4272
+ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4274
4273
  float * op_params = (float *)dst->op_params;
4275
4274
 
4276
4275
  float scale = op_params[0];
@@ -4286,17 +4285,16 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4286
4285
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
4287
4286
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
4288
4287
 
4289
- ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
4288
+ ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
4290
4289
  ncols,
4291
4290
  src1 != nullptr ? nrows_y : (uint32_t)0,
4292
- src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
4293
4291
  scale, max_bias,
4294
4292
  m0, m1,
4295
4293
  n_head_log2,
4296
4294
  });
4297
4295
  }
4298
4296
 
4299
- static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4297
+ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4300
4298
  const int n_dims = ((int32_t *) dst->op_params)[1];
4301
4299
  const int mode = ((int32_t *) dst->op_params)[2];
4302
4300
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4319,15 +4317,40 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4319
4317
  if (is_neox) {
4320
4318
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
4321
4319
  const float inv_ndims = -1.0f / n_dims;
4322
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
4320
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4321
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4322
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4323
+ src2 != nullptr,
4324
+ });
4323
4325
  } else {
4324
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
4326
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4327
+ (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4328
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4329
+ });
4325
4330
  }
4326
4331
  }
4327
4332
 
4328
4333
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4329
4334
  int32_t * op_params = (int32_t *)dst->op_params;
4330
- ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
4335
+
4336
+ uint32_t ncols = src0->ne[0];
4337
+
4338
+ uint32_t ncols_pad = 1;
4339
+ while (ncols_pad < ncols) {
4340
+ ncols_pad *= 2;
4341
+ }
4342
+
4343
+ GGML_ASSERT(ncols_pad <= 1024);
4344
+
4345
+ std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
4346
+
4347
+ std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
4348
+
4349
+ ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
4350
+ ncols,
4351
+ ncols_pad,
4352
+ op_params[0],
4353
+ });
4331
4354
  }
4332
4355
 
4333
4356
  #ifdef GGML_VULKAN_RUN_TESTS
@@ -4379,6 +4402,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4379
4402
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4380
4403
  p = ctx->device->pipeline_matmul_f32->a_s;
4381
4404
  shname = "F32_ALIGNED_S";
4405
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4406
+ p = ctx->device->pipeline_matmul_f32_f16->a_s;
4407
+ shname = "F32_F16_ALIGNED_S";
4382
4408
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4383
4409
  p = ctx->device->pipeline_matmul_f16_f32->a_s;
4384
4410
  shname = "F16_F32_ALIGNED_S";
@@ -4392,6 +4418,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4392
4418
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4393
4419
  p = ctx->device->pipeline_matmul_f32->a_m;
4394
4420
  shname = "F32_ALIGNED_M";
4421
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4422
+ p = ctx->device->pipeline_matmul_f32_f16->a_m;
4423
+ shname = "F32_F16_ALIGNED_M";
4395
4424
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4396
4425
  p = ctx->device->pipeline_matmul_f16_f32->a_m;
4397
4426
  shname = "F16_F32_ALIGNED_M";
@@ -4405,6 +4434,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4405
4434
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4406
4435
  p = ctx->device->pipeline_matmul_f32->a_l;
4407
4436
  shname = "F32_ALIGNED_L";
4437
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4438
+ p = ctx->device->pipeline_matmul_f32_f16->a_l;
4439
+ shname = "F32_F16_ALIGNED_L";
4408
4440
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4409
4441
  p = ctx->device->pipeline_matmul_f16_f32->a_l;
4410
4442
  shname = "F16_F32_ALIGNED_L";
@@ -4425,6 +4457,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4425
4457
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4426
4458
  p = ctx->device->pipeline_matmul_f32->s;
4427
4459
  shname = "F32_S";
4460
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4461
+ p = ctx->device->pipeline_matmul_f32_f16->s;
4462
+ shname = "F32_F16_S";
4428
4463
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4429
4464
  p = ctx->device->pipeline_matmul_f16_f32->s;
4430
4465
  shname = "F16_F32_S";
@@ -4436,6 +4471,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4436
4471
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4437
4472
  p = ctx->device->pipeline_matmul_f32->m;
4438
4473
  shname = "F32_M";
4474
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4475
+ p = ctx->device->pipeline_matmul_f32_f16->m;
4476
+ shname = "F32_F16_M";
4439
4477
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4440
4478
  p = ctx->device->pipeline_matmul_f16_f32->m;
4441
4479
  shname = "F16_F32_M";
@@ -4447,6 +4485,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4447
4485
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4448
4486
  p = ctx->device->pipeline_matmul_f32->l;
4449
4487
  shname = "F32_L";
4488
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4489
+ p = ctx->device->pipeline_matmul_f32_f16->l;
4490
+ shname = "F32_F16_L";
4450
4491
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4451
4492
  p = ctx->device->pipeline_matmul_f16_f32->l;
4452
4493
  shname = "F16_F32_L";
@@ -4559,15 +4600,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4559
4600
  src1_ggml->data = y;
4560
4601
  tensor_ggml->data = d_chk;
4561
4602
 
4562
- ctx->disable = true;
4563
-
4564
4603
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
4565
4604
  ggml_build_forward_expand(cgraph, tensor_ggml);
4566
4605
 
4567
4606
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
4568
4607
 
4569
- ctx->disable = false;
4570
-
4571
4608
  ggml_free(ggml_ctx);
4572
4609
 
4573
4610
  double avg_err = 0.0;
@@ -5047,15 +5084,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5047
5084
  src1_ggml->data = y;
5048
5085
  tensor_ggml->data = d_chk;
5049
5086
 
5050
- ctx->disable = true;
5051
-
5052
5087
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
5053
5088
  ggml_build_forward_expand(cgraph, tensor_ggml);
5054
5089
 
5055
5090
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
5056
5091
 
5057
- ctx->disable = false;
5058
-
5059
5092
  ggml_free(ggml_ctx);
5060
5093
 
5061
5094
  double avg_err = 0.0;
@@ -5132,12 +5165,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5132
5165
  #ifdef GGML_VULKAN_DEBUG
5133
5166
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5134
5167
  #endif
5135
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
5168
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5169
+
5170
+ if (extra == nullptr) {
5136
5171
  return;
5137
5172
  }
5138
5173
 
5139
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5140
-
5141
5174
  ggml_tensor * src0 = node->src[0];
5142
5175
  ggml_tensor * src1 = node->src[1];
5143
5176
 
@@ -5242,9 +5275,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5242
5275
  }
5243
5276
 
5244
5277
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5245
- if (ctx->disable) {
5246
- return;
5247
- }
5248
5278
  #ifdef GGML_VULKAN_DEBUG
5249
5279
  std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5250
5280
  #endif
@@ -5418,7 +5448,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5418
5448
  }
5419
5449
 
5420
5450
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5421
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(node)) {
5451
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5452
+
5453
+ if (ggml_is_empty(node) || extra == nullptr) {
5422
5454
  return;
5423
5455
  }
5424
5456
 
@@ -5432,8 +5464,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5432
5464
  const ggml_tensor * src1 = node->src[1];
5433
5465
  const ggml_tensor * src2 = node->src[2];
5434
5466
 
5435
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5436
-
5437
5467
  switch (node->op) {
5438
5468
  case GGML_OP_UNARY:
5439
5469
  switch (ggml_get_unary_op(node)) {
@@ -5545,11 +5575,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5545
5575
 
5546
5576
  break;
5547
5577
  case GGML_OP_SOFT_MAX:
5548
- ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, src2, node);
5578
+ ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
5549
5579
 
5550
5580
  break;
5551
5581
  case GGML_OP_ROPE:
5552
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
5582
+ ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
5553
5583
 
5554
5584
  break;
5555
5585
  case GGML_OP_ARGSORT:
@@ -5578,7 +5608,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5578
5608
  last_node = true;
5579
5609
  #endif
5580
5610
 
5581
- if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
5611
+ if (last_node) {
5582
5612
  ggml_vk_ctx_end(ctx->compute_ctx);
5583
5613
  ctx->compute_ctx->exit_tensor = node;
5584
5614
  ctx->compute_ctx = nullptr;
@@ -5586,10 +5616,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5586
5616
  }
5587
5617
 
5588
5618
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
5589
- if (ctx->disable) {
5590
- return false;
5591
- }
5592
-
5593
5619
  ggml_tensor_extra_gpu * extra = nullptr;
5594
5620
 
5595
5621
  switch (tensor->op) {
@@ -5648,7 +5674,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5648
5674
  }
5649
5675
 
5650
5676
  #ifdef GGML_VULKAN_DEBUG
5651
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5677
+ std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5652
5678
  #endif
5653
5679
 
5654
5680
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5688,9 +5714,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5688
5714
 
5689
5715
  // Clean up after graph processing is done
5690
5716
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5691
- if (ctx->disable) {
5692
- return;
5693
- }
5694
5717
  #ifdef GGML_VULKAN_DEBUG
5695
5718
  std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5696
5719
  #endif
@@ -5863,7 +5886,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5863
5886
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5864
5887
  }
5865
5888
 
5866
- tensor->backend = GGML_BACKEND_TYPE_GPU;
5867
5889
  tensor->extra = extra;
5868
5890
  }
5869
5891
 
@@ -5871,8 +5893,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
5871
5893
  #ifdef GGML_VULKAN_DEBUG
5872
5894
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5873
5895
  #endif
5874
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5875
-
5876
5896
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5877
5897
 
5878
5898
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -5886,8 +5906,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
5886
5906
  #ifdef GGML_VULKAN_DEBUG
5887
5907
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5888
5908
  #endif
5889
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5890
-
5891
5909
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5892
5910
 
5893
5911
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6030,6 +6048,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
6030
6048
  #ifdef GGML_VULKAN_DEBUG
6031
6049
  std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6032
6050
  #endif
6051
+ size += 32; // Behave like the CPU buffer type
6033
6052
  void * ptr = nullptr;
6034
6053
  try {
6035
6054
  ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
@@ -6117,7 +6136,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6117
6136
  #endif
6118
6137
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6119
6138
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6120
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6121
6139
 
6122
6140
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6123
6141
 
@@ -6138,7 +6156,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6138
6156
  #endif
6139
6157
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6140
6158
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6141
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6142
6159
 
6143
6160
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6144
6161
 
@@ -6204,6 +6221,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6204
6221
  ctx->transfer_ctx = nullptr;
6205
6222
  }
6206
6223
 
6224
+ static bool ggml_vk_is_empty(ggml_tensor * node) {
6225
+ return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6226
+ }
6227
+
6207
6228
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6208
6229
  #ifdef GGML_VULKAN_DEBUG
6209
6230
  std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
@@ -6218,7 +6239,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6218
6239
  int last_node = cgraph->n_nodes - 1;
6219
6240
 
6220
6241
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
6221
- while (last_node > 0 && (cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(cgraph->nodes[last_node]))) {
6242
+ while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
6222
6243
  last_node -= 1;
6223
6244
  }
6224
6245
 
@@ -6232,7 +6253,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6232
6253
  for (int i = 0; i < cgraph->n_nodes; i++) {
6233
6254
  ggml_tensor * node = cgraph->nodes[i];
6234
6255
 
6235
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
6256
+ if (ggml_vk_is_empty(node)) {
6236
6257
  continue;
6237
6258
  }
6238
6259
 
@@ -6534,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6534
6555
  for (int j = 0; j < level; j++) {
6535
6556
  std::cerr << " ";
6536
6557
  }
6537
- std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
6558
+ std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
6538
6559
 
6539
6560
  done.push_back(tensor);
6540
6561
 
@@ -6546,7 +6567,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6546
6567
  }
6547
6568
 
6548
6569
  static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
6549
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6570
+ if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
6550
6571
  return;
6551
6572
  }
6552
6573
  i0 = std::max(i0, 5);
@@ -6567,6 +6588,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6567
6588
  val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6568
6589
  } else if (tensor->type == GGML_TYPE_F16) {
6569
6590
  val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
6591
+ } else if (tensor->type == GGML_TYPE_I32) {
6592
+ val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6570
6593
  } else {
6571
6594
  GGML_ASSERT(false);
6572
6595
  }
@@ -6582,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6582
6605
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6583
6606
  void * tensor_data = tensor->data;
6584
6607
 
6585
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6608
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6586
6609
  const size_t tensor_size = ggml_nbytes(tensor);
6587
6610
  tensor_data = malloc(tensor_size);
6588
6611
 
@@ -6593,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6593
6616
  }
6594
6617
 
6595
6618
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
6596
- std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6619
+ std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6597
6620
  if (tensor->src[0] != nullptr) {
6598
- std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6621
+ std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6599
6622
  }
6600
6623
  if (tensor->src[1] != nullptr) {
6601
- std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6624
+ std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6602
6625
  }
6603
6626
  std::cerr << std::endl << "Result:" << std::endl;
6604
6627
  ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
@@ -6609,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6609
6632
  std::vector<const ggml_tensor *> done;
6610
6633
  ggml_vk_print_graph_origin(tensor, done);
6611
6634
 
6612
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6635
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6613
6636
  free(tensor_data);
6614
6637
  }
6615
6638
  }
6616
6639
 
6617
- static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
6618
- return;
6619
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
6620
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6621
- return;
6622
- }
6623
- for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
6624
- for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
6625
- for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
6626
- for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
6627
- float val = 0.0f;
6628
- if (tensor->type == GGML_TYPE_F32) {
6629
- val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
6630
- } else if (tensor->type == GGML_TYPE_F16) {
6631
- val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
6632
- }
6633
- if (std::isnan(val)) {
6634
- std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
6635
- std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6636
- std::cerr << std::endl;
6637
- ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
6638
- std::cerr << std::endl;
6639
- std::vector<const ggml_tensor *> done;
6640
- ggml_vk_print_graph_origin(tensor, done);
6641
- GGML_ASSERT(false);
6642
- }
6643
- }
6644
- }
6645
- }
6646
- }
6647
- }
6648
-
6649
6640
  void * comp_result;
6650
6641
  size_t comp_size;
6651
6642
  size_t comp_nb[GGML_MAX_DIMS];
@@ -6699,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6699
6690
 
6700
6691
  src0_buffer = malloc(src0_size);
6701
6692
  src0_clone->data = src0_buffer;
6702
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
6693
+ if (ggml_backend_buffer_is_host(src0->buffer)) {
6703
6694
  memcpy(src0_clone->data, src0->data, src0_size);
6704
6695
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6705
- } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
6696
+ } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6706
6697
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6707
6698
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6708
6699
  uint64_t offset = extra->offset;
@@ -6733,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6733
6724
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6734
6725
  ggml_vk_print_tensor(ctx, src0, "src0");
6735
6726
  }
6736
-
6737
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
6738
6727
  }
6739
6728
  if (src1 != nullptr) {
6740
6729
  src1_clone = ggml_dup_tensor(ggml_ctx, src1);
@@ -6743,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6743
6732
 
6744
6733
  src1_buffer = malloc(src1_size);
6745
6734
  src1_clone->data = src1_buffer;
6746
- if (src1->backend == GGML_BACKEND_TYPE_CPU) {
6735
+ if (ggml_backend_buffer_is_host(src1->buffer)) {
6747
6736
  memcpy(src1_clone->data, src1->data, src1_size);
6748
6737
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6749
- } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
6738
+ } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6750
6739
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6751
6740
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6752
6741
  uint64_t offset = extra->offset;
@@ -6777,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6777
6766
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6778
6767
  ggml_vk_print_tensor(ctx, src1, "src1");
6779
6768
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6780
- std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6769
+ std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6781
6770
  if (src1->src[0] != nullptr) {
6782
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6771
+ std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6783
6772
  }
6784
6773
  if (src1->src[1] != nullptr) {
6785
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6774
+ std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6786
6775
  }
6787
6776
  std::cerr << std::endl << "Result:" << std::endl;
6788
6777
  ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
@@ -6793,8 +6782,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6793
6782
  std::vector<const ggml_tensor *> done;
6794
6783
  ggml_vk_print_graph_origin(src1_clone, done);
6795
6784
  }
6796
-
6797
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
6798
6785
  }
6799
6786
  if (src2 != nullptr) {
6800
6787
  src2_clone = ggml_dup_tensor(ggml_ctx, src2);
@@ -6803,18 +6790,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6803
6790
 
6804
6791
  src2_buffer = malloc(src2_size);
6805
6792
  src2_clone->data = src2_buffer;
6806
- if (src2->backend == GGML_BACKEND_TYPE_CPU) {
6793
+ if (ggml_backend_buffer_is_host(src2->buffer)) {
6807
6794
  memcpy(src2_clone->data, src2->data, src2_size);
6808
6795
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6809
- } else if (src2->backend == GGML_BACKEND_TYPE_GPU) {
6796
+ } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6810
6797
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6811
- vk_buffer buf = extra->buffer_gpu.lock();
6798
+ vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6812
6799
  uint64_t offset = extra->offset;
6813
6800
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6814
6801
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6815
6802
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
6816
6803
  const int idx = i3*src2->ne[2] + i2;
6817
- ggml_vk_buffer_read(ctx, buf, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6804
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6818
6805
  }
6819
6806
  }
6820
6807
 
@@ -6824,10 +6811,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6824
6811
  src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
6825
6812
  }
6826
6813
  } else {
6827
- if (offset + src2_size >= buf->size) {
6828
- src2_size = buf->size - offset;
6814
+ if (offset + src2_size >= buffer_gpu->size) {
6815
+ src2_size = buffer_gpu->size - offset;
6829
6816
  }
6830
- ggml_vk_buffer_read(ctx, buf, offset, src2_clone->data, src2_size);
6817
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
6831
6818
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6832
6819
  }
6833
6820
  } else {
@@ -6837,12 +6824,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6837
6824
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6838
6825
  ggml_vk_print_tensor(ctx, src2, "src2");
6839
6826
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
6840
- std::cerr << "src2_clone=" << tensor << " src2_clone->backend: " << src2_clone->backend << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6827
+ std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6841
6828
  if (src2->src[0] != nullptr) {
6842
- std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " backend=" << src2->src[0]->backend << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6829
+ std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6843
6830
  }
6844
6831
  if (src2->src[1] != nullptr) {
6845
- std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " backend=" << src2->src[1]->backend << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6832
+ std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6846
6833
  }
6847
6834
  std::cerr << std::endl << "Result:" << std::endl;
6848
6835
  ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
@@ -6853,8 +6840,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6853
6840
  std::vector<const ggml_tensor *> done;
6854
6841
  ggml_vk_print_graph_origin(src2_clone, done);
6855
6842
  }
6856
-
6857
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
6858
6843
  }
6859
6844
 
6860
6845
  if (tensor->op == GGML_OP_MUL_MAT) {
@@ -6875,7 +6860,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6875
6860
  tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
6876
6861
  } else if (tensor->op == GGML_OP_SOFT_MAX) {
6877
6862
  if (src1 != nullptr) {
6878
- tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6863
+ tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6879
6864
  } else {
6880
6865
  tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
6881
6866
  }
@@ -6892,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6892
6877
  float attn_factor = ((float *) tensor->op_params)[8];
6893
6878
  float beta_fast = ((float *) tensor->op_params)[9];
6894
6879
  float beta_slow = ((float *) tensor->op_params)[10];
6895
- tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6880
+ tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6896
6881
  } else if (tensor->op == GGML_OP_UNARY) {
6897
6882
  switch (ggml_get_unary_op(tensor)) {
6898
6883
  case GGML_UNARY_OP_SILU:
@@ -6935,17 +6920,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6935
6920
  GGML_ASSERT(false);
6936
6921
  }
6937
6922
 
6938
- // Disable vulkan here to avoid the hooks in ggml.c
6939
- ctx->disable = true;
6940
-
6941
6923
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
6942
6924
  ggml_build_forward_expand(cgraph, tensor_clone);
6943
6925
 
6944
6926
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6945
6927
 
6946
- ctx->disable = false;
6947
-
6948
- ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
6949
6928
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6950
6929
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
6951
6930
  }
@@ -6962,9 +6941,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6962
6941
  if (src1 != nullptr) {
6963
6942
  free(src1_buffer);
6964
6943
  }
6965
- if (src2 != nullptr) {
6966
- free(src2_buffer);
6967
- }
6968
6944
 
6969
6945
  ggml_free(ggml_ctx);
6970
6946
  }
@@ -6989,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6989
6965
 
6990
6966
  void * tensor_data = tensor->data;
6991
6967
 
6992
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6968
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6993
6969
  size_t tensor_size = ggml_nbytes(tensor);
6994
6970
  tensor_data = malloc(tensor_size);
6995
6971
 
@@ -7024,8 +7000,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7024
7000
  } else if (tensor->type == GGML_TYPE_F16) {
7025
7001
  correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
7026
7002
  result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
7003
+ } else if (tensor->type == GGML_TYPE_I32) {
7004
+ correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
7005
+ result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
7027
7006
  } else {
7028
- std::cerr << "comp_size=" << comp_size << " but required is " << (i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]) << std::endl;
7007
+ std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
7029
7008
  }
7030
7009
  } else {
7031
7010
  std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
@@ -7034,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7034
7013
 
7035
7014
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
7036
7015
  std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
7037
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7016
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7038
7017
  if (src0 != nullptr) {
7039
- std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7018
+ std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7040
7019
  }
7041
7020
  if (src1 != nullptr) {
7042
- std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7021
+ std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7043
7022
  }
7044
7023
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7045
7024
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7075,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7075
7054
 
7076
7055
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
7077
7056
  std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7078
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7057
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7079
7058
  if (src0 != nullptr) {
7080
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7059
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7081
7060
  }
7082
7061
  if (src1 != nullptr) {
7083
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7062
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7084
7063
  }
7085
7064
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7086
7065
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7099,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7099
7078
 
7100
7079
  if (avg_err > 0.05 || std::isnan(avg_err)) {
7101
7080
  std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7102
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7081
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7103
7082
  if (src0 != nullptr) {
7104
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7083
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7105
7084
  }
7106
7085
  if (src1 != nullptr) {
7107
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7086
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7108
7087
  }
7109
7088
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7110
7089
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7116,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7116
7095
  ggml_vk_print_graph_origin(tensor, done);
7117
7096
  GGML_ASSERT(false);
7118
7097
  } else {
7119
- std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
7098
+ std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7120
7099
  }
7121
7100
 
7122
7101
  free(comp_result);
7123
7102
  comp_result = nullptr;
7124
7103
  comp_size = 0;
7125
7104
 
7126
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
7105
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7127
7106
  free(tensor_data);
7128
7107
  }
7129
7108
  }