llama_cpp 0.15.2 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -114,6 +114,7 @@ struct vk_device {
114
114
  size_t idx;
115
115
 
116
116
  vk_matmul_pipeline pipeline_matmul_f32;
117
+ vk_matmul_pipeline pipeline_matmul_f32_f16;
117
118
  vk_matmul_pipeline pipeline_matmul_f16;
118
119
  vk_matmul_pipeline pipeline_matmul_f16_f32;
119
120
  vk_pipeline pipeline_matmul_split_k_reduce;
@@ -289,12 +290,12 @@ struct vk_op_rope_neox_push_constants {
289
290
  float corr_dims[4];
290
291
  float theta_scale;
291
292
  float inv_ndims;
293
+ uint32_t has_freq_facs;
292
294
  };
293
295
 
294
296
  struct vk_op_soft_max_push_constants {
295
297
  uint32_t KX;
296
298
  uint32_t KY;
297
- uint32_t KZ;
298
299
  float scale;
299
300
  float max_bias;
300
301
  float m0;
@@ -304,7 +305,8 @@ struct vk_op_soft_max_push_constants {
304
305
 
305
306
  struct vk_op_argsort_push_constants {
306
307
  uint32_t ncols;
307
- bool ascending;
308
+ uint32_t ncols_pad;
309
+ int32_t order;
308
310
  };
309
311
 
310
312
  // Allow pre-recording command buffers
@@ -375,13 +377,12 @@ struct ggml_backend_vk_context {
375
377
  vk_context * compute_ctx;
376
378
  vk_context * transfer_ctx;
377
379
 
378
- bool disable;
379
380
  bool initialized;
380
381
 
381
382
  size_t idx;
382
383
  };
383
384
 
384
- struct vk_instance {
385
+ struct vk_instance_t {
385
386
  vk::Instance instance;
386
387
 
387
388
  std::vector<size_t> device_indices;
@@ -423,7 +424,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
423
424
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
424
425
 
425
426
  static bool vk_instance_initialized = false;
426
- static vk_instance vk_instance;
427
+ static vk_instance_t vk_instance;
427
428
 
428
429
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
430
 
@@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1013
1014
  uint32_t s_align = 32;
1014
1015
 
1015
1016
  ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
+ ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1016
1018
  ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
1019
  ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1018
1020
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1048
1050
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1049
1051
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1050
1052
 
1053
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1054
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1055
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1056
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1057
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1058
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1059
+
1051
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1052
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1053
1062
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1230
1239
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1231
1240
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1232
1241
 
1242
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1243
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1244
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1245
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1246
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1247
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1248
+
1233
1249
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1234
1250
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1235
1251
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1501,14 +1517,14 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1501
1517
 
1502
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
1503
1519
 
1504
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1505
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1520
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1521
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1506
1522
 
1507
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1508
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1509
1525
 
1510
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1511
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1512
1528
 
1513
1529
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1514
1530
  }
@@ -1859,7 +1875,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1859
1875
  ctx->compute_ctx = nullptr;
1860
1876
  ctx->transfer_ctx = nullptr;
1861
1877
 
1862
- ctx->disable = false;
1863
1878
  ctx->initialized = true;
1864
1879
 
1865
1880
  ctx->idx = idx;
@@ -1903,6 +1918,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1903
1918
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1904
1919
  return ctx->device->pipeline_matmul_f32;
1905
1920
  }
1921
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
1922
+ return ctx->device->pipeline_matmul_f32_f16;
1923
+ }
1906
1924
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
1907
1925
  return ctx->device->pipeline_matmul_f16_f32;
1908
1926
  }
@@ -2722,7 +2740,7 @@ static void ggml_vk_matmul(
2722
2740
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2723
2741
  uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
2724
2742
  #ifdef GGML_VULKAN_DEBUG
2725
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2743
+ std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2726
2744
  #endif
2727
2745
  ggml_vk_sync_buffers(subctx);
2728
2746
  if (split_k == 1) {
@@ -2792,7 +2810,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2792
2810
 
2793
2811
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2794
2812
  #ifdef GGML_VULKAN_DEBUG
2795
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2813
+ std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2796
2814
  std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2797
2815
  #endif
2798
2816
  const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2812,9 +2830,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2812
2830
 
2813
2831
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2814
2832
  #ifdef GGML_VULKAN_DEBUG
2815
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2816
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2817
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2833
+ std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2834
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2835
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2818
2836
  #endif
2819
2837
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2820
2838
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -2982,19 +3000,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2982
3000
  ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
2983
3001
  0, 0, 0, 0, 1
2984
3002
  ); // NOLINT
2985
-
2986
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2987
- // copy dst to host
2988
- float * d = (float *) ((char *) dst->data);
2989
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
2990
- }
2991
3003
  }
2992
3004
 
2993
3005
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2994
3006
  #ifdef GGML_VULKAN_DEBUG
2995
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2996
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2997
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3007
+ std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3008
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3009
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2998
3010
  #endif
2999
3011
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3000
3012
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -3147,12 +3159,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3147
3159
 
3148
3160
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3149
3161
  #ifdef GGML_VULKAN_DEBUG
3150
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3151
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3152
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3162
+ std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3163
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3164
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3153
3165
  #endif
3154
3166
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3155
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3156
3167
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3157
3168
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
3158
3169
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -3217,25 +3228,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3217
3228
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3218
3229
  ggml_vk_sync_buffers(subctx);
3219
3230
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3220
-
3221
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3222
- // copy dst to host
3223
- float * d = (float *) dst->data;
3224
- ggml_vk_sync_buffers(subctx);
3225
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3226
- }
3227
3231
  }
3228
3232
 
3229
3233
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3230
3234
  #ifdef GGML_VULKAN_DEBUG
3231
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3232
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3233
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3235
+ std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3236
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3237
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3234
3238
  #endif
3235
3239
  GGML_ASSERT(!ggml_is_transposed(src0));
3236
3240
  GGML_ASSERT(!ggml_is_transposed(src1));
3237
3241
  GGML_ASSERT(!ggml_is_permuted(src0));
3238
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3239
3242
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3240
3243
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3241
3244
 
@@ -3302,26 +3305,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3302
3305
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3303
3306
  ggml_vk_sync_buffers(subctx);
3304
3307
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3305
-
3306
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3307
- // copy dst to host
3308
- float * d = (float *) dst->data;
3309
- ggml_vk_sync_buffers(subctx);
3310
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3311
- }
3312
- }
3313
-
3314
- static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
3315
- const uint64_t ne10 = src1->ne[0];
3316
-
3317
- const uint64_t ne0 = dst->ne[0];
3318
- const uint64_t ne1 = dst->ne[1];
3319
-
3320
- // TODO: find the optimal values for these
3321
- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
3322
- (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
3323
- dst->type == GGML_TYPE_F32 &&
3324
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
3325
3308
  }
3326
3309
 
3327
3310
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3711,8 +3694,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3711
3694
  // TODO: support for transposed / permuted tensors
3712
3695
  GGML_ASSERT(nb0 == sizeof(float));
3713
3696
  GGML_ASSERT(nb00 == sizeof(float));
3714
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3715
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
3716
3697
 
3717
3698
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3718
3699
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3834,7 +3815,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3834
3815
  if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
3835
3816
  return ctx->device->pipeline_soft_max_f32;
3836
3817
  }
3837
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3818
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3838
3819
  return ctx->device->pipeline_soft_max_f32_f16;
3839
3820
  }
3840
3821
  return nullptr;
@@ -3873,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3873
3854
  default:
3874
3855
  return nullptr;
3875
3856
  }
3857
+
3858
+ GGML_UNUSED(src2);
3876
3859
  }
3877
3860
 
3878
3861
  static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
@@ -3902,14 +3885,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3902
3885
  template<typename PC>
3903
3886
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3904
3887
  #ifdef GGML_VULKAN_DEBUG
3905
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3888
+ std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3906
3889
  if (src1 != nullptr) {
3907
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3890
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3908
3891
  }
3909
3892
  if (src2 != nullptr) {
3910
- std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", backend=" << src2->backend << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3893
+ std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3911
3894
  }
3912
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3895
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3913
3896
  #endif
3914
3897
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3915
3898
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
@@ -3919,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3919
3902
  const uint64_t ne02 = src0->ne[2];
3920
3903
  const uint64_t ne03 = src0->ne[3];
3921
3904
  const uint64_t ne0 = ne00 * ne01;
3905
+
3922
3906
  const bool use_src1 = src1 != nullptr;
3923
3907
  const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
3924
3908
  const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
@@ -3926,11 +3910,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3926
3910
  const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
3927
3911
  const uint64_t ne1 = ne10 * ne11;
3928
3912
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
3929
- const uint64_t nb2 = dst->nb[2];
3930
- const uint64_t nb3 = dst->nb[3];
3931
3913
 
3932
3914
  const bool use_src2 = src2 != nullptr;
3933
- const uint64_t ne2 = use_src2 ? src2->ne[0] * src2->ne[1] : 0;
3915
+ const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
3916
+ const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
3917
+ const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
3918
+ const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
3919
+ const uint64_t ne2 = ne20 * ne21;
3934
3920
 
3935
3921
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
3936
3922
  ggml_vk_func_t op_func;
@@ -3976,7 +3962,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3976
3962
  src1_uma = d_Y != nullptr;
3977
3963
  }
3978
3964
  if (use_src2) {
3979
- ggml_vk_host_get(ctx, src1->data, d_Z, z_buf_offset);
3965
+ ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
3980
3966
  src2_uma = d_Z != nullptr;
3981
3967
  }
3982
3968
  }
@@ -3989,7 +3975,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3989
3975
  vk_buffer d_D = extra->buffer_gpu.lock();
3990
3976
 
3991
3977
  // Workaround for tiny tensor inputs on ROPE
3992
- if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3978
+ if (use_src1 && y_sz > d_D->size) {
3993
3979
  y_sz = VK_WHOLE_SIZE;
3994
3980
  }
3995
3981
 
@@ -4006,7 +3992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4006
3992
  y_buf_offset = extra_src1->offset;
4007
3993
  GGML_ASSERT(d_Y != nullptr);
4008
3994
  }
4009
-
4010
3995
  if (use_src2 && !src2_uma) {
4011
3996
  d_Z = extra_src2->buffer_gpu.lock();
4012
3997
  z_buf_offset = extra_src2->offset;
@@ -4016,6 +4001,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4016
4001
  if (op_supports_incontiguous) {
4017
4002
  x_sz = ggml_nbytes(src0);
4018
4003
  y_sz = use_src1 ? ggml_nbytes(src1) : 0;
4004
+ z_sz = use_src2 ? ggml_nbytes(src2) : 0;
4019
4005
  d_sz = ggml_nbytes(dst);
4020
4006
 
4021
4007
  if (x_buf_offset + x_sz >= d_X->size) {
@@ -4024,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4024
4010
  if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
4025
4011
  y_sz = VK_WHOLE_SIZE;
4026
4012
  }
4013
+ if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
4014
+ z_sz = VK_WHOLE_SIZE;
4015
+ }
4027
4016
  if (d_buf_offset + d_sz >= d_D->size) {
4028
4017
  d_sz = VK_WHOLE_SIZE;
4029
4018
  }
@@ -4046,7 +4035,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4046
4035
  elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
4047
4036
  break;
4048
4037
  case GGML_OP_GET_ROWS:
4049
- elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4038
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4039
+ break;
4040
+ case GGML_OP_ARGSORT:
4041
+ elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
4050
4042
  break;
4051
4043
  default:
4052
4044
  elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
@@ -4060,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4060
4052
  if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4061
4053
  y_sz *= ne12 * ne13;
4062
4054
  }
4055
+ if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4056
+ z_sz *= ne22 * ne23;
4057
+ }
4063
4058
  if (d_sz != VK_WHOLE_SIZE) {
4064
4059
  d_sz *= ne02 * ne03;
4065
4060
  }
4066
4061
  }
4067
4062
 
4068
4063
  if (op == GGML_OP_SOFT_MAX) {
4069
- // Empty src1 and src2 are possible on soft_max, but the shader needs buffers
4064
+ // Empty src1 is possible in soft_max, but the shader needs a buffer
4070
4065
  vk_subbuffer subbuf_y;
4071
4066
  if (use_src1) {
4072
4067
  subbuf_y = { d_Y, y_buf_offset, y_sz };
@@ -4074,15 +4069,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4074
4069
  subbuf_y = { d_X, 0, d_X->size };
4075
4070
  }
4076
4071
 
4077
- vk_subbuffer subbuf_z;
4078
- if (use_src2) {
4079
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4072
+ ggml_vk_sync_buffers(subctx);
4073
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4074
+ } else if (op == GGML_OP_ROPE) {
4075
+ const int mode = ((int32_t *) dst->op_params)[2];
4076
+ const bool is_neox = mode & 2;
4077
+
4078
+ if (is_neox) {
4079
+ // Empty src2 is possible in rope, but the shader needs a buffer
4080
+ vk_subbuffer subbuf_z;
4081
+ if (use_src2) {
4082
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4083
+ } else {
4084
+ subbuf_z = { d_X, 0, d_X->size };
4085
+ }
4086
+
4087
+ ggml_vk_sync_buffers(subctx);
4088
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4080
4089
  } else {
4081
- subbuf_z = { d_X, 0, d_X->size };
4090
+ ggml_vk_sync_buffers(subctx);
4091
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4082
4092
  }
4083
-
4093
+ } else if (use_src2) {
4084
4094
  ggml_vk_sync_buffers(subctx);
4085
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4095
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4086
4096
  } else if (use_src1) {
4087
4097
  ggml_vk_sync_buffers(subctx);
4088
4098
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4090,22 +4100,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4090
4100
  ggml_vk_sync_buffers(subctx);
4091
4101
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4092
4102
  }
4093
- if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
4094
- ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
4095
- } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
4096
- // copy dst to host
4097
- float * d = (float *) dst->data;
4098
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
4099
- }
4100
4103
  } else {
4101
4104
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4105
+ GGML_ASSERT(op != GGML_OP_ARGSORT);
4106
+ GGML_ASSERT(!use_src2);
4102
4107
 
4103
4108
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
4104
4109
 
4105
4110
  switch (dst->op) {
4106
4111
  case GGML_OP_NORM:
4107
4112
  case GGML_OP_RMS_NORM:
4108
- case GGML_OP_SOFT_MAX:
4109
4113
  elements = { (uint32_t)ne01, 1, 1 };
4110
4114
  break;
4111
4115
  case GGML_OP_DIAG_MASK_INF:
@@ -4135,10 +4139,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4135
4139
  ggml_vk_sync_buffers(subctx);
4136
4140
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4137
4141
  }
4138
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
4139
- // copy dst to host
4140
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
4141
- }
4142
4142
  }
4143
4143
  }
4144
4144
  }
@@ -4269,7 +4269,7 @@ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * su
4269
4269
  ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4270
4270
  }
4271
4271
 
4272
- static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4272
+ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4273
4273
  float * op_params = (float *)dst->op_params;
4274
4274
 
4275
4275
  float scale = op_params[0];
@@ -4285,20 +4285,16 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4285
4285
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
4286
4286
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
4287
4287
 
4288
- #pragma message("TODO: src2 is no longer used in soft_max - should be removed and ALiBi calculation should be updated")
4289
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
4290
-
4291
- ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
4288
+ ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
4292
4289
  ncols,
4293
4290
  src1 != nullptr ? nrows_y : (uint32_t)0,
4294
- src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
4295
4291
  scale, max_bias,
4296
4292
  m0, m1,
4297
4293
  n_head_log2,
4298
4294
  });
4299
4295
  }
4300
4296
 
4301
- static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4297
+ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4302
4298
  const int n_dims = ((int32_t *) dst->op_params)[1];
4303
4299
  const int mode = ((int32_t *) dst->op_params)[2];
4304
4300
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4321,15 +4317,40 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4321
4317
  if (is_neox) {
4322
4318
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
4323
4319
  const float inv_ndims = -1.0f / n_dims;
4324
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
4320
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4321
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4322
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4323
+ src2 != nullptr,
4324
+ });
4325
4325
  } else {
4326
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
4326
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4327
+ (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4328
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4329
+ });
4327
4330
  }
4328
4331
  }
4329
4332
 
4330
4333
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4331
4334
  int32_t * op_params = (int32_t *)dst->op_params;
4332
- ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
4335
+
4336
+ uint32_t ncols = src0->ne[0];
4337
+
4338
+ uint32_t ncols_pad = 1;
4339
+ while (ncols_pad < ncols) {
4340
+ ncols_pad *= 2;
4341
+ }
4342
+
4343
+ GGML_ASSERT(ncols_pad <= 1024);
4344
+
4345
+ std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
4346
+
4347
+ std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
4348
+
4349
+ ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
4350
+ ncols,
4351
+ ncols_pad,
4352
+ op_params[0],
4353
+ });
4333
4354
  }
4334
4355
 
4335
4356
  #ifdef GGML_VULKAN_RUN_TESTS
@@ -4381,6 +4402,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4381
4402
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4382
4403
  p = ctx->device->pipeline_matmul_f32->a_s;
4383
4404
  shname = "F32_ALIGNED_S";
4405
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4406
+ p = ctx->device->pipeline_matmul_f32_f16->a_s;
4407
+ shname = "F32_F16_ALIGNED_S";
4384
4408
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4385
4409
  p = ctx->device->pipeline_matmul_f16_f32->a_s;
4386
4410
  shname = "F16_F32_ALIGNED_S";
@@ -4394,6 +4418,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4394
4418
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4395
4419
  p = ctx->device->pipeline_matmul_f32->a_m;
4396
4420
  shname = "F32_ALIGNED_M";
4421
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4422
+ p = ctx->device->pipeline_matmul_f32_f16->a_m;
4423
+ shname = "F32_F16_ALIGNED_M";
4397
4424
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4398
4425
  p = ctx->device->pipeline_matmul_f16_f32->a_m;
4399
4426
  shname = "F16_F32_ALIGNED_M";
@@ -4407,6 +4434,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4407
4434
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4408
4435
  p = ctx->device->pipeline_matmul_f32->a_l;
4409
4436
  shname = "F32_ALIGNED_L";
4437
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4438
+ p = ctx->device->pipeline_matmul_f32_f16->a_l;
4439
+ shname = "F32_F16_ALIGNED_L";
4410
4440
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4411
4441
  p = ctx->device->pipeline_matmul_f16_f32->a_l;
4412
4442
  shname = "F16_F32_ALIGNED_L";
@@ -4427,6 +4457,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4427
4457
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4428
4458
  p = ctx->device->pipeline_matmul_f32->s;
4429
4459
  shname = "F32_S";
4460
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4461
+ p = ctx->device->pipeline_matmul_f32_f16->s;
4462
+ shname = "F32_F16_S";
4430
4463
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4431
4464
  p = ctx->device->pipeline_matmul_f16_f32->s;
4432
4465
  shname = "F16_F32_S";
@@ -4438,6 +4471,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4438
4471
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4439
4472
  p = ctx->device->pipeline_matmul_f32->m;
4440
4473
  shname = "F32_M";
4474
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4475
+ p = ctx->device->pipeline_matmul_f32_f16->m;
4476
+ shname = "F32_F16_M";
4441
4477
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4442
4478
  p = ctx->device->pipeline_matmul_f16_f32->m;
4443
4479
  shname = "F16_F32_M";
@@ -4449,6 +4485,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4449
4485
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4450
4486
  p = ctx->device->pipeline_matmul_f32->l;
4451
4487
  shname = "F32_L";
4488
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4489
+ p = ctx->device->pipeline_matmul_f32_f16->l;
4490
+ shname = "F32_F16_L";
4452
4491
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4453
4492
  p = ctx->device->pipeline_matmul_f16_f32->l;
4454
4493
  shname = "F16_F32_L";
@@ -4561,15 +4600,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4561
4600
  src1_ggml->data = y;
4562
4601
  tensor_ggml->data = d_chk;
4563
4602
 
4564
- ctx->disable = true;
4565
-
4566
4603
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
4567
4604
  ggml_build_forward_expand(cgraph, tensor_ggml);
4568
4605
 
4569
4606
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
4570
4607
 
4571
- ctx->disable = false;
4572
-
4573
4608
  ggml_free(ggml_ctx);
4574
4609
 
4575
4610
  double avg_err = 0.0;
@@ -5049,15 +5084,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5049
5084
  src1_ggml->data = y;
5050
5085
  tensor_ggml->data = d_chk;
5051
5086
 
5052
- ctx->disable = true;
5053
-
5054
5087
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
5055
5088
  ggml_build_forward_expand(cgraph, tensor_ggml);
5056
5089
 
5057
5090
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
5058
5091
 
5059
- ctx->disable = false;
5060
-
5061
5092
  ggml_free(ggml_ctx);
5062
5093
 
5063
5094
  double avg_err = 0.0;
@@ -5134,12 +5165,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5134
5165
  #ifdef GGML_VULKAN_DEBUG
5135
5166
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5136
5167
  #endif
5137
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
5168
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5169
+
5170
+ if (extra == nullptr) {
5138
5171
  return;
5139
5172
  }
5140
5173
 
5141
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5142
-
5143
5174
  ggml_tensor * src0 = node->src[0];
5144
5175
  ggml_tensor * src1 = node->src[1];
5145
5176
 
@@ -5244,9 +5275,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5244
5275
  }
5245
5276
 
5246
5277
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5247
- if (ctx->disable) {
5248
- return;
5249
- }
5250
5278
  #ifdef GGML_VULKAN_DEBUG
5251
5279
  std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5252
5280
  #endif
@@ -5420,7 +5448,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5420
5448
  }
5421
5449
 
5422
5450
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5423
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(node)) {
5451
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5452
+
5453
+ if (ggml_is_empty(node) || extra == nullptr) {
5424
5454
  return;
5425
5455
  }
5426
5456
 
@@ -5434,8 +5464,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5434
5464
  const ggml_tensor * src1 = node->src[1];
5435
5465
  const ggml_tensor * src2 = node->src[2];
5436
5466
 
5437
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5438
-
5439
5467
  switch (node->op) {
5440
5468
  case GGML_OP_UNARY:
5441
5469
  switch (ggml_get_unary_op(node)) {
@@ -5547,11 +5575,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5547
5575
 
5548
5576
  break;
5549
5577
  case GGML_OP_SOFT_MAX:
5550
- ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, src2, node);
5578
+ ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
5551
5579
 
5552
5580
  break;
5553
5581
  case GGML_OP_ROPE:
5554
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
5582
+ ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
5555
5583
 
5556
5584
  break;
5557
5585
  case GGML_OP_ARGSORT:
@@ -5580,7 +5608,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5580
5608
  last_node = true;
5581
5609
  #endif
5582
5610
 
5583
- if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
5611
+ if (last_node) {
5584
5612
  ggml_vk_ctx_end(ctx->compute_ctx);
5585
5613
  ctx->compute_ctx->exit_tensor = node;
5586
5614
  ctx->compute_ctx = nullptr;
@@ -5588,10 +5616,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5588
5616
  }
5589
5617
 
5590
5618
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
5591
- if (ctx->disable) {
5592
- return false;
5593
- }
5594
-
5595
5619
  ggml_tensor_extra_gpu * extra = nullptr;
5596
5620
 
5597
5621
  switch (tensor->op) {
@@ -5650,7 +5674,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5650
5674
  }
5651
5675
 
5652
5676
  #ifdef GGML_VULKAN_DEBUG
5653
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5677
+ std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5654
5678
  #endif
5655
5679
 
5656
5680
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5690,9 +5714,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5690
5714
 
5691
5715
  // Clean up after graph processing is done
5692
5716
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5693
- if (ctx->disable) {
5694
- return;
5695
- }
5696
5717
  #ifdef GGML_VULKAN_DEBUG
5697
5718
  std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5698
5719
  #endif
@@ -5865,7 +5886,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5865
5886
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5866
5887
  }
5867
5888
 
5868
- tensor->backend = GGML_BACKEND_TYPE_GPU;
5869
5889
  tensor->extra = extra;
5870
5890
  }
5871
5891
 
@@ -5873,8 +5893,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
5873
5893
  #ifdef GGML_VULKAN_DEBUG
5874
5894
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5875
5895
  #endif
5876
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5877
-
5878
5896
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5879
5897
 
5880
5898
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -5888,8 +5906,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
5888
5906
  #ifdef GGML_VULKAN_DEBUG
5889
5907
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5890
5908
  #endif
5891
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5892
-
5893
5909
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5894
5910
 
5895
5911
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6032,6 +6048,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
6032
6048
  #ifdef GGML_VULKAN_DEBUG
6033
6049
  std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6034
6050
  #endif
6051
+ size += 32; // Behave like the CPU buffer type
6035
6052
  void * ptr = nullptr;
6036
6053
  try {
6037
6054
  ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
@@ -6119,7 +6136,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6119
6136
  #endif
6120
6137
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6121
6138
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6122
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6123
6139
 
6124
6140
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6125
6141
 
@@ -6140,7 +6156,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6140
6156
  #endif
6141
6157
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6142
6158
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6143
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6144
6159
 
6145
6160
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6146
6161
 
@@ -6206,6 +6221,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6206
6221
  ctx->transfer_ctx = nullptr;
6207
6222
  }
6208
6223
 
6224
+ static bool ggml_vk_is_empty(ggml_tensor * node) {
6225
+ return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6226
+ }
6227
+
6209
6228
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6210
6229
  #ifdef GGML_VULKAN_DEBUG
6211
6230
  std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
@@ -6220,7 +6239,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6220
6239
  int last_node = cgraph->n_nodes - 1;
6221
6240
 
6222
6241
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
6223
- while (last_node > 0 && (cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(cgraph->nodes[last_node]))) {
6242
+ while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
6224
6243
  last_node -= 1;
6225
6244
  }
6226
6245
 
@@ -6234,7 +6253,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6234
6253
  for (int i = 0; i < cgraph->n_nodes; i++) {
6235
6254
  ggml_tensor * node = cgraph->nodes[i];
6236
6255
 
6237
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
6256
+ if (ggml_vk_is_empty(node)) {
6238
6257
  continue;
6239
6258
  }
6240
6259
 
@@ -6536,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6536
6555
  for (int j = 0; j < level; j++) {
6537
6556
  std::cerr << " ";
6538
6557
  }
6539
- std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
6558
+ std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
6540
6559
 
6541
6560
  done.push_back(tensor);
6542
6561
 
@@ -6548,7 +6567,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6548
6567
  }
6549
6568
 
6550
6569
  static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
6551
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6570
+ if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
6552
6571
  return;
6553
6572
  }
6554
6573
  i0 = std::max(i0, 5);
@@ -6569,6 +6588,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6569
6588
  val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6570
6589
  } else if (tensor->type == GGML_TYPE_F16) {
6571
6590
  val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
6591
+ } else if (tensor->type == GGML_TYPE_I32) {
6592
+ val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6572
6593
  } else {
6573
6594
  GGML_ASSERT(false);
6574
6595
  }
@@ -6584,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6584
6605
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6585
6606
  void * tensor_data = tensor->data;
6586
6607
 
6587
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6608
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6588
6609
  const size_t tensor_size = ggml_nbytes(tensor);
6589
6610
  tensor_data = malloc(tensor_size);
6590
6611
 
@@ -6595,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6595
6616
  }
6596
6617
 
6597
6618
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
6598
- std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6619
+ std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6599
6620
  if (tensor->src[0] != nullptr) {
6600
- std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6621
+ std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6601
6622
  }
6602
6623
  if (tensor->src[1] != nullptr) {
6603
- std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6624
+ std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6604
6625
  }
6605
6626
  std::cerr << std::endl << "Result:" << std::endl;
6606
6627
  ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
@@ -6611,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6611
6632
  std::vector<const ggml_tensor *> done;
6612
6633
  ggml_vk_print_graph_origin(tensor, done);
6613
6634
 
6614
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6635
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6615
6636
  free(tensor_data);
6616
6637
  }
6617
6638
  }
6618
6639
 
6619
- static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
6620
- return;
6621
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
6622
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6623
- return;
6624
- }
6625
- for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
6626
- for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
6627
- for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
6628
- for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
6629
- float val = 0.0f;
6630
- if (tensor->type == GGML_TYPE_F32) {
6631
- val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
6632
- } else if (tensor->type == GGML_TYPE_F16) {
6633
- val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
6634
- }
6635
- if (std::isnan(val)) {
6636
- std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
6637
- std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6638
- std::cerr << std::endl;
6639
- ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
6640
- std::cerr << std::endl;
6641
- std::vector<const ggml_tensor *> done;
6642
- ggml_vk_print_graph_origin(tensor, done);
6643
- GGML_ASSERT(false);
6644
- }
6645
- }
6646
- }
6647
- }
6648
- }
6649
- }
6650
-
6651
6640
  void * comp_result;
6652
6641
  size_t comp_size;
6653
6642
  size_t comp_nb[GGML_MAX_DIMS];
@@ -6701,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6701
6690
 
6702
6691
  src0_buffer = malloc(src0_size);
6703
6692
  src0_clone->data = src0_buffer;
6704
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
6693
+ if (ggml_backend_buffer_is_host(src0->buffer)) {
6705
6694
  memcpy(src0_clone->data, src0->data, src0_size);
6706
6695
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6707
- } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
6696
+ } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6708
6697
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6709
6698
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6710
6699
  uint64_t offset = extra->offset;
@@ -6735,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6735
6724
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6736
6725
  ggml_vk_print_tensor(ctx, src0, "src0");
6737
6726
  }
6738
-
6739
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
6740
6727
  }
6741
6728
  if (src1 != nullptr) {
6742
6729
  src1_clone = ggml_dup_tensor(ggml_ctx, src1);
@@ -6745,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6745
6732
 
6746
6733
  src1_buffer = malloc(src1_size);
6747
6734
  src1_clone->data = src1_buffer;
6748
- if (src1->backend == GGML_BACKEND_TYPE_CPU) {
6735
+ if (ggml_backend_buffer_is_host(src1->buffer)) {
6749
6736
  memcpy(src1_clone->data, src1->data, src1_size);
6750
6737
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6751
- } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
6738
+ } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6752
6739
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6753
6740
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6754
6741
  uint64_t offset = extra->offset;
@@ -6779,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6779
6766
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6780
6767
  ggml_vk_print_tensor(ctx, src1, "src1");
6781
6768
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6782
- std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6769
+ std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6783
6770
  if (src1->src[0] != nullptr) {
6784
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6771
+ std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6785
6772
  }
6786
6773
  if (src1->src[1] != nullptr) {
6787
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6774
+ std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6788
6775
  }
6789
6776
  std::cerr << std::endl << "Result:" << std::endl;
6790
6777
  ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
@@ -6795,8 +6782,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6795
6782
  std::vector<const ggml_tensor *> done;
6796
6783
  ggml_vk_print_graph_origin(src1_clone, done);
6797
6784
  }
6798
-
6799
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
6800
6785
  }
6801
6786
  if (src2 != nullptr) {
6802
6787
  src2_clone = ggml_dup_tensor(ggml_ctx, src2);
@@ -6805,18 +6790,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6805
6790
 
6806
6791
  src2_buffer = malloc(src2_size);
6807
6792
  src2_clone->data = src2_buffer;
6808
- if (src2->backend == GGML_BACKEND_TYPE_CPU) {
6793
+ if (ggml_backend_buffer_is_host(src2->buffer)) {
6809
6794
  memcpy(src2_clone->data, src2->data, src2_size);
6810
6795
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6811
- } else if (src2->backend == GGML_BACKEND_TYPE_GPU) {
6796
+ } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6812
6797
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6813
- vk_buffer buf = extra->buffer_gpu.lock();
6798
+ vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6814
6799
  uint64_t offset = extra->offset;
6815
6800
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6816
6801
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6817
6802
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
6818
6803
  const int idx = i3*src2->ne[2] + i2;
6819
- ggml_vk_buffer_read(ctx, buf, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6804
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6820
6805
  }
6821
6806
  }
6822
6807
 
@@ -6826,10 +6811,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6826
6811
  src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
6827
6812
  }
6828
6813
  } else {
6829
- if (offset + src2_size >= buf->size) {
6830
- src2_size = buf->size - offset;
6814
+ if (offset + src2_size >= buffer_gpu->size) {
6815
+ src2_size = buffer_gpu->size - offset;
6831
6816
  }
6832
- ggml_vk_buffer_read(ctx, buf, offset, src2_clone->data, src2_size);
6817
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
6833
6818
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6834
6819
  }
6835
6820
  } else {
@@ -6839,12 +6824,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6839
6824
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6840
6825
  ggml_vk_print_tensor(ctx, src2, "src2");
6841
6826
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
6842
- std::cerr << "src2_clone=" << tensor << " src2_clone->backend: " << src2_clone->backend << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6827
+ std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6843
6828
  if (src2->src[0] != nullptr) {
6844
- std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " backend=" << src2->src[0]->backend << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6829
+ std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6845
6830
  }
6846
6831
  if (src2->src[1] != nullptr) {
6847
- std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " backend=" << src2->src[1]->backend << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6832
+ std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6848
6833
  }
6849
6834
  std::cerr << std::endl << "Result:" << std::endl;
6850
6835
  ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
@@ -6855,8 +6840,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6855
6840
  std::vector<const ggml_tensor *> done;
6856
6841
  ggml_vk_print_graph_origin(src2_clone, done);
6857
6842
  }
6858
-
6859
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
6860
6843
  }
6861
6844
 
6862
6845
  if (tensor->op == GGML_OP_MUL_MAT) {
@@ -6877,7 +6860,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6877
6860
  tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
6878
6861
  } else if (tensor->op == GGML_OP_SOFT_MAX) {
6879
6862
  if (src1 != nullptr) {
6880
- tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6863
+ tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6881
6864
  } else {
6882
6865
  tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
6883
6866
  }
@@ -6894,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6894
6877
  float attn_factor = ((float *) tensor->op_params)[8];
6895
6878
  float beta_fast = ((float *) tensor->op_params)[9];
6896
6879
  float beta_slow = ((float *) tensor->op_params)[10];
6897
- tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6880
+ tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6898
6881
  } else if (tensor->op == GGML_OP_UNARY) {
6899
6882
  switch (ggml_get_unary_op(tensor)) {
6900
6883
  case GGML_UNARY_OP_SILU:
@@ -6937,17 +6920,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6937
6920
  GGML_ASSERT(false);
6938
6921
  }
6939
6922
 
6940
- // Disable vulkan here to avoid the hooks in ggml.c
6941
- ctx->disable = true;
6942
-
6943
6923
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
6944
6924
  ggml_build_forward_expand(cgraph, tensor_clone);
6945
6925
 
6946
6926
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6947
6927
 
6948
- ctx->disable = false;
6949
-
6950
- ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
6951
6928
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6952
6929
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
6953
6930
  }
@@ -6964,9 +6941,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6964
6941
  if (src1 != nullptr) {
6965
6942
  free(src1_buffer);
6966
6943
  }
6967
- if (src2 != nullptr) {
6968
- free(src2_buffer);
6969
- }
6970
6944
 
6971
6945
  ggml_free(ggml_ctx);
6972
6946
  }
@@ -6991,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6991
6965
 
6992
6966
  void * tensor_data = tensor->data;
6993
6967
 
6994
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6968
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6995
6969
  size_t tensor_size = ggml_nbytes(tensor);
6996
6970
  tensor_data = malloc(tensor_size);
6997
6971
 
@@ -7026,8 +7000,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7026
7000
  } else if (tensor->type == GGML_TYPE_F16) {
7027
7001
  correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
7028
7002
  result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
7003
+ } else if (tensor->type == GGML_TYPE_I32) {
7004
+ correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
7005
+ result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
7029
7006
  } else {
7030
- std::cerr << "comp_size=" << comp_size << " but required is " << (i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]) << std::endl;
7007
+ std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
7031
7008
  }
7032
7009
  } else {
7033
7010
  std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
@@ -7036,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7036
7013
 
7037
7014
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
7038
7015
  std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
7039
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7016
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7040
7017
  if (src0 != nullptr) {
7041
- std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7018
+ std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7042
7019
  }
7043
7020
  if (src1 != nullptr) {
7044
- std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7021
+ std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7045
7022
  }
7046
7023
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7047
7024
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7077,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7077
7054
 
7078
7055
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
7079
7056
  std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7080
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7057
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7081
7058
  if (src0 != nullptr) {
7082
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7059
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7083
7060
  }
7084
7061
  if (src1 != nullptr) {
7085
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7062
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7086
7063
  }
7087
7064
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7088
7065
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7101,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7101
7078
 
7102
7079
  if (avg_err > 0.05 || std::isnan(avg_err)) {
7103
7080
  std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7104
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7081
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7105
7082
  if (src0 != nullptr) {
7106
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7083
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7107
7084
  }
7108
7085
  if (src1 != nullptr) {
7109
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7086
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7110
7087
  }
7111
7088
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7112
7089
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7118,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7118
7095
  ggml_vk_print_graph_origin(tensor, done);
7119
7096
  GGML_ASSERT(false);
7120
7097
  } else {
7121
- std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
7098
+ std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7122
7099
  }
7123
7100
 
7124
7101
  free(comp_result);
7125
7102
  comp_result = nullptr;
7126
7103
  comp_size = 0;
7127
7104
 
7128
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
7105
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7129
7106
  free(tensor_data);
7130
7107
  }
7131
7108
  }