llama_cpp 0.15.2 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -114,6 +114,7 @@ struct vk_device {
114
114
  size_t idx;
115
115
 
116
116
  vk_matmul_pipeline pipeline_matmul_f32;
117
+ vk_matmul_pipeline pipeline_matmul_f32_f16;
117
118
  vk_matmul_pipeline pipeline_matmul_f16;
118
119
  vk_matmul_pipeline pipeline_matmul_f16_f32;
119
120
  vk_pipeline pipeline_matmul_split_k_reduce;
@@ -289,12 +290,12 @@ struct vk_op_rope_neox_push_constants {
289
290
  float corr_dims[4];
290
291
  float theta_scale;
291
292
  float inv_ndims;
293
+ uint32_t has_freq_facs;
292
294
  };
293
295
 
294
296
  struct vk_op_soft_max_push_constants {
295
297
  uint32_t KX;
296
298
  uint32_t KY;
297
- uint32_t KZ;
298
299
  float scale;
299
300
  float max_bias;
300
301
  float m0;
@@ -304,7 +305,8 @@ struct vk_op_soft_max_push_constants {
304
305
 
305
306
  struct vk_op_argsort_push_constants {
306
307
  uint32_t ncols;
307
- bool ascending;
308
+ uint32_t ncols_pad;
309
+ int32_t order;
308
310
  };
309
311
 
310
312
  // Allow pre-recording command buffers
@@ -375,13 +377,12 @@ struct ggml_backend_vk_context {
375
377
  vk_context * compute_ctx;
376
378
  vk_context * transfer_ctx;
377
379
 
378
- bool disable;
379
380
  bool initialized;
380
381
 
381
382
  size_t idx;
382
383
  };
383
384
 
384
- struct vk_instance {
385
+ struct vk_instance_t {
385
386
  vk::Instance instance;
386
387
 
387
388
  std::vector<size_t> device_indices;
@@ -423,7 +424,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
423
424
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
424
425
 
425
426
  static bool vk_instance_initialized = false;
426
- static vk_instance vk_instance;
427
+ static vk_instance_t vk_instance;
427
428
 
428
429
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
430
 
@@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1013
1014
  uint32_t s_align = 32;
1014
1015
 
1015
1016
  ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
+ ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1016
1018
  ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
1019
  ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1018
1020
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1048
1050
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1049
1051
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1050
1052
 
1053
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1054
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1055
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1056
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1057
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1058
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1059
+
1051
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1052
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1053
1062
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1230
1239
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1231
1240
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1232
1241
 
1242
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1243
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1244
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1245
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1246
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1247
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1248
+
1233
1249
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1234
1250
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1235
1251
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1501,14 +1517,14 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1501
1517
 
1502
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
1503
1519
 
1504
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1505
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1520
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1521
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1506
1522
 
1507
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1508
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1509
1525
 
1510
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1511
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1512
1528
 
1513
1529
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1514
1530
  }
@@ -1859,7 +1875,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1859
1875
  ctx->compute_ctx = nullptr;
1860
1876
  ctx->transfer_ctx = nullptr;
1861
1877
 
1862
- ctx->disable = false;
1863
1878
  ctx->initialized = true;
1864
1879
 
1865
1880
  ctx->idx = idx;
@@ -1903,6 +1918,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1903
1918
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1904
1919
  return ctx->device->pipeline_matmul_f32;
1905
1920
  }
1921
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
1922
+ return ctx->device->pipeline_matmul_f32_f16;
1923
+ }
1906
1924
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
1907
1925
  return ctx->device->pipeline_matmul_f16_f32;
1908
1926
  }
@@ -2722,7 +2740,7 @@ static void ggml_vk_matmul(
2722
2740
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2723
2741
  uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
2724
2742
  #ifdef GGML_VULKAN_DEBUG
2725
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2743
+ std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2726
2744
  #endif
2727
2745
  ggml_vk_sync_buffers(subctx);
2728
2746
  if (split_k == 1) {
@@ -2792,7 +2810,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2792
2810
 
2793
2811
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2794
2812
  #ifdef GGML_VULKAN_DEBUG
2795
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2813
+ std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2796
2814
  std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2797
2815
  #endif
2798
2816
  const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2812,9 +2830,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2812
2830
 
2813
2831
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2814
2832
  #ifdef GGML_VULKAN_DEBUG
2815
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2816
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2817
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2833
+ std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2834
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2835
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2818
2836
  #endif
2819
2837
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2820
2838
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -2982,19 +3000,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2982
3000
  ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
2983
3001
  0, 0, 0, 0, 1
2984
3002
  ); // NOLINT
2985
-
2986
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2987
- // copy dst to host
2988
- float * d = (float *) ((char *) dst->data);
2989
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
2990
- }
2991
3003
  }
2992
3004
 
2993
3005
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2994
3006
  #ifdef GGML_VULKAN_DEBUG
2995
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2996
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2997
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3007
+ std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3008
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3009
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2998
3010
  #endif
2999
3011
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3000
3012
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -3147,12 +3159,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3147
3159
 
3148
3160
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3149
3161
  #ifdef GGML_VULKAN_DEBUG
3150
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3151
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3152
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3162
+ std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3163
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3164
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3153
3165
  #endif
3154
3166
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3155
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3156
3167
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3157
3168
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
3158
3169
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -3217,25 +3228,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3217
3228
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3218
3229
  ggml_vk_sync_buffers(subctx);
3219
3230
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3220
-
3221
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3222
- // copy dst to host
3223
- float * d = (float *) dst->data;
3224
- ggml_vk_sync_buffers(subctx);
3225
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3226
- }
3227
3231
  }
3228
3232
 
3229
3233
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3230
3234
  #ifdef GGML_VULKAN_DEBUG
3231
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3232
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3233
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3235
+ std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3236
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3237
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3234
3238
  #endif
3235
3239
  GGML_ASSERT(!ggml_is_transposed(src0));
3236
3240
  GGML_ASSERT(!ggml_is_transposed(src1));
3237
3241
  GGML_ASSERT(!ggml_is_permuted(src0));
3238
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3239
3242
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3240
3243
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3241
3244
 
@@ -3302,26 +3305,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3302
3305
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3303
3306
  ggml_vk_sync_buffers(subctx);
3304
3307
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3305
-
3306
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3307
- // copy dst to host
3308
- float * d = (float *) dst->data;
3309
- ggml_vk_sync_buffers(subctx);
3310
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3311
- }
3312
- }
3313
-
3314
- static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
3315
- const uint64_t ne10 = src1->ne[0];
3316
-
3317
- const uint64_t ne0 = dst->ne[0];
3318
- const uint64_t ne1 = dst->ne[1];
3319
-
3320
- // TODO: find the optimal values for these
3321
- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
3322
- (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
3323
- dst->type == GGML_TYPE_F32 &&
3324
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
3325
3308
  }
3326
3309
 
3327
3310
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3711,8 +3694,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3711
3694
  // TODO: support for transposed / permuted tensors
3712
3695
  GGML_ASSERT(nb0 == sizeof(float));
3713
3696
  GGML_ASSERT(nb00 == sizeof(float));
3714
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3715
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
3716
3697
 
3717
3698
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3718
3699
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3834,7 +3815,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3834
3815
  if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
3835
3816
  return ctx->device->pipeline_soft_max_f32;
3836
3817
  }
3837
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3818
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3838
3819
  return ctx->device->pipeline_soft_max_f32_f16;
3839
3820
  }
3840
3821
  return nullptr;
@@ -3873,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3873
3854
  default:
3874
3855
  return nullptr;
3875
3856
  }
3857
+
3858
+ GGML_UNUSED(src2);
3876
3859
  }
3877
3860
 
3878
3861
  static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
@@ -3902,14 +3885,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3902
3885
  template<typename PC>
3903
3886
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3904
3887
  #ifdef GGML_VULKAN_DEBUG
3905
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3888
+ std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3906
3889
  if (src1 != nullptr) {
3907
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3890
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3908
3891
  }
3909
3892
  if (src2 != nullptr) {
3910
- std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", backend=" << src2->backend << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3893
+ std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3911
3894
  }
3912
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3895
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3913
3896
  #endif
3914
3897
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3915
3898
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
@@ -3919,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3919
3902
  const uint64_t ne02 = src0->ne[2];
3920
3903
  const uint64_t ne03 = src0->ne[3];
3921
3904
  const uint64_t ne0 = ne00 * ne01;
3905
+
3922
3906
  const bool use_src1 = src1 != nullptr;
3923
3907
  const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
3924
3908
  const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
@@ -3926,11 +3910,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3926
3910
  const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
3927
3911
  const uint64_t ne1 = ne10 * ne11;
3928
3912
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
3929
- const uint64_t nb2 = dst->nb[2];
3930
- const uint64_t nb3 = dst->nb[3];
3931
3913
 
3932
3914
  const bool use_src2 = src2 != nullptr;
3933
- const uint64_t ne2 = use_src2 ? src2->ne[0] * src2->ne[1] : 0;
3915
+ const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
3916
+ const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
3917
+ const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
3918
+ const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
3919
+ const uint64_t ne2 = ne20 * ne21;
3934
3920
 
3935
3921
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
3936
3922
  ggml_vk_func_t op_func;
@@ -3976,7 +3962,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3976
3962
  src1_uma = d_Y != nullptr;
3977
3963
  }
3978
3964
  if (use_src2) {
3979
- ggml_vk_host_get(ctx, src1->data, d_Z, z_buf_offset);
3965
+ ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
3980
3966
  src2_uma = d_Z != nullptr;
3981
3967
  }
3982
3968
  }
@@ -3989,7 +3975,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3989
3975
  vk_buffer d_D = extra->buffer_gpu.lock();
3990
3976
 
3991
3977
  // Workaround for tiny tensor inputs on ROPE
3992
- if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3978
+ if (use_src1 && y_sz > d_D->size) {
3993
3979
  y_sz = VK_WHOLE_SIZE;
3994
3980
  }
3995
3981
 
@@ -4006,7 +3992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4006
3992
  y_buf_offset = extra_src1->offset;
4007
3993
  GGML_ASSERT(d_Y != nullptr);
4008
3994
  }
4009
-
4010
3995
  if (use_src2 && !src2_uma) {
4011
3996
  d_Z = extra_src2->buffer_gpu.lock();
4012
3997
  z_buf_offset = extra_src2->offset;
@@ -4016,6 +4001,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4016
4001
  if (op_supports_incontiguous) {
4017
4002
  x_sz = ggml_nbytes(src0);
4018
4003
  y_sz = use_src1 ? ggml_nbytes(src1) : 0;
4004
+ z_sz = use_src2 ? ggml_nbytes(src2) : 0;
4019
4005
  d_sz = ggml_nbytes(dst);
4020
4006
 
4021
4007
  if (x_buf_offset + x_sz >= d_X->size) {
@@ -4024,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4024
4010
  if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
4025
4011
  y_sz = VK_WHOLE_SIZE;
4026
4012
  }
4013
+ if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
4014
+ z_sz = VK_WHOLE_SIZE;
4015
+ }
4027
4016
  if (d_buf_offset + d_sz >= d_D->size) {
4028
4017
  d_sz = VK_WHOLE_SIZE;
4029
4018
  }
@@ -4046,7 +4035,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4046
4035
  elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
4047
4036
  break;
4048
4037
  case GGML_OP_GET_ROWS:
4049
- elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4038
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4039
+ break;
4040
+ case GGML_OP_ARGSORT:
4041
+ elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
4050
4042
  break;
4051
4043
  default:
4052
4044
  elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
@@ -4060,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4060
4052
  if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4061
4053
  y_sz *= ne12 * ne13;
4062
4054
  }
4055
+ if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4056
+ z_sz *= ne22 * ne23;
4057
+ }
4063
4058
  if (d_sz != VK_WHOLE_SIZE) {
4064
4059
  d_sz *= ne02 * ne03;
4065
4060
  }
4066
4061
  }
4067
4062
 
4068
4063
  if (op == GGML_OP_SOFT_MAX) {
4069
- // Empty src1 and src2 are possible on soft_max, but the shader needs buffers
4064
+ // Empty src1 is possible in soft_max, but the shader needs a buffer
4070
4065
  vk_subbuffer subbuf_y;
4071
4066
  if (use_src1) {
4072
4067
  subbuf_y = { d_Y, y_buf_offset, y_sz };
@@ -4074,15 +4069,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4074
4069
  subbuf_y = { d_X, 0, d_X->size };
4075
4070
  }
4076
4071
 
4077
- vk_subbuffer subbuf_z;
4078
- if (use_src2) {
4079
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4072
+ ggml_vk_sync_buffers(subctx);
4073
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4074
+ } else if (op == GGML_OP_ROPE) {
4075
+ const int mode = ((int32_t *) dst->op_params)[2];
4076
+ const bool is_neox = mode & 2;
4077
+
4078
+ if (is_neox) {
4079
+ // Empty src2 is possible in rope, but the shader needs a buffer
4080
+ vk_subbuffer subbuf_z;
4081
+ if (use_src2) {
4082
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4083
+ } else {
4084
+ subbuf_z = { d_X, 0, d_X->size };
4085
+ }
4086
+
4087
+ ggml_vk_sync_buffers(subctx);
4088
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4080
4089
  } else {
4081
- subbuf_z = { d_X, 0, d_X->size };
4090
+ ggml_vk_sync_buffers(subctx);
4091
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4082
4092
  }
4083
-
4093
+ } else if (use_src2) {
4084
4094
  ggml_vk_sync_buffers(subctx);
4085
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4095
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4086
4096
  } else if (use_src1) {
4087
4097
  ggml_vk_sync_buffers(subctx);
4088
4098
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4090,22 +4100,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4090
4100
  ggml_vk_sync_buffers(subctx);
4091
4101
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4092
4102
  }
4093
- if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
4094
- ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
4095
- } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
4096
- // copy dst to host
4097
- float * d = (float *) dst->data;
4098
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
4099
- }
4100
4103
  } else {
4101
4104
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4105
+ GGML_ASSERT(op != GGML_OP_ARGSORT);
4106
+ GGML_ASSERT(!use_src2);
4102
4107
 
4103
4108
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
4104
4109
 
4105
4110
  switch (dst->op) {
4106
4111
  case GGML_OP_NORM:
4107
4112
  case GGML_OP_RMS_NORM:
4108
- case GGML_OP_SOFT_MAX:
4109
4113
  elements = { (uint32_t)ne01, 1, 1 };
4110
4114
  break;
4111
4115
  case GGML_OP_DIAG_MASK_INF:
@@ -4135,10 +4139,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4135
4139
  ggml_vk_sync_buffers(subctx);
4136
4140
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4137
4141
  }
4138
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
4139
- // copy dst to host
4140
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
4141
- }
4142
4142
  }
4143
4143
  }
4144
4144
  }
@@ -4269,7 +4269,7 @@ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * su
4269
4269
  ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4270
4270
  }
4271
4271
 
4272
- static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4272
+ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4273
4273
  float * op_params = (float *)dst->op_params;
4274
4274
 
4275
4275
  float scale = op_params[0];
@@ -4285,20 +4285,16 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4285
4285
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
4286
4286
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
4287
4287
 
4288
- #pragma message("TODO: src2 is no longer used in soft_max - should be removed and ALiBi calculation should be updated")
4289
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
4290
-
4291
- ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
4288
+ ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
4292
4289
  ncols,
4293
4290
  src1 != nullptr ? nrows_y : (uint32_t)0,
4294
- src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
4295
4291
  scale, max_bias,
4296
4292
  m0, m1,
4297
4293
  n_head_log2,
4298
4294
  });
4299
4295
  }
4300
4296
 
4301
- static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4297
+ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4302
4298
  const int n_dims = ((int32_t *) dst->op_params)[1];
4303
4299
  const int mode = ((int32_t *) dst->op_params)[2];
4304
4300
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4321,15 +4317,40 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4321
4317
  if (is_neox) {
4322
4318
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
4323
4319
  const float inv_ndims = -1.0f / n_dims;
4324
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
4320
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4321
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4322
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4323
+ src2 != nullptr,
4324
+ });
4325
4325
  } else {
4326
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
4326
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4327
+ (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4328
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4329
+ });
4327
4330
  }
4328
4331
  }
4329
4332
 
4330
4333
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4331
4334
  int32_t * op_params = (int32_t *)dst->op_params;
4332
- ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
4335
+
4336
+ uint32_t ncols = src0->ne[0];
4337
+
4338
+ uint32_t ncols_pad = 1;
4339
+ while (ncols_pad < ncols) {
4340
+ ncols_pad *= 2;
4341
+ }
4342
+
4343
+ GGML_ASSERT(ncols_pad <= 1024);
4344
+
4345
+ std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
4346
+
4347
+ std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
4348
+
4349
+ ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
4350
+ ncols,
4351
+ ncols_pad,
4352
+ op_params[0],
4353
+ });
4333
4354
  }
4334
4355
 
4335
4356
  #ifdef GGML_VULKAN_RUN_TESTS
@@ -4381,6 +4402,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4381
4402
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4382
4403
  p = ctx->device->pipeline_matmul_f32->a_s;
4383
4404
  shname = "F32_ALIGNED_S";
4405
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4406
+ p = ctx->device->pipeline_matmul_f32_f16->a_s;
4407
+ shname = "F32_F16_ALIGNED_S";
4384
4408
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4385
4409
  p = ctx->device->pipeline_matmul_f16_f32->a_s;
4386
4410
  shname = "F16_F32_ALIGNED_S";
@@ -4394,6 +4418,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4394
4418
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4395
4419
  p = ctx->device->pipeline_matmul_f32->a_m;
4396
4420
  shname = "F32_ALIGNED_M";
4421
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4422
+ p = ctx->device->pipeline_matmul_f32_f16->a_m;
4423
+ shname = "F32_F16_ALIGNED_M";
4397
4424
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4398
4425
  p = ctx->device->pipeline_matmul_f16_f32->a_m;
4399
4426
  shname = "F16_F32_ALIGNED_M";
@@ -4407,6 +4434,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4407
4434
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4408
4435
  p = ctx->device->pipeline_matmul_f32->a_l;
4409
4436
  shname = "F32_ALIGNED_L";
4437
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4438
+ p = ctx->device->pipeline_matmul_f32_f16->a_l;
4439
+ shname = "F32_F16_ALIGNED_L";
4410
4440
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4411
4441
  p = ctx->device->pipeline_matmul_f16_f32->a_l;
4412
4442
  shname = "F16_F32_ALIGNED_L";
@@ -4427,6 +4457,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4427
4457
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4428
4458
  p = ctx->device->pipeline_matmul_f32->s;
4429
4459
  shname = "F32_S";
4460
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4461
+ p = ctx->device->pipeline_matmul_f32_f16->s;
4462
+ shname = "F32_F16_S";
4430
4463
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4431
4464
  p = ctx->device->pipeline_matmul_f16_f32->s;
4432
4465
  shname = "F16_F32_S";
@@ -4438,6 +4471,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4438
4471
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4439
4472
  p = ctx->device->pipeline_matmul_f32->m;
4440
4473
  shname = "F32_M";
4474
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4475
+ p = ctx->device->pipeline_matmul_f32_f16->m;
4476
+ shname = "F32_F16_M";
4441
4477
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4442
4478
  p = ctx->device->pipeline_matmul_f16_f32->m;
4443
4479
  shname = "F16_F32_M";
@@ -4449,6 +4485,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4449
4485
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4450
4486
  p = ctx->device->pipeline_matmul_f32->l;
4451
4487
  shname = "F32_L";
4488
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4489
+ p = ctx->device->pipeline_matmul_f32_f16->l;
4490
+ shname = "F32_F16_L";
4452
4491
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4453
4492
  p = ctx->device->pipeline_matmul_f16_f32->l;
4454
4493
  shname = "F16_F32_L";
@@ -4561,15 +4600,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4561
4600
  src1_ggml->data = y;
4562
4601
  tensor_ggml->data = d_chk;
4563
4602
 
4564
- ctx->disable = true;
4565
-
4566
4603
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
4567
4604
  ggml_build_forward_expand(cgraph, tensor_ggml);
4568
4605
 
4569
4606
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
4570
4607
 
4571
- ctx->disable = false;
4572
-
4573
4608
  ggml_free(ggml_ctx);
4574
4609
 
4575
4610
  double avg_err = 0.0;
@@ -5049,15 +5084,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5049
5084
  src1_ggml->data = y;
5050
5085
  tensor_ggml->data = d_chk;
5051
5086
 
5052
- ctx->disable = true;
5053
-
5054
5087
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
5055
5088
  ggml_build_forward_expand(cgraph, tensor_ggml);
5056
5089
 
5057
5090
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
5058
5091
 
5059
- ctx->disable = false;
5060
-
5061
5092
  ggml_free(ggml_ctx);
5062
5093
 
5063
5094
  double avg_err = 0.0;
@@ -5134,12 +5165,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5134
5165
  #ifdef GGML_VULKAN_DEBUG
5135
5166
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5136
5167
  #endif
5137
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
5168
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5169
+
5170
+ if (extra == nullptr) {
5138
5171
  return;
5139
5172
  }
5140
5173
 
5141
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5142
-
5143
5174
  ggml_tensor * src0 = node->src[0];
5144
5175
  ggml_tensor * src1 = node->src[1];
5145
5176
 
@@ -5244,9 +5275,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5244
5275
  }
5245
5276
 
5246
5277
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5247
- if (ctx->disable) {
5248
- return;
5249
- }
5250
5278
  #ifdef GGML_VULKAN_DEBUG
5251
5279
  std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5252
5280
  #endif
@@ -5420,7 +5448,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5420
5448
  }
5421
5449
 
5422
5450
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5423
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(node)) {
5451
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5452
+
5453
+ if (ggml_is_empty(node) || extra == nullptr) {
5424
5454
  return;
5425
5455
  }
5426
5456
 
@@ -5434,8 +5464,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5434
5464
  const ggml_tensor * src1 = node->src[1];
5435
5465
  const ggml_tensor * src2 = node->src[2];
5436
5466
 
5437
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5438
-
5439
5467
  switch (node->op) {
5440
5468
  case GGML_OP_UNARY:
5441
5469
  switch (ggml_get_unary_op(node)) {
@@ -5547,11 +5575,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5547
5575
 
5548
5576
  break;
5549
5577
  case GGML_OP_SOFT_MAX:
5550
- ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, src2, node);
5578
+ ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
5551
5579
 
5552
5580
  break;
5553
5581
  case GGML_OP_ROPE:
5554
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
5582
+ ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
5555
5583
 
5556
5584
  break;
5557
5585
  case GGML_OP_ARGSORT:
@@ -5580,7 +5608,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5580
5608
  last_node = true;
5581
5609
  #endif
5582
5610
 
5583
- if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
5611
+ if (last_node) {
5584
5612
  ggml_vk_ctx_end(ctx->compute_ctx);
5585
5613
  ctx->compute_ctx->exit_tensor = node;
5586
5614
  ctx->compute_ctx = nullptr;
@@ -5588,10 +5616,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5588
5616
  }
5589
5617
 
5590
5618
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
5591
- if (ctx->disable) {
5592
- return false;
5593
- }
5594
-
5595
5619
  ggml_tensor_extra_gpu * extra = nullptr;
5596
5620
 
5597
5621
  switch (tensor->op) {
@@ -5650,7 +5674,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5650
5674
  }
5651
5675
 
5652
5676
  #ifdef GGML_VULKAN_DEBUG
5653
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5677
+ std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5654
5678
  #endif
5655
5679
 
5656
5680
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5690,9 +5714,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5690
5714
 
5691
5715
  // Clean up after graph processing is done
5692
5716
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5693
- if (ctx->disable) {
5694
- return;
5695
- }
5696
5717
  #ifdef GGML_VULKAN_DEBUG
5697
5718
  std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5698
5719
  #endif
@@ -5865,7 +5886,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5865
5886
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5866
5887
  }
5867
5888
 
5868
- tensor->backend = GGML_BACKEND_TYPE_GPU;
5869
5889
  tensor->extra = extra;
5870
5890
  }
5871
5891
 
@@ -5873,8 +5893,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
5873
5893
  #ifdef GGML_VULKAN_DEBUG
5874
5894
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5875
5895
  #endif
5876
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5877
-
5878
5896
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5879
5897
 
5880
5898
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -5888,8 +5906,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
5888
5906
  #ifdef GGML_VULKAN_DEBUG
5889
5907
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5890
5908
  #endif
5891
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5892
-
5893
5909
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5894
5910
 
5895
5911
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -5996,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
5996
6012
  };
5997
6013
 
5998
6014
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6015
+ ggml_vk_instance_init();
6016
+
5999
6017
  #ifdef GGML_VULKAN_DEBUG
6000
6018
  std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
6001
6019
  #endif
@@ -6032,6 +6050,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
6032
6050
  #ifdef GGML_VULKAN_DEBUG
6033
6051
  std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6034
6052
  #endif
6053
+ size += 32; // Behave like the CPU buffer type
6035
6054
  void * ptr = nullptr;
6036
6055
  try {
6037
6056
  ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
@@ -6119,7 +6138,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6119
6138
  #endif
6120
6139
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6121
6140
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6122
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6123
6141
 
6124
6142
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6125
6143
 
@@ -6140,7 +6158,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6140
6158
  #endif
6141
6159
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6142
6160
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6143
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6144
6161
 
6145
6162
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6146
6163
 
@@ -6206,6 +6223,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6206
6223
  ctx->transfer_ctx = nullptr;
6207
6224
  }
6208
6225
 
6226
+ static bool ggml_vk_is_empty(ggml_tensor * node) {
6227
+ return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6228
+ }
6229
+
6209
6230
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6210
6231
  #ifdef GGML_VULKAN_DEBUG
6211
6232
  std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
@@ -6220,7 +6241,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6220
6241
  int last_node = cgraph->n_nodes - 1;
6221
6242
 
6222
6243
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
6223
- while (last_node > 0 && (cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(cgraph->nodes[last_node]))) {
6244
+ while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
6224
6245
  last_node -= 1;
6225
6246
  }
6226
6247
 
@@ -6234,7 +6255,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6234
6255
  for (int i = 0; i < cgraph->n_nodes; i++) {
6235
6256
  ggml_tensor * node = cgraph->nodes[i];
6236
6257
 
6237
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
6258
+ if (ggml_vk_is_empty(node)) {
6238
6259
  continue;
6239
6260
  }
6240
6261
 
@@ -6536,7 +6557,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6536
6557
  for (int j = 0; j < level; j++) {
6537
6558
  std::cerr << " ";
6538
6559
  }
6539
- std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
6560
+ std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
6540
6561
 
6541
6562
  done.push_back(tensor);
6542
6563
 
@@ -6548,7 +6569,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6548
6569
  }
6549
6570
 
6550
6571
  static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
6551
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6572
+ if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
6552
6573
  return;
6553
6574
  }
6554
6575
  i0 = std::max(i0, 5);
@@ -6569,6 +6590,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6569
6590
  val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6570
6591
  } else if (tensor->type == GGML_TYPE_F16) {
6571
6592
  val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
6593
+ } else if (tensor->type == GGML_TYPE_I32) {
6594
+ val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6572
6595
  } else {
6573
6596
  GGML_ASSERT(false);
6574
6597
  }
@@ -6584,7 +6607,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6584
6607
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6585
6608
  void * tensor_data = tensor->data;
6586
6609
 
6587
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6610
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6588
6611
  const size_t tensor_size = ggml_nbytes(tensor);
6589
6612
  tensor_data = malloc(tensor_size);
6590
6613
 
@@ -6595,12 +6618,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6595
6618
  }
6596
6619
 
6597
6620
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
6598
- std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6621
+ std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6599
6622
  if (tensor->src[0] != nullptr) {
6600
- std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6623
+ std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6601
6624
  }
6602
6625
  if (tensor->src[1] != nullptr) {
6603
- std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6626
+ std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6604
6627
  }
6605
6628
  std::cerr << std::endl << "Result:" << std::endl;
6606
6629
  ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
@@ -6611,43 +6634,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6611
6634
  std::vector<const ggml_tensor *> done;
6612
6635
  ggml_vk_print_graph_origin(tensor, done);
6613
6636
 
6614
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6637
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6615
6638
  free(tensor_data);
6616
6639
  }
6617
6640
  }
6618
6641
 
6619
- static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
6620
- return;
6621
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
6622
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6623
- return;
6624
- }
6625
- for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
6626
- for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
6627
- for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
6628
- for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
6629
- float val = 0.0f;
6630
- if (tensor->type == GGML_TYPE_F32) {
6631
- val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
6632
- } else if (tensor->type == GGML_TYPE_F16) {
6633
- val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
6634
- }
6635
- if (std::isnan(val)) {
6636
- std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
6637
- std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6638
- std::cerr << std::endl;
6639
- ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
6640
- std::cerr << std::endl;
6641
- std::vector<const ggml_tensor *> done;
6642
- ggml_vk_print_graph_origin(tensor, done);
6643
- GGML_ASSERT(false);
6644
- }
6645
- }
6646
- }
6647
- }
6648
- }
6649
- }
6650
-
6651
6642
  void * comp_result;
6652
6643
  size_t comp_size;
6653
6644
  size_t comp_nb[GGML_MAX_DIMS];
@@ -6701,10 +6692,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6701
6692
 
6702
6693
  src0_buffer = malloc(src0_size);
6703
6694
  src0_clone->data = src0_buffer;
6704
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
6695
+ if (ggml_backend_buffer_is_host(src0->buffer)) {
6705
6696
  memcpy(src0_clone->data, src0->data, src0_size);
6706
6697
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6707
- } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
6698
+ } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6708
6699
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6709
6700
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6710
6701
  uint64_t offset = extra->offset;
@@ -6735,8 +6726,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6735
6726
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6736
6727
  ggml_vk_print_tensor(ctx, src0, "src0");
6737
6728
  }
6738
-
6739
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
6740
6729
  }
6741
6730
  if (src1 != nullptr) {
6742
6731
  src1_clone = ggml_dup_tensor(ggml_ctx, src1);
@@ -6745,10 +6734,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6745
6734
 
6746
6735
  src1_buffer = malloc(src1_size);
6747
6736
  src1_clone->data = src1_buffer;
6748
- if (src1->backend == GGML_BACKEND_TYPE_CPU) {
6737
+ if (ggml_backend_buffer_is_host(src1->buffer)) {
6749
6738
  memcpy(src1_clone->data, src1->data, src1_size);
6750
6739
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6751
- } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
6740
+ } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6752
6741
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6753
6742
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6754
6743
  uint64_t offset = extra->offset;
@@ -6779,12 +6768,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6779
6768
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6780
6769
  ggml_vk_print_tensor(ctx, src1, "src1");
6781
6770
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6782
- std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6771
+ std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6783
6772
  if (src1->src[0] != nullptr) {
6784
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6773
+ std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6785
6774
  }
6786
6775
  if (src1->src[1] != nullptr) {
6787
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6776
+ std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6788
6777
  }
6789
6778
  std::cerr << std::endl << "Result:" << std::endl;
6790
6779
  ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
@@ -6795,8 +6784,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6795
6784
  std::vector<const ggml_tensor *> done;
6796
6785
  ggml_vk_print_graph_origin(src1_clone, done);
6797
6786
  }
6798
-
6799
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
6800
6787
  }
6801
6788
  if (src2 != nullptr) {
6802
6789
  src2_clone = ggml_dup_tensor(ggml_ctx, src2);
@@ -6805,18 +6792,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6805
6792
 
6806
6793
  src2_buffer = malloc(src2_size);
6807
6794
  src2_clone->data = src2_buffer;
6808
- if (src2->backend == GGML_BACKEND_TYPE_CPU) {
6795
+ if (ggml_backend_buffer_is_host(src2->buffer)) {
6809
6796
  memcpy(src2_clone->data, src2->data, src2_size);
6810
6797
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6811
- } else if (src2->backend == GGML_BACKEND_TYPE_GPU) {
6798
+ } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6812
6799
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6813
- vk_buffer buf = extra->buffer_gpu.lock();
6800
+ vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6814
6801
  uint64_t offset = extra->offset;
6815
6802
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6816
6803
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6817
6804
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
6818
6805
  const int idx = i3*src2->ne[2] + i2;
6819
- ggml_vk_buffer_read(ctx, buf, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6806
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6820
6807
  }
6821
6808
  }
6822
6809
 
@@ -6826,10 +6813,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6826
6813
  src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
6827
6814
  }
6828
6815
  } else {
6829
- if (offset + src2_size >= buf->size) {
6830
- src2_size = buf->size - offset;
6816
+ if (offset + src2_size >= buffer_gpu->size) {
6817
+ src2_size = buffer_gpu->size - offset;
6831
6818
  }
6832
- ggml_vk_buffer_read(ctx, buf, offset, src2_clone->data, src2_size);
6819
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
6833
6820
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6834
6821
  }
6835
6822
  } else {
@@ -6839,12 +6826,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6839
6826
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6840
6827
  ggml_vk_print_tensor(ctx, src2, "src2");
6841
6828
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
6842
- std::cerr << "src2_clone=" << tensor << " src2_clone->backend: " << src2_clone->backend << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6829
+ std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6843
6830
  if (src2->src[0] != nullptr) {
6844
- std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " backend=" << src2->src[0]->backend << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6831
+ std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6845
6832
  }
6846
6833
  if (src2->src[1] != nullptr) {
6847
- std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " backend=" << src2->src[1]->backend << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6834
+ std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6848
6835
  }
6849
6836
  std::cerr << std::endl << "Result:" << std::endl;
6850
6837
  ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
@@ -6855,8 +6842,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6855
6842
  std::vector<const ggml_tensor *> done;
6856
6843
  ggml_vk_print_graph_origin(src2_clone, done);
6857
6844
  }
6858
-
6859
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
6860
6845
  }
6861
6846
 
6862
6847
  if (tensor->op == GGML_OP_MUL_MAT) {
@@ -6877,7 +6862,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6877
6862
  tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
6878
6863
  } else if (tensor->op == GGML_OP_SOFT_MAX) {
6879
6864
  if (src1 != nullptr) {
6880
- tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6865
+ tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6881
6866
  } else {
6882
6867
  tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
6883
6868
  }
@@ -6894,7 +6879,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6894
6879
  float attn_factor = ((float *) tensor->op_params)[8];
6895
6880
  float beta_fast = ((float *) tensor->op_params)[9];
6896
6881
  float beta_slow = ((float *) tensor->op_params)[10];
6897
- tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6882
+ tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6898
6883
  } else if (tensor->op == GGML_OP_UNARY) {
6899
6884
  switch (ggml_get_unary_op(tensor)) {
6900
6885
  case GGML_UNARY_OP_SILU:
@@ -6937,17 +6922,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6937
6922
  GGML_ASSERT(false);
6938
6923
  }
6939
6924
 
6940
- // Disable vulkan here to avoid the hooks in ggml.c
6941
- ctx->disable = true;
6942
-
6943
6925
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
6944
6926
  ggml_build_forward_expand(cgraph, tensor_clone);
6945
6927
 
6946
6928
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6947
6929
 
6948
- ctx->disable = false;
6949
-
6950
- ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
6951
6930
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6952
6931
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
6953
6932
  }
@@ -6964,9 +6943,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6964
6943
  if (src1 != nullptr) {
6965
6944
  free(src1_buffer);
6966
6945
  }
6967
- if (src2 != nullptr) {
6968
- free(src2_buffer);
6969
- }
6970
6946
 
6971
6947
  ggml_free(ggml_ctx);
6972
6948
  }
@@ -6991,7 +6967,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6991
6967
 
6992
6968
  void * tensor_data = tensor->data;
6993
6969
 
6994
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6970
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6995
6971
  size_t tensor_size = ggml_nbytes(tensor);
6996
6972
  tensor_data = malloc(tensor_size);
6997
6973
 
@@ -7026,8 +7002,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7026
7002
  } else if (tensor->type == GGML_TYPE_F16) {
7027
7003
  correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
7028
7004
  result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
7005
+ } else if (tensor->type == GGML_TYPE_I32) {
7006
+ correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
7007
+ result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
7029
7008
  } else {
7030
- std::cerr << "comp_size=" << comp_size << " but required is " << (i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]) << std::endl;
7009
+ std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
7031
7010
  }
7032
7011
  } else {
7033
7012
  std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
@@ -7036,12 +7015,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7036
7015
 
7037
7016
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
7038
7017
  std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
7039
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7018
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7040
7019
  if (src0 != nullptr) {
7041
- std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7020
+ std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7042
7021
  }
7043
7022
  if (src1 != nullptr) {
7044
- std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7023
+ std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7045
7024
  }
7046
7025
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7047
7026
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7077,12 +7056,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7077
7056
 
7078
7057
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
7079
7058
  std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7080
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7059
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7081
7060
  if (src0 != nullptr) {
7082
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7061
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7083
7062
  }
7084
7063
  if (src1 != nullptr) {
7085
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7064
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7086
7065
  }
7087
7066
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7088
7067
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7101,12 +7080,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7101
7080
 
7102
7081
  if (avg_err > 0.05 || std::isnan(avg_err)) {
7103
7082
  std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7104
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7083
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7105
7084
  if (src0 != nullptr) {
7106
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7085
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7107
7086
  }
7108
7087
  if (src1 != nullptr) {
7109
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7088
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7110
7089
  }
7111
7090
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7112
7091
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7118,14 +7097,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7118
7097
  ggml_vk_print_graph_origin(tensor, done);
7119
7098
  GGML_ASSERT(false);
7120
7099
  } else {
7121
- std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
7100
+ std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7122
7101
  }
7123
7102
 
7124
7103
  free(comp_result);
7125
7104
  comp_result = nullptr;
7126
7105
  comp_size = 0;
7127
7106
 
7128
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
7107
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7129
7108
  free(tensor_data);
7130
7109
  }
7131
7110
  }