llama_cpp 0.15.2 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -114,6 +114,7 @@ struct vk_device {
114
114
  size_t idx;
115
115
 
116
116
  vk_matmul_pipeline pipeline_matmul_f32;
117
+ vk_matmul_pipeline pipeline_matmul_f32_f16;
117
118
  vk_matmul_pipeline pipeline_matmul_f16;
118
119
  vk_matmul_pipeline pipeline_matmul_f16_f32;
119
120
  vk_pipeline pipeline_matmul_split_k_reduce;
@@ -289,12 +290,12 @@ struct vk_op_rope_neox_push_constants {
289
290
  float corr_dims[4];
290
291
  float theta_scale;
291
292
  float inv_ndims;
293
+ uint32_t has_freq_facs;
292
294
  };
293
295
 
294
296
  struct vk_op_soft_max_push_constants {
295
297
  uint32_t KX;
296
298
  uint32_t KY;
297
- uint32_t KZ;
298
299
  float scale;
299
300
  float max_bias;
300
301
  float m0;
@@ -304,7 +305,8 @@ struct vk_op_soft_max_push_constants {
304
305
 
305
306
  struct vk_op_argsort_push_constants {
306
307
  uint32_t ncols;
307
- bool ascending;
308
+ uint32_t ncols_pad;
309
+ int32_t order;
308
310
  };
309
311
 
310
312
  // Allow pre-recording command buffers
@@ -375,13 +377,12 @@ struct ggml_backend_vk_context {
375
377
  vk_context * compute_ctx;
376
378
  vk_context * transfer_ctx;
377
379
 
378
- bool disable;
379
380
  bool initialized;
380
381
 
381
382
  size_t idx;
382
383
  };
383
384
 
384
- struct vk_instance {
385
+ struct vk_instance_t {
385
386
  vk::Instance instance;
386
387
 
387
388
  std::vector<size_t> device_indices;
@@ -423,7 +424,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
423
424
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
424
425
 
425
426
  static bool vk_instance_initialized = false;
426
- static vk_instance vk_instance;
427
+ static vk_instance_t vk_instance;
427
428
 
428
429
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
430
 
@@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1013
1014
  uint32_t s_align = 32;
1014
1015
 
1015
1016
  ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
+ ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1016
1018
  ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
1019
  ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1018
1020
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1048
1050
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1049
1051
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1050
1052
 
1053
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1054
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1055
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1056
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1057
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1058
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1059
+
1051
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1052
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1053
1062
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1230
1239
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1231
1240
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1232
1241
 
1242
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1243
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1244
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1245
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1246
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1247
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1248
+
1233
1249
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1234
1250
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1235
1251
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1501,14 +1517,14 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1501
1517
 
1502
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
1503
1519
 
1504
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1505
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1520
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1521
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1506
1522
 
1507
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1508
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1509
1525
 
1510
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1511
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1512
1528
 
1513
1529
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1514
1530
  }
@@ -1859,7 +1875,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1859
1875
  ctx->compute_ctx = nullptr;
1860
1876
  ctx->transfer_ctx = nullptr;
1861
1877
 
1862
- ctx->disable = false;
1863
1878
  ctx->initialized = true;
1864
1879
 
1865
1880
  ctx->idx = idx;
@@ -1903,6 +1918,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1903
1918
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1904
1919
  return ctx->device->pipeline_matmul_f32;
1905
1920
  }
1921
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
1922
+ return ctx->device->pipeline_matmul_f32_f16;
1923
+ }
1906
1924
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
1907
1925
  return ctx->device->pipeline_matmul_f16_f32;
1908
1926
  }
@@ -2722,7 +2740,7 @@ static void ggml_vk_matmul(
2722
2740
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2723
2741
  uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
2724
2742
  #ifdef GGML_VULKAN_DEBUG
2725
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2743
+ std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2726
2744
  #endif
2727
2745
  ggml_vk_sync_buffers(subctx);
2728
2746
  if (split_k == 1) {
@@ -2792,7 +2810,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2792
2810
 
2793
2811
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2794
2812
  #ifdef GGML_VULKAN_DEBUG
2795
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2813
+ std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2796
2814
  std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2797
2815
  #endif
2798
2816
  const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2812,9 +2830,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2812
2830
 
2813
2831
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2814
2832
  #ifdef GGML_VULKAN_DEBUG
2815
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2816
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2817
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2833
+ std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2834
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2835
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2818
2836
  #endif
2819
2837
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2820
2838
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -2982,19 +3000,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2982
3000
  ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
2983
3001
  0, 0, 0, 0, 1
2984
3002
  ); // NOLINT
2985
-
2986
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2987
- // copy dst to host
2988
- float * d = (float *) ((char *) dst->data);
2989
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
2990
- }
2991
3003
  }
2992
3004
 
2993
3005
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2994
3006
  #ifdef GGML_VULKAN_DEBUG
2995
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2996
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2997
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3007
+ std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3008
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3009
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2998
3010
  #endif
2999
3011
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3000
3012
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -3147,12 +3159,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3147
3159
 
3148
3160
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3149
3161
  #ifdef GGML_VULKAN_DEBUG
3150
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3151
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3152
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3162
+ std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3163
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3164
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3153
3165
  #endif
3154
3166
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3155
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3156
3167
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3157
3168
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
3158
3169
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -3217,25 +3228,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3217
3228
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3218
3229
  ggml_vk_sync_buffers(subctx);
3219
3230
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3220
-
3221
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3222
- // copy dst to host
3223
- float * d = (float *) dst->data;
3224
- ggml_vk_sync_buffers(subctx);
3225
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3226
- }
3227
3231
  }
3228
3232
 
3229
3233
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3230
3234
  #ifdef GGML_VULKAN_DEBUG
3231
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3232
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3233
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3235
+ std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3236
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3237
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3234
3238
  #endif
3235
3239
  GGML_ASSERT(!ggml_is_transposed(src0));
3236
3240
  GGML_ASSERT(!ggml_is_transposed(src1));
3237
3241
  GGML_ASSERT(!ggml_is_permuted(src0));
3238
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3239
3242
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3240
3243
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3241
3244
 
@@ -3302,26 +3305,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3302
3305
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3303
3306
  ggml_vk_sync_buffers(subctx);
3304
3307
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3305
-
3306
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3307
- // copy dst to host
3308
- float * d = (float *) dst->data;
3309
- ggml_vk_sync_buffers(subctx);
3310
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3311
- }
3312
- }
3313
-
3314
- static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
3315
- const uint64_t ne10 = src1->ne[0];
3316
-
3317
- const uint64_t ne0 = dst->ne[0];
3318
- const uint64_t ne1 = dst->ne[1];
3319
-
3320
- // TODO: find the optimal values for these
3321
- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
3322
- (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
3323
- dst->type == GGML_TYPE_F32 &&
3324
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
3325
3308
  }
3326
3309
 
3327
3310
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3711,8 +3694,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3711
3694
  // TODO: support for transposed / permuted tensors
3712
3695
  GGML_ASSERT(nb0 == sizeof(float));
3713
3696
  GGML_ASSERT(nb00 == sizeof(float));
3714
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3715
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
3716
3697
 
3717
3698
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3718
3699
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3834,7 +3815,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3834
3815
  if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
3835
3816
  return ctx->device->pipeline_soft_max_f32;
3836
3817
  }
3837
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3818
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
3838
3819
  return ctx->device->pipeline_soft_max_f32_f16;
3839
3820
  }
3840
3821
  return nullptr;
@@ -3873,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3873
3854
  default:
3874
3855
  return nullptr;
3875
3856
  }
3857
+
3858
+ GGML_UNUSED(src2);
3876
3859
  }
3877
3860
 
3878
3861
  static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
@@ -3902,14 +3885,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3902
3885
  template<typename PC>
3903
3886
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3904
3887
  #ifdef GGML_VULKAN_DEBUG
3905
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3888
+ std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3906
3889
  if (src1 != nullptr) {
3907
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3890
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3908
3891
  }
3909
3892
  if (src2 != nullptr) {
3910
- std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", backend=" << src2->backend << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3893
+ std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3911
3894
  }
3912
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3895
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3913
3896
  #endif
3914
3897
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3915
3898
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
@@ -3919,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3919
3902
  const uint64_t ne02 = src0->ne[2];
3920
3903
  const uint64_t ne03 = src0->ne[3];
3921
3904
  const uint64_t ne0 = ne00 * ne01;
3905
+
3922
3906
  const bool use_src1 = src1 != nullptr;
3923
3907
  const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
3924
3908
  const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
@@ -3926,11 +3910,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3926
3910
  const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
3927
3911
  const uint64_t ne1 = ne10 * ne11;
3928
3912
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
3929
- const uint64_t nb2 = dst->nb[2];
3930
- const uint64_t nb3 = dst->nb[3];
3931
3913
 
3932
3914
  const bool use_src2 = src2 != nullptr;
3933
- const uint64_t ne2 = use_src2 ? src2->ne[0] * src2->ne[1] : 0;
3915
+ const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
3916
+ const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
3917
+ const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
3918
+ const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
3919
+ const uint64_t ne2 = ne20 * ne21;
3934
3920
 
3935
3921
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
3936
3922
  ggml_vk_func_t op_func;
@@ -3976,7 +3962,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3976
3962
  src1_uma = d_Y != nullptr;
3977
3963
  }
3978
3964
  if (use_src2) {
3979
- ggml_vk_host_get(ctx, src1->data, d_Z, z_buf_offset);
3965
+ ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
3980
3966
  src2_uma = d_Z != nullptr;
3981
3967
  }
3982
3968
  }
@@ -3989,7 +3975,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3989
3975
  vk_buffer d_D = extra->buffer_gpu.lock();
3990
3976
 
3991
3977
  // Workaround for tiny tensor inputs on ROPE
3992
- if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3978
+ if (use_src1 && y_sz > d_D->size) {
3993
3979
  y_sz = VK_WHOLE_SIZE;
3994
3980
  }
3995
3981
 
@@ -4006,7 +3992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4006
3992
  y_buf_offset = extra_src1->offset;
4007
3993
  GGML_ASSERT(d_Y != nullptr);
4008
3994
  }
4009
-
4010
3995
  if (use_src2 && !src2_uma) {
4011
3996
  d_Z = extra_src2->buffer_gpu.lock();
4012
3997
  z_buf_offset = extra_src2->offset;
@@ -4016,6 +4001,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4016
4001
  if (op_supports_incontiguous) {
4017
4002
  x_sz = ggml_nbytes(src0);
4018
4003
  y_sz = use_src1 ? ggml_nbytes(src1) : 0;
4004
+ z_sz = use_src2 ? ggml_nbytes(src2) : 0;
4019
4005
  d_sz = ggml_nbytes(dst);
4020
4006
 
4021
4007
  if (x_buf_offset + x_sz >= d_X->size) {
@@ -4024,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4024
4010
  if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
4025
4011
  y_sz = VK_WHOLE_SIZE;
4026
4012
  }
4013
+ if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
4014
+ z_sz = VK_WHOLE_SIZE;
4015
+ }
4027
4016
  if (d_buf_offset + d_sz >= d_D->size) {
4028
4017
  d_sz = VK_WHOLE_SIZE;
4029
4018
  }
@@ -4046,7 +4035,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4046
4035
  elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
4047
4036
  break;
4048
4037
  case GGML_OP_GET_ROWS:
4049
- elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4038
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4039
+ break;
4040
+ case GGML_OP_ARGSORT:
4041
+ elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
4050
4042
  break;
4051
4043
  default:
4052
4044
  elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
@@ -4060,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4060
4052
  if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4061
4053
  y_sz *= ne12 * ne13;
4062
4054
  }
4055
+ if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4056
+ z_sz *= ne22 * ne23;
4057
+ }
4063
4058
  if (d_sz != VK_WHOLE_SIZE) {
4064
4059
  d_sz *= ne02 * ne03;
4065
4060
  }
4066
4061
  }
4067
4062
 
4068
4063
  if (op == GGML_OP_SOFT_MAX) {
4069
- // Empty src1 and src2 are possible on soft_max, but the shader needs buffers
4064
+ // Empty src1 is possible in soft_max, but the shader needs a buffer
4070
4065
  vk_subbuffer subbuf_y;
4071
4066
  if (use_src1) {
4072
4067
  subbuf_y = { d_Y, y_buf_offset, y_sz };
@@ -4074,15 +4069,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4074
4069
  subbuf_y = { d_X, 0, d_X->size };
4075
4070
  }
4076
4071
 
4077
- vk_subbuffer subbuf_z;
4078
- if (use_src2) {
4079
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4072
+ ggml_vk_sync_buffers(subctx);
4073
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4074
+ } else if (op == GGML_OP_ROPE) {
4075
+ const int mode = ((int32_t *) dst->op_params)[2];
4076
+ const bool is_neox = mode & 2;
4077
+
4078
+ if (is_neox) {
4079
+ // Empty src2 is possible in rope, but the shader needs a buffer
4080
+ vk_subbuffer subbuf_z;
4081
+ if (use_src2) {
4082
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4083
+ } else {
4084
+ subbuf_z = { d_X, 0, d_X->size };
4085
+ }
4086
+
4087
+ ggml_vk_sync_buffers(subctx);
4088
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4080
4089
  } else {
4081
- subbuf_z = { d_X, 0, d_X->size };
4090
+ ggml_vk_sync_buffers(subctx);
4091
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4082
4092
  }
4083
-
4093
+ } else if (use_src2) {
4084
4094
  ggml_vk_sync_buffers(subctx);
4085
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4095
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4086
4096
  } else if (use_src1) {
4087
4097
  ggml_vk_sync_buffers(subctx);
4088
4098
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4090,22 +4100,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4090
4100
  ggml_vk_sync_buffers(subctx);
4091
4101
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4092
4102
  }
4093
- if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
4094
- ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
4095
- } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
4096
- // copy dst to host
4097
- float * d = (float *) dst->data;
4098
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
4099
- }
4100
4103
  } else {
4101
4104
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4105
+ GGML_ASSERT(op != GGML_OP_ARGSORT);
4106
+ GGML_ASSERT(!use_src2);
4102
4107
 
4103
4108
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
4104
4109
 
4105
4110
  switch (dst->op) {
4106
4111
  case GGML_OP_NORM:
4107
4112
  case GGML_OP_RMS_NORM:
4108
- case GGML_OP_SOFT_MAX:
4109
4113
  elements = { (uint32_t)ne01, 1, 1 };
4110
4114
  break;
4111
4115
  case GGML_OP_DIAG_MASK_INF:
@@ -4135,10 +4139,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4135
4139
  ggml_vk_sync_buffers(subctx);
4136
4140
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4137
4141
  }
4138
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
4139
- // copy dst to host
4140
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
4141
- }
4142
4142
  }
4143
4143
  }
4144
4144
  }
@@ -4269,7 +4269,7 @@ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * su
4269
4269
  ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4270
4270
  }
4271
4271
 
4272
- static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4272
+ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4273
4273
  float * op_params = (float *)dst->op_params;
4274
4274
 
4275
4275
  float scale = op_params[0];
@@ -4285,20 +4285,16 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4285
4285
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
4286
4286
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
4287
4287
 
4288
- #pragma message("TODO: src2 is no longer used in soft_max - should be removed and ALiBi calculation should be updated")
4289
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
4290
-
4291
- ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
4288
+ ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
4292
4289
  ncols,
4293
4290
  src1 != nullptr ? nrows_y : (uint32_t)0,
4294
- src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
4295
4291
  scale, max_bias,
4296
4292
  m0, m1,
4297
4293
  n_head_log2,
4298
4294
  });
4299
4295
  }
4300
4296
 
4301
- static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4297
+ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4302
4298
  const int n_dims = ((int32_t *) dst->op_params)[1];
4303
4299
  const int mode = ((int32_t *) dst->op_params)[2];
4304
4300
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4321,15 +4317,40 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4321
4317
  if (is_neox) {
4322
4318
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
4323
4319
  const float inv_ndims = -1.0f / n_dims;
4324
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
4320
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4321
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4322
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4323
+ src2 != nullptr,
4324
+ });
4325
4325
  } else {
4326
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
4326
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4327
+ (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4328
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4329
+ });
4327
4330
  }
4328
4331
  }
4329
4332
 
4330
4333
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4331
4334
  int32_t * op_params = (int32_t *)dst->op_params;
4332
- ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
4335
+
4336
+ uint32_t ncols = src0->ne[0];
4337
+
4338
+ uint32_t ncols_pad = 1;
4339
+ while (ncols_pad < ncols) {
4340
+ ncols_pad *= 2;
4341
+ }
4342
+
4343
+ GGML_ASSERT(ncols_pad <= 1024);
4344
+
4345
+ std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
4346
+
4347
+ std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
4348
+
4349
+ ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
4350
+ ncols,
4351
+ ncols_pad,
4352
+ op_params[0],
4353
+ });
4333
4354
  }
4334
4355
 
4335
4356
  #ifdef GGML_VULKAN_RUN_TESTS
@@ -4381,6 +4402,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4381
4402
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4382
4403
  p = ctx->device->pipeline_matmul_f32->a_s;
4383
4404
  shname = "F32_ALIGNED_S";
4405
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4406
+ p = ctx->device->pipeline_matmul_f32_f16->a_s;
4407
+ shname = "F32_F16_ALIGNED_S";
4384
4408
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4385
4409
  p = ctx->device->pipeline_matmul_f16_f32->a_s;
4386
4410
  shname = "F16_F32_ALIGNED_S";
@@ -4394,6 +4418,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4394
4418
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4395
4419
  p = ctx->device->pipeline_matmul_f32->a_m;
4396
4420
  shname = "F32_ALIGNED_M";
4421
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4422
+ p = ctx->device->pipeline_matmul_f32_f16->a_m;
4423
+ shname = "F32_F16_ALIGNED_M";
4397
4424
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4398
4425
  p = ctx->device->pipeline_matmul_f16_f32->a_m;
4399
4426
  shname = "F16_F32_ALIGNED_M";
@@ -4407,6 +4434,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4407
4434
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4408
4435
  p = ctx->device->pipeline_matmul_f32->a_l;
4409
4436
  shname = "F32_ALIGNED_L";
4437
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4438
+ p = ctx->device->pipeline_matmul_f32_f16->a_l;
4439
+ shname = "F32_F16_ALIGNED_L";
4410
4440
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4411
4441
  p = ctx->device->pipeline_matmul_f16_f32->a_l;
4412
4442
  shname = "F16_F32_ALIGNED_L";
@@ -4427,6 +4457,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4427
4457
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4428
4458
  p = ctx->device->pipeline_matmul_f32->s;
4429
4459
  shname = "F32_S";
4460
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4461
+ p = ctx->device->pipeline_matmul_f32_f16->s;
4462
+ shname = "F32_F16_S";
4430
4463
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4431
4464
  p = ctx->device->pipeline_matmul_f16_f32->s;
4432
4465
  shname = "F16_F32_S";
@@ -4438,6 +4471,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4438
4471
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4439
4472
  p = ctx->device->pipeline_matmul_f32->m;
4440
4473
  shname = "F32_M";
4474
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4475
+ p = ctx->device->pipeline_matmul_f32_f16->m;
4476
+ shname = "F32_F16_M";
4441
4477
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4442
4478
  p = ctx->device->pipeline_matmul_f16_f32->m;
4443
4479
  shname = "F16_F32_M";
@@ -4449,6 +4485,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4449
4485
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4450
4486
  p = ctx->device->pipeline_matmul_f32->l;
4451
4487
  shname = "F32_L";
4488
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4489
+ p = ctx->device->pipeline_matmul_f32_f16->l;
4490
+ shname = "F32_F16_L";
4452
4491
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4453
4492
  p = ctx->device->pipeline_matmul_f16_f32->l;
4454
4493
  shname = "F16_F32_L";
@@ -4561,15 +4600,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4561
4600
  src1_ggml->data = y;
4562
4601
  tensor_ggml->data = d_chk;
4563
4602
 
4564
- ctx->disable = true;
4565
-
4566
4603
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
4567
4604
  ggml_build_forward_expand(cgraph, tensor_ggml);
4568
4605
 
4569
4606
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
4570
4607
 
4571
- ctx->disable = false;
4572
-
4573
4608
  ggml_free(ggml_ctx);
4574
4609
 
4575
4610
  double avg_err = 0.0;
@@ -5049,15 +5084,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5049
5084
  src1_ggml->data = y;
5050
5085
  tensor_ggml->data = d_chk;
5051
5086
 
5052
- ctx->disable = true;
5053
-
5054
5087
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
5055
5088
  ggml_build_forward_expand(cgraph, tensor_ggml);
5056
5089
 
5057
5090
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
5058
5091
 
5059
- ctx->disable = false;
5060
-
5061
5092
  ggml_free(ggml_ctx);
5062
5093
 
5063
5094
  double avg_err = 0.0;
@@ -5134,12 +5165,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5134
5165
  #ifdef GGML_VULKAN_DEBUG
5135
5166
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5136
5167
  #endif
5137
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
5168
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5169
+
5170
+ if (extra == nullptr) {
5138
5171
  return;
5139
5172
  }
5140
5173
 
5141
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5142
-
5143
5174
  ggml_tensor * src0 = node->src[0];
5144
5175
  ggml_tensor * src1 = node->src[1];
5145
5176
 
@@ -5244,9 +5275,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5244
5275
  }
5245
5276
 
5246
5277
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5247
- if (ctx->disable) {
5248
- return;
5249
- }
5250
5278
  #ifdef GGML_VULKAN_DEBUG
5251
5279
  std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5252
5280
  #endif
@@ -5420,7 +5448,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5420
5448
  }
5421
5449
 
5422
5450
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5423
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(node)) {
5451
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5452
+
5453
+ if (ggml_is_empty(node) || extra == nullptr) {
5424
5454
  return;
5425
5455
  }
5426
5456
 
@@ -5434,8 +5464,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5434
5464
  const ggml_tensor * src1 = node->src[1];
5435
5465
  const ggml_tensor * src2 = node->src[2];
5436
5466
 
5437
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5438
-
5439
5467
  switch (node->op) {
5440
5468
  case GGML_OP_UNARY:
5441
5469
  switch (ggml_get_unary_op(node)) {
@@ -5547,11 +5575,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5547
5575
 
5548
5576
  break;
5549
5577
  case GGML_OP_SOFT_MAX:
5550
- ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, src2, node);
5578
+ ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
5551
5579
 
5552
5580
  break;
5553
5581
  case GGML_OP_ROPE:
5554
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
5582
+ ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
5555
5583
 
5556
5584
  break;
5557
5585
  case GGML_OP_ARGSORT:
@@ -5580,7 +5608,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5580
5608
  last_node = true;
5581
5609
  #endif
5582
5610
 
5583
- if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
5611
+ if (last_node) {
5584
5612
  ggml_vk_ctx_end(ctx->compute_ctx);
5585
5613
  ctx->compute_ctx->exit_tensor = node;
5586
5614
  ctx->compute_ctx = nullptr;
@@ -5588,10 +5616,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5588
5616
  }
5589
5617
 
5590
5618
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
5591
- if (ctx->disable) {
5592
- return false;
5593
- }
5594
-
5595
5619
  ggml_tensor_extra_gpu * extra = nullptr;
5596
5620
 
5597
5621
  switch (tensor->op) {
@@ -5650,7 +5674,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5650
5674
  }
5651
5675
 
5652
5676
  #ifdef GGML_VULKAN_DEBUG
5653
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5677
+ std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5654
5678
  #endif
5655
5679
 
5656
5680
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5690,9 +5714,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5690
5714
 
5691
5715
  // Clean up after graph processing is done
5692
5716
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5693
- if (ctx->disable) {
5694
- return;
5695
- }
5696
5717
  #ifdef GGML_VULKAN_DEBUG
5697
5718
  std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5698
5719
  #endif
@@ -5865,7 +5886,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5865
5886
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5866
5887
  }
5867
5888
 
5868
- tensor->backend = GGML_BACKEND_TYPE_GPU;
5869
5889
  tensor->extra = extra;
5870
5890
  }
5871
5891
 
@@ -5873,8 +5893,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
5873
5893
  #ifdef GGML_VULKAN_DEBUG
5874
5894
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5875
5895
  #endif
5876
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5877
-
5878
5896
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5879
5897
 
5880
5898
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -5888,8 +5906,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
5888
5906
  #ifdef GGML_VULKAN_DEBUG
5889
5907
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5890
5908
  #endif
5891
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5892
-
5893
5909
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5894
5910
 
5895
5911
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6032,6 +6048,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
6032
6048
  #ifdef GGML_VULKAN_DEBUG
6033
6049
  std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6034
6050
  #endif
6051
+ size += 32; // Behave like the CPU buffer type
6035
6052
  void * ptr = nullptr;
6036
6053
  try {
6037
6054
  ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
@@ -6119,7 +6136,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6119
6136
  #endif
6120
6137
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6121
6138
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6122
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6123
6139
 
6124
6140
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6125
6141
 
@@ -6140,7 +6156,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6140
6156
  #endif
6141
6157
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6142
6158
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6143
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6144
6159
 
6145
6160
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6146
6161
 
@@ -6206,6 +6221,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6206
6221
  ctx->transfer_ctx = nullptr;
6207
6222
  }
6208
6223
 
6224
+ static bool ggml_vk_is_empty(ggml_tensor * node) {
6225
+ return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6226
+ }
6227
+
6209
6228
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6210
6229
  #ifdef GGML_VULKAN_DEBUG
6211
6230
  std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
@@ -6220,7 +6239,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6220
6239
  int last_node = cgraph->n_nodes - 1;
6221
6240
 
6222
6241
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
6223
- while (last_node > 0 && (cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(cgraph->nodes[last_node]))) {
6242
+ while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
6224
6243
  last_node -= 1;
6225
6244
  }
6226
6245
 
@@ -6234,7 +6253,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6234
6253
  for (int i = 0; i < cgraph->n_nodes; i++) {
6235
6254
  ggml_tensor * node = cgraph->nodes[i];
6236
6255
 
6237
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
6256
+ if (ggml_vk_is_empty(node)) {
6238
6257
  continue;
6239
6258
  }
6240
6259
 
@@ -6536,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6536
6555
  for (int j = 0; j < level; j++) {
6537
6556
  std::cerr << " ";
6538
6557
  }
6539
- std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
6558
+ std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
6540
6559
 
6541
6560
  done.push_back(tensor);
6542
6561
 
@@ -6548,7 +6567,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6548
6567
  }
6549
6568
 
6550
6569
  static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
6551
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6570
+ if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
6552
6571
  return;
6553
6572
  }
6554
6573
  i0 = std::max(i0, 5);
@@ -6569,6 +6588,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6569
6588
  val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6570
6589
  } else if (tensor->type == GGML_TYPE_F16) {
6571
6590
  val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
6591
+ } else if (tensor->type == GGML_TYPE_I32) {
6592
+ val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6572
6593
  } else {
6573
6594
  GGML_ASSERT(false);
6574
6595
  }
@@ -6584,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6584
6605
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6585
6606
  void * tensor_data = tensor->data;
6586
6607
 
6587
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6608
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6588
6609
  const size_t tensor_size = ggml_nbytes(tensor);
6589
6610
  tensor_data = malloc(tensor_size);
6590
6611
 
@@ -6595,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6595
6616
  }
6596
6617
 
6597
6618
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
6598
- std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6619
+ std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6599
6620
  if (tensor->src[0] != nullptr) {
6600
- std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6621
+ std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6601
6622
  }
6602
6623
  if (tensor->src[1] != nullptr) {
6603
- std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6624
+ std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6604
6625
  }
6605
6626
  std::cerr << std::endl << "Result:" << std::endl;
6606
6627
  ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
@@ -6611,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6611
6632
  std::vector<const ggml_tensor *> done;
6612
6633
  ggml_vk_print_graph_origin(tensor, done);
6613
6634
 
6614
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6635
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6615
6636
  free(tensor_data);
6616
6637
  }
6617
6638
  }
6618
6639
 
6619
- static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
6620
- return;
6621
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
6622
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6623
- return;
6624
- }
6625
- for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
6626
- for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
6627
- for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
6628
- for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
6629
- float val = 0.0f;
6630
- if (tensor->type == GGML_TYPE_F32) {
6631
- val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
6632
- } else if (tensor->type == GGML_TYPE_F16) {
6633
- val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
6634
- }
6635
- if (std::isnan(val)) {
6636
- std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
6637
- std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6638
- std::cerr << std::endl;
6639
- ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
6640
- std::cerr << std::endl;
6641
- std::vector<const ggml_tensor *> done;
6642
- ggml_vk_print_graph_origin(tensor, done);
6643
- GGML_ASSERT(false);
6644
- }
6645
- }
6646
- }
6647
- }
6648
- }
6649
- }
6650
-
6651
6640
  void * comp_result;
6652
6641
  size_t comp_size;
6653
6642
  size_t comp_nb[GGML_MAX_DIMS];
@@ -6701,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6701
6690
 
6702
6691
  src0_buffer = malloc(src0_size);
6703
6692
  src0_clone->data = src0_buffer;
6704
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
6693
+ if (ggml_backend_buffer_is_host(src0->buffer)) {
6705
6694
  memcpy(src0_clone->data, src0->data, src0_size);
6706
6695
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6707
- } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
6696
+ } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6708
6697
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6709
6698
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6710
6699
  uint64_t offset = extra->offset;
@@ -6735,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6735
6724
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6736
6725
  ggml_vk_print_tensor(ctx, src0, "src0");
6737
6726
  }
6738
-
6739
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
6740
6727
  }
6741
6728
  if (src1 != nullptr) {
6742
6729
  src1_clone = ggml_dup_tensor(ggml_ctx, src1);
@@ -6745,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6745
6732
 
6746
6733
  src1_buffer = malloc(src1_size);
6747
6734
  src1_clone->data = src1_buffer;
6748
- if (src1->backend == GGML_BACKEND_TYPE_CPU) {
6735
+ if (ggml_backend_buffer_is_host(src1->buffer)) {
6749
6736
  memcpy(src1_clone->data, src1->data, src1_size);
6750
6737
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6751
- } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
6738
+ } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6752
6739
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6753
6740
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6754
6741
  uint64_t offset = extra->offset;
@@ -6779,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6779
6766
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6780
6767
  ggml_vk_print_tensor(ctx, src1, "src1");
6781
6768
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6782
- std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6769
+ std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6783
6770
  if (src1->src[0] != nullptr) {
6784
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6771
+ std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6785
6772
  }
6786
6773
  if (src1->src[1] != nullptr) {
6787
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6774
+ std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6788
6775
  }
6789
6776
  std::cerr << std::endl << "Result:" << std::endl;
6790
6777
  ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
@@ -6795,8 +6782,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6795
6782
  std::vector<const ggml_tensor *> done;
6796
6783
  ggml_vk_print_graph_origin(src1_clone, done);
6797
6784
  }
6798
-
6799
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
6800
6785
  }
6801
6786
  if (src2 != nullptr) {
6802
6787
  src2_clone = ggml_dup_tensor(ggml_ctx, src2);
@@ -6805,18 +6790,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6805
6790
 
6806
6791
  src2_buffer = malloc(src2_size);
6807
6792
  src2_clone->data = src2_buffer;
6808
- if (src2->backend == GGML_BACKEND_TYPE_CPU) {
6793
+ if (ggml_backend_buffer_is_host(src2->buffer)) {
6809
6794
  memcpy(src2_clone->data, src2->data, src2_size);
6810
6795
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6811
- } else if (src2->backend == GGML_BACKEND_TYPE_GPU) {
6796
+ } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6812
6797
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6813
- vk_buffer buf = extra->buffer_gpu.lock();
6798
+ vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6814
6799
  uint64_t offset = extra->offset;
6815
6800
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6816
6801
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6817
6802
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
6818
6803
  const int idx = i3*src2->ne[2] + i2;
6819
- ggml_vk_buffer_read(ctx, buf, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6804
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6820
6805
  }
6821
6806
  }
6822
6807
 
@@ -6826,10 +6811,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6826
6811
  src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
6827
6812
  }
6828
6813
  } else {
6829
- if (offset + src2_size >= buf->size) {
6830
- src2_size = buf->size - offset;
6814
+ if (offset + src2_size >= buffer_gpu->size) {
6815
+ src2_size = buffer_gpu->size - offset;
6831
6816
  }
6832
- ggml_vk_buffer_read(ctx, buf, offset, src2_clone->data, src2_size);
6817
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
6833
6818
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6834
6819
  }
6835
6820
  } else {
@@ -6839,12 +6824,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6839
6824
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6840
6825
  ggml_vk_print_tensor(ctx, src2, "src2");
6841
6826
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
6842
- std::cerr << "src2_clone=" << tensor << " src2_clone->backend: " << src2_clone->backend << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6827
+ std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6843
6828
  if (src2->src[0] != nullptr) {
6844
- std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " backend=" << src2->src[0]->backend << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6829
+ std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6845
6830
  }
6846
6831
  if (src2->src[1] != nullptr) {
6847
- std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " backend=" << src2->src[1]->backend << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6832
+ std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6848
6833
  }
6849
6834
  std::cerr << std::endl << "Result:" << std::endl;
6850
6835
  ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
@@ -6855,8 +6840,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6855
6840
  std::vector<const ggml_tensor *> done;
6856
6841
  ggml_vk_print_graph_origin(src2_clone, done);
6857
6842
  }
6858
-
6859
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
6860
6843
  }
6861
6844
 
6862
6845
  if (tensor->op == GGML_OP_MUL_MAT) {
@@ -6877,7 +6860,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6877
6860
  tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
6878
6861
  } else if (tensor->op == GGML_OP_SOFT_MAX) {
6879
6862
  if (src1 != nullptr) {
6880
- tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6863
+ tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
6881
6864
  } else {
6882
6865
  tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
6883
6866
  }
@@ -6894,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6894
6877
  float attn_factor = ((float *) tensor->op_params)[8];
6895
6878
  float beta_fast = ((float *) tensor->op_params)[9];
6896
6879
  float beta_slow = ((float *) tensor->op_params)[10];
6897
- tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6880
+ tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6898
6881
  } else if (tensor->op == GGML_OP_UNARY) {
6899
6882
  switch (ggml_get_unary_op(tensor)) {
6900
6883
  case GGML_UNARY_OP_SILU:
@@ -6937,17 +6920,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6937
6920
  GGML_ASSERT(false);
6938
6921
  }
6939
6922
 
6940
- // Disable vulkan here to avoid the hooks in ggml.c
6941
- ctx->disable = true;
6942
-
6943
6923
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
6944
6924
  ggml_build_forward_expand(cgraph, tensor_clone);
6945
6925
 
6946
6926
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6947
6927
 
6948
- ctx->disable = false;
6949
-
6950
- ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
6951
6928
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6952
6929
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
6953
6930
  }
@@ -6964,9 +6941,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6964
6941
  if (src1 != nullptr) {
6965
6942
  free(src1_buffer);
6966
6943
  }
6967
- if (src2 != nullptr) {
6968
- free(src2_buffer);
6969
- }
6970
6944
 
6971
6945
  ggml_free(ggml_ctx);
6972
6946
  }
@@ -6991,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6991
6965
 
6992
6966
  void * tensor_data = tensor->data;
6993
6967
 
6994
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6968
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6995
6969
  size_t tensor_size = ggml_nbytes(tensor);
6996
6970
  tensor_data = malloc(tensor_size);
6997
6971
 
@@ -7026,8 +7000,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7026
7000
  } else if (tensor->type == GGML_TYPE_F16) {
7027
7001
  correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
7028
7002
  result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
7003
+ } else if (tensor->type == GGML_TYPE_I32) {
7004
+ correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
7005
+ result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
7029
7006
  } else {
7030
- std::cerr << "comp_size=" << comp_size << " but required is " << (i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]) << std::endl;
7007
+ std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
7031
7008
  }
7032
7009
  } else {
7033
7010
  std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
@@ -7036,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7036
7013
 
7037
7014
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
7038
7015
  std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
7039
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7016
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7040
7017
  if (src0 != nullptr) {
7041
- std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7018
+ std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7042
7019
  }
7043
7020
  if (src1 != nullptr) {
7044
- std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7021
+ std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7045
7022
  }
7046
7023
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7047
7024
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7077,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7077
7054
 
7078
7055
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
7079
7056
  std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7080
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7057
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7081
7058
  if (src0 != nullptr) {
7082
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7059
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7083
7060
  }
7084
7061
  if (src1 != nullptr) {
7085
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7062
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7086
7063
  }
7087
7064
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7088
7065
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7101,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7101
7078
 
7102
7079
  if (avg_err > 0.05 || std::isnan(avg_err)) {
7103
7080
  std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7104
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7081
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7105
7082
  if (src0 != nullptr) {
7106
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7083
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7107
7084
  }
7108
7085
  if (src1 != nullptr) {
7109
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7086
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7110
7087
  }
7111
7088
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7112
7089
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7118,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7118
7095
  ggml_vk_print_graph_origin(tensor, done);
7119
7096
  GGML_ASSERT(false);
7120
7097
  } else {
7121
- std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
7098
+ std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7122
7099
  }
7123
7100
 
7124
7101
  free(comp_result);
7125
7102
  comp_result = nullptr;
7126
7103
  comp_size = 0;
7127
7104
 
7128
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
7105
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7129
7106
  free(tensor_data);
7130
7107
  }
7131
7108
  }