llama_cpp 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
3338
3338
 
3339
3339
  size_t total_elements = ggml_nelements(src);
3340
3340
 
3341
- const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
3341
+ const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
3342
3342
  float *src_data =NULL;
3343
3343
  if(src_on_device) {
3344
3344
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
8086
8086
  int ixj = col ^ j;
8087
8087
  if (ixj > col) {
8088
8088
  if ((col & k) == 0) {
8089
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8089
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8090
8090
  swap(dst_row[col], dst_row[ixj]);
8091
8091
  }
8092
8092
  } else {
8093
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8093
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8094
8094
  swap(dst_row[col], dst_row[ixj]);
8095
8095
  }
8096
8096
  }
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
8126
8126
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
8127
8127
  }
8128
8128
 
8129
- static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
8130
- const sycl::nd_item<3> &item_ct1, float *buf) {
8129
+
8130
+ template <bool vals_smem, int ncols_template, int block_size_template>
8131
+ static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
8132
+ const int nrows_y, const float scale, const float max_bias, const float m0,
8133
+ const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
8134
+ const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
8135
+
8131
8136
  const int tid = item_ct1.get_local_id(2);
8132
8137
  const int rowx = item_ct1.get_group(2);
8133
8138
  const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
8134
8139
 
8135
- const int block_size = item_ct1.get_local_range(2);
8140
+ const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
8136
8141
 
8137
8142
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
8138
8143
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
8139
8144
 
8145
+ float slope = 0.0f;
8146
+
8147
+ // ALiBi
8148
+ if (max_bias > 0.0f) {
8149
+ const uint32_t h = rowx/nrows_y; // head index
8150
+
8151
+ const float base = h < n_head_log2 ? m0 : m1;
8152
+ const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
8153
+
8154
+ slope = sycl::pow(base, float(exp));
8155
+ }
8156
+
8157
+ float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
8140
8158
  float max_val = -INFINITY;
8141
8159
 
8142
- for (int col = tid; col < ncols; col += block_size) {
8160
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8161
+ const int col = col0 + tid;
8162
+
8163
+ if (ncols_template == 0 && col >= ncols) {
8164
+ break;
8165
+ }
8166
+
8143
8167
  const int ix = rowx*ncols + col;
8144
8168
  const int iy = rowy*ncols + col;
8145
- max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
8169
+
8170
+ const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
8171
+
8172
+ vals[col] = val;
8173
+ max_val = sycl::max(max_val, val);
8146
8174
  }
8147
8175
 
8148
8176
  // find the max value in the block
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8151
8179
  if (warp_id == 0) {
8152
8180
  buf[lane_id] = -INFINITY;
8153
8181
  }
8154
- /*
8155
- DPCT1118:12: SYCL group functions and algorithms must be encountered in
8156
- converged control flow. You may need to adjust the code.
8157
- */
8158
- /*
8159
- DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
8160
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8161
- better performance if there is no access to global memory.
8162
- */
8163
- item_ct1.barrier();
8182
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8164
8183
 
8165
8184
  if (lane_id == 0) {
8166
8185
  buf[warp_id] = max_val;
8167
8186
  }
8168
- /*
8169
- DPCT1118:13: SYCL group functions and algorithms must be encountered in
8170
- converged control flow. You may need to adjust the code.
8171
- */
8172
- /*
8173
- DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
8174
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8175
- better performance if there is no access to global memory.
8176
- */
8177
- item_ct1.barrier();
8187
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8178
8188
 
8179
8189
  max_val = buf[lane_id];
8180
8190
  max_val = warp_reduce_max(max_val, item_ct1);
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8182
8192
 
8183
8193
  float tmp = 0.f;
8184
8194
 
8185
- for (int col = tid; col < ncols; col += block_size) {
8186
- const int ix = rowx*ncols + col;
8187
- const int iy = rowy*ncols + col;
8188
- const float val =
8189
- sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
8195
+ #pragma unroll
8196
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8197
+ const int col = col0 + tid;
8198
+ if (ncols_template == 0 && col >= ncols) {
8199
+ break;
8200
+ }
8201
+
8202
+ const float val = sycl::native::exp(vals[col] - max_val);
8190
8203
  tmp += val;
8191
- dst[ix] = val;
8204
+ vals[col] = val;
8192
8205
  }
8193
8206
 
8194
8207
  // find the sum of exps in the block
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8197
8210
  if (warp_id == 0) {
8198
8211
  buf[lane_id] = 0.f;
8199
8212
  }
8200
- /*
8201
- DPCT1118:14: SYCL group functions and algorithms must be encountered in
8202
- converged control flow. You may need to adjust the code.
8203
- */
8204
- /*
8205
- DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
8206
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8207
- better performance if there is no access to global memory.
8208
- */
8209
- item_ct1.barrier();
8213
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8210
8214
 
8211
8215
  if (lane_id == 0) {
8212
8216
  buf[warp_id] = tmp;
8213
8217
  }
8214
- /*
8215
- DPCT1118:15: SYCL group functions and algorithms must be encountered in
8216
- converged control flow. You may need to adjust the code.
8217
- */
8218
- /*
8219
- DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
8220
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8221
- better performance if there is no access to global memory.
8222
- */
8223
- item_ct1.barrier();
8218
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8224
8219
 
8225
8220
  tmp = buf[lane_id];
8226
8221
  tmp = warp_reduce_sum(tmp, item_ct1);
8227
8222
  }
8228
8223
 
8229
- const float inv_tmp = 1.f / tmp;
8224
+ const float inv_sum = 1.f / tmp;
8230
8225
 
8231
- for (int col = tid; col < ncols; col += block_size) {
8232
- const int i = rowx*ncols + col;
8233
- dst[i] *= inv_tmp;
8226
+ #pragma unroll
8227
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8228
+ const int col = col0 + tid;
8229
+
8230
+ if (ncols_template == 0 && col >= ncols) {
8231
+ return;
8232
+ }
8233
+
8234
+ const int idst = rowx*ncols + col;
8235
+ dst[idst] = vals[col] * inv_sum;
8234
8236
  }
8235
8237
  }
8236
8238
 
@@ -10825,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10825
10827
 
10826
10828
  const sycl::range<3> block_dims(1, 1, ncols);
10827
10829
  const sycl::range<3> block_nums(1, nrows, 1);
10828
- if (order == GGML_SORT_ASC) {
10830
+ if (order == GGML_SORT_ORDER_ASC) {
10829
10831
  /*
10830
10832
  DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
10831
10833
  the limit. To get the device limit, query
@@ -10834,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10834
10836
  stream->parallel_for(
10835
10837
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10836
10838
  [=](sycl::nd_item<3> item_ct1) {
10837
- k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
10839
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
10838
10840
  });
10839
- } else if (order == GGML_SORT_DESC) {
10841
+ } else if (order == GGML_SORT_ORDER_DESC) {
10840
10842
  /*
10841
10843
  DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
10842
10844
  the limit. To get the device limit, query
@@ -10845,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10845
10847
  stream->parallel_for(
10846
10848
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10847
10849
  [=](sycl::nd_item<3> item_ct1) {
10848
- k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
10850
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
10849
10851
  });
10850
10852
  } else {
10851
10853
  GGML_ASSERT(false);
@@ -10867,37 +10869,98 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
10867
10869
  });
10868
10870
  }
10869
10871
 
10870
- static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
10871
- const int ncols_x, const int nrows_x,
10872
- const int nrows_y, const float scale,
10873
- dpct::queue_ptr stream) {
10874
- int nth = WARP_SIZE;
10875
- while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
10876
- const sycl::range<3> block_dims(1, 1, nth);
10877
- const sycl::range<3> block_nums(1, 1, nrows_x);
10878
- /*
10879
- DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
10880
- limit. To get the device limit, query info::device::max_work_group_size.
10881
- Adjust the work-group size if needed.
10882
- */
10872
+ template <bool vals_smem, int ncols_template, int block_size_template>
10873
+ static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
10874
+ const int nrows_y, const float scale, const float max_bias, const float m0,
10875
+ const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
10876
+ const size_t n_local_scratch, dpct::queue_ptr stream) {
10883
10877
  stream->submit([&](sycl::handler &cgh) {
10884
- /*
10885
- DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
10886
- replaced with a value. Modify the code to use the original expression,
10887
- provided in comments, if it is correct.
10888
- */
10889
- sycl::local_accessor<float, 1> buf_acc_ct1(
10890
- sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
10878
+ sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
10891
10879
 
10892
10880
  cgh.parallel_for(
10893
10881
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10894
10882
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
10895
- soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
10896
- buf_acc_ct1.get_pointer());
10883
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
10884
+ nrows_y, scale, max_bias, m0,
10885
+ m1, n_head_log2, item_ct1,
10886
+ local_buf_acc.get_pointer());
10897
10887
  });
10898
10888
  });
10899
10889
  }
10900
10890
 
10891
+ static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
10892
+ float * dst, const int ncols_x, const int nrows_x,
10893
+ const int nrows_y, const float scale, const float max_bias,
10894
+ dpct::queue_ptr stream) {
10895
+ int nth = WARP_SIZE;
10896
+ while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
10897
+ const sycl::range<3> block_dims(1, 1, nth);
10898
+ const sycl::range<3> block_nums(1, 1, nrows_x);
10899
+ const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
10900
+ static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
10901
+
10902
+ const uint32_t n_head_kv = nrows_x/nrows_y;
10903
+ const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
10904
+
10905
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
10906
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
10907
+
10908
+ const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
10909
+ if (n_local_scratch*sizeof(float) < local_mem_size) {
10910
+ switch (ncols_x) {
10911
+ case 32:
10912
+ soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10913
+ max_bias, m0, m1, n_head_log2, block_nums,
10914
+ block_dims, n_local_scratch, stream);
10915
+ break;
10916
+ case 64:
10917
+ soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10918
+ max_bias, m0, m1, n_head_log2, block_nums,
10919
+ block_dims, n_local_scratch, stream);
10920
+ break;
10921
+ case 128:
10922
+ soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10923
+ max_bias, m0, m1, n_head_log2, block_nums,
10924
+ block_dims, n_local_scratch, stream);
10925
+ break;
10926
+ case 256:
10927
+ soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10928
+ max_bias, m0, m1, n_head_log2, block_nums,
10929
+ block_dims, n_local_scratch, stream);
10930
+ break;
10931
+ case 512:
10932
+ soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10933
+ max_bias, m0, m1, n_head_log2, block_nums,
10934
+ block_dims, n_local_scratch, stream);
10935
+ break;
10936
+ case 1024:
10937
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10938
+ max_bias, m0, m1, n_head_log2, block_nums,
10939
+ block_dims, n_local_scratch, stream);
10940
+ break;
10941
+ case 2048:
10942
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10943
+ max_bias, m0, m1, n_head_log2, block_nums,
10944
+ block_dims, n_local_scratch, stream);
10945
+ break;
10946
+ case 4096:
10947
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10948
+ max_bias, m0, m1, n_head_log2, block_nums,
10949
+ block_dims, n_local_scratch, stream);
10950
+ break;
10951
+ default:
10952
+ soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10953
+ max_bias, m0, m1, n_head_log2, block_nums,
10954
+ block_dims, n_local_scratch, stream);
10955
+ break;
10956
+ }
10957
+ } else {
10958
+ soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10959
+ max_bias, m0, m1, n_head_log2, block_nums,
10960
+ block_dims, WARP_SIZE, stream);
10961
+ }
10962
+ }
10963
+
10901
10964
  template <typename T>
10902
10965
  static void im2col_sycl(const float *x, T *dst, int IW, int IH,
10903
10966
  int OW, int OH, int KW, int KH, int IC,
@@ -11407,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
11407
11470
 
11408
11471
  dpct::memcpy_direction kind;
11409
11472
  char * src_ptr;
11410
- if (src->backend == GGML_BACKEND_CPU) {
11473
+ if (src->backend == GGML_BACKEND_TYPE_CPU) {
11411
11474
  kind = dpct::host_to_device;
11412
11475
  src_ptr = (char *) src->data;
11413
- // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
11414
- } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
11415
- GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11476
+ // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
11477
+ } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
11478
+ GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11416
11479
  kind = dpct::device_to_device;
11417
11480
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
11418
11481
  int id;
@@ -11846,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
11846
11909
 
11847
11910
  // the main device has a larger memory buffer to hold the results from all GPUs
11848
11911
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
11849
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
11912
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
11850
11913
 
11851
11914
  switch (src0->type) {
11852
11915
  case GGML_TYPE_Q4_0:
@@ -12119,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
12119
12182
 
12120
12183
  // the main device has a larger memory buffer to hold the results from all GPUs
12121
12184
  // ldc == nrows of the matrix that cuBLAS writes into
12122
- int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
12185
+ int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
12123
12186
 
12124
12187
  #ifdef GGML_SYCL_F16
12125
12188
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
12435
12498
 
12436
12499
  const int64_t ne00 = src0->ne[0];
12437
12500
  const int64_t nrows_x = ggml_nrows(src0);
12438
- const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
12501
+ const int64_t nrows_y = src0->ne[1];
12439
12502
 
12440
12503
  float scale = 1.0f;
12441
- memcpy(&scale, dst->op_params, sizeof(float));
12504
+ float max_bias = 0.0f;
12442
12505
 
12443
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
12506
+ memcpy(&scale, dst->op_params + 0, sizeof(float));
12507
+ memcpy(&max_bias, dst->op_params + 1, sizeof(float));
12444
12508
 
12445
- (void) dst;
12509
+ // positions tensor
12510
+ float * src2_dd = nullptr;
12511
+ sycl_pool_alloc<float> src2_f;
12512
+
12513
+ ggml_tensor * src2 = dst->src[2];
12514
+ const bool use_src2 = src2 != nullptr;
12515
+
12516
+ if (use_src2) {
12517
+ const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
12518
+
12519
+ if (src2_on_device) {
12520
+ ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
12521
+ src2_dd = (float *) src2_extra->data_device[g_main_device];
12522
+ } else {
12523
+ src2_dd = src2_f.alloc(ggml_nelements(src2));
12524
+ SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
12525
+ }
12526
+ }
12527
+
12528
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
12529
+ nrows_x, nrows_y, scale, max_bias, main_stream);
12446
12530
  }
12447
12531
 
12448
12532
  inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
@@ -12501,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12501
12585
  const bool use_src1 = src1 != nullptr;
12502
12586
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
12503
12587
 
12504
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
12505
- GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
12588
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12589
+ GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12506
12590
 
12507
12591
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
12508
12592
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
12509
12593
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12510
12594
 
12511
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12512
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
12513
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
12595
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12596
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
12597
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
12514
12598
 
12515
12599
  // dd = data device
12516
12600
  float * src0_ddf = nullptr;
@@ -12565,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12565
12649
  main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
12566
12650
  }
12567
12651
 
12568
- if (dst->backend == GGML_BACKEND_CPU) {
12652
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12569
12653
  SYCL_CHECK(CHECK_TRY_ERROR(
12570
12654
  dpct::get_current_device().queues_wait_and_throw()));
12571
12655
  }
@@ -12640,8 +12724,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12640
12724
  const int nb2 = dst->nb[2];
12641
12725
  const int nb3 = dst->nb[3];
12642
12726
 
12643
- GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
12644
- GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
12727
+ GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12728
+ GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12729
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
12645
12730
 
12646
12731
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
12647
12732
 
@@ -12656,13 +12741,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12656
12741
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
12657
12742
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12658
12743
 
12659
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12744
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12660
12745
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
12661
12746
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
12662
12747
 
12663
12748
  int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
12664
12749
 
12665
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
12750
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12666
12751
  GGML_ASSERT(!(split && ne02 > 1));
12667
12752
  GGML_ASSERT(!(split && ne03 > 1));
12668
12753
  GGML_ASSERT(!(split && ne02 < ne12));
@@ -12717,8 +12802,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12717
12802
 
12718
12803
  used_devices++;
12719
12804
 
12720
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12721
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12805
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12806
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12722
12807
 
12723
12808
  ggml_sycl_set_device(get_device_id_by_index(id));
12724
12809
  const dpct::queue_ptr stream = g_syclStreams[id][0];
@@ -12782,8 +12867,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12782
12867
  continue;
12783
12868
  }
12784
12869
 
12785
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12786
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12870
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12871
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12787
12872
  const int64_t row_diff = row_high[id] - row_low[id];
12788
12873
 
12789
12874
  ggml_sycl_set_device(get_device_id_by_index(id));
@@ -12809,12 +12894,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12809
12894
 
12810
12895
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
12811
12896
  // in that case an offset on dst_ddf_i is needed
12812
- if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
12897
+ if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
12813
12898
  dst_dd_i += row_low[id]; // offset is 0 if no tensor split
12814
12899
  }
12815
12900
 
12816
12901
  // copy src0, src1 to device if necessary
12817
- if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
12902
+ if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
12818
12903
  if (id != g_main_device_index) {
12819
12904
  if (convert_src1_to_q8_1) {
12820
12905
  char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
@@ -12830,14 +12915,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12830
12915
  src1_ncols * ne10 * sizeof(float))));
12831
12916
  }
12832
12917
  }
12833
- } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
12918
+ } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
12834
12919
  SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
12835
12920
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
12836
12921
  } else {
12837
12922
  GGML_ASSERT(false);
12838
12923
  }
12839
12924
 
12840
- if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
12925
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
12841
12926
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
12842
12927
  /*
12843
12928
  DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -12867,10 +12952,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12867
12952
  if (!dst_on_device) {
12868
12953
  void * dst_off_device;
12869
12954
  dpct::memcpy_direction kind;
12870
- if (dst->backend == GGML_BACKEND_CPU) {
12955
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12871
12956
  dst_off_device = dst->data;
12872
12957
  kind = dpct::device_to_host;
12873
- } else if (dst->backend == GGML_BACKEND_GPU) {
12958
+ } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
12874
12959
  dst_off_device = dst_extra->data_device[g_main_device_index];
12875
12960
  kind = dpct::device_to_device;
12876
12961
  } else {
@@ -12954,7 +13039,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12954
13039
  }
12955
13040
  }
12956
13041
 
12957
- if (dst->backend == GGML_BACKEND_CPU) {
13042
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12958
13043
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
12959
13044
  SYCL_CHECK(CHECK_TRY_ERROR(
12960
13045
  dpct::get_current_device().queues_wait_and_throw()));
@@ -13091,7 +13176,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
13091
13176
  const ggml_tensor *src1,
13092
13177
  ggml_tensor *dst) try {
13093
13178
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
13094
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13179
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13095
13180
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
13096
13181
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
13097
13182
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -13129,7 +13214,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
13129
13214
  GGML_ASSERT(!ggml_is_transposed(src0));
13130
13215
  GGML_ASSERT(!ggml_is_transposed(src1));
13131
13216
  GGML_ASSERT(!ggml_is_permuted(src0));
13132
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13217
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13133
13218
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13134
13219
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13135
13220
 
@@ -13185,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
13185
13270
  int64_t i03 = i13 / r3;
13186
13271
  int64_t i02 = i12 / r2;
13187
13272
 
13188
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
13189
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
13190
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
13273
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
13274
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
13275
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
13191
13276
  }
13192
13277
 
13193
- static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13194
- const ggml_tensor *src1,
13195
- ggml_tensor *dst) try {
13278
+ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
13279
+ const ggml_tensor *src1,
13280
+ ggml_tensor *dst) try {
13196
13281
  GGML_ASSERT(!ggml_is_transposed(src0));
13197
13282
  GGML_ASSERT(!ggml_is_transposed(src1));
13198
13283
 
13199
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13284
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13200
13285
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13201
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
13202
-
13203
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13204
13286
 
13205
- GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
13206
-
13207
- GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13208
-
13209
- GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13287
+ GGML_TENSOR_BINARY_OP_LOCALS
13210
13288
 
13211
- const int64_t ne1 = ggml_nelements(src1);
13212
- const int64_t ne = ggml_nelements(dst);
13289
+ const int64_t ne_dst = ggml_nelements(dst);
13213
13290
 
13214
13291
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13215
13292
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -13228,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13228
13305
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
13229
13306
 
13230
13307
  // convert src1 to fp16
13231
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
13232
- GGML_ASSERT(to_fp16_sycl != nullptr);
13233
-
13234
- sycl_pool_alloc<sycl::half> src1_as_f16(ne1);
13235
- to_fp16_sycl(src1_ddf, src1_as_f16.get(), ne1, main_stream);
13308
+ sycl_pool_alloc<sycl::half> src1_f16_alloc;
13309
+ if (src1->type != GGML_TYPE_F16) {
13310
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
13311
+ const int64_t ne_src1 = ggml_nelements(src1);
13312
+ src1_f16_alloc.alloc(ne_src1);
13313
+ GGML_ASSERT(to_fp16_sycl != nullptr);
13314
+ to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
13315
+ }
13316
+ sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
13317
+ : src1_f16_alloc.get();
13236
13318
 
13237
13319
  sycl_pool_alloc<sycl::half> dst_f16;
13238
13320
  char * dst_t;
@@ -13253,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13253
13335
  const void * alpha = &alpha_f16;
13254
13336
  const void * beta = &beta_f16;
13255
13337
 
13256
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
13257
- dst_t = (char *) dst_f16.alloc(ne);
13258
-
13259
- nbd2 /= sizeof(float) / sizeof(sycl::half);
13260
- nbd3 /= sizeof(float) / sizeof(sycl::half);
13261
- } else {
13262
- dst_t = (char *) dst_ddf;
13263
-
13264
- cu_compute_type = dpct::library_data_t::real_float;
13265
- cu_data_type = dpct::library_data_t::real_float;
13338
+ // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
13339
+ // once oneMKL open source supports half, half, float, float: datatypes
13340
+ dst_t = (char *) dst_f16.alloc(ne_dst);
13266
13341
 
13267
- alpha = &alpha_f32;
13268
- beta = &beta_f32;
13269
- }
13342
+ nbd2 /= sizeof(float) / sizeof(sycl::half);
13343
+ nbd3 /= sizeof(float) / sizeof(sycl::half);
13270
13344
 
13271
13345
  GGML_ASSERT(ne12 % ne02 == 0);
13272
13346
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -13302,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13302
13376
  *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
13303
13377
  oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
13304
13378
  (const char *)src0_as_f16, dpct::library_data_t::real_half,
13305
- nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
13306
- (const char *)src1_as_f16.get(), dpct::library_data_t::real_half,
13307
- nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta,
13308
- (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
13379
+ nb01 / nb00, nb02 / nb00,
13380
+ (const char *)src1_f16, dpct::library_data_t::real_half,
13381
+ nb11 / nb10, nb12 / nb10, beta,
13382
+ (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
13309
13383
  ne12 * ne13, cu_compute_type)));
13310
13384
  } else {
13311
13385
  // use syclGemmBatchedEx
@@ -13325,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13325
13399
  {sycl::aspect::fp16});
13326
13400
 
13327
13401
  main_stream->submit([&](sycl::handler &cgh) {
13328
- const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
13329
- const void **ptrs_src_get_ct3 = ptrs_src.get();
13330
- void **ptrs_dst_get_ct4 = ptrs_dst.get();
13331
-
13402
+ const void **ptrs_src_get = ptrs_src.get();
13403
+ void **ptrs_dst_get = ptrs_dst.get();
13404
+ size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
13405
+ size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
13332
13406
  cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
13333
13407
  [=](sycl::nd_item<3> item_ct1) {
13334
13408
  k_compute_batched_ptrs(
13335
- src0_as_f16, src1_as_f16_get_ct1,
13336
- dst_t, ptrs_src_get_ct3,
13337
- ptrs_dst_get_ct4, ne12, ne13, ne23,
13338
- nb02, nb03, nb12, nb13, nbd2, nbd3, r2,
13339
- r3, item_ct1);
13409
+ src0_as_f16, src1_f16,
13410
+ dst_t, ptrs_src_get,
13411
+ ptrs_dst_get, ne12, ne13, ne23,
13412
+ nb02, nb03, nb12_scaled, nb13_scaled,
13413
+ nbd2, nbd3, r2, r3, item_ct1);
13340
13414
  });
13341
13415
  });
13342
13416
  }
13343
- /*
13344
- DPCT1010:95: SYCL uses exceptions to report errors and does not use the
13345
- error codes. The call was replaced with 0. You need to rewrite this
13346
- code.
13347
- */
13348
- SYCL_CHECK(0);
13349
-
13350
13417
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
13351
13418
  *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
13352
13419
  oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
13353
13420
  (const void **)(ptrs_src.get() + 0 * ne23),
13354
- dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
13421
+ dpct::library_data_t::real_half, nb01 / nb00,
13355
13422
  (const void **)(ptrs_src.get() + 1 * ne23),
13356
- dpct::library_data_t::real_half, nb11 / sizeof(float), beta,
13423
+ dpct::library_data_t::real_half, nb11 / nb10, beta,
13357
13424
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
13358
13425
  cu_compute_type)));
13359
13426
  }
13360
13427
  #endif
13361
13428
 
13362
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
13363
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
13364
- to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
13365
- }
13429
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
13430
+ to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
13366
13431
  }
13367
13432
  catch (sycl::exception const &exc) {
13368
13433
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -13372,11 +13437,11 @@ catch (sycl::exception const &exc) {
13372
13437
 
13373
13438
  static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
13374
13439
  const bool all_on_device =
13375
- (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
13376
- (src1->backend == GGML_BACKEND_GPU) &&
13377
- ( dst->backend == GGML_BACKEND_GPU);
13440
+ (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
13441
+ (src1->backend == GGML_BACKEND_TYPE_GPU) &&
13442
+ ( dst->backend == GGML_BACKEND_TYPE_GPU);
13378
13443
 
13379
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
13444
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
13380
13445
 
13381
13446
  int64_t min_compute_capability = INT_MAX;
13382
13447
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -13407,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
13407
13472
  // KQV single-batch
13408
13473
  // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
13409
13474
  ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
13410
- } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
13475
+ } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
13411
13476
  // KQ + KQV multi-batch
13412
- // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n");
13413
- ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst);
13477
+ // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
13478
+ ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
13414
13479
  } else if (src0->type == GGML_TYPE_F32) {
13415
13480
  // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
13416
13481
  ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
@@ -13505,7 +13570,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
13505
13570
  GGML_ASSERT(!ggml_is_transposed(src00));
13506
13571
  GGML_ASSERT(!ggml_is_transposed(src1));
13507
13572
 
13508
- GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
13573
+ GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13509
13574
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13510
13575
 
13511
13576
  GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
@@ -13643,7 +13708,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13643
13708
 
13644
13709
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
13645
13710
 
13646
- if (ids->backend == GGML_BACKEND_GPU) {
13711
+ if (ids->backend == GGML_BACKEND_TYPE_GPU) {
13647
13712
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
13648
13713
  SYCL_CHECK(CHECK_TRY_ERROR(
13649
13714
  stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
@@ -13661,20 +13726,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13661
13726
  ggml_tensor src1_row = *src1;
13662
13727
  ggml_tensor dst_row = *dst;
13663
13728
 
13664
- src1_row.backend = GGML_BACKEND_GPU;
13665
- dst_row.backend = GGML_BACKEND_GPU;
13729
+ src1_row.backend = GGML_BACKEND_TYPE_GPU;
13730
+ dst_row.backend = GGML_BACKEND_TYPE_GPU;
13666
13731
 
13667
13732
  src1_row.extra = &src1_row_extra;
13668
13733
  dst_row.extra = &dst_row_extra;
13669
13734
 
13670
- char * src1_original = src1->backend == GGML_BACKEND_CPU ?
13735
+ char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
13671
13736
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
13672
- char * dst_original = dst->backend == GGML_BACKEND_CPU ?
13737
+ char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
13673
13738
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
13674
13739
 
13675
13740
  if (src1->ne[1] == 1) {
13676
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13677
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
13741
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13742
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
13678
13743
 
13679
13744
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
13680
13745
  //int32_t row_id;
@@ -13756,7 +13821,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13756
13821
  }
13757
13822
  }
13758
13823
 
13759
- if (dst->backend == GGML_BACKEND_CPU) {
13824
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
13760
13825
  SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
13761
13826
  }
13762
13827
  }
@@ -13779,8 +13844,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13779
13844
  const int64_t ne = ggml_nelements(src0);
13780
13845
  GGML_ASSERT(ne == ggml_nelements(src1));
13781
13846
 
13782
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
13783
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13847
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
13848
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13784
13849
 
13785
13850
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13786
13851
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
@@ -13887,17 +13952,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
13887
13952
  memset(extra, 0, sizeof(*extra));
13888
13953
 
13889
13954
  for (int64_t id = 0; id < g_device_count; ++id) {
13890
- if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
13955
+ if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
13891
13956
  continue;
13892
13957
  }
13893
13958
  ggml_sycl_set_device(get_device_id_by_index(id));
13894
13959
  const dpct::queue_ptr stream = g_syclStreams[id][0];
13895
13960
 
13896
13961
  int64_t row_low, row_high;
13897
- if (backend == GGML_BACKEND_GPU) {
13962
+ if (backend == GGML_BACKEND_TYPE_GPU) {
13898
13963
  row_low = 0;
13899
13964
  row_high = nrows;
13900
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
13965
+ } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
13901
13966
  const int64_t rounding = get_row_rounding(tensor->type);
13902
13967
 
13903
13968
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
@@ -13946,7 +14011,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
13946
14011
 
13947
14012
  extra->data_device[id] = buf;
13948
14013
 
13949
- if (backend == GGML_BACKEND_GPU_SPLIT) {
14014
+ if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
13950
14015
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
13951
14016
  SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
13952
14017
  new sycl::event()));
@@ -13963,7 +14028,7 @@ catch (sycl::exception const &exc) {
13963
14028
  }
13964
14029
 
13965
14030
  void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
13966
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
14031
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
13967
14032
  return;
13968
14033
  }
13969
14034
 
@@ -14016,15 +14081,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14016
14081
  return;
14017
14082
  }
14018
14083
 
14019
- tensor->backend = GGML_BACKEND_GPU;
14084
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14020
14085
 
14021
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
14086
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
14022
14087
  const ggml_op src0_op = tensor->src[0]->op;
14023
14088
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
14024
14089
  ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
14025
14090
  }
14026
14091
  }
14027
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
14092
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
14028
14093
  ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
14029
14094
  }
14030
14095
 
@@ -14042,7 +14107,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14042
14107
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
14043
14108
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
14044
14109
 
14045
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
14110
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14046
14111
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
14047
14112
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14048
14113
  size_t offset = 0;
@@ -14111,7 +14176,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
14111
14176
 
14112
14177
  const bool inplace = tensor->view_src != nullptr;
14113
14178
 
14114
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
14179
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14115
14180
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
14116
14181
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14117
14182
  size_t view_offset = 0;
@@ -14132,7 +14197,7 @@ catch (sycl::exception const &exc) {
14132
14197
  }
14133
14198
 
14134
14199
  void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
14135
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14200
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14136
14201
  GGML_ASSERT(ggml_is_contiguous(tensor));
14137
14202
 
14138
14203
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -14219,9 +14284,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14219
14284
  if (!g_sycl_loaded) return false;
14220
14285
 
14221
14286
  ggml_sycl_func_t func;
14222
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
14223
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
14224
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
14287
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
14288
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
14289
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
14225
14290
 
14226
14291
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
14227
14292
  return false;
@@ -14359,14 +14424,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14359
14424
  return false;
14360
14425
  }
14361
14426
 
14362
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
14427
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
14363
14428
  ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
14364
14429
  }
14365
14430
 
14366
14431
  if (params->ith != 0) {
14367
14432
  return true;
14368
14433
  }
14369
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14434
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14370
14435
  return true;
14371
14436
  }
14372
14437
  func(tensor->src[0], tensor->src[1], tensor);
@@ -14517,7 +14582,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
14517
14582
 
14518
14583
  extra->data_device[ctx->device] = tensor->data;
14519
14584
 
14520
- tensor->backend = GGML_BACKEND_GPU;
14585
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14521
14586
  tensor->extra = extra;
14522
14587
 
14523
14588
  if (ggml_is_quantized(tensor->type)) {
@@ -14548,7 +14613,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14548
14613
  ggml_tensor *tensor,
14549
14614
  const void *data, size_t offset,
14550
14615
  size_t size) try {
14551
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14616
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14552
14617
 
14553
14618
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14554
14619
 
@@ -14573,7 +14638,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
14573
14638
  const ggml_tensor *tensor,
14574
14639
  void *data, size_t offset,
14575
14640
  size_t size) try {
14576
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14641
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14577
14642
 
14578
14643
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14579
14644
 
@@ -14809,7 +14874,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
14809
14874
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14810
14875
 
14811
14876
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14812
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14877
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14813
14878
 
14814
14879
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14815
14880
  (char *)tensor->data + offset, data, size)));
@@ -14827,7 +14892,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
14827
14892
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14828
14893
 
14829
14894
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14830
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14895
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14831
14896
 
14832
14897
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14833
14898
  data, (const char *)tensor->data + offset, size)));
@@ -14880,7 +14945,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
14880
14945
  ggml_sycl_set_main_device(sycl_ctx->device);
14881
14946
 
14882
14947
  ggml_compute_params params = {};
14883
- params.type = GGML_TASK_COMPUTE;
14948
+ params.type = GGML_TASK_TYPE_COMPUTE;
14884
14949
  params.ith = 0;
14885
14950
  for (int i = 0; i < cgraph->n_nodes; i++) {
14886
14951
  ggml_tensor * node = cgraph->nodes[i];
@@ -14888,13 +14953,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
14888
14953
  if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
14889
14954
  continue;
14890
14955
 
14891
- assert(node->backend == GGML_BACKEND_GPU);
14956
+ assert(node->backend == GGML_BACKEND_TYPE_GPU);
14892
14957
  assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14893
14958
  assert(node->extra != nullptr);
14894
14959
 
14895
14960
  for (int j = 0; j < GGML_MAX_SRC; j++) {
14896
14961
  if (node->src[j] != nullptr) {
14897
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
14962
+ assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
14898
14963
  assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14899
14964
  assert(node->src[j]->extra != nullptr);
14900
14965
  }
@@ -15078,6 +15143,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
15078
15143
  /* .supports_op = */ ggml_backend_sycl_supports_op,
15079
15144
  };
15080
15145
 
15146
+ static ggml_guid_t ggml_backend_sycl_guid() {
15147
+ static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
15148
+ return &guid;
15149
+ }
15150
+
15081
15151
  ggml_backend_t ggml_backend_sycl_init(int device) {
15082
15152
  ggml_init_sycl(); // TODO: remove from ggml.c
15083
15153
 
@@ -15095,6 +15165,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15095
15165
  };
15096
15166
 
15097
15167
  ggml_backend_t sycl_backend = new ggml_backend {
15168
+ /* .guid = */ ggml_backend_sycl_guid(),
15098
15169
  /* .interface = */ ggml_backend_sycl_interface,
15099
15170
  /* .context = */ ctx
15100
15171
  };
@@ -15103,7 +15174,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15103
15174
  }
15104
15175
 
15105
15176
  bool ggml_backend_is_sycl(ggml_backend_t backend) {
15106
- return backend->iface.get_name == ggml_backend_sycl_name;
15177
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
15107
15178
  }
15108
15179
 
15109
15180
  static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {