llama_cpp 0.12.7 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
3338
3338
 
3339
3339
  size_t total_elements = ggml_nelements(src);
3340
3340
 
3341
- const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
3341
+ const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
3342
3342
  float *src_data =NULL;
3343
3343
  if(src_on_device) {
3344
3344
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
8086
8086
  int ixj = col ^ j;
8087
8087
  if (ixj > col) {
8088
8088
  if ((col & k) == 0) {
8089
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8089
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8090
8090
  swap(dst_row[col], dst_row[ixj]);
8091
8091
  }
8092
8092
  } else {
8093
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8093
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8094
8094
  swap(dst_row[col], dst_row[ixj]);
8095
8095
  }
8096
8096
  }
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
8126
8126
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
8127
8127
  }
8128
8128
 
8129
- static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
8130
- const sycl::nd_item<3> &item_ct1, float *buf) {
8129
+
8130
+ template <bool vals_smem, int ncols_template, int block_size_template>
8131
+ static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
8132
+ const int nrows_y, const float scale, const float max_bias, const float m0,
8133
+ const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
8134
+ const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
8135
+
8131
8136
  const int tid = item_ct1.get_local_id(2);
8132
8137
  const int rowx = item_ct1.get_group(2);
8133
8138
  const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
8134
8139
 
8135
- const int block_size = item_ct1.get_local_range(2);
8140
+ const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
8136
8141
 
8137
8142
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
8138
8143
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
8139
8144
 
8145
+ float slope = 0.0f;
8146
+
8147
+ // ALiBi
8148
+ if (max_bias > 0.0f) {
8149
+ const uint32_t h = rowx/nrows_y; // head index
8150
+
8151
+ const float base = h < n_head_log2 ? m0 : m1;
8152
+ const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
8153
+
8154
+ slope = sycl::pow(base, float(exp));
8155
+ }
8156
+
8157
+ float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
8140
8158
  float max_val = -INFINITY;
8141
8159
 
8142
- for (int col = tid; col < ncols; col += block_size) {
8160
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8161
+ const int col = col0 + tid;
8162
+
8163
+ if (ncols_template == 0 && col >= ncols) {
8164
+ break;
8165
+ }
8166
+
8143
8167
  const int ix = rowx*ncols + col;
8144
8168
  const int iy = rowy*ncols + col;
8145
- max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
8169
+
8170
+ const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
8171
+
8172
+ vals[col] = val;
8173
+ max_val = sycl::max(max_val, val);
8146
8174
  }
8147
8175
 
8148
8176
  // find the max value in the block
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8151
8179
  if (warp_id == 0) {
8152
8180
  buf[lane_id] = -INFINITY;
8153
8181
  }
8154
- /*
8155
- DPCT1118:12: SYCL group functions and algorithms must be encountered in
8156
- converged control flow. You may need to adjust the code.
8157
- */
8158
- /*
8159
- DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
8160
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8161
- better performance if there is no access to global memory.
8162
- */
8163
- item_ct1.barrier();
8182
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8164
8183
 
8165
8184
  if (lane_id == 0) {
8166
8185
  buf[warp_id] = max_val;
8167
8186
  }
8168
- /*
8169
- DPCT1118:13: SYCL group functions and algorithms must be encountered in
8170
- converged control flow. You may need to adjust the code.
8171
- */
8172
- /*
8173
- DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
8174
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8175
- better performance if there is no access to global memory.
8176
- */
8177
- item_ct1.barrier();
8187
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8178
8188
 
8179
8189
  max_val = buf[lane_id];
8180
8190
  max_val = warp_reduce_max(max_val, item_ct1);
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8182
8192
 
8183
8193
  float tmp = 0.f;
8184
8194
 
8185
- for (int col = tid; col < ncols; col += block_size) {
8186
- const int ix = rowx*ncols + col;
8187
- const int iy = rowy*ncols + col;
8188
- const float val =
8189
- sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
8195
+ #pragma unroll
8196
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8197
+ const int col = col0 + tid;
8198
+ if (ncols_template == 0 && col >= ncols) {
8199
+ break;
8200
+ }
8201
+
8202
+ const float val = sycl::native::exp(vals[col] - max_val);
8190
8203
  tmp += val;
8191
- dst[ix] = val;
8204
+ vals[col] = val;
8192
8205
  }
8193
8206
 
8194
8207
  // find the sum of exps in the block
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8197
8210
  if (warp_id == 0) {
8198
8211
  buf[lane_id] = 0.f;
8199
8212
  }
8200
- /*
8201
- DPCT1118:14: SYCL group functions and algorithms must be encountered in
8202
- converged control flow. You may need to adjust the code.
8203
- */
8204
- /*
8205
- DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
8206
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8207
- better performance if there is no access to global memory.
8208
- */
8209
- item_ct1.barrier();
8213
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8210
8214
 
8211
8215
  if (lane_id == 0) {
8212
8216
  buf[warp_id] = tmp;
8213
8217
  }
8214
- /*
8215
- DPCT1118:15: SYCL group functions and algorithms must be encountered in
8216
- converged control flow. You may need to adjust the code.
8217
- */
8218
- /*
8219
- DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
8220
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8221
- better performance if there is no access to global memory.
8222
- */
8223
- item_ct1.barrier();
8218
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8224
8219
 
8225
8220
  tmp = buf[lane_id];
8226
8221
  tmp = warp_reduce_sum(tmp, item_ct1);
8227
8222
  }
8228
8223
 
8229
- const float inv_tmp = 1.f / tmp;
8224
+ const float inv_sum = 1.f / tmp;
8230
8225
 
8231
- for (int col = tid; col < ncols; col += block_size) {
8232
- const int i = rowx*ncols + col;
8233
- dst[i] *= inv_tmp;
8226
+ #pragma unroll
8227
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8228
+ const int col = col0 + tid;
8229
+
8230
+ if (ncols_template == 0 && col >= ncols) {
8231
+ return;
8232
+ }
8233
+
8234
+ const int idst = rowx*ncols + col;
8235
+ dst[idst] = vals[col] * inv_sum;
8234
8236
  }
8235
8237
  }
8236
8238
 
@@ -10825,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10825
10827
 
10826
10828
  const sycl::range<3> block_dims(1, 1, ncols);
10827
10829
  const sycl::range<3> block_nums(1, nrows, 1);
10828
- if (order == GGML_SORT_ASC) {
10830
+ if (order == GGML_SORT_ORDER_ASC) {
10829
10831
  /*
10830
10832
  DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
10831
10833
  the limit. To get the device limit, query
@@ -10834,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10834
10836
  stream->parallel_for(
10835
10837
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10836
10838
  [=](sycl::nd_item<3> item_ct1) {
10837
- k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
10839
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
10838
10840
  });
10839
- } else if (order == GGML_SORT_DESC) {
10841
+ } else if (order == GGML_SORT_ORDER_DESC) {
10840
10842
  /*
10841
10843
  DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
10842
10844
  the limit. To get the device limit, query
@@ -10845,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10845
10847
  stream->parallel_for(
10846
10848
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10847
10849
  [=](sycl::nd_item<3> item_ct1) {
10848
- k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
10850
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
10849
10851
  });
10850
10852
  } else {
10851
10853
  GGML_ASSERT(false);
@@ -10867,37 +10869,98 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
10867
10869
  });
10868
10870
  }
10869
10871
 
10870
- static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
10871
- const int ncols_x, const int nrows_x,
10872
- const int nrows_y, const float scale,
10873
- dpct::queue_ptr stream) {
10874
- int nth = WARP_SIZE;
10875
- while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
10876
- const sycl::range<3> block_dims(1, 1, nth);
10877
- const sycl::range<3> block_nums(1, 1, nrows_x);
10878
- /*
10879
- DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
10880
- limit. To get the device limit, query info::device::max_work_group_size.
10881
- Adjust the work-group size if needed.
10882
- */
10872
+ template <bool vals_smem, int ncols_template, int block_size_template>
10873
+ static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
10874
+ const int nrows_y, const float scale, const float max_bias, const float m0,
10875
+ const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
10876
+ const size_t n_local_scratch, dpct::queue_ptr stream) {
10883
10877
  stream->submit([&](sycl::handler &cgh) {
10884
- /*
10885
- DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
10886
- replaced with a value. Modify the code to use the original expression,
10887
- provided in comments, if it is correct.
10888
- */
10889
- sycl::local_accessor<float, 1> buf_acc_ct1(
10890
- sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
10878
+ sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
10891
10879
 
10892
10880
  cgh.parallel_for(
10893
10881
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10894
10882
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
10895
- soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
10896
- buf_acc_ct1.get_pointer());
10883
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
10884
+ nrows_y, scale, max_bias, m0,
10885
+ m1, n_head_log2, item_ct1,
10886
+ local_buf_acc.get_pointer());
10897
10887
  });
10898
10888
  });
10899
10889
  }
10900
10890
 
10891
+ static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
10892
+ float * dst, const int ncols_x, const int nrows_x,
10893
+ const int nrows_y, const float scale, const float max_bias,
10894
+ dpct::queue_ptr stream) {
10895
+ int nth = WARP_SIZE;
10896
+ while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
10897
+ const sycl::range<3> block_dims(1, 1, nth);
10898
+ const sycl::range<3> block_nums(1, 1, nrows_x);
10899
+ const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
10900
+ static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
10901
+
10902
+ const uint32_t n_head_kv = nrows_x/nrows_y;
10903
+ const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
10904
+
10905
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
10906
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
10907
+
10908
+ const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
10909
+ if (n_local_scratch*sizeof(float) < local_mem_size) {
10910
+ switch (ncols_x) {
10911
+ case 32:
10912
+ soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10913
+ max_bias, m0, m1, n_head_log2, block_nums,
10914
+ block_dims, n_local_scratch, stream);
10915
+ break;
10916
+ case 64:
10917
+ soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10918
+ max_bias, m0, m1, n_head_log2, block_nums,
10919
+ block_dims, n_local_scratch, stream);
10920
+ break;
10921
+ case 128:
10922
+ soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10923
+ max_bias, m0, m1, n_head_log2, block_nums,
10924
+ block_dims, n_local_scratch, stream);
10925
+ break;
10926
+ case 256:
10927
+ soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10928
+ max_bias, m0, m1, n_head_log2, block_nums,
10929
+ block_dims, n_local_scratch, stream);
10930
+ break;
10931
+ case 512:
10932
+ soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10933
+ max_bias, m0, m1, n_head_log2, block_nums,
10934
+ block_dims, n_local_scratch, stream);
10935
+ break;
10936
+ case 1024:
10937
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10938
+ max_bias, m0, m1, n_head_log2, block_nums,
10939
+ block_dims, n_local_scratch, stream);
10940
+ break;
10941
+ case 2048:
10942
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10943
+ max_bias, m0, m1, n_head_log2, block_nums,
10944
+ block_dims, n_local_scratch, stream);
10945
+ break;
10946
+ case 4096:
10947
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10948
+ max_bias, m0, m1, n_head_log2, block_nums,
10949
+ block_dims, n_local_scratch, stream);
10950
+ break;
10951
+ default:
10952
+ soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10953
+ max_bias, m0, m1, n_head_log2, block_nums,
10954
+ block_dims, n_local_scratch, stream);
10955
+ break;
10956
+ }
10957
+ } else {
10958
+ soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10959
+ max_bias, m0, m1, n_head_log2, block_nums,
10960
+ block_dims, WARP_SIZE, stream);
10961
+ }
10962
+ }
10963
+
10901
10964
  template <typename T>
10902
10965
  static void im2col_sycl(const float *x, T *dst, int IW, int IH,
10903
10966
  int OW, int OH, int KW, int KH, int IC,
@@ -11407,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
11407
11470
 
11408
11471
  dpct::memcpy_direction kind;
11409
11472
  char * src_ptr;
11410
- if (src->backend == GGML_BACKEND_CPU) {
11473
+ if (src->backend == GGML_BACKEND_TYPE_CPU) {
11411
11474
  kind = dpct::host_to_device;
11412
11475
  src_ptr = (char *) src->data;
11413
- // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
11414
- } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
11415
- GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11476
+ // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
11477
+ } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
11478
+ GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11416
11479
  kind = dpct::device_to_device;
11417
11480
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
11418
11481
  int id;
@@ -11846,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
11846
11909
 
11847
11910
  // the main device has a larger memory buffer to hold the results from all GPUs
11848
11911
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
11849
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
11912
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
11850
11913
 
11851
11914
  switch (src0->type) {
11852
11915
  case GGML_TYPE_Q4_0:
@@ -12119,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
12119
12182
 
12120
12183
  // the main device has a larger memory buffer to hold the results from all GPUs
12121
12184
  // ldc == nrows of the matrix that cuBLAS writes into
12122
- int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
12185
+ int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
12123
12186
 
12124
12187
  #ifdef GGML_SYCL_F16
12125
12188
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
12435
12498
 
12436
12499
  const int64_t ne00 = src0->ne[0];
12437
12500
  const int64_t nrows_x = ggml_nrows(src0);
12438
- const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
12501
+ const int64_t nrows_y = src0->ne[1];
12439
12502
 
12440
12503
  float scale = 1.0f;
12441
- memcpy(&scale, dst->op_params, sizeof(float));
12504
+ float max_bias = 0.0f;
12442
12505
 
12443
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
12506
+ memcpy(&scale, dst->op_params + 0, sizeof(float));
12507
+ memcpy(&max_bias, dst->op_params + 1, sizeof(float));
12444
12508
 
12445
- (void) dst;
12509
+ // positions tensor
12510
+ float * src2_dd = nullptr;
12511
+ sycl_pool_alloc<float> src2_f;
12512
+
12513
+ ggml_tensor * src2 = dst->src[2];
12514
+ const bool use_src2 = src2 != nullptr;
12515
+
12516
+ if (use_src2) {
12517
+ const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
12518
+
12519
+ if (src2_on_device) {
12520
+ ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
12521
+ src2_dd = (float *) src2_extra->data_device[g_main_device];
12522
+ } else {
12523
+ src2_dd = src2_f.alloc(ggml_nelements(src2));
12524
+ SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
12525
+ }
12526
+ }
12527
+
12528
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
12529
+ nrows_x, nrows_y, scale, max_bias, main_stream);
12446
12530
  }
12447
12531
 
12448
12532
  inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
@@ -12501,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12501
12585
  const bool use_src1 = src1 != nullptr;
12502
12586
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
12503
12587
 
12504
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
12505
- GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
12588
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12589
+ GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12506
12590
 
12507
12591
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
12508
12592
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
12509
12593
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12510
12594
 
12511
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12512
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
12513
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
12595
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12596
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
12597
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
12514
12598
 
12515
12599
  // dd = data device
12516
12600
  float * src0_ddf = nullptr;
@@ -12565,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12565
12649
  main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
12566
12650
  }
12567
12651
 
12568
- if (dst->backend == GGML_BACKEND_CPU) {
12652
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12569
12653
  SYCL_CHECK(CHECK_TRY_ERROR(
12570
12654
  dpct::get_current_device().queues_wait_and_throw()));
12571
12655
  }
@@ -12640,8 +12724,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12640
12724
  const int nb2 = dst->nb[2];
12641
12725
  const int nb3 = dst->nb[3];
12642
12726
 
12643
- GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
12644
- GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
12727
+ GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12728
+ GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12729
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
12645
12730
 
12646
12731
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
12647
12732
 
@@ -12656,13 +12741,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12656
12741
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
12657
12742
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12658
12743
 
12659
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12744
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12660
12745
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
12661
12746
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
12662
12747
 
12663
12748
  int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
12664
12749
 
12665
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
12750
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12666
12751
  GGML_ASSERT(!(split && ne02 > 1));
12667
12752
  GGML_ASSERT(!(split && ne03 > 1));
12668
12753
  GGML_ASSERT(!(split && ne02 < ne12));
@@ -12717,8 +12802,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12717
12802
 
12718
12803
  used_devices++;
12719
12804
 
12720
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12721
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12805
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12806
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12722
12807
 
12723
12808
  ggml_sycl_set_device(get_device_id_by_index(id));
12724
12809
  const dpct::queue_ptr stream = g_syclStreams[id][0];
@@ -12782,8 +12867,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12782
12867
  continue;
12783
12868
  }
12784
12869
 
12785
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12786
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12870
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12871
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12787
12872
  const int64_t row_diff = row_high[id] - row_low[id];
12788
12873
 
12789
12874
  ggml_sycl_set_device(get_device_id_by_index(id));
@@ -12809,12 +12894,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12809
12894
 
12810
12895
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
12811
12896
  // in that case an offset on dst_ddf_i is needed
12812
- if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
12897
+ if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
12813
12898
  dst_dd_i += row_low[id]; // offset is 0 if no tensor split
12814
12899
  }
12815
12900
 
12816
12901
  // copy src0, src1 to device if necessary
12817
- if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
12902
+ if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
12818
12903
  if (id != g_main_device_index) {
12819
12904
  if (convert_src1_to_q8_1) {
12820
12905
  char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
@@ -12830,14 +12915,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12830
12915
  src1_ncols * ne10 * sizeof(float))));
12831
12916
  }
12832
12917
  }
12833
- } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
12918
+ } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
12834
12919
  SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
12835
12920
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
12836
12921
  } else {
12837
12922
  GGML_ASSERT(false);
12838
12923
  }
12839
12924
 
12840
- if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
12925
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
12841
12926
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
12842
12927
  /*
12843
12928
  DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -12867,10 +12952,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12867
12952
  if (!dst_on_device) {
12868
12953
  void * dst_off_device;
12869
12954
  dpct::memcpy_direction kind;
12870
- if (dst->backend == GGML_BACKEND_CPU) {
12955
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12871
12956
  dst_off_device = dst->data;
12872
12957
  kind = dpct::device_to_host;
12873
- } else if (dst->backend == GGML_BACKEND_GPU) {
12958
+ } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
12874
12959
  dst_off_device = dst_extra->data_device[g_main_device_index];
12875
12960
  kind = dpct::device_to_device;
12876
12961
  } else {
@@ -12954,7 +13039,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12954
13039
  }
12955
13040
  }
12956
13041
 
12957
- if (dst->backend == GGML_BACKEND_CPU) {
13042
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12958
13043
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
12959
13044
  SYCL_CHECK(CHECK_TRY_ERROR(
12960
13045
  dpct::get_current_device().queues_wait_and_throw()));
@@ -13091,7 +13176,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
13091
13176
  const ggml_tensor *src1,
13092
13177
  ggml_tensor *dst) try {
13093
13178
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
13094
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13179
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13095
13180
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
13096
13181
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
13097
13182
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -13129,7 +13214,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
13129
13214
  GGML_ASSERT(!ggml_is_transposed(src0));
13130
13215
  GGML_ASSERT(!ggml_is_transposed(src1));
13131
13216
  GGML_ASSERT(!ggml_is_permuted(src0));
13132
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13217
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13133
13218
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13134
13219
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13135
13220
 
@@ -13185,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
13185
13270
  int64_t i03 = i13 / r3;
13186
13271
  int64_t i02 = i12 / r2;
13187
13272
 
13188
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
13189
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
13190
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
13273
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
13274
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
13275
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
13191
13276
  }
13192
13277
 
13193
- static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13194
- const ggml_tensor *src1,
13195
- ggml_tensor *dst) try {
13278
+ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
13279
+ const ggml_tensor *src1,
13280
+ ggml_tensor *dst) try {
13196
13281
  GGML_ASSERT(!ggml_is_transposed(src0));
13197
13282
  GGML_ASSERT(!ggml_is_transposed(src1));
13198
13283
 
13199
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13284
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13200
13285
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13201
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
13202
-
13203
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13204
13286
 
13205
- GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
13206
-
13207
- GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13208
-
13209
- GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13287
+ GGML_TENSOR_BINARY_OP_LOCALS
13210
13288
 
13211
- const int64_t ne1 = ggml_nelements(src1);
13212
- const int64_t ne = ggml_nelements(dst);
13289
+ const int64_t ne_dst = ggml_nelements(dst);
13213
13290
 
13214
13291
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13215
13292
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -13228,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13228
13305
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
13229
13306
 
13230
13307
  // convert src1 to fp16
13231
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
13232
- GGML_ASSERT(to_fp16_sycl != nullptr);
13233
-
13234
- sycl_pool_alloc<sycl::half> src1_as_f16(ne1);
13235
- to_fp16_sycl(src1_ddf, src1_as_f16.get(), ne1, main_stream);
13308
+ sycl_pool_alloc<sycl::half> src1_f16_alloc;
13309
+ if (src1->type != GGML_TYPE_F16) {
13310
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
13311
+ const int64_t ne_src1 = ggml_nelements(src1);
13312
+ src1_f16_alloc.alloc(ne_src1);
13313
+ GGML_ASSERT(to_fp16_sycl != nullptr);
13314
+ to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
13315
+ }
13316
+ sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
13317
+ : src1_f16_alloc.get();
13236
13318
 
13237
13319
  sycl_pool_alloc<sycl::half> dst_f16;
13238
13320
  char * dst_t;
@@ -13253,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13253
13335
  const void * alpha = &alpha_f16;
13254
13336
  const void * beta = &beta_f16;
13255
13337
 
13256
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
13257
- dst_t = (char *) dst_f16.alloc(ne);
13258
-
13259
- nbd2 /= sizeof(float) / sizeof(sycl::half);
13260
- nbd3 /= sizeof(float) / sizeof(sycl::half);
13261
- } else {
13262
- dst_t = (char *) dst_ddf;
13263
-
13264
- cu_compute_type = dpct::library_data_t::real_float;
13265
- cu_data_type = dpct::library_data_t::real_float;
13338
+ // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
13339
+ // once oneMKL open source supports half, half, float, float: datatypes
13340
+ dst_t = (char *) dst_f16.alloc(ne_dst);
13266
13341
 
13267
- alpha = &alpha_f32;
13268
- beta = &beta_f32;
13269
- }
13342
+ nbd2 /= sizeof(float) / sizeof(sycl::half);
13343
+ nbd3 /= sizeof(float) / sizeof(sycl::half);
13270
13344
 
13271
13345
  GGML_ASSERT(ne12 % ne02 == 0);
13272
13346
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -13302,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13302
13376
  *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
13303
13377
  oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
13304
13378
  (const char *)src0_as_f16, dpct::library_data_t::real_half,
13305
- nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
13306
- (const char *)src1_as_f16.get(), dpct::library_data_t::real_half,
13307
- nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta,
13308
- (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
13379
+ nb01 / nb00, nb02 / nb00,
13380
+ (const char *)src1_f16, dpct::library_data_t::real_half,
13381
+ nb11 / nb10, nb12 / nb10, beta,
13382
+ (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
13309
13383
  ne12 * ne13, cu_compute_type)));
13310
13384
  } else {
13311
13385
  // use syclGemmBatchedEx
@@ -13325,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13325
13399
  {sycl::aspect::fp16});
13326
13400
 
13327
13401
  main_stream->submit([&](sycl::handler &cgh) {
13328
- const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
13329
- const void **ptrs_src_get_ct3 = ptrs_src.get();
13330
- void **ptrs_dst_get_ct4 = ptrs_dst.get();
13331
-
13402
+ const void **ptrs_src_get = ptrs_src.get();
13403
+ void **ptrs_dst_get = ptrs_dst.get();
13404
+ size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
13405
+ size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
13332
13406
  cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
13333
13407
  [=](sycl::nd_item<3> item_ct1) {
13334
13408
  k_compute_batched_ptrs(
13335
- src0_as_f16, src1_as_f16_get_ct1,
13336
- dst_t, ptrs_src_get_ct3,
13337
- ptrs_dst_get_ct4, ne12, ne13, ne23,
13338
- nb02, nb03, nb12, nb13, nbd2, nbd3, r2,
13339
- r3, item_ct1);
13409
+ src0_as_f16, src1_f16,
13410
+ dst_t, ptrs_src_get,
13411
+ ptrs_dst_get, ne12, ne13, ne23,
13412
+ nb02, nb03, nb12_scaled, nb13_scaled,
13413
+ nbd2, nbd3, r2, r3, item_ct1);
13340
13414
  });
13341
13415
  });
13342
13416
  }
13343
- /*
13344
- DPCT1010:95: SYCL uses exceptions to report errors and does not use the
13345
- error codes. The call was replaced with 0. You need to rewrite this
13346
- code.
13347
- */
13348
- SYCL_CHECK(0);
13349
-
13350
13417
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
13351
13418
  *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
13352
13419
  oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
13353
13420
  (const void **)(ptrs_src.get() + 0 * ne23),
13354
- dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
13421
+ dpct::library_data_t::real_half, nb01 / nb00,
13355
13422
  (const void **)(ptrs_src.get() + 1 * ne23),
13356
- dpct::library_data_t::real_half, nb11 / sizeof(float), beta,
13423
+ dpct::library_data_t::real_half, nb11 / nb10, beta,
13357
13424
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
13358
13425
  cu_compute_type)));
13359
13426
  }
13360
13427
  #endif
13361
13428
 
13362
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
13363
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
13364
- to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
13365
- }
13429
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
13430
+ to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
13366
13431
  }
13367
13432
  catch (sycl::exception const &exc) {
13368
13433
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -13372,11 +13437,11 @@ catch (sycl::exception const &exc) {
13372
13437
 
13373
13438
  static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
13374
13439
  const bool all_on_device =
13375
- (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
13376
- (src1->backend == GGML_BACKEND_GPU) &&
13377
- ( dst->backend == GGML_BACKEND_GPU);
13440
+ (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
13441
+ (src1->backend == GGML_BACKEND_TYPE_GPU) &&
13442
+ ( dst->backend == GGML_BACKEND_TYPE_GPU);
13378
13443
 
13379
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
13444
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
13380
13445
 
13381
13446
  int64_t min_compute_capability = INT_MAX;
13382
13447
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -13407,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
13407
13472
  // KQV single-batch
13408
13473
  // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
13409
13474
  ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
13410
- } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
13475
+ } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
13411
13476
  // KQ + KQV multi-batch
13412
- // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n");
13413
- ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst);
13477
+ // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
13478
+ ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
13414
13479
  } else if (src0->type == GGML_TYPE_F32) {
13415
13480
  // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
13416
13481
  ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
@@ -13505,7 +13570,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
13505
13570
  GGML_ASSERT(!ggml_is_transposed(src00));
13506
13571
  GGML_ASSERT(!ggml_is_transposed(src1));
13507
13572
 
13508
- GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
13573
+ GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13509
13574
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13510
13575
 
13511
13576
  GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
@@ -13643,7 +13708,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13643
13708
 
13644
13709
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
13645
13710
 
13646
- if (ids->backend == GGML_BACKEND_GPU) {
13711
+ if (ids->backend == GGML_BACKEND_TYPE_GPU) {
13647
13712
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
13648
13713
  SYCL_CHECK(CHECK_TRY_ERROR(
13649
13714
  stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
@@ -13661,20 +13726,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13661
13726
  ggml_tensor src1_row = *src1;
13662
13727
  ggml_tensor dst_row = *dst;
13663
13728
 
13664
- src1_row.backend = GGML_BACKEND_GPU;
13665
- dst_row.backend = GGML_BACKEND_GPU;
13729
+ src1_row.backend = GGML_BACKEND_TYPE_GPU;
13730
+ dst_row.backend = GGML_BACKEND_TYPE_GPU;
13666
13731
 
13667
13732
  src1_row.extra = &src1_row_extra;
13668
13733
  dst_row.extra = &dst_row_extra;
13669
13734
 
13670
- char * src1_original = src1->backend == GGML_BACKEND_CPU ?
13735
+ char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
13671
13736
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
13672
- char * dst_original = dst->backend == GGML_BACKEND_CPU ?
13737
+ char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
13673
13738
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
13674
13739
 
13675
13740
  if (src1->ne[1] == 1) {
13676
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13677
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
13741
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13742
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
13678
13743
 
13679
13744
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
13680
13745
  //int32_t row_id;
@@ -13756,7 +13821,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13756
13821
  }
13757
13822
  }
13758
13823
 
13759
- if (dst->backend == GGML_BACKEND_CPU) {
13824
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
13760
13825
  SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
13761
13826
  }
13762
13827
  }
@@ -13779,8 +13844,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13779
13844
  const int64_t ne = ggml_nelements(src0);
13780
13845
  GGML_ASSERT(ne == ggml_nelements(src1));
13781
13846
 
13782
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
13783
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13847
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
13848
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13784
13849
 
13785
13850
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13786
13851
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
@@ -13887,17 +13952,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
13887
13952
  memset(extra, 0, sizeof(*extra));
13888
13953
 
13889
13954
  for (int64_t id = 0; id < g_device_count; ++id) {
13890
- if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
13955
+ if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
13891
13956
  continue;
13892
13957
  }
13893
13958
  ggml_sycl_set_device(get_device_id_by_index(id));
13894
13959
  const dpct::queue_ptr stream = g_syclStreams[id][0];
13895
13960
 
13896
13961
  int64_t row_low, row_high;
13897
- if (backend == GGML_BACKEND_GPU) {
13962
+ if (backend == GGML_BACKEND_TYPE_GPU) {
13898
13963
  row_low = 0;
13899
13964
  row_high = nrows;
13900
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
13965
+ } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
13901
13966
  const int64_t rounding = get_row_rounding(tensor->type);
13902
13967
 
13903
13968
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
@@ -13946,7 +14011,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
13946
14011
 
13947
14012
  extra->data_device[id] = buf;
13948
14013
 
13949
- if (backend == GGML_BACKEND_GPU_SPLIT) {
14014
+ if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
13950
14015
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
13951
14016
  SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
13952
14017
  new sycl::event()));
@@ -13963,7 +14028,7 @@ catch (sycl::exception const &exc) {
13963
14028
  }
13964
14029
 
13965
14030
  void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
13966
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
14031
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
13967
14032
  return;
13968
14033
  }
13969
14034
 
@@ -14016,15 +14081,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14016
14081
  return;
14017
14082
  }
14018
14083
 
14019
- tensor->backend = GGML_BACKEND_GPU;
14084
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14020
14085
 
14021
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
14086
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
14022
14087
  const ggml_op src0_op = tensor->src[0]->op;
14023
14088
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
14024
14089
  ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
14025
14090
  }
14026
14091
  }
14027
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
14092
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
14028
14093
  ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
14029
14094
  }
14030
14095
 
@@ -14042,7 +14107,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14042
14107
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
14043
14108
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
14044
14109
 
14045
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
14110
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14046
14111
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
14047
14112
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14048
14113
  size_t offset = 0;
@@ -14111,7 +14176,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
14111
14176
 
14112
14177
  const bool inplace = tensor->view_src != nullptr;
14113
14178
 
14114
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
14179
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14115
14180
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
14116
14181
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14117
14182
  size_t view_offset = 0;
@@ -14132,7 +14197,7 @@ catch (sycl::exception const &exc) {
14132
14197
  }
14133
14198
 
14134
14199
  void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
14135
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14200
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14136
14201
  GGML_ASSERT(ggml_is_contiguous(tensor));
14137
14202
 
14138
14203
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -14219,9 +14284,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14219
14284
  if (!g_sycl_loaded) return false;
14220
14285
 
14221
14286
  ggml_sycl_func_t func;
14222
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
14223
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
14224
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
14287
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
14288
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
14289
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
14225
14290
 
14226
14291
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
14227
14292
  return false;
@@ -14359,14 +14424,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14359
14424
  return false;
14360
14425
  }
14361
14426
 
14362
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
14427
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
14363
14428
  ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
14364
14429
  }
14365
14430
 
14366
14431
  if (params->ith != 0) {
14367
14432
  return true;
14368
14433
  }
14369
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14434
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14370
14435
  return true;
14371
14436
  }
14372
14437
  func(tensor->src[0], tensor->src[1], tensor);
@@ -14517,7 +14582,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
14517
14582
 
14518
14583
  extra->data_device[ctx->device] = tensor->data;
14519
14584
 
14520
- tensor->backend = GGML_BACKEND_GPU;
14585
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14521
14586
  tensor->extra = extra;
14522
14587
 
14523
14588
  if (ggml_is_quantized(tensor->type)) {
@@ -14548,7 +14613,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14548
14613
  ggml_tensor *tensor,
14549
14614
  const void *data, size_t offset,
14550
14615
  size_t size) try {
14551
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14616
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14552
14617
 
14553
14618
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14554
14619
 
@@ -14573,7 +14638,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
14573
14638
  const ggml_tensor *tensor,
14574
14639
  void *data, size_t offset,
14575
14640
  size_t size) try {
14576
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14641
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14577
14642
 
14578
14643
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14579
14644
 
@@ -14809,7 +14874,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
14809
14874
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14810
14875
 
14811
14876
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14812
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14877
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14813
14878
 
14814
14879
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14815
14880
  (char *)tensor->data + offset, data, size)));
@@ -14827,7 +14892,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
14827
14892
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14828
14893
 
14829
14894
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14830
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14895
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14831
14896
 
14832
14897
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14833
14898
  data, (const char *)tensor->data + offset, size)));
@@ -14880,7 +14945,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
14880
14945
  ggml_sycl_set_main_device(sycl_ctx->device);
14881
14946
 
14882
14947
  ggml_compute_params params = {};
14883
- params.type = GGML_TASK_COMPUTE;
14948
+ params.type = GGML_TASK_TYPE_COMPUTE;
14884
14949
  params.ith = 0;
14885
14950
  for (int i = 0; i < cgraph->n_nodes; i++) {
14886
14951
  ggml_tensor * node = cgraph->nodes[i];
@@ -14888,13 +14953,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
14888
14953
  if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
14889
14954
  continue;
14890
14955
 
14891
- assert(node->backend == GGML_BACKEND_GPU);
14956
+ assert(node->backend == GGML_BACKEND_TYPE_GPU);
14892
14957
  assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14893
14958
  assert(node->extra != nullptr);
14894
14959
 
14895
14960
  for (int j = 0; j < GGML_MAX_SRC; j++) {
14896
14961
  if (node->src[j] != nullptr) {
14897
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
14962
+ assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
14898
14963
  assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
14899
14964
  assert(node->src[j]->extra != nullptr);
14900
14965
  }
@@ -15078,6 +15143,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
15078
15143
  /* .supports_op = */ ggml_backend_sycl_supports_op,
15079
15144
  };
15080
15145
 
15146
+ static ggml_guid_t ggml_backend_sycl_guid() {
15147
+ static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
15148
+ return &guid;
15149
+ }
15150
+
15081
15151
  ggml_backend_t ggml_backend_sycl_init(int device) {
15082
15152
  ggml_init_sycl(); // TODO: remove from ggml.c
15083
15153
 
@@ -15095,6 +15165,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15095
15165
  };
15096
15166
 
15097
15167
  ggml_backend_t sycl_backend = new ggml_backend {
15168
+ /* .guid = */ ggml_backend_sycl_guid(),
15098
15169
  /* .interface = */ ggml_backend_sycl_interface,
15099
15170
  /* .context = */ ctx
15100
15171
  };
@@ -15103,7 +15174,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15103
15174
  }
15104
15175
 
15105
15176
  bool ggml_backend_is_sycl(ggml_backend_t backend) {
15106
- return backend->iface.get_name == ggml_backend_sycl_name;
15177
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
15107
15178
  }
15108
15179
 
15109
15180
  static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {