llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
3338
3338
 
3339
3339
  size_t total_elements = ggml_nelements(src);
3340
3340
 
3341
- const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
3341
+ const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
3342
3342
  float *src_data =NULL;
3343
3343
  if(src_on_device) {
3344
3344
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
8086
8086
  int ixj = col ^ j;
8087
8087
  if (ixj > col) {
8088
8088
  if ((col & k) == 0) {
8089
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8089
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
8090
8090
  swap(dst_row[col], dst_row[ixj]);
8091
8091
  }
8092
8092
  } else {
8093
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8093
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
8094
8094
  swap(dst_row[col], dst_row[ixj]);
8095
8095
  }
8096
8096
  }
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
8126
8126
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
8127
8127
  }
8128
8128
 
8129
- static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
8130
- const sycl::nd_item<3> &item_ct1, float *buf) {
8129
+
8130
+ template <bool vals_smem, int ncols_template, int block_size_template>
8131
+ static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
8132
+ const int nrows_y, const float scale, const float max_bias, const float m0,
8133
+ const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
8134
+ const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
8135
+
8131
8136
  const int tid = item_ct1.get_local_id(2);
8132
8137
  const int rowx = item_ct1.get_group(2);
8133
8138
  const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
8134
8139
 
8135
- const int block_size = item_ct1.get_local_range(2);
8140
+ const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
8136
8141
 
8137
8142
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
8138
8143
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
8139
8144
 
8145
+ float slope = 0.0f;
8146
+
8147
+ // ALiBi
8148
+ if (max_bias > 0.0f) {
8149
+ const uint32_t h = rowx/nrows_y; // head index
8150
+
8151
+ const float base = h < n_head_log2 ? m0 : m1;
8152
+ const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
8153
+
8154
+ slope = sycl::pow(base, float(exp));
8155
+ }
8156
+
8157
+ float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
8140
8158
  float max_val = -INFINITY;
8141
8159
 
8142
- for (int col = tid; col < ncols; col += block_size) {
8160
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8161
+ const int col = col0 + tid;
8162
+
8163
+ if (ncols_template == 0 && col >= ncols) {
8164
+ break;
8165
+ }
8166
+
8143
8167
  const int ix = rowx*ncols + col;
8144
8168
  const int iy = rowy*ncols + col;
8145
- max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
8169
+
8170
+ const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
8171
+
8172
+ vals[col] = val;
8173
+ max_val = sycl::max(max_val, val);
8146
8174
  }
8147
8175
 
8148
8176
  // find the max value in the block
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8151
8179
  if (warp_id == 0) {
8152
8180
  buf[lane_id] = -INFINITY;
8153
8181
  }
8154
- /*
8155
- DPCT1118:12: SYCL group functions and algorithms must be encountered in
8156
- converged control flow. You may need to adjust the code.
8157
- */
8158
- /*
8159
- DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
8160
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8161
- better performance if there is no access to global memory.
8162
- */
8163
- item_ct1.barrier();
8182
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8164
8183
 
8165
8184
  if (lane_id == 0) {
8166
8185
  buf[warp_id] = max_val;
8167
8186
  }
8168
- /*
8169
- DPCT1118:13: SYCL group functions and algorithms must be encountered in
8170
- converged control flow. You may need to adjust the code.
8171
- */
8172
- /*
8173
- DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
8174
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8175
- better performance if there is no access to global memory.
8176
- */
8177
- item_ct1.barrier();
8187
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8178
8188
 
8179
8189
  max_val = buf[lane_id];
8180
8190
  max_val = warp_reduce_max(max_val, item_ct1);
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8182
8192
 
8183
8193
  float tmp = 0.f;
8184
8194
 
8185
- for (int col = tid; col < ncols; col += block_size) {
8186
- const int ix = rowx*ncols + col;
8187
- const int iy = rowy*ncols + col;
8188
- const float val =
8189
- sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
8195
+ #pragma unroll
8196
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8197
+ const int col = col0 + tid;
8198
+ if (ncols_template == 0 && col >= ncols) {
8199
+ break;
8200
+ }
8201
+
8202
+ const float val = sycl::native::exp(vals[col] - max_val);
8190
8203
  tmp += val;
8191
- dst[ix] = val;
8204
+ vals[col] = val;
8192
8205
  }
8193
8206
 
8194
8207
  // find the sum of exps in the block
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
8197
8210
  if (warp_id == 0) {
8198
8211
  buf[lane_id] = 0.f;
8199
8212
  }
8200
- /*
8201
- DPCT1118:14: SYCL group functions and algorithms must be encountered in
8202
- converged control flow. You may need to adjust the code.
8203
- */
8204
- /*
8205
- DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
8206
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8207
- better performance if there is no access to global memory.
8208
- */
8209
- item_ct1.barrier();
8213
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8210
8214
 
8211
8215
  if (lane_id == 0) {
8212
8216
  buf[warp_id] = tmp;
8213
8217
  }
8214
- /*
8215
- DPCT1118:15: SYCL group functions and algorithms must be encountered in
8216
- converged control flow. You may need to adjust the code.
8217
- */
8218
- /*
8219
- DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
8220
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
8221
- better performance if there is no access to global memory.
8222
- */
8223
- item_ct1.barrier();
8218
+ item_ct1.barrier(sycl::access::fence_space::local_space);
8224
8219
 
8225
8220
  tmp = buf[lane_id];
8226
8221
  tmp = warp_reduce_sum(tmp, item_ct1);
8227
8222
  }
8228
8223
 
8229
- const float inv_tmp = 1.f / tmp;
8224
+ const float inv_sum = 1.f / tmp;
8230
8225
 
8231
- for (int col = tid; col < ncols; col += block_size) {
8232
- const int i = rowx*ncols + col;
8233
- dst[i] *= inv_tmp;
8226
+ #pragma unroll
8227
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
8228
+ const int col = col0 + tid;
8229
+
8230
+ if (ncols_template == 0 && col >= ncols) {
8231
+ return;
8232
+ }
8233
+
8234
+ const int idst = rowx*ncols + col;
8235
+ dst[idst] = vals[col] * inv_sum;
8234
8236
  }
8235
8237
  }
8236
8238
 
@@ -9188,174 +9190,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
9188
9190
  }
9189
9191
  }
9190
9192
 
9191
- static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
9192
- float *dst, const int ncols,
9193
- const int nrows,
9194
- dpct::queue_ptr stream) {
9195
- GGML_ASSERT(ncols % QK4_0 == 0);
9196
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9197
- const sycl::range<3> block_nums(1, 1, block_num_y);
9198
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9199
- stream->parallel_for(
9200
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9201
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9202
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
9203
- vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
9204
- item_ct1);
9205
- });
9206
- }
9207
-
9208
- static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
9209
- float *dst, const int ncols,
9210
- const int nrows,
9211
- dpct::queue_ptr stream) {
9212
- GGML_ASSERT(ncols % QK4_1 == 0);
9213
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9214
- const sycl::range<3> block_nums(1, 1, block_num_y);
9215
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9216
- stream->parallel_for(
9217
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9218
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9219
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
9220
- vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
9221
- item_ct1);
9222
- });
9223
- }
9224
-
9225
- static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
9226
- float *dst, const int ncols,
9227
- const int nrows,
9228
- dpct::queue_ptr stream) {
9229
- GGML_ASSERT(ncols % QK5_0 == 0);
9230
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9231
- const sycl::range<3> block_nums(1, 1, block_num_y);
9232
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9233
- stream->parallel_for(
9234
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9235
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9236
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
9237
- vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
9238
- item_ct1);
9239
- });
9240
- }
9241
-
9242
- static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
9243
- float *dst, const int ncols,
9244
- const int nrows,
9245
- dpct::queue_ptr stream) {
9246
- GGML_ASSERT(ncols % QK5_1 == 0);
9247
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9248
- const sycl::range<3> block_nums(1, 1, block_num_y);
9249
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9250
- stream->parallel_for(
9251
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9252
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9253
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
9254
- vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
9255
- item_ct1);
9256
- });
9257
- }
9258
-
9259
- static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
9260
- float *dst, const int ncols,
9261
- const int nrows,
9262
- dpct::queue_ptr stream) {
9263
- GGML_ASSERT(ncols % QK8_0 == 0);
9264
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9265
- const sycl::range<3> block_nums(1, 1, block_num_y);
9266
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9267
- stream->parallel_for(
9268
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9269
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9270
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
9271
- vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
9272
- item_ct1);
9273
- });
9274
- }
9275
-
9276
- static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
9277
- float *dst, const int ncols,
9278
- const int nrows,
9279
- dpct::queue_ptr stream) {
9280
- GGML_ASSERT(ncols % QK_K == 0);
9281
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9282
- const sycl::range<3> block_nums(1, 1, block_num_y);
9283
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9284
- stream->parallel_for(
9285
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9286
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9287
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
9288
- vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
9289
- item_ct1);
9290
- });
9291
- }
9292
-
9293
- static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
9294
- float *dst, const int ncols,
9295
- const int nrows,
9296
- dpct::queue_ptr stream) {
9297
- GGML_ASSERT(ncols % QK_K == 0);
9298
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9299
- const sycl::range<3> block_nums(1, 1, block_num_y);
9300
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9301
- stream->parallel_for(
9302
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9303
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9304
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
9305
- vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
9306
- item_ct1);
9307
- });
9308
- }
9309
-
9310
- static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
9311
- float *dst, const int ncols,
9312
- const int nrows,
9313
- dpct::queue_ptr stream) {
9314
- GGML_ASSERT(ncols % QK_K == 0);
9315
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9316
- const sycl::range<3> block_nums(1, 1, block_num_y);
9317
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9318
- stream->parallel_for(
9319
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9320
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9321
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
9322
- vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
9323
- item_ct1);
9324
- });
9325
- }
9326
-
9327
- static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
9328
- float *dst, const int ncols,
9329
- const int nrows,
9330
- dpct::queue_ptr stream) {
9331
- GGML_ASSERT(ncols % QK_K == 0);
9332
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9333
- const sycl::range<3> block_nums(1, 1, block_num_y);
9334
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9335
- stream->parallel_for(
9336
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9337
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9338
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
9339
- vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
9340
- item_ct1);
9341
- });
9342
- }
9343
-
9344
- static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
9345
- float *dst, const int ncols,
9346
- const int nrows,
9347
- dpct::queue_ptr stream) {
9348
- GGML_ASSERT(ncols % QK_K == 0);
9349
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9350
- const sycl::range<3> block_nums(1, 1, block_num_y);
9351
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9352
- stream->parallel_for(
9353
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9354
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9355
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
9356
- vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
9357
- item_ct1);
9358
- });
9193
+ template <int qk, int qi, typename block_q_t, int vdr,
9194
+ vec_dot_q_sycl_t vec_dot_q_sycl>
9195
+ static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
9196
+ float *dst, const int ncols,
9197
+ const int nrows,
9198
+ dpct::queue_ptr stream) {
9199
+ GGML_ASSERT(ncols % QK4_0 == 0);
9200
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9201
+ const sycl::range<3> block_nums(1, 1, block_num_y);
9202
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9203
+ stream->parallel_for(
9204
+ sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
9205
+ ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9206
+ mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
9207
+ vx, vy, dst, ncols, nrows, item_ct1);
9208
+ });
9359
9209
  }
9360
9210
 
9361
9211
  int get_device_index_by_id(int id){
@@ -10977,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10977
10827
 
10978
10828
  const sycl::range<3> block_dims(1, 1, ncols);
10979
10829
  const sycl::range<3> block_nums(1, nrows, 1);
10980
- if (order == GGML_SORT_ASC) {
10830
+ if (order == GGML_SORT_ORDER_ASC) {
10981
10831
  /*
10982
10832
  DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
10983
10833
  the limit. To get the device limit, query
@@ -10986,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10986
10836
  stream->parallel_for(
10987
10837
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10988
10838
  [=](sycl::nd_item<3> item_ct1) {
10989
- k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
10839
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
10990
10840
  });
10991
- } else if (order == GGML_SORT_DESC) {
10841
+ } else if (order == GGML_SORT_ORDER_DESC) {
10992
10842
  /*
10993
10843
  DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
10994
10844
  the limit. To get the device limit, query
@@ -10997,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
10997
10847
  stream->parallel_for(
10998
10848
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10999
10849
  [=](sycl::nd_item<3> item_ct1) {
11000
- k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
10850
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
11001
10851
  });
11002
10852
  } else {
11003
10853
  GGML_ASSERT(false);
@@ -11019,37 +10869,98 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
11019
10869
  });
11020
10870
  }
11021
10871
 
11022
- static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
11023
- const int ncols_x, const int nrows_x,
11024
- const int nrows_y, const float scale,
11025
- dpct::queue_ptr stream) {
11026
- int nth = WARP_SIZE;
11027
- while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
11028
- const sycl::range<3> block_dims(1, 1, nth);
11029
- const sycl::range<3> block_nums(1, 1, nrows_x);
11030
- /*
11031
- DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
11032
- limit. To get the device limit, query info::device::max_work_group_size.
11033
- Adjust the work-group size if needed.
11034
- */
10872
+ template <bool vals_smem, int ncols_template, int block_size_template>
10873
+ static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
10874
+ const int nrows_y, const float scale, const float max_bias, const float m0,
10875
+ const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
10876
+ const size_t n_local_scratch, dpct::queue_ptr stream) {
11035
10877
  stream->submit([&](sycl::handler &cgh) {
11036
- /*
11037
- DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
11038
- replaced with a value. Modify the code to use the original expression,
11039
- provided in comments, if it is correct.
11040
- */
11041
- sycl::local_accessor<float, 1> buf_acc_ct1(
11042
- sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
10878
+ sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
11043
10879
 
11044
10880
  cgh.parallel_for(
11045
10881
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
11046
10882
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
11047
- soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
11048
- buf_acc_ct1.get_pointer());
10883
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
10884
+ nrows_y, scale, max_bias, m0,
10885
+ m1, n_head_log2, item_ct1,
10886
+ local_buf_acc.get_pointer());
11049
10887
  });
11050
10888
  });
11051
10889
  }
11052
10890
 
10891
+ static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
10892
+ float * dst, const int ncols_x, const int nrows_x,
10893
+ const int nrows_y, const float scale, const float max_bias,
10894
+ dpct::queue_ptr stream) {
10895
+ int nth = WARP_SIZE;
10896
+ while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
10897
+ const sycl::range<3> block_dims(1, 1, nth);
10898
+ const sycl::range<3> block_nums(1, 1, nrows_x);
10899
+ const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
10900
+ static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
10901
+
10902
+ const uint32_t n_head_kv = nrows_x/nrows_y;
10903
+ const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
10904
+
10905
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
10906
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
10907
+
10908
+ const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
10909
+ if (n_local_scratch*sizeof(float) < local_mem_size) {
10910
+ switch (ncols_x) {
10911
+ case 32:
10912
+ soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10913
+ max_bias, m0, m1, n_head_log2, block_nums,
10914
+ block_dims, n_local_scratch, stream);
10915
+ break;
10916
+ case 64:
10917
+ soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10918
+ max_bias, m0, m1, n_head_log2, block_nums,
10919
+ block_dims, n_local_scratch, stream);
10920
+ break;
10921
+ case 128:
10922
+ soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10923
+ max_bias, m0, m1, n_head_log2, block_nums,
10924
+ block_dims, n_local_scratch, stream);
10925
+ break;
10926
+ case 256:
10927
+ soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10928
+ max_bias, m0, m1, n_head_log2, block_nums,
10929
+ block_dims, n_local_scratch, stream);
10930
+ break;
10931
+ case 512:
10932
+ soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10933
+ max_bias, m0, m1, n_head_log2, block_nums,
10934
+ block_dims, n_local_scratch, stream);
10935
+ break;
10936
+ case 1024:
10937
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10938
+ max_bias, m0, m1, n_head_log2, block_nums,
10939
+ block_dims, n_local_scratch, stream);
10940
+ break;
10941
+ case 2048:
10942
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10943
+ max_bias, m0, m1, n_head_log2, block_nums,
10944
+ block_dims, n_local_scratch, stream);
10945
+ break;
10946
+ case 4096:
10947
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10948
+ max_bias, m0, m1, n_head_log2, block_nums,
10949
+ block_dims, n_local_scratch, stream);
10950
+ break;
10951
+ default:
10952
+ soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10953
+ max_bias, m0, m1, n_head_log2, block_nums,
10954
+ block_dims, n_local_scratch, stream);
10955
+ break;
10956
+ }
10957
+ } else {
10958
+ soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
10959
+ max_bias, m0, m1, n_head_log2, block_nums,
10960
+ block_dims, WARP_SIZE, stream);
10961
+ }
10962
+ }
10963
+
11053
10964
  template <typename T>
11054
10965
  static void im2col_sycl(const float *x, T *dst, int IW, int IH,
11055
10966
  int OW, int OH, int KW, int KH, int IC,
@@ -11559,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
11559
11470
 
11560
11471
  dpct::memcpy_direction kind;
11561
11472
  char * src_ptr;
11562
- if (src->backend == GGML_BACKEND_CPU) {
11473
+ if (src->backend == GGML_BACKEND_TYPE_CPU) {
11563
11474
  kind = dpct::host_to_device;
11564
11475
  src_ptr = (char *) src->data;
11565
- // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
11566
- } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
11567
- GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11476
+ // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
11477
+ } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
11478
+ GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
11568
11479
  kind = dpct::device_to_device;
11569
11480
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
11570
11481
  int id;
@@ -11998,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
11998
11909
 
11999
11910
  // the main device has a larger memory buffer to hold the results from all GPUs
12000
11911
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
12001
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
11912
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
12002
11913
 
12003
11914
  switch (src0->type) {
12004
11915
  case GGML_TYPE_Q4_0:
@@ -12095,37 +12006,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
12095
12006
  const int64_t ne00 = src0->ne[0];
12096
12007
  const int64_t row_diff = row_high - row_low;
12097
12008
 
12009
+ // TODO: support these quantization types
12010
+ GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
12011
+ src0->type == GGML_TYPE_IQ2_XS ||
12012
+ src0->type == GGML_TYPE_IQ3_XXS ||
12013
+ src0->type == GGML_TYPE_IQ1_S));
12014
+
12098
12015
  switch (src0->type) {
12099
12016
  case GGML_TYPE_Q4_0:
12100
- mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12101
- break;
12017
+ mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
12018
+ VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
12019
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12020
+ break;
12102
12021
  case GGML_TYPE_Q4_1:
12103
- mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12104
- break;
12022
+ mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
12023
+ VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
12024
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12025
+ break;
12105
12026
  case GGML_TYPE_Q5_0:
12106
- mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12107
- break;
12027
+ mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
12028
+ VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
12029
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12030
+ break;
12108
12031
  case GGML_TYPE_Q5_1:
12109
- mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12110
- break;
12032
+ mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
12033
+ VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
12034
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12035
+ break;
12111
12036
  case GGML_TYPE_Q8_0:
12112
- mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12113
- break;
12037
+ mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
12038
+ VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
12039
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12040
+ break;
12114
12041
  case GGML_TYPE_Q2_K:
12115
- mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12116
- break;
12042
+ mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
12043
+ VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
12044
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12045
+ break;
12117
12046
  case GGML_TYPE_Q3_K:
12118
- mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12119
- break;
12047
+ mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
12048
+ VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
12049
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12050
+ break;
12120
12051
  case GGML_TYPE_Q4_K:
12121
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12122
- break;
12052
+ mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
12053
+ VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
12054
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12055
+ break;
12123
12056
  case GGML_TYPE_Q5_K:
12124
- mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12125
- break;
12057
+ mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
12058
+ VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
12059
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12060
+ break;
12126
12061
  case GGML_TYPE_Q6_K:
12127
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12128
- break;
12062
+ mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
12063
+ VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
12064
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12065
+ break;
12129
12066
  default:
12130
12067
  GGML_ASSERT(false);
12131
12068
  break;
@@ -12145,7 +12082,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
12145
12082
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
12146
12083
  const dpct::queue_ptr &stream) {
12147
12084
 
12148
- GGML_TENSOR_BINARY_OP_LOCALS
12085
+ GGML_TENSOR_BINARY_OP_LOCALS;
12149
12086
 
12150
12087
  const int64_t row_diff = row_high - row_low;
12151
12088
 
@@ -12245,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
12245
12182
 
12246
12183
  // the main device has a larger memory buffer to hold the results from all GPUs
12247
12184
  // ldc == nrows of the matrix that cuBLAS writes into
12248
- int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
12185
+ int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
12249
12186
 
12250
12187
  #ifdef GGML_SYCL_F16
12251
12188
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
@@ -12561,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
12561
12498
 
12562
12499
  const int64_t ne00 = src0->ne[0];
12563
12500
  const int64_t nrows_x = ggml_nrows(src0);
12564
- const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
12501
+ const int64_t nrows_y = src0->ne[1];
12565
12502
 
12566
12503
  float scale = 1.0f;
12567
- memcpy(&scale, dst->op_params, sizeof(float));
12504
+ float max_bias = 0.0f;
12568
12505
 
12569
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
12506
+ memcpy(&scale, dst->op_params + 0, sizeof(float));
12507
+ memcpy(&max_bias, dst->op_params + 1, sizeof(float));
12570
12508
 
12571
- (void) dst;
12509
+ // positions tensor
12510
+ float * src2_dd = nullptr;
12511
+ sycl_pool_alloc<float> src2_f;
12512
+
12513
+ ggml_tensor * src2 = dst->src[2];
12514
+ const bool use_src2 = src2 != nullptr;
12515
+
12516
+ if (use_src2) {
12517
+ const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
12518
+
12519
+ if (src2_on_device) {
12520
+ ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
12521
+ src2_dd = (float *) src2_extra->data_device[g_main_device];
12522
+ } else {
12523
+ src2_dd = src2_f.alloc(ggml_nelements(src2));
12524
+ SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
12525
+ }
12526
+ }
12527
+
12528
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
12529
+ nrows_x, nrows_y, scale, max_bias, main_stream);
12572
12530
  }
12573
12531
 
12574
12532
  inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
@@ -12627,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12627
12585
  const bool use_src1 = src1 != nullptr;
12628
12586
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
12629
12587
 
12630
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
12631
- GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
12588
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12589
+ GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12632
12590
 
12633
12591
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
12634
12592
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
12635
12593
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12636
12594
 
12637
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12638
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
12639
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
12595
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12596
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
12597
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
12640
12598
 
12641
12599
  // dd = data device
12642
12600
  float * src0_ddf = nullptr;
@@ -12691,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
12691
12649
  main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
12692
12650
  }
12693
12651
 
12694
- if (dst->backend == GGML_BACKEND_CPU) {
12652
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12695
12653
  SYCL_CHECK(CHECK_TRY_ERROR(
12696
12654
  dpct::get_current_device().queues_wait_and_throw()));
12697
12655
  }
@@ -12766,8 +12724,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12766
12724
  const int nb2 = dst->nb[2];
12767
12725
  const int nb3 = dst->nb[3];
12768
12726
 
12769
- GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
12770
- GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
12727
+ GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12728
+ GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
12729
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
12771
12730
 
12772
12731
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
12773
12732
 
@@ -12782,13 +12741,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12782
12741
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
12783
12742
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
12784
12743
 
12785
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
12744
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12786
12745
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
12787
12746
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
12788
12747
 
12789
12748
  int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
12790
12749
 
12791
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
12750
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
12792
12751
  GGML_ASSERT(!(split && ne02 > 1));
12793
12752
  GGML_ASSERT(!(split && ne03 > 1));
12794
12753
  GGML_ASSERT(!(split && ne02 < ne12));
@@ -12843,8 +12802,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12843
12802
 
12844
12803
  used_devices++;
12845
12804
 
12846
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12847
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12805
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12806
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12848
12807
 
12849
12808
  ggml_sycl_set_device(get_device_id_by_index(id));
12850
12809
  const dpct::queue_ptr stream = g_syclStreams[id][0];
@@ -12908,8 +12867,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12908
12867
  continue;
12909
12868
  }
12910
12869
 
12911
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12912
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
12870
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12871
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
12913
12872
  const int64_t row_diff = row_high[id] - row_low[id];
12914
12873
 
12915
12874
  ggml_sycl_set_device(get_device_id_by_index(id));
@@ -12935,12 +12894,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12935
12894
 
12936
12895
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
12937
12896
  // in that case an offset on dst_ddf_i is needed
12938
- if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
12897
+ if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
12939
12898
  dst_dd_i += row_low[id]; // offset is 0 if no tensor split
12940
12899
  }
12941
12900
 
12942
12901
  // copy src0, src1 to device if necessary
12943
- if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
12902
+ if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
12944
12903
  if (id != g_main_device_index) {
12945
12904
  if (convert_src1_to_q8_1) {
12946
12905
  char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
@@ -12956,14 +12915,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12956
12915
  src1_ncols * ne10 * sizeof(float))));
12957
12916
  }
12958
12917
  }
12959
- } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
12918
+ } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
12960
12919
  SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
12961
12920
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
12962
12921
  } else {
12963
12922
  GGML_ASSERT(false);
12964
12923
  }
12965
12924
 
12966
- if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
12925
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
12967
12926
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
12968
12927
  /*
12969
12928
  DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -12993,10 +12952,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12993
12952
  if (!dst_on_device) {
12994
12953
  void * dst_off_device;
12995
12954
  dpct::memcpy_direction kind;
12996
- if (dst->backend == GGML_BACKEND_CPU) {
12955
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
12997
12956
  dst_off_device = dst->data;
12998
12957
  kind = dpct::device_to_host;
12999
- } else if (dst->backend == GGML_BACKEND_GPU) {
12958
+ } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
13000
12959
  dst_off_device = dst_extra->data_device[g_main_device_index];
13001
12960
  kind = dpct::device_to_device;
13002
12961
  } else {
@@ -13080,7 +13039,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
13080
13039
  }
13081
13040
  }
13082
13041
 
13083
- if (dst->backend == GGML_BACKEND_CPU) {
13042
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
13084
13043
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13085
13044
  SYCL_CHECK(CHECK_TRY_ERROR(
13086
13045
  dpct::get_current_device().queues_wait_and_throw()));
@@ -13217,7 +13176,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
13217
13176
  const ggml_tensor *src1,
13218
13177
  ggml_tensor *dst) try {
13219
13178
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
13220
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13179
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13221
13180
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
13222
13181
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
13223
13182
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -13255,7 +13214,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
13255
13214
  GGML_ASSERT(!ggml_is_transposed(src0));
13256
13215
  GGML_ASSERT(!ggml_is_transposed(src1));
13257
13216
  GGML_ASSERT(!ggml_is_permuted(src0));
13258
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13217
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13259
13218
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13260
13219
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13261
13220
 
@@ -13311,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
13311
13270
  int64_t i03 = i13 / r3;
13312
13271
  int64_t i02 = i12 / r2;
13313
13272
 
13314
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
13315
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
13316
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
13273
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
13274
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
13275
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
13317
13276
  }
13318
13277
 
13319
- static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13320
- const ggml_tensor *src1,
13321
- ggml_tensor *dst) try {
13278
+ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
13279
+ const ggml_tensor *src1,
13280
+ ggml_tensor *dst) try {
13322
13281
  GGML_ASSERT(!ggml_is_transposed(src0));
13323
13282
  GGML_ASSERT(!ggml_is_transposed(src1));
13324
13283
 
13325
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
13284
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13326
13285
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13327
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
13328
-
13329
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13330
13286
 
13331
- GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
13332
-
13333
- GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13334
-
13335
- GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13287
+ GGML_TENSOR_BINARY_OP_LOCALS
13336
13288
 
13337
- const int64_t ne1 = ggml_nelements(src1);
13338
- const int64_t ne = ggml_nelements(dst);
13289
+ const int64_t ne_dst = ggml_nelements(dst);
13339
13290
 
13340
13291
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13341
13292
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -13354,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13354
13305
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
13355
13306
 
13356
13307
  // convert src1 to fp16
13357
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
13358
- GGML_ASSERT(to_fp16_sycl != nullptr);
13359
-
13360
- sycl_pool_alloc<sycl::half> src1_as_f16(ne1);
13361
- to_fp16_sycl(src1_ddf, src1_as_f16.get(), ne1, main_stream);
13308
+ sycl_pool_alloc<sycl::half> src1_f16_alloc;
13309
+ if (src1->type != GGML_TYPE_F16) {
13310
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
13311
+ const int64_t ne_src1 = ggml_nelements(src1);
13312
+ src1_f16_alloc.alloc(ne_src1);
13313
+ GGML_ASSERT(to_fp16_sycl != nullptr);
13314
+ to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
13315
+ }
13316
+ sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
13317
+ : src1_f16_alloc.get();
13362
13318
 
13363
13319
  sycl_pool_alloc<sycl::half> dst_f16;
13364
13320
  char * dst_t;
@@ -13379,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13379
13335
  const void * alpha = &alpha_f16;
13380
13336
  const void * beta = &beta_f16;
13381
13337
 
13382
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
13383
- dst_t = (char *) dst_f16.alloc(ne);
13338
+ // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
13339
+ // once oneMKL open source supports half, half, float, float: datatypes
13340
+ dst_t = (char *) dst_f16.alloc(ne_dst);
13384
13341
 
13385
- nbd2 /= sizeof(float) / sizeof(sycl::half);
13386
- nbd3 /= sizeof(float) / sizeof(sycl::half);
13387
- } else {
13388
- dst_t = (char *) dst_ddf;
13389
-
13390
- cu_compute_type = dpct::library_data_t::real_float;
13391
- cu_data_type = dpct::library_data_t::real_float;
13392
-
13393
- alpha = &alpha_f32;
13394
- beta = &beta_f32;
13395
- }
13342
+ nbd2 /= sizeof(float) / sizeof(sycl::half);
13343
+ nbd3 /= sizeof(float) / sizeof(sycl::half);
13396
13344
 
13397
13345
  GGML_ASSERT(ne12 % ne02 == 0);
13398
13346
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -13428,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13428
13376
  *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
13429
13377
  oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
13430
13378
  (const char *)src0_as_f16, dpct::library_data_t::real_half,
13431
- nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
13432
- (const char *)src1_as_f16.get(), dpct::library_data_t::real_half,
13433
- nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta,
13434
- (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
13379
+ nb01 / nb00, nb02 / nb00,
13380
+ (const char *)src1_f16, dpct::library_data_t::real_half,
13381
+ nb11 / nb10, nb12 / nb10, beta,
13382
+ (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
13435
13383
  ne12 * ne13, cu_compute_type)));
13436
13384
  } else {
13437
13385
  // use syclGemmBatchedEx
@@ -13451,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13451
13399
  {sycl::aspect::fp16});
13452
13400
 
13453
13401
  main_stream->submit([&](sycl::handler &cgh) {
13454
- const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
13455
- const void **ptrs_src_get_ct3 = ptrs_src.get();
13456
- void **ptrs_dst_get_ct4 = ptrs_dst.get();
13457
-
13402
+ const void **ptrs_src_get = ptrs_src.get();
13403
+ void **ptrs_dst_get = ptrs_dst.get();
13404
+ size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
13405
+ size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
13458
13406
  cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
13459
13407
  [=](sycl::nd_item<3> item_ct1) {
13460
13408
  k_compute_batched_ptrs(
13461
- src0_as_f16, src1_as_f16_get_ct1,
13462
- dst_t, ptrs_src_get_ct3,
13463
- ptrs_dst_get_ct4, ne12, ne13, ne23,
13464
- nb02, nb03, nb12, nb13, nbd2, nbd3, r2,
13465
- r3, item_ct1);
13409
+ src0_as_f16, src1_f16,
13410
+ dst_t, ptrs_src_get,
13411
+ ptrs_dst_get, ne12, ne13, ne23,
13412
+ nb02, nb03, nb12_scaled, nb13_scaled,
13413
+ nbd2, nbd3, r2, r3, item_ct1);
13466
13414
  });
13467
13415
  });
13468
13416
  }
13469
- /*
13470
- DPCT1010:95: SYCL uses exceptions to report errors and does not use the
13471
- error codes. The call was replaced with 0. You need to rewrite this
13472
- code.
13473
- */
13474
- SYCL_CHECK(0);
13475
-
13476
13417
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
13477
13418
  *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
13478
13419
  oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
13479
13420
  (const void **)(ptrs_src.get() + 0 * ne23),
13480
- dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
13421
+ dpct::library_data_t::real_half, nb01 / nb00,
13481
13422
  (const void **)(ptrs_src.get() + 1 * ne23),
13482
- dpct::library_data_t::real_half, nb11 / sizeof(float), beta,
13423
+ dpct::library_data_t::real_half, nb11 / nb10, beta,
13483
13424
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
13484
13425
  cu_compute_type)));
13485
13426
  }
13486
13427
  #endif
13487
13428
 
13488
- if (dst->op_params[0] == GGML_PREC_DEFAULT) {
13489
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
13490
- to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
13491
- }
13429
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
13430
+ to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
13492
13431
  }
13493
13432
  catch (sycl::exception const &exc) {
13494
13433
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -13498,11 +13437,11 @@ catch (sycl::exception const &exc) {
13498
13437
 
13499
13438
  static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
13500
13439
  const bool all_on_device =
13501
- (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
13502
- (src1->backend == GGML_BACKEND_GPU) &&
13503
- ( dst->backend == GGML_BACKEND_GPU);
13440
+ (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
13441
+ (src1->backend == GGML_BACKEND_TYPE_GPU) &&
13442
+ ( dst->backend == GGML_BACKEND_TYPE_GPU);
13504
13443
 
13505
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
13444
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
13506
13445
 
13507
13446
  int64_t min_compute_capability = INT_MAX;
13508
13447
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -13533,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
13533
13472
  // KQV single-batch
13534
13473
  // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
13535
13474
  ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
13536
- } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
13475
+ } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
13537
13476
  // KQ + KQV multi-batch
13538
- // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n");
13539
- ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst);
13477
+ // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
13478
+ ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
13540
13479
  } else if (src0->type == GGML_TYPE_F32) {
13541
13480
  // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
13542
13481
  ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
@@ -13631,7 +13570,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
13631
13570
  GGML_ASSERT(!ggml_is_transposed(src00));
13632
13571
  GGML_ASSERT(!ggml_is_transposed(src1));
13633
13572
 
13634
- GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
13573
+ GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
13635
13574
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13636
13575
 
13637
13576
  GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
@@ -13769,7 +13708,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13769
13708
 
13770
13709
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
13771
13710
 
13772
- if (ids->backend == GGML_BACKEND_GPU) {
13711
+ if (ids->backend == GGML_BACKEND_TYPE_GPU) {
13773
13712
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
13774
13713
  SYCL_CHECK(CHECK_TRY_ERROR(
13775
13714
  stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
@@ -13787,20 +13726,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13787
13726
  ggml_tensor src1_row = *src1;
13788
13727
  ggml_tensor dst_row = *dst;
13789
13728
 
13790
- src1_row.backend = GGML_BACKEND_GPU;
13791
- dst_row.backend = GGML_BACKEND_GPU;
13729
+ src1_row.backend = GGML_BACKEND_TYPE_GPU;
13730
+ dst_row.backend = GGML_BACKEND_TYPE_GPU;
13792
13731
 
13793
13732
  src1_row.extra = &src1_row_extra;
13794
13733
  dst_row.extra = &dst_row_extra;
13795
13734
 
13796
- char * src1_original = src1->backend == GGML_BACKEND_CPU ?
13735
+ char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
13797
13736
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
13798
- char * dst_original = dst->backend == GGML_BACKEND_CPU ?
13737
+ char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
13799
13738
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
13800
13739
 
13801
13740
  if (src1->ne[1] == 1) {
13802
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13803
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
13741
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13742
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
13804
13743
 
13805
13744
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
13806
13745
  //int32_t row_id;
@@ -13882,7 +13821,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
13882
13821
  }
13883
13822
  }
13884
13823
 
13885
- if (dst->backend == GGML_BACKEND_CPU) {
13824
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
13886
13825
  SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
13887
13826
  }
13888
13827
  }
@@ -13905,8 +13844,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13905
13844
  const int64_t ne = ggml_nelements(src0);
13906
13845
  GGML_ASSERT(ne == ggml_nelements(src1));
13907
13846
 
13908
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
13909
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
13847
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
13848
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
13910
13849
 
13911
13850
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13912
13851
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
@@ -14013,17 +13952,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
14013
13952
  memset(extra, 0, sizeof(*extra));
14014
13953
 
14015
13954
  for (int64_t id = 0; id < g_device_count; ++id) {
14016
- if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
13955
+ if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
14017
13956
  continue;
14018
13957
  }
14019
13958
  ggml_sycl_set_device(get_device_id_by_index(id));
14020
13959
  const dpct::queue_ptr stream = g_syclStreams[id][0];
14021
13960
 
14022
13961
  int64_t row_low, row_high;
14023
- if (backend == GGML_BACKEND_GPU) {
13962
+ if (backend == GGML_BACKEND_TYPE_GPU) {
14024
13963
  row_low = 0;
14025
13964
  row_high = nrows;
14026
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
13965
+ } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
14027
13966
  const int64_t rounding = get_row_rounding(tensor->type);
14028
13967
 
14029
13968
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
@@ -14072,7 +14011,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
14072
14011
 
14073
14012
  extra->data_device[id] = buf;
14074
14013
 
14075
- if (backend == GGML_BACKEND_GPU_SPLIT) {
14014
+ if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
14076
14015
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
14077
14016
  SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
14078
14017
  new sycl::event()));
@@ -14089,7 +14028,7 @@ catch (sycl::exception const &exc) {
14089
14028
  }
14090
14029
 
14091
14030
  void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
14092
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
14031
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
14093
14032
  return;
14094
14033
  }
14095
14034
 
@@ -14142,15 +14081,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14142
14081
  return;
14143
14082
  }
14144
14083
 
14145
- tensor->backend = GGML_BACKEND_GPU;
14084
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14146
14085
 
14147
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
14086
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
14148
14087
  const ggml_op src0_op = tensor->src[0]->op;
14149
14088
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
14150
14089
  ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
14151
14090
  }
14152
14091
  }
14153
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
14092
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
14154
14093
  ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
14155
14094
  }
14156
14095
 
@@ -14168,7 +14107,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
14168
14107
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
14169
14108
  const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
14170
14109
 
14171
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
14110
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14172
14111
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
14173
14112
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14174
14113
  size_t offset = 0;
@@ -14237,7 +14176,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
14237
14176
 
14238
14177
  const bool inplace = tensor->view_src != nullptr;
14239
14178
 
14240
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
14179
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
14241
14180
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
14242
14181
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
14243
14182
  size_t view_offset = 0;
@@ -14258,7 +14197,7 @@ catch (sycl::exception const &exc) {
14258
14197
  }
14259
14198
 
14260
14199
  void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
14261
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14200
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14262
14201
  GGML_ASSERT(ggml_is_contiguous(tensor));
14263
14202
 
14264
14203
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -14345,9 +14284,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14345
14284
  if (!g_sycl_loaded) return false;
14346
14285
 
14347
14286
  ggml_sycl_func_t func;
14348
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
14349
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
14350
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
14287
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
14288
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
14289
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
14351
14290
 
14352
14291
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
14353
14292
  return false;
@@ -14485,14 +14424,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
14485
14424
  return false;
14486
14425
  }
14487
14426
 
14488
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
14427
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
14489
14428
  ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
14490
14429
  }
14491
14430
 
14492
14431
  if (params->ith != 0) {
14493
14432
  return true;
14494
14433
  }
14495
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14434
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14496
14435
  return true;
14497
14436
  }
14498
14437
  func(tensor->src[0], tensor->src[1], tensor);
@@ -14643,7 +14582,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
14643
14582
 
14644
14583
  extra->data_device[ctx->device] = tensor->data;
14645
14584
 
14646
- tensor->backend = GGML_BACKEND_GPU;
14585
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
14647
14586
  tensor->extra = extra;
14648
14587
 
14649
14588
  if (ggml_is_quantized(tensor->type)) {
@@ -14674,7 +14613,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
14674
14613
  ggml_tensor *tensor,
14675
14614
  const void *data, size_t offset,
14676
14615
  size_t size) try {
14677
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14616
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14678
14617
 
14679
14618
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14680
14619
 
@@ -14699,7 +14638,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
14699
14638
  const ggml_tensor *tensor,
14700
14639
  void *data, size_t offset,
14701
14640
  size_t size) try {
14702
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14641
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14703
14642
 
14704
14643
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
14705
14644
 
@@ -14768,7 +14707,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
14768
14707
  static ggml_backend_buffer_t
14769
14708
  ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
14770
14709
  size_t size) try {
14771
- int device = (int) (intptr_t) buft->context;
14710
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
14711
+ int device = (int) buft_ctx->device;
14772
14712
 
14773
14713
  ggml_sycl_set_device(device);
14774
14714
  int device_index = get_device_index_by_id(device);
@@ -14846,7 +14786,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
14846
14786
  for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
14847
14787
  ggml_backend_sycl_buffer_types[i] = {
14848
14788
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
14849
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
14789
+ /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
14850
14790
  };
14851
14791
  }
14852
14792
  ggml_backend_sycl_buffer_type_initialized = true;
@@ -14908,10 +14848,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
14908
14848
 
14909
14849
  // backend
14910
14850
 
14911
- struct ggml_backend_context_sycl {
14912
- int device;
14913
- };
14914
-
14915
14851
  static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14916
14852
  return GGML_SYCL_NAME;
14917
14853
 
@@ -14919,14 +14855,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14919
14855
  }
14920
14856
 
14921
14857
  static void ggml_backend_sycl_free(ggml_backend_t backend) {
14922
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14858
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14923
14859
 
14924
14860
  delete sycl_ctx;
14925
14861
  delete backend;
14926
14862
  }
14927
14863
 
14928
14864
  static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
14929
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14865
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14930
14866
 
14931
14867
  return ggml_backend_sycl_buffer_type(sycl_ctx->device);
14932
14868
  }
@@ -14935,10 +14871,10 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
14935
14871
  ggml_tensor *tensor,
14936
14872
  const void *data, size_t offset,
14937
14873
  size_t size) try {
14938
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14874
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14939
14875
 
14940
14876
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14941
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14877
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14942
14878
 
14943
14879
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14944
14880
  (char *)tensor->data + offset, data, size)));
@@ -14953,10 +14889,10 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
14953
14889
  const ggml_tensor *tensor,
14954
14890
  void *data, size_t offset,
14955
14891
  size_t size) try {
14956
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14892
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14957
14893
 
14958
14894
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14959
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
14895
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
14960
14896
 
14961
14897
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
14962
14898
  data, (const char *)tensor->data + offset, size)));
@@ -14968,7 +14904,7 @@ catch (sycl::exception const &exc) {
14968
14904
  }
14969
14905
 
14970
14906
  static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
14971
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14907
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14972
14908
 
14973
14909
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
14974
14910
 
@@ -15004,12 +14940,12 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
15004
14940
  }
15005
14941
 
15006
14942
  static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
15007
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14943
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
15008
14944
 
15009
14945
  ggml_sycl_set_main_device(sycl_ctx->device);
15010
14946
 
15011
14947
  ggml_compute_params params = {};
15012
- params.type = GGML_TASK_COMPUTE;
14948
+ params.type = GGML_TASK_TYPE_COMPUTE;
15013
14949
  params.ith = 0;
15014
14950
  for (int i = 0; i < cgraph->n_nodes; i++) {
15015
14951
  ggml_tensor * node = cgraph->nodes[i];
@@ -15017,13 +14953,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
15017
14953
  if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
15018
14954
  continue;
15019
14955
 
15020
- assert(node->backend == GGML_BACKEND_GPU);
14956
+ assert(node->backend == GGML_BACKEND_TYPE_GPU);
15021
14957
  assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
15022
14958
  assert(node->extra != nullptr);
15023
14959
 
15024
14960
  for (int j = 0; j < GGML_MAX_SRC; j++) {
15025
14961
  if (node->src[j] != nullptr) {
15026
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
14962
+ assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
15027
14963
  assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
15028
14964
  assert(node->src[j]->extra != nullptr);
15029
14965
  }
@@ -15093,6 +15029,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
15093
15029
  return false;
15094
15030
  }
15095
15031
 
15032
+ if (a->type == GGML_TYPE_IQ1_S) {
15033
+ return false;
15034
+ }
15035
+ if (a->type == GGML_TYPE_IQ3_XXS) {
15036
+ return false;
15037
+ }
15096
15038
  if (a->type == GGML_TYPE_IQ2_XXS) {
15097
15039
  return false;
15098
15040
  }
@@ -15201,6 +15143,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
15201
15143
  /* .supports_op = */ ggml_backend_sycl_supports_op,
15202
15144
  };
15203
15145
 
15146
+ static ggml_guid_t ggml_backend_sycl_guid() {
15147
+ static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
15148
+ return &guid;
15149
+ }
15150
+
15204
15151
  ggml_backend_t ggml_backend_sycl_init(int device) {
15205
15152
  ggml_init_sycl(); // TODO: remove from ggml.c
15206
15153
 
@@ -15212,11 +15159,13 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15212
15159
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
15213
15160
  ggml_sycl_set_main_device(device);
15214
15161
 
15215
- ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
15216
- /* .device = */ device
15162
+ ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
15163
+ /* .device = */ device,
15164
+ /* .name = */ GGML_SYCL_NAME + std::to_string(device),
15217
15165
  };
15218
15166
 
15219
15167
  ggml_backend_t sycl_backend = new ggml_backend {
15168
+ /* .guid = */ ggml_backend_sycl_guid(),
15220
15169
  /* .interface = */ ggml_backend_sycl_interface,
15221
15170
  /* .context = */ ctx
15222
15171
  };
@@ -15225,7 +15174,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15225
15174
  }
15226
15175
 
15227
15176
  bool ggml_backend_is_sycl(ggml_backend_t backend) {
15228
- return backend->iface.get_name == ggml_backend_sycl_name;
15177
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
15229
15178
  }
15230
15179
 
15231
15180
  static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {