llama_cpp 0.12.6 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -191,6 +191,21 @@ typedef struct {
191
191
  } block_iq3_xxs;
192
192
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
193
193
 
194
+ typedef struct {
195
+ ggml_fp16_t d;
196
+ uint8_t qs[QK_K/8];
197
+ uint8_t scales[QK_K/16];
198
+ } block_iq1_s;
199
+ static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
200
+
201
+ // Non-linear quants
202
+ #define QK4_NL 32
203
+ typedef struct {
204
+ ggml_fp16_t d;
205
+ uint8_t qs[QK4_NL/2];
206
+ } block_iq4_nl;
207
+ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
208
+
194
209
  #ifdef __cplusplus
195
210
  extern "C" {
196
211
  #endif
@@ -210,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
210
225
  void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
211
226
  void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
212
227
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
228
+ void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
213
229
 
214
230
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
215
231
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@@ -225,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
225
241
  void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
226
242
  void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
227
243
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
244
+ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
228
245
 
229
246
  // Dequantization
230
247
  void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -243,6 +260,8 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
243
260
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
244
261
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
245
262
  void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
263
+ void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
264
+ void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
246
265
 
247
266
  // Dot product
248
267
  void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -259,6 +278,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
259
278
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
260
279
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
261
280
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
281
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
282
+ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
262
283
 
263
284
  //
264
285
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@@ -266,6 +287,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
266
287
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
267
288
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
268
289
  size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
290
+ size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
291
+ size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
269
292
  size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
270
293
  size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
271
294
  size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@@ -276,8 +299,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
276
299
  size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
277
300
  size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
278
301
 
279
- void iq2xs_init_impl(int grid_size);
280
- void iq2xs_free_impl(int grid_size);
302
+ void iq2xs_init_impl(enum ggml_type type);
303
+ void iq2xs_free_impl(enum ggml_type type);
281
304
  void iq3xs_init_impl(int grid_size);
282
305
  void iq3xs_free_impl(int grid_size);
283
306
 
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
9188
9188
  }
9189
9189
  }
9190
9190
 
9191
- static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
9192
- float *dst, const int ncols,
9193
- const int nrows,
9194
- dpct::queue_ptr stream) {
9195
- GGML_ASSERT(ncols % QK4_0 == 0);
9196
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9197
- const sycl::range<3> block_nums(1, 1, block_num_y);
9198
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9199
- stream->parallel_for(
9200
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9201
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9202
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
9203
- vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
9204
- item_ct1);
9205
- });
9206
- }
9207
-
9208
- static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
9209
- float *dst, const int ncols,
9210
- const int nrows,
9211
- dpct::queue_ptr stream) {
9212
- GGML_ASSERT(ncols % QK4_1 == 0);
9213
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9214
- const sycl::range<3> block_nums(1, 1, block_num_y);
9215
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9216
- stream->parallel_for(
9217
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9218
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9219
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
9220
- vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
9221
- item_ct1);
9222
- });
9223
- }
9224
-
9225
- static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
9226
- float *dst, const int ncols,
9227
- const int nrows,
9228
- dpct::queue_ptr stream) {
9229
- GGML_ASSERT(ncols % QK5_0 == 0);
9230
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9231
- const sycl::range<3> block_nums(1, 1, block_num_y);
9232
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9233
- stream->parallel_for(
9234
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9235
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9236
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
9237
- vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
9238
- item_ct1);
9239
- });
9240
- }
9241
-
9242
- static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
9243
- float *dst, const int ncols,
9244
- const int nrows,
9245
- dpct::queue_ptr stream) {
9246
- GGML_ASSERT(ncols % QK5_1 == 0);
9247
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9248
- const sycl::range<3> block_nums(1, 1, block_num_y);
9249
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9250
- stream->parallel_for(
9251
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9252
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9253
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
9254
- vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
9255
- item_ct1);
9256
- });
9257
- }
9258
-
9259
- static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
9260
- float *dst, const int ncols,
9261
- const int nrows,
9262
- dpct::queue_ptr stream) {
9263
- GGML_ASSERT(ncols % QK8_0 == 0);
9264
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9265
- const sycl::range<3> block_nums(1, 1, block_num_y);
9266
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9267
- stream->parallel_for(
9268
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9269
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9270
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
9271
- vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
9272
- item_ct1);
9273
- });
9274
- }
9275
-
9276
- static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
9277
- float *dst, const int ncols,
9278
- const int nrows,
9279
- dpct::queue_ptr stream) {
9280
- GGML_ASSERT(ncols % QK_K == 0);
9281
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9282
- const sycl::range<3> block_nums(1, 1, block_num_y);
9283
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9284
- stream->parallel_for(
9285
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9286
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9287
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
9288
- vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
9289
- item_ct1);
9290
- });
9291
- }
9292
-
9293
- static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
9294
- float *dst, const int ncols,
9295
- const int nrows,
9296
- dpct::queue_ptr stream) {
9297
- GGML_ASSERT(ncols % QK_K == 0);
9298
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9299
- const sycl::range<3> block_nums(1, 1, block_num_y);
9300
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9301
- stream->parallel_for(
9302
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9303
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9304
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
9305
- vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
9306
- item_ct1);
9307
- });
9308
- }
9309
-
9310
- static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
9311
- float *dst, const int ncols,
9312
- const int nrows,
9313
- dpct::queue_ptr stream) {
9314
- GGML_ASSERT(ncols % QK_K == 0);
9315
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9316
- const sycl::range<3> block_nums(1, 1, block_num_y);
9317
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9318
- stream->parallel_for(
9319
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9320
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9321
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
9322
- vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
9323
- item_ct1);
9324
- });
9325
- }
9326
-
9327
- static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
9328
- float *dst, const int ncols,
9329
- const int nrows,
9330
- dpct::queue_ptr stream) {
9331
- GGML_ASSERT(ncols % QK_K == 0);
9332
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9333
- const sycl::range<3> block_nums(1, 1, block_num_y);
9334
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9335
- stream->parallel_for(
9336
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9337
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9338
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
9339
- vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
9340
- item_ct1);
9341
- });
9342
- }
9343
-
9344
- static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
9345
- float *dst, const int ncols,
9346
- const int nrows,
9347
- dpct::queue_ptr stream) {
9348
- GGML_ASSERT(ncols % QK_K == 0);
9349
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9350
- const sycl::range<3> block_nums(1, 1, block_num_y);
9351
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9352
- stream->parallel_for(
9353
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9354
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9355
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
9356
- vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
9357
- item_ct1);
9358
- });
9191
+ template <int qk, int qi, typename block_q_t, int vdr,
9192
+ vec_dot_q_sycl_t vec_dot_q_sycl>
9193
+ static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
9194
+ float *dst, const int ncols,
9195
+ const int nrows,
9196
+ dpct::queue_ptr stream) {
9197
+ GGML_ASSERT(ncols % QK4_0 == 0);
9198
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9199
+ const sycl::range<3> block_nums(1, 1, block_num_y);
9200
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9201
+ stream->parallel_for(
9202
+ sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
9203
+ ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9204
+ mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
9205
+ vx, vy, dst, ncols, nrows, item_ct1);
9206
+ });
9359
9207
  }
9360
9208
 
9361
9209
  int get_device_index_by_id(int id){
@@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
12095
11943
  const int64_t ne00 = src0->ne[0];
12096
11944
  const int64_t row_diff = row_high - row_low;
12097
11945
 
11946
+ // TODO: support these quantization types
11947
+ GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
11948
+ src0->type == GGML_TYPE_IQ2_XS ||
11949
+ src0->type == GGML_TYPE_IQ3_XXS ||
11950
+ src0->type == GGML_TYPE_IQ1_S));
11951
+
12098
11952
  switch (src0->type) {
12099
11953
  case GGML_TYPE_Q4_0:
12100
- mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12101
- break;
11954
+ mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
11955
+ VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
11956
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11957
+ break;
12102
11958
  case GGML_TYPE_Q4_1:
12103
- mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12104
- break;
11959
+ mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
11960
+ VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
11961
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11962
+ break;
12105
11963
  case GGML_TYPE_Q5_0:
12106
- mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12107
- break;
11964
+ mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
11965
+ VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
11966
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11967
+ break;
12108
11968
  case GGML_TYPE_Q5_1:
12109
- mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12110
- break;
11969
+ mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
11970
+ VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
11971
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11972
+ break;
12111
11973
  case GGML_TYPE_Q8_0:
12112
- mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12113
- break;
11974
+ mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
11975
+ VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
11976
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11977
+ break;
12114
11978
  case GGML_TYPE_Q2_K:
12115
- mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12116
- break;
11979
+ mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
11980
+ VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
11981
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11982
+ break;
12117
11983
  case GGML_TYPE_Q3_K:
12118
- mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12119
- break;
11984
+ mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
11985
+ VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
11986
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11987
+ break;
12120
11988
  case GGML_TYPE_Q4_K:
12121
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12122
- break;
11989
+ mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
11990
+ VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
11991
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11992
+ break;
12123
11993
  case GGML_TYPE_Q5_K:
12124
- mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12125
- break;
11994
+ mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
11995
+ VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
11996
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11997
+ break;
12126
11998
  case GGML_TYPE_Q6_K:
12127
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12128
- break;
11999
+ mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
12000
+ VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
12001
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12002
+ break;
12129
12003
  default:
12130
12004
  GGML_ASSERT(false);
12131
12005
  break;
@@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
12145
12019
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
12146
12020
  const dpct::queue_ptr &stream) {
12147
12021
 
12148
- GGML_TENSOR_BINARY_OP_LOCALS
12022
+ GGML_TENSOR_BINARY_OP_LOCALS;
12149
12023
 
12150
12024
  const int64_t row_diff = row_high - row_low;
12151
12025
 
@@ -14768,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
14768
14642
  static ggml_backend_buffer_t
14769
14643
  ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
14770
14644
  size_t size) try {
14771
- int device = (int) (intptr_t) buft->context;
14645
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
14646
+ int device = (int) buft_ctx->device;
14772
14647
 
14773
14648
  ggml_sycl_set_device(device);
14774
14649
  int device_index = get_device_index_by_id(device);
@@ -14846,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
14846
14721
  for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
14847
14722
  ggml_backend_sycl_buffer_types[i] = {
14848
14723
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
14849
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
14724
+ /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
14850
14725
  };
14851
14726
  }
14852
14727
  ggml_backend_sycl_buffer_type_initialized = true;
@@ -14908,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
14908
14783
 
14909
14784
  // backend
14910
14785
 
14911
- struct ggml_backend_context_sycl {
14912
- int device;
14913
- };
14914
-
14915
14786
  static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14916
14787
  return GGML_SYCL_NAME;
14917
14788
 
@@ -14919,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14919
14790
  }
14920
14791
 
14921
14792
  static void ggml_backend_sycl_free(ggml_backend_t backend) {
14922
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14793
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14923
14794
 
14924
14795
  delete sycl_ctx;
14925
14796
  delete backend;
14926
14797
  }
14927
14798
 
14928
14799
  static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
14929
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14800
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14930
14801
 
14931
14802
  return ggml_backend_sycl_buffer_type(sycl_ctx->device);
14932
14803
  }
@@ -14935,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
14935
14806
  ggml_tensor *tensor,
14936
14807
  const void *data, size_t offset,
14937
14808
  size_t size) try {
14938
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14809
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14939
14810
 
14940
14811
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14941
14812
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -14953,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
14953
14824
  const ggml_tensor *tensor,
14954
14825
  void *data, size_t offset,
14955
14826
  size_t size) try {
14956
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14827
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14957
14828
 
14958
14829
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14959
14830
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -14968,7 +14839,7 @@ catch (sycl::exception const &exc) {
14968
14839
  }
14969
14840
 
14970
14841
  static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
14971
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14842
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14972
14843
 
14973
14844
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
14974
14845
 
@@ -15004,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
15004
14875
  }
15005
14876
 
15006
14877
  static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
15007
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14878
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
15008
14879
 
15009
14880
  ggml_sycl_set_main_device(sycl_ctx->device);
15010
14881
 
@@ -15093,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
15093
14964
  return false;
15094
14965
  }
15095
14966
 
14967
+ if (a->type == GGML_TYPE_IQ1_S) {
14968
+ return false;
14969
+ }
14970
+ if (a->type == GGML_TYPE_IQ3_XXS) {
14971
+ return false;
14972
+ }
15096
14973
  if (a->type == GGML_TYPE_IQ2_XXS) {
15097
14974
  return false;
15098
14975
  }
@@ -15212,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15212
15089
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
15213
15090
  ggml_sycl_set_main_device(device);
15214
15091
 
15215
- ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
15216
- /* .device = */ device
15092
+ ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
15093
+ /* .device = */ device,
15094
+ /* .name = */ GGML_SYCL_NAME + std::to_string(device),
15217
15095
  };
15218
15096
 
15219
15097
  ggml_backend_t sycl_backend = new ggml_backend {