llama_cpp 0.12.6 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -191,6 +191,21 @@ typedef struct {
191
191
  } block_iq3_xxs;
192
192
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
193
193
 
194
+ typedef struct {
195
+ ggml_fp16_t d;
196
+ uint8_t qs[QK_K/8];
197
+ uint8_t scales[QK_K/16];
198
+ } block_iq1_s;
199
+ static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
200
+
201
+ // Non-linear quants
202
+ #define QK4_NL 32
203
+ typedef struct {
204
+ ggml_fp16_t d;
205
+ uint8_t qs[QK4_NL/2];
206
+ } block_iq4_nl;
207
+ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
208
+
194
209
  #ifdef __cplusplus
195
210
  extern "C" {
196
211
  #endif
@@ -210,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
210
225
  void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
211
226
  void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
212
227
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
228
+ void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
213
229
 
214
230
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
215
231
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@@ -225,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
225
241
  void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
226
242
  void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
227
243
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
244
+ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
228
245
 
229
246
  // Dequantization
230
247
  void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -243,6 +260,8 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
243
260
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
244
261
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
245
262
  void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
263
+ void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
264
+ void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
246
265
 
247
266
  // Dot product
248
267
  void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -259,6 +278,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
259
278
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
260
279
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
261
280
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
281
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
282
+ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
262
283
 
263
284
  //
264
285
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@@ -266,6 +287,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
266
287
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
267
288
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
268
289
  size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
290
+ size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
291
+ size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
269
292
  size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
270
293
  size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
271
294
  size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@@ -276,8 +299,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
276
299
  size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
277
300
  size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
278
301
 
279
- void iq2xs_init_impl(int grid_size);
280
- void iq2xs_free_impl(int grid_size);
302
+ void iq2xs_init_impl(enum ggml_type type);
303
+ void iq2xs_free_impl(enum ggml_type type);
281
304
  void iq3xs_init_impl(int grid_size);
282
305
  void iq3xs_free_impl(int grid_size);
283
306
 
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
9188
9188
  }
9189
9189
  }
9190
9190
 
9191
- static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
9192
- float *dst, const int ncols,
9193
- const int nrows,
9194
- dpct::queue_ptr stream) {
9195
- GGML_ASSERT(ncols % QK4_0 == 0);
9196
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9197
- const sycl::range<3> block_nums(1, 1, block_num_y);
9198
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9199
- stream->parallel_for(
9200
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9201
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9202
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
9203
- vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
9204
- item_ct1);
9205
- });
9206
- }
9207
-
9208
- static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
9209
- float *dst, const int ncols,
9210
- const int nrows,
9211
- dpct::queue_ptr stream) {
9212
- GGML_ASSERT(ncols % QK4_1 == 0);
9213
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9214
- const sycl::range<3> block_nums(1, 1, block_num_y);
9215
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9216
- stream->parallel_for(
9217
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9218
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9219
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
9220
- vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
9221
- item_ct1);
9222
- });
9223
- }
9224
-
9225
- static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
9226
- float *dst, const int ncols,
9227
- const int nrows,
9228
- dpct::queue_ptr stream) {
9229
- GGML_ASSERT(ncols % QK5_0 == 0);
9230
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9231
- const sycl::range<3> block_nums(1, 1, block_num_y);
9232
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9233
- stream->parallel_for(
9234
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9235
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9236
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
9237
- vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
9238
- item_ct1);
9239
- });
9240
- }
9241
-
9242
- static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
9243
- float *dst, const int ncols,
9244
- const int nrows,
9245
- dpct::queue_ptr stream) {
9246
- GGML_ASSERT(ncols % QK5_1 == 0);
9247
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9248
- const sycl::range<3> block_nums(1, 1, block_num_y);
9249
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9250
- stream->parallel_for(
9251
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9252
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9253
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
9254
- vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
9255
- item_ct1);
9256
- });
9257
- }
9258
-
9259
- static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
9260
- float *dst, const int ncols,
9261
- const int nrows,
9262
- dpct::queue_ptr stream) {
9263
- GGML_ASSERT(ncols % QK8_0 == 0);
9264
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9265
- const sycl::range<3> block_nums(1, 1, block_num_y);
9266
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9267
- stream->parallel_for(
9268
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9269
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9270
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
9271
- vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
9272
- item_ct1);
9273
- });
9274
- }
9275
-
9276
- static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
9277
- float *dst, const int ncols,
9278
- const int nrows,
9279
- dpct::queue_ptr stream) {
9280
- GGML_ASSERT(ncols % QK_K == 0);
9281
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9282
- const sycl::range<3> block_nums(1, 1, block_num_y);
9283
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9284
- stream->parallel_for(
9285
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9286
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9287
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
9288
- vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
9289
- item_ct1);
9290
- });
9291
- }
9292
-
9293
- static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
9294
- float *dst, const int ncols,
9295
- const int nrows,
9296
- dpct::queue_ptr stream) {
9297
- GGML_ASSERT(ncols % QK_K == 0);
9298
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9299
- const sycl::range<3> block_nums(1, 1, block_num_y);
9300
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9301
- stream->parallel_for(
9302
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9303
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9304
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
9305
- vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
9306
- item_ct1);
9307
- });
9308
- }
9309
-
9310
- static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
9311
- float *dst, const int ncols,
9312
- const int nrows,
9313
- dpct::queue_ptr stream) {
9314
- GGML_ASSERT(ncols % QK_K == 0);
9315
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9316
- const sycl::range<3> block_nums(1, 1, block_num_y);
9317
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9318
- stream->parallel_for(
9319
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9320
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9321
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
9322
- vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
9323
- item_ct1);
9324
- });
9325
- }
9326
-
9327
- static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
9328
- float *dst, const int ncols,
9329
- const int nrows,
9330
- dpct::queue_ptr stream) {
9331
- GGML_ASSERT(ncols % QK_K == 0);
9332
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9333
- const sycl::range<3> block_nums(1, 1, block_num_y);
9334
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9335
- stream->parallel_for(
9336
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9337
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9338
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
9339
- vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
9340
- item_ct1);
9341
- });
9342
- }
9343
-
9344
- static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
9345
- float *dst, const int ncols,
9346
- const int nrows,
9347
- dpct::queue_ptr stream) {
9348
- GGML_ASSERT(ncols % QK_K == 0);
9349
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9350
- const sycl::range<3> block_nums(1, 1, block_num_y);
9351
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9352
- stream->parallel_for(
9353
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9354
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9355
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
9356
- vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
9357
- item_ct1);
9358
- });
9191
+ template <int qk, int qi, typename block_q_t, int vdr,
9192
+ vec_dot_q_sycl_t vec_dot_q_sycl>
9193
+ static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
9194
+ float *dst, const int ncols,
9195
+ const int nrows,
9196
+ dpct::queue_ptr stream) {
9197
+ GGML_ASSERT(ncols % QK4_0 == 0);
9198
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9199
+ const sycl::range<3> block_nums(1, 1, block_num_y);
9200
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9201
+ stream->parallel_for(
9202
+ sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
9203
+ ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9204
+ mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
9205
+ vx, vy, dst, ncols, nrows, item_ct1);
9206
+ });
9359
9207
  }
9360
9208
 
9361
9209
  int get_device_index_by_id(int id){
@@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
12095
11943
  const int64_t ne00 = src0->ne[0];
12096
11944
  const int64_t row_diff = row_high - row_low;
12097
11945
 
11946
+ // TODO: support these quantization types
11947
+ GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
11948
+ src0->type == GGML_TYPE_IQ2_XS ||
11949
+ src0->type == GGML_TYPE_IQ3_XXS ||
11950
+ src0->type == GGML_TYPE_IQ1_S));
11951
+
12098
11952
  switch (src0->type) {
12099
11953
  case GGML_TYPE_Q4_0:
12100
- mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12101
- break;
11954
+ mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
11955
+ VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
11956
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11957
+ break;
12102
11958
  case GGML_TYPE_Q4_1:
12103
- mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12104
- break;
11959
+ mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
11960
+ VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
11961
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11962
+ break;
12105
11963
  case GGML_TYPE_Q5_0:
12106
- mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12107
- break;
11964
+ mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
11965
+ VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
11966
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11967
+ break;
12108
11968
  case GGML_TYPE_Q5_1:
12109
- mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12110
- break;
11969
+ mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
11970
+ VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
11971
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11972
+ break;
12111
11973
  case GGML_TYPE_Q8_0:
12112
- mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12113
- break;
11974
+ mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
11975
+ VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
11976
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11977
+ break;
12114
11978
  case GGML_TYPE_Q2_K:
12115
- mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12116
- break;
11979
+ mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
11980
+ VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
11981
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11982
+ break;
12117
11983
  case GGML_TYPE_Q3_K:
12118
- mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12119
- break;
11984
+ mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
11985
+ VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
11986
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11987
+ break;
12120
11988
  case GGML_TYPE_Q4_K:
12121
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12122
- break;
11989
+ mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
11990
+ VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
11991
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11992
+ break;
12123
11993
  case GGML_TYPE_Q5_K:
12124
- mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12125
- break;
11994
+ mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
11995
+ VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
11996
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11997
+ break;
12126
11998
  case GGML_TYPE_Q6_K:
12127
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12128
- break;
11999
+ mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
12000
+ VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
12001
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12002
+ break;
12129
12003
  default:
12130
12004
  GGML_ASSERT(false);
12131
12005
  break;
@@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
12145
12019
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
12146
12020
  const dpct::queue_ptr &stream) {
12147
12021
 
12148
- GGML_TENSOR_BINARY_OP_LOCALS
12022
+ GGML_TENSOR_BINARY_OP_LOCALS;
12149
12023
 
12150
12024
  const int64_t row_diff = row_high - row_low;
12151
12025
 
@@ -14768,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
14768
14642
  static ggml_backend_buffer_t
14769
14643
  ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
14770
14644
  size_t size) try {
14771
- int device = (int) (intptr_t) buft->context;
14645
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
14646
+ int device = (int) buft_ctx->device;
14772
14647
 
14773
14648
  ggml_sycl_set_device(device);
14774
14649
  int device_index = get_device_index_by_id(device);
@@ -14846,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
14846
14721
  for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
14847
14722
  ggml_backend_sycl_buffer_types[i] = {
14848
14723
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
14849
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
14724
+ /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
14850
14725
  };
14851
14726
  }
14852
14727
  ggml_backend_sycl_buffer_type_initialized = true;
@@ -14908,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
14908
14783
 
14909
14784
  // backend
14910
14785
 
14911
- struct ggml_backend_context_sycl {
14912
- int device;
14913
- };
14914
-
14915
14786
  static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14916
14787
  return GGML_SYCL_NAME;
14917
14788
 
@@ -14919,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14919
14790
  }
14920
14791
 
14921
14792
  static void ggml_backend_sycl_free(ggml_backend_t backend) {
14922
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14793
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14923
14794
 
14924
14795
  delete sycl_ctx;
14925
14796
  delete backend;
14926
14797
  }
14927
14798
 
14928
14799
  static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
14929
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14800
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14930
14801
 
14931
14802
  return ggml_backend_sycl_buffer_type(sycl_ctx->device);
14932
14803
  }
@@ -14935,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
14935
14806
  ggml_tensor *tensor,
14936
14807
  const void *data, size_t offset,
14937
14808
  size_t size) try {
14938
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14809
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14939
14810
 
14940
14811
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14941
14812
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -14953,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
14953
14824
  const ggml_tensor *tensor,
14954
14825
  void *data, size_t offset,
14955
14826
  size_t size) try {
14956
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14827
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14957
14828
 
14958
14829
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14959
14830
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -14968,7 +14839,7 @@ catch (sycl::exception const &exc) {
14968
14839
  }
14969
14840
 
14970
14841
  static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
14971
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14842
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14972
14843
 
14973
14844
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
14974
14845
 
@@ -15004,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
15004
14875
  }
15005
14876
 
15006
14877
  static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
15007
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14878
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
15008
14879
 
15009
14880
  ggml_sycl_set_main_device(sycl_ctx->device);
15010
14881
 
@@ -15093,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
15093
14964
  return false;
15094
14965
  }
15095
14966
 
14967
+ if (a->type == GGML_TYPE_IQ1_S) {
14968
+ return false;
14969
+ }
14970
+ if (a->type == GGML_TYPE_IQ3_XXS) {
14971
+ return false;
14972
+ }
15096
14973
  if (a->type == GGML_TYPE_IQ2_XXS) {
15097
14974
  return false;
15098
14975
  }
@@ -15212,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15212
15089
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
15213
15090
  ggml_sycl_set_main_device(device);
15214
15091
 
15215
- ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
15216
- /* .device = */ device
15092
+ ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
15093
+ /* .device = */ device,
15094
+ /* .name = */ GGML_SYCL_NAME + std::to_string(device),
15217
15095
  };
15218
15096
 
15219
15097
  ggml_backend_t sycl_backend = new ggml_backend {