llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -191,6 +191,21 @@ typedef struct {
191
191
  } block_iq3_xxs;
192
192
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
193
193
 
194
+ typedef struct {
195
+ ggml_fp16_t d;
196
+ uint8_t qs[QK_K/8];
197
+ uint8_t scales[QK_K/16];
198
+ } block_iq1_s;
199
+ static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
200
+
201
+ // Non-linear quants
202
+ #define QK4_NL 32
203
+ typedef struct {
204
+ ggml_fp16_t d;
205
+ uint8_t qs[QK4_NL/2];
206
+ } block_iq4_nl;
207
+ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
208
+
194
209
  #ifdef __cplusplus
195
210
  extern "C" {
196
211
  #endif
@@ -210,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
210
225
  void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
211
226
  void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
212
227
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
228
+ void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
213
229
 
214
230
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
215
231
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@@ -225,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
225
241
  void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
226
242
  void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
227
243
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
244
+ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
228
245
 
229
246
  // Dequantization
230
247
  void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -243,22 +260,26 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
243
260
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
244
261
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
245
262
  void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
263
+ void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
264
+ void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
246
265
 
247
266
  // Dot product
248
- void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
249
- void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
250
- void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
251
- void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
252
- void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
253
-
254
- void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
255
- void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
256
- void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
257
- void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
258
- void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
259
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
260
- void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
261
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
267
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
268
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
269
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
270
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
271
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
272
+
273
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
274
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
275
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
276
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
277
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
278
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
279
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
280
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
281
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
282
+ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
262
283
 
263
284
  //
264
285
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@@ -266,6 +287,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML
266
287
  size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
267
288
  size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
268
289
  size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
290
+ size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
291
+ size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
269
292
  size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
270
293
  size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
271
294
  size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@@ -276,8 +299,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
276
299
  size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
277
300
  size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
278
301
 
279
- void iq2xs_init_impl(int grid_size);
280
- void iq2xs_free_impl(int grid_size);
302
+ void iq2xs_init_impl(enum ggml_type type);
303
+ void iq2xs_free_impl(enum ggml_type type);
281
304
  void iq3xs_init_impl(int grid_size);
282
305
  void iq3xs_free_impl(int grid_size);
283
306
 
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
9188
9188
  }
9189
9189
  }
9190
9190
 
9191
- static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
9192
- float *dst, const int ncols,
9193
- const int nrows,
9194
- dpct::queue_ptr stream) {
9195
- GGML_ASSERT(ncols % QK4_0 == 0);
9196
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9197
- const sycl::range<3> block_nums(1, 1, block_num_y);
9198
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9199
- stream->parallel_for(
9200
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9201
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9202
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
9203
- vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
9204
- item_ct1);
9205
- });
9206
- }
9207
-
9208
- static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
9209
- float *dst, const int ncols,
9210
- const int nrows,
9211
- dpct::queue_ptr stream) {
9212
- GGML_ASSERT(ncols % QK4_1 == 0);
9213
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9214
- const sycl::range<3> block_nums(1, 1, block_num_y);
9215
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9216
- stream->parallel_for(
9217
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9218
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9219
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
9220
- vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
9221
- item_ct1);
9222
- });
9223
- }
9224
-
9225
- static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
9226
- float *dst, const int ncols,
9227
- const int nrows,
9228
- dpct::queue_ptr stream) {
9229
- GGML_ASSERT(ncols % QK5_0 == 0);
9230
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9231
- const sycl::range<3> block_nums(1, 1, block_num_y);
9232
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9233
- stream->parallel_for(
9234
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9235
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9236
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
9237
- vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
9238
- item_ct1);
9239
- });
9240
- }
9241
-
9242
- static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
9243
- float *dst, const int ncols,
9244
- const int nrows,
9245
- dpct::queue_ptr stream) {
9246
- GGML_ASSERT(ncols % QK5_1 == 0);
9247
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9248
- const sycl::range<3> block_nums(1, 1, block_num_y);
9249
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9250
- stream->parallel_for(
9251
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9252
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9253
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
9254
- vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
9255
- item_ct1);
9256
- });
9257
- }
9258
-
9259
- static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
9260
- float *dst, const int ncols,
9261
- const int nrows,
9262
- dpct::queue_ptr stream) {
9263
- GGML_ASSERT(ncols % QK8_0 == 0);
9264
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9265
- const sycl::range<3> block_nums(1, 1, block_num_y);
9266
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9267
- stream->parallel_for(
9268
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9269
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9270
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
9271
- vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
9272
- item_ct1);
9273
- });
9274
- }
9275
-
9276
- static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
9277
- float *dst, const int ncols,
9278
- const int nrows,
9279
- dpct::queue_ptr stream) {
9280
- GGML_ASSERT(ncols % QK_K == 0);
9281
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9282
- const sycl::range<3> block_nums(1, 1, block_num_y);
9283
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9284
- stream->parallel_for(
9285
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9286
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9287
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
9288
- vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
9289
- item_ct1);
9290
- });
9291
- }
9292
-
9293
- static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
9294
- float *dst, const int ncols,
9295
- const int nrows,
9296
- dpct::queue_ptr stream) {
9297
- GGML_ASSERT(ncols % QK_K == 0);
9298
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9299
- const sycl::range<3> block_nums(1, 1, block_num_y);
9300
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9301
- stream->parallel_for(
9302
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9303
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9304
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
9305
- vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
9306
- item_ct1);
9307
- });
9308
- }
9309
-
9310
- static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
9311
- float *dst, const int ncols,
9312
- const int nrows,
9313
- dpct::queue_ptr stream) {
9314
- GGML_ASSERT(ncols % QK_K == 0);
9315
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9316
- const sycl::range<3> block_nums(1, 1, block_num_y);
9317
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9318
- stream->parallel_for(
9319
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9320
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9321
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
9322
- vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
9323
- item_ct1);
9324
- });
9325
- }
9326
-
9327
- static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
9328
- float *dst, const int ncols,
9329
- const int nrows,
9330
- dpct::queue_ptr stream) {
9331
- GGML_ASSERT(ncols % QK_K == 0);
9332
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9333
- const sycl::range<3> block_nums(1, 1, block_num_y);
9334
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9335
- stream->parallel_for(
9336
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9337
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9338
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
9339
- vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
9340
- item_ct1);
9341
- });
9342
- }
9343
-
9344
- static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
9345
- float *dst, const int ncols,
9346
- const int nrows,
9347
- dpct::queue_ptr stream) {
9348
- GGML_ASSERT(ncols % QK_K == 0);
9349
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9350
- const sycl::range<3> block_nums(1, 1, block_num_y);
9351
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9352
- stream->parallel_for(
9353
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
9354
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9355
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
9356
- vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
9357
- item_ct1);
9358
- });
9191
+ template <int qk, int qi, typename block_q_t, int vdr,
9192
+ vec_dot_q_sycl_t vec_dot_q_sycl>
9193
+ static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
9194
+ float *dst, const int ncols,
9195
+ const int nrows,
9196
+ dpct::queue_ptr stream) {
9197
+ GGML_ASSERT(ncols % QK4_0 == 0);
9198
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
9199
+ const sycl::range<3> block_nums(1, 1, block_num_y);
9200
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
9201
+ stream->parallel_for(
9202
+ sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
9203
+ ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
9204
+ mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
9205
+ vx, vy, dst, ncols, nrows, item_ct1);
9206
+ });
9359
9207
  }
9360
9208
 
9361
9209
  int get_device_index_by_id(int id){
@@ -11578,11 +11426,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
11578
11426
  }
11579
11427
  char * dst_ptr = (char *) dst;
11580
11428
 
11581
- const int64_t ne0 = src->ne[0];
11582
- const int64_t nb0 = src->nb[0];
11583
- const int64_t nb1 = src->nb[1];
11584
- const int64_t nb2 = src->nb[2];
11585
- const int64_t nb3 = src->nb[3];
11429
+ GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
11430
+ GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
11586
11431
  const enum ggml_type type = src->type;
11587
11432
  const int64_t ts = ggml_type_size(type);
11588
11433
  const int64_t bs = ggml_blck_size(type);
@@ -12098,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
12098
11943
  const int64_t ne00 = src0->ne[0];
12099
11944
  const int64_t row_diff = row_high - row_low;
12100
11945
 
11946
+ // TODO: support these quantization types
11947
+ GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
11948
+ src0->type == GGML_TYPE_IQ2_XS ||
11949
+ src0->type == GGML_TYPE_IQ3_XXS ||
11950
+ src0->type == GGML_TYPE_IQ1_S));
11951
+
12101
11952
  switch (src0->type) {
12102
11953
  case GGML_TYPE_Q4_0:
12103
- mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12104
- break;
11954
+ mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
11955
+ VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
11956
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11957
+ break;
12105
11958
  case GGML_TYPE_Q4_1:
12106
- mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12107
- break;
11959
+ mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
11960
+ VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
11961
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11962
+ break;
12108
11963
  case GGML_TYPE_Q5_0:
12109
- mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12110
- break;
11964
+ mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
11965
+ VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
11966
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11967
+ break;
12111
11968
  case GGML_TYPE_Q5_1:
12112
- mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12113
- break;
11969
+ mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
11970
+ VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
11971
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11972
+ break;
12114
11973
  case GGML_TYPE_Q8_0:
12115
- mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12116
- break;
11974
+ mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
11975
+ VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
11976
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11977
+ break;
12117
11978
  case GGML_TYPE_Q2_K:
12118
- mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12119
- break;
11979
+ mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
11980
+ VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
11981
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11982
+ break;
12120
11983
  case GGML_TYPE_Q3_K:
12121
- mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12122
- break;
11984
+ mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
11985
+ VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
11986
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11987
+ break;
12123
11988
  case GGML_TYPE_Q4_K:
12124
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12125
- break;
11989
+ mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
11990
+ VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
11991
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11992
+ break;
12126
11993
  case GGML_TYPE_Q5_K:
12127
- mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12128
- break;
11994
+ mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
11995
+ VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
11996
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
11997
+ break;
12129
11998
  case GGML_TYPE_Q6_K:
12130
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12131
- break;
11999
+ mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
12000
+ VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
12001
+ src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
12002
+ break;
12132
12003
  default:
12133
12004
  GGML_ASSERT(false);
12134
12005
  break;
@@ -12148,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
12148
12019
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
12149
12020
  const dpct::queue_ptr &stream) {
12150
12021
 
12151
- GGML_TENSOR_BINARY_OP_LOCALS
12022
+ GGML_TENSOR_BINARY_OP_LOCALS;
12152
12023
 
12153
12024
  const int64_t row_diff = row_high - row_low;
12154
12025
 
@@ -12426,9 +12297,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
12426
12297
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
12427
12298
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
12428
12299
 
12429
- const int64_t ne00 = src0->ne[0];
12430
- const int64_t ne01 = src0->ne[1];
12431
- const int64_t ne02 = src0->ne[2];
12300
+ GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
12432
12301
  const int64_t nrows = ggml_nrows(src0);
12433
12302
 
12434
12303
  //const int n_past = ((int32_t *) dst->op_params)[0];
@@ -12758,15 +12627,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
12758
12627
  ggml_sycl_op_mul_mat_t op,
12759
12628
  const bool convert_src1_to_q8_1) try {
12760
12629
 
12761
- const int64_t ne00 = src0->ne[0];
12762
- const int64_t ne01 = src0->ne[1];
12763
- const int64_t ne02 = src0->ne[2];
12764
- const int64_t ne03 = src0->ne[3];
12630
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
12765
12631
 
12766
- const int64_t ne10 = src1->ne[0];
12767
- const int64_t ne11 = src1->ne[1];
12768
- const int64_t ne12 = src1->ne[2];
12769
- const int64_t ne13 = src1->ne[3];
12632
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
12770
12633
  const int64_t nrows1 = ggml_nrows(src1);
12771
12634
 
12772
12635
  GGML_ASSERT(ne03 == ne13);
@@ -13337,23 +13200,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
13337
13200
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
13338
13201
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13339
13202
 
13340
- const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
13341
- const int64_t ne01 = src0->ne[1];
13342
- const int64_t ne02 = src0->ne[2];
13343
- const int64_t ne03 = src0->ne[3];
13203
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
13344
13204
 
13345
- const int64_t nb01 = src0->nb[1];
13346
- const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
13347
- const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
13205
+ GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
13348
13206
 
13349
- const int64_t ne10 = src1->ne[0];
13350
- const int64_t ne11 = src1->ne[1];
13351
- const int64_t ne12 = src1->ne[2];
13352
- const int64_t ne13 = src1->ne[3];
13207
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13353
13208
 
13354
- const int64_t nb11 = src1->nb[1];
13355
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
13356
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
13209
+ GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13357
13210
 
13358
13211
  const int64_t ne1 = ggml_nelements(src1);
13359
13212
  const int64_t ne = ggml_nelements(dst);
@@ -13655,23 +13508,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
13655
13508
  GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
13656
13509
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
13657
13510
 
13658
- const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
13659
- const int64_t ne01 = src00->ne[1];
13660
- const int64_t ne02 = src00->ne[2];
13661
- const int64_t ne03 = src00->ne[3];
13511
+ GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
13662
13512
 
13663
13513
  //const int64_t nb01 = src00->nb[1];
13664
- const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
13665
- const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
13514
+ GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
13666
13515
 
13667
- const int64_t ne10 = src1->ne[0];
13668
- const int64_t ne11 = src1->ne[1];
13669
- const int64_t ne12 = src1->ne[2];
13670
- const int64_t ne13 = src1->ne[3];
13516
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
13671
13517
 
13518
+ GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
13672
13519
  //const int64_t nb11 = src1->nb[1];
13673
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
13674
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
13675
13520
 
13676
13521
  const int64_t ne1 = ggml_nelements(src1);
13677
13522
  const int64_t ne = ggml_nelements(dst);
@@ -13940,25 +13785,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
13940
13785
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
13941
13786
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
13942
13787
 
13943
- const int64_t ne00 = src0->ne[0];
13944
- const int64_t ne01 = src0->ne[1];
13945
- const int64_t ne02 = src0->ne[2];
13946
-
13947
-
13948
- const int64_t nb00 = src0->nb[0];
13949
- const int64_t nb01 = src0->nb[1];
13950
- const int64_t nb02 = src0->nb[2];
13951
- const int64_t nb03 = src0->nb[3];
13952
-
13953
- const int64_t ne10 = src1->ne[0];
13954
- const int64_t ne11 = src1->ne[1];
13955
- const int64_t ne12 = src1->ne[2];
13956
-
13957
-
13958
- const int64_t nb10 = src1->nb[0];
13959
- const int64_t nb11 = src1->nb[1];
13960
- const int64_t nb12 = src1->nb[2];
13961
- const int64_t nb13 = src1->nb[3];
13788
+ GGML_TENSOR_BINARY_OP_LOCALS;
13962
13789
 
13963
13790
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
13964
13791
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -14815,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
14815
14642
  static ggml_backend_buffer_t
14816
14643
  ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
14817
14644
  size_t size) try {
14818
- int device = (int) (intptr_t) buft->context;
14645
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
14646
+ int device = (int) buft_ctx->device;
14819
14647
 
14820
14648
  ggml_sycl_set_device(device);
14821
14649
  int device_index = get_device_index_by_id(device);
@@ -14893,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
14893
14721
  for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
14894
14722
  ggml_backend_sycl_buffer_types[i] = {
14895
14723
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
14896
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
14724
+ /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
14897
14725
  };
14898
14726
  }
14899
14727
  ggml_backend_sycl_buffer_type_initialized = true;
@@ -14955,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
14955
14783
 
14956
14784
  // backend
14957
14785
 
14958
- struct ggml_backend_context_sycl {
14959
- int device;
14960
- };
14961
-
14962
14786
  static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14963
14787
  return GGML_SYCL_NAME;
14964
14788
 
@@ -14966,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
14966
14790
  }
14967
14791
 
14968
14792
  static void ggml_backend_sycl_free(ggml_backend_t backend) {
14969
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14793
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14970
14794
 
14971
14795
  delete sycl_ctx;
14972
14796
  delete backend;
14973
14797
  }
14974
14798
 
14975
14799
  static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
14976
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14800
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14977
14801
 
14978
14802
  return ggml_backend_sycl_buffer_type(sycl_ctx->device);
14979
14803
  }
@@ -14982,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
14982
14806
  ggml_tensor *tensor,
14983
14807
  const void *data, size_t offset,
14984
14808
  size_t size) try {
14985
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14809
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
14986
14810
 
14987
14811
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
14988
14812
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -15000,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
15000
14824
  const ggml_tensor *tensor,
15001
14825
  void *data, size_t offset,
15002
14826
  size_t size) try {
15003
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14827
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
15004
14828
 
15005
14829
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
15006
14830
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -15015,7 +14839,7 @@ catch (sycl::exception const &exc) {
15015
14839
  }
15016
14840
 
15017
14841
  static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
15018
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14842
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
15019
14843
 
15020
14844
  SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
15021
14845
 
@@ -15051,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
15051
14875
  }
15052
14876
 
15053
14877
  static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
15054
- ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
14878
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
15055
14879
 
15056
14880
  ggml_sycl_set_main_device(sycl_ctx->device);
15057
14881
 
@@ -15140,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
15140
14964
  return false;
15141
14965
  }
15142
14966
 
14967
+ if (a->type == GGML_TYPE_IQ1_S) {
14968
+ return false;
14969
+ }
14970
+ if (a->type == GGML_TYPE_IQ3_XXS) {
14971
+ return false;
14972
+ }
15143
14973
  if (a->type == GGML_TYPE_IQ2_XXS) {
15144
14974
  return false;
15145
14975
  }
@@ -15259,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
15259
15089
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
15260
15090
  ggml_sycl_set_main_device(device);
15261
15091
 
15262
- ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
15263
- /* .device = */ device
15092
+ ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
15093
+ /* .device = */ device,
15094
+ /* .name = */ GGML_SYCL_NAME + std::to_string(device),
15264
15095
  };
15265
15096
 
15266
15097
  ggml_backend_t sycl_backend = new ggml_backend {