llama_cpp 0.14.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -202,24 +202,29 @@ namespace dpct
202
202
  // Version string has the following format:
203
203
  // a. OpenCL<space><major.minor><space><vendor-specific-information>
204
204
  // b. <major.minor>
205
+ // c. <AmdGcnArchName> e.g gfx1030
205
206
  std::string ver;
206
207
  ver = dev.get_info<sycl::info::device::version>();
207
208
  std::string::size_type i = 0;
208
- while (i < ver.size())
209
- {
210
- if (isdigit(ver[i]))
211
- break;
212
- i++;
209
+ while (i < ver.size()) {
210
+ if (isdigit(ver[i]))
211
+ break;
212
+ i++;
213
213
  }
214
214
  major = std::stoi(&(ver[i]));
215
- while (i < ver.size())
216
- {
217
- if (ver[i] == '.')
218
- break;
219
- i++;
215
+ while (i < ver.size()) {
216
+ if (ver[i] == '.')
217
+ break;
218
+ i++;
219
+ }
220
+ if (i < ver.size()) {
221
+ // a. and b.
222
+ i++;
223
+ minor = std::stoi(&(ver[i]));
224
+ } else {
225
+ // c.
226
+ minor = 0;
220
227
  }
221
- i++;
222
- minor = std::stoi(&(ver[i]));
223
228
  }
224
229
 
225
230
  template <typename tag, typename T>
@@ -3144,6 +3149,9 @@ namespace dpct
3144
3149
 
3145
3150
  } // COPY from DPCT head files
3146
3151
 
3152
+ #define GGML_COMMON_DECL_SYCL
3153
+ #define GGML_COMMON_IMPL_SYCL
3154
+ #include "ggml-common.h"
3147
3155
 
3148
3156
  static int g_ggml_sycl_debug=0;
3149
3157
  #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
@@ -3310,66 +3318,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
3310
3318
  const float *src1_dd, float *dst_dd,
3311
3319
  const dpct::queue_ptr &main_stream);
3312
3320
 
3313
- // QK = number of values after dequantization
3314
- // QR = QK / number of values before dequantization
3315
- // QI = number of 32 bit integers before dequantization
3316
-
3317
- #define QK4_0 32
3318
- #define QR4_0 2
3319
- #define QI4_0 (QK4_0 / (4 * QR4_0))
3320
- typedef struct dpct_type_block_q4_0 {
3321
- sycl::half d; // delta
3322
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
3323
- } block_q4_0;
3324
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
3325
-
3326
- #define QK4_1 32
3327
- #define QR4_1 2
3328
- #define QI4_1 (QK4_1 / (4 * QR4_1))
3329
- typedef struct dpct_type_block_q4_1 {
3330
- sycl::half2 dm; // dm.x = delta, dm.y = min
3331
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
3332
- } block_q4_1;
3333
- static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
3334
-
3335
- #define QK5_0 32
3336
- #define QR5_0 2
3337
- #define QI5_0 (QK5_0 / (4 * QR5_0))
3338
- typedef struct dpct_type_block_q5_0 {
3339
- sycl::half d; // delta
3340
- uint8_t qh[4]; // 5-th bit of quants
3341
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
3342
- } block_q5_0;
3343
- static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
3344
-
3345
- #define QK5_1 32
3346
- #define QR5_1 2
3347
- #define QI5_1 (QK5_1 / (4 * QR5_1))
3348
- typedef struct dpct_type_block_q5_1 {
3349
- sycl::half2 dm; // dm.x = delta, dm.y = min
3350
- uint8_t qh[4]; // 5-th bit of quants
3351
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
3352
- } block_q5_1;
3353
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
3354
-
3355
- #define QK8_0 32
3356
- #define QR8_0 1
3357
- #define QI8_0 (QK8_0 / (4 * QR8_0))
3358
- typedef struct dpct_type_block_q8_0 {
3359
- sycl::half d; // delta
3360
- int8_t qs[QK8_0]; // quants
3361
- } block_q8_0;
3362
- static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
3363
-
3364
- #define QK8_1 32
3365
- #define QR8_1 1
3366
- #define QI8_1 (QK8_1 / (4 * QR8_1))
3367
- typedef struct dpct_type_block_q8_1 {
3368
- sycl::half2 ds; // ds.x = delta, ds.y = sum
3369
- int8_t qs[QK8_0]; // quants
3370
- } block_q8_1;
3371
- static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
3372
-
3373
3321
  typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
3374
3322
  typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
3375
3323
  int **x_qh, int **x_sc);
@@ -3386,112 +3334,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3386
3334
  const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
3387
3335
  const int &i, const int &j, const int &k);
3388
3336
 
3389
- //================================= k-quants
3390
-
3391
- #ifdef GGML_QKK_64
3392
- #define QK_K 64
3393
- #define K_SCALE_SIZE 4
3394
- #else
3395
- #define QK_K 256
3396
- #define K_SCALE_SIZE 12
3397
- #endif
3398
-
3399
- #define QR2_K 4
3400
- #define QI2_K (QK_K / (4*QR2_K))
3401
- typedef struct dpct_type_block_q2_K {
3402
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
3403
- uint8_t qs[QK_K/4]; // quants
3404
- sycl::half2 dm; // super-block scale for quantized scales/mins
3405
- } block_q2_K;
3406
- static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
3407
-
3408
- #define QR3_K 4
3409
- #define QI3_K (QK_K / (4*QR3_K))
3410
- typedef struct dpct_type_block_q3_K {
3411
- uint8_t hmask[QK_K/8]; // quants - high bit
3412
- uint8_t qs[QK_K/4]; // quants - low 2 bits
3413
- #ifdef GGML_QKK_64
3414
- uint8_t scales[2]; // scales, quantized with 8 bits
3415
- #else
3416
- uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
3417
- #endif
3418
- sycl::half d; // super-block scale
3419
- } block_q3_K;
3420
- //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
3421
-
3422
- #define QR4_K 2
3423
- #define QI4_K (QK_K / (4*QR4_K))
3424
- #ifdef GGML_QKK_64
3425
- typedef struct {
3426
- sycl::half dm[2]; // super-block scales/mins
3427
- uint8_t scales[2]; // 4-bit block scales/mins
3428
- uint8_t qs[QK_K/2]; // 4--bit quants
3429
- } block_q4_K;
3430
- static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
3431
- #else
3432
- typedef struct dpct_type_block_q4_K {
3433
- sycl::half2 dm; // super-block scale for quantized scales/mins
3434
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
3435
- uint8_t qs[QK_K/2]; // 4--bit quants
3436
- } block_q4_K;
3437
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
3438
- #endif
3439
-
3440
- #define QR5_K 2
3441
- #define QI5_K (QK_K / (4*QR5_K))
3442
- #ifdef GGML_QKK_64
3443
- typedef struct {
3444
- sycl::half d; // super-block scale
3445
- int8_t scales[QK_K/16]; // block scales
3446
- uint8_t qh[QK_K/8]; // quants, high bit
3447
- uint8_t qs[QK_K/2]; // quants, low 4 bits
3448
- } block_q5_K;
3449
- static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
3450
- #else
3451
- typedef struct dpct_type_block_q5_K {
3452
- sycl::half2 dm; // super-block scale for quantized scales/mins
3453
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
3454
- uint8_t qh[QK_K/8]; // quants, high bit
3455
- uint8_t qs[QK_K/2]; // quants, low 4 bits
3456
- } block_q5_K;
3457
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
3458
- #endif
3459
-
3460
- #define QR6_K 2
3461
- #define QI6_K (QK_K / (4*QR6_K))
3462
- typedef struct dpct_type_block_q6_K {
3463
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
3464
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
3465
- int8_t scales[QK_K/16]; // scales
3466
- sycl::half d; // delta
3467
- } block_q6_K;
3468
- static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
3469
-
3470
- #define QR2_XXS 8
3471
- #define QI2_XXS (QK_K / (4*QR2_XXS))
3472
- typedef struct dpct_type_block_iq2_xxs {
3473
- sycl::half d;
3474
- uint16_t qs[QK_K/8];
3475
- } block_iq2_xxs;
3476
- static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
3477
-
3478
- #define QR2_XS 8
3479
- #define QI2_XS (QK_K / (4*QR2_XS))
3480
- typedef struct dpct_type_block_iq2_xs {
3481
- sycl::half d;
3482
- uint16_t qs[QK_K/8];
3483
- uint8_t scales[QK_K/32];
3484
- } block_iq2_xs;
3485
- static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
3486
-
3487
- #define QR3_XXS 8
3488
- #define QI3_XXS (QK_K / (4*QR3_XXS))
3489
- typedef struct dpct_type_block_iq3_xxs {
3490
- sycl::half d;
3491
- uint8_t qs[3*(QK_K/8)];
3492
- } block_iq3_xxs;
3493
- static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
3494
-
3495
3337
  #define WARP_SIZE 32
3496
3338
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
3497
3339
 
@@ -3609,7 +3451,7 @@ class sycl_gpu_mgr {
3609
3451
  dpct::device_info prop;
3610
3452
  dpct::get_device_info(prop, device);
3611
3453
  if (max_compute_units == prop.get_max_compute_units() &&
3612
- prop.get_major_version() == 1) {
3454
+ is_ext_oneapi_device(device)) {
3613
3455
  gpus.push_back(id);
3614
3456
  devices.push_back(device);
3615
3457
  work_group_size = prop.get_max_work_group_size();
@@ -3642,6 +3484,15 @@ class sycl_gpu_mgr {
3642
3484
  assert(false);
3643
3485
  return -1;
3644
3486
  }
3487
+
3488
+ bool is_ext_oneapi_device(const sycl::device &dev) {
3489
+ sycl::backend dev_backend = dev.get_backend();
3490
+ if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
3491
+ dev_backend == sycl::backend::ext_oneapi_cuda ||
3492
+ dev_backend == sycl::backend::ext_oneapi_hip)
3493
+ return true;
3494
+ return false;
3495
+ }
3645
3496
  };
3646
3497
 
3647
3498
  static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL;
@@ -4745,388 +4596,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4745
4596
  #endif
4746
4597
  }
4747
4598
 
4748
- static dpct::global_memory<const uint64_t, 1>
4749
- iq2xxs_grid(sycl::range<1>(256),
4750
- {
4751
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
4752
- 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
4753
- 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b,
4754
- 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
4755
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08,
4756
- 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
4757
- 0x080808082b08082b, 0x080808082b082b2b, 0x080808082b2b082b,
4758
- 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
4759
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908,
4760
- 0x080808192b192b08, 0x0808082b08080808, 0x0808082b0808082b,
4761
- 0x0808082b082b082b, 0x0808082b2b08082b, 0x0808190808080819,
4762
- 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
4763
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
4764
- 0x0808190819082b08, 0x08081908192b0808, 0x080819082b080819,
4765
- 0x080819082b081908, 0x080819082b190808, 0x080819082b2b1908,
4766
- 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
4767
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19,
4768
- 0x080819192b080808, 0x080819192b190819, 0x0808192b08082b19,
4769
- 0x0808192b08190808, 0x0808192b19080808, 0x0808192b2b081908,
4770
- 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
4771
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08,
4772
- 0x08082b0819080819, 0x08082b0819081908, 0x08082b0819190808,
4773
- 0x08082b081919082b, 0x08082b082b082b08, 0x08082b1908081908,
4774
- 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
4775
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808,
4776
- 0x08190808082b0819, 0x0819080819080808, 0x08190808192b0808,
4777
- 0x081908082b081908, 0x081908082b190808, 0x081908082b191919,
4778
- 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
4779
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808,
4780
- 0x0819082b082b1908, 0x0819082b19081919, 0x0819190808080808,
4781
- 0x0819190808082b08, 0x08191908082b0808, 0x08191908082b1919,
4782
- 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
4783
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b,
4784
- 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
4785
- 0x08192b0819080808, 0x08192b082b080819, 0x08192b1908080808,
4786
- 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
4787
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b,
4788
- 0x082b080819081908, 0x082b0808192b0819, 0x082b08082b080808,
4789
- 0x082b08082b08082b, 0x082b0819082b2b19, 0x082b081919082b08,
4790
- 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
4791
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808,
4792
- 0x082b19081919192b, 0x082b191908080808, 0x082b191919080819,
4793
- 0x082b1919192b1908, 0x082b192b2b190808, 0x082b2b0808082b08,
4794
- 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
4795
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808,
4796
- 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
4797
- 0x1908080819080808, 0x1908080819082b08, 0x190808081919192b,
4798
- 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
4799
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808,
4800
- 0x19080819192b0819, 0x190808192b080808, 0x190808192b081919,
4801
- 0x1908082b08080819, 0x1908082b08190808, 0x1908082b19082b08,
4802
- 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
4803
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808,
4804
- 0x190819082b192b19, 0x190819190819082b, 0x19081919082b1908,
4805
- 0x1908192b08080808, 0x19082b0808080819, 0x19082b0808081908,
4806
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
4807
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819,
4808
- 0x19082b192b08082b, 0x19082b2b19081919, 0x19082b2b2b190808,
4809
- 0x1919080808080808, 0x1919080808082b08, 0x1919080808190819,
4810
- 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
4811
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b,
4812
- 0x191908192b2b1908, 0x1919082b2b190819, 0x191919082b190808,
4813
- 0x191919082b19082b, 0x1919191908082b2b, 0x1919192b08080819,
4814
- 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
4815
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808,
4816
- 0x19192b2b08082b08, 0x192b080808081908, 0x192b080808190808,
4817
- 0x192b080819080808, 0x192b0808192b2b08, 0x192b081908080808,
4818
- 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
4819
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808,
4820
- 0x192b19190819082b, 0x192b19192b081908, 0x192b2b081908082b,
4821
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808082b2b,
4822
- 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
4823
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819,
4824
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808,
4825
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
4826
- 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
4827
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808,
4828
- 0x2b082b080808082b, 0x2b082b1908081908, 0x2b082b2b08190819,
4829
- 0x2b19080808081908, 0x2b19080808190808, 0x2b190808082b1908,
4830
- 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
4831
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808,
4832
- 0x2b191908082b082b, 0x2b19190819081908, 0x2b19191919190819,
4833
- 0x2b192b082b080819, 0x2b192b19082b0808, 0x2b2b08080808082b,
4834
- 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
4835
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808,
4836
- 0x2b2b2b1908081908,
4837
- });
4838
-
4839
- static dpct::global_memory<const uint64_t, 1>
4840
- iq2xs_grid(sycl::range<1>(512),
4841
- {
4842
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
4843
- 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
4844
- 0x0808080808191908, 0x080808080819192b, 0x0808080808192b19,
4845
- 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
4846
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908,
4847
- 0x080808081908192b, 0x0808080819082b19, 0x0808080819190808,
4848
- 0x080808081919082b, 0x0808080819191919, 0x0808080819192b08,
4849
- 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
4850
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08,
4851
- 0x080808082b190819, 0x080808082b191908, 0x080808082b192b19,
4852
- 0x080808082b2b0808, 0x0808081908080819, 0x0808081908081908,
4853
- 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
4854
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08,
4855
- 0x0808081908192b2b, 0x08080819082b0819, 0x08080819082b1908,
4856
- 0x0808081919080808, 0x080808191908082b, 0x0808081919081919,
4857
- 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
4858
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819,
4859
- 0x080808192b081908, 0x080808192b190808, 0x0808082b08080808,
4860
- 0x0808082b0808082b, 0x0808082b08081919, 0x0808082b08082b08,
4861
- 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
4862
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808,
4863
- 0x0808082b19191919, 0x0808082b2b080808, 0x0808082b2b082b2b,
4864
- 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
4865
- 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
4866
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819,
4867
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
4868
- 0x0808190819081919, 0x0808190819082b08, 0x0808190819190819,
4869
- 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
4870
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
4871
- 0x0808191908080808, 0x080819190808082b, 0x0808191908081919,
4872
- 0x0808191908082b08, 0x0808191908190819, 0x0808191908191908,
4873
- 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
4874
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808,
4875
- 0x0808192b08080819, 0x0808192b08081908, 0x0808192b08190808,
4876
- 0x0808192b082b192b, 0x0808192b19080808, 0x0808192b1908082b,
4877
- 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
4878
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b,
4879
- 0x08082b0808190819, 0x08082b0808191908, 0x08082b08082b0808,
4880
- 0x08082b08082b1919, 0x08082b0819080819, 0x08082b0819081908,
4881
- 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
4882
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819,
4883
- 0x08082b1908081908, 0x08082b1908190808, 0x08082b1919080808,
4884
- 0x08082b192b080819, 0x08082b192b082b19, 0x08082b2b08080808,
4885
- 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
4886
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908,
4887
- 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808,
4888
- 0x081908080819082b, 0x0819080808191919, 0x0819080808192b08,
4889
- 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
4890
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08,
4891
- 0x0819080819190819, 0x0819080819191908, 0x08190808192b0808,
4892
- 0x08190808192b2b2b, 0x081908082b080819, 0x081908082b081908,
4893
- 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
4894
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819,
4895
- 0x0819081908191908, 0x08190819082b0808, 0x0819081919080819,
4896
- 0x0819081919081908, 0x0819081919190808, 0x081908192b080808,
4897
- 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
4898
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808,
4899
- 0x0819082b19080808, 0x0819082b192b0808, 0x0819190808080808,
4900
- 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
4901
- 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
4902
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19,
4903
- 0x0819190819190808, 0x08191908192b1908, 0x081919082b080808,
4904
- 0x0819191908080819, 0x0819191908081908, 0x0819191908190808,
4905
- 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
4906
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908,
4907
- 0x08192b0808190808, 0x08192b080819082b, 0x08192b0819080808,
4908
- 0x08192b0819191908, 0x08192b082b08192b, 0x08192b1908080808,
4909
- 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
4910
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b,
4911
- 0x082b080808081919, 0x082b080808082b08, 0x082b080808082b2b,
4912
- 0x082b080808190819, 0x082b080808191908, 0x082b0808082b0808,
4913
- 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
4914
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819,
4915
- 0x082b081908081908, 0x082b081908190808, 0x082b081919080808,
4916
- 0x082b081919082b08, 0x082b0819192b1919, 0x082b082b08080808,
4917
- 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
4918
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808,
4919
- 0x082b1908082b2b19, 0x082b190819080808, 0x082b191908080808,
4920
- 0x082b191919080819, 0x082b19191919082b, 0x082b19192b192b19,
4921
- 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
4922
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b,
4923
- 0x082b2b08082b0808, 0x082b2b0819191919, 0x082b2b082b082b08,
4924
- 0x082b2b082b2b082b, 0x082b2b19192b2b08, 0x082b2b192b190808,
4925
- 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
4926
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
4927
- 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19,
4928
- 0x1908080808190808, 0x190808080819082b, 0x1908080808191919,
4929
- 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
4930
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919,
4931
- 0x1908080819082b08, 0x1908080819082b2b, 0x1908080819190819,
4932
- 0x1908080819191908, 0x19080808192b0808, 0x19080808192b1919,
4933
- 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
4934
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919,
4935
- 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
4936
- 0x19080819082b0808, 0x1908081919080819, 0x1908081919081908,
4937
- 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
4938
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908,
4939
- 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b082b2b19,
4940
- 0x1908082b19080808, 0x1908190808080808, 0x190819080808082b,
4941
- 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
4942
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808,
4943
- 0x1908190819080819, 0x1908190819081908, 0x1908190819190808,
4944
- 0x190819082b080808, 0x190819082b191908, 0x1908191908080819,
4945
- 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
4946
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808,
4947
- 0x1908192b08082b2b, 0x1908192b19081908, 0x1908192b19190808,
4948
- 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808190808,
4949
- 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
4950
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819,
4951
- 0x19082b1919081908, 0x19082b1919190808, 0x19082b19192b2b19,
4952
- 0x19082b2b08081908, 0x1919080808080808, 0x191908080808082b,
4953
- 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
4954
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08,
4955
- 0x1919080819080819, 0x1919080819081908, 0x1919080819190808,
4956
- 0x191908082b080808, 0x1919081908080819, 0x1919081908081908,
4957
- 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
4958
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908,
4959
- 0x1919082b2b2b2b2b, 0x1919190808080819, 0x1919190808081908,
4960
- 0x1919190808190808, 0x19191908082b0819, 0x1919190819080808,
4961
- 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
4962
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808,
4963
- 0x191919192b082b08, 0x1919192b082b0819, 0x1919192b192b2b08,
4964
- 0x1919192b2b2b0819, 0x19192b0808080808, 0x19192b0808191908,
4965
- 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
4966
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b,
4967
- 0x19192b2b2b081919, 0x192b080808080819, 0x192b080808081908,
4968
- 0x192b080808190808, 0x192b080819080808, 0x192b080819191908,
4969
- 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
4970
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b,
4971
- 0x192b082b2b19082b, 0x192b190808080808, 0x192b19080819192b,
4972
- 0x192b191908190808, 0x192b191919080808, 0x192b191919081919,
4973
- 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
4974
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908,
4975
- 0x192b2b2b192b082b, 0x2b08080808080808, 0x2b0808080808082b,
4976
- 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
4977
- 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
4978
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808,
4979
- 0x2b0808082b080808, 0x2b0808082b08082b, 0x2b0808082b2b2b08,
4980
- 0x2b0808082b2b2b2b, 0x2b08081908080819, 0x2b08081908081908,
4981
- 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
4982
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808,
4983
- 0x2b08082b082b0808, 0x2b08082b2b080808, 0x2b08082b2b08082b,
4984
- 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, 0x2b08190808080819,
4985
- 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
4986
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
4987
- 0x2b0819082b082b19, 0x2b08191908080808, 0x2b08191919081908,
4988
- 0x2b0819192b2b1919, 0x2b08192b08192b08, 0x2b08192b192b2b2b,
4989
- 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
4990
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b,
4991
- 0x2b082b082b2b2b08, 0x2b082b190808192b, 0x2b082b2b082b082b,
4992
- 0x2b082b2b2b080808, 0x2b082b2b2b082b08, 0x2b082b2b2b19192b,
4993
- 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
4994
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b,
4995
- 0x2b1908082b081908, 0x2b19081908080808, 0x2b190819082b082b,
4996
- 0x2b190819192b1908, 0x2b19082b1919192b, 0x2b19082b2b082b19,
4997
- 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
4998
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19,
4999
- 0x2b1919192b190808, 0x2b1919192b19082b, 0x2b19192b19080819,
5000
- 0x2b192b0819190819, 0x2b192b082b2b192b, 0x2b192b1919082b19,
5001
- 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
5002
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b,
5003
- 0x2b2b0808082b0808, 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808,
5004
- 0x2b2b081919190819, 0x2b2b081919192b19, 0x2b2b08192b2b192b,
5005
- 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
5006
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808,
5007
- 0x2b2b190819080808, 0x2b2b19082b191919, 0x2b2b192b192b1919,
5008
- 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, 0x2b2b2b08082b0808,
5009
- 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
5010
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908,
5011
- 0x2b2b2b192b08192b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b,
5012
- 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
5013
- });
5014
-
5015
- static dpct::global_memory<const uint32_t, 1> iq3xxs_grid(
5016
- sycl::range<1>(256),
5017
- {
5018
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e,
5019
- 0x04041404, 0x04041414, 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c,
5020
- 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, 0x040c140c, 0x040c142c,
5021
- 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
5022
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c,
5023
- 0x04141c1c, 0x04141c3e, 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c,
5024
- 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, 0x041c3e04, 0x04240c1c,
5025
- 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
5026
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04,
5027
- 0x043e0c24, 0x043e0c34, 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c,
5028
- 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, 0x0c041c04, 0x0c041c14,
5029
- 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
5030
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14,
5031
- 0x0c14140c, 0x0c141c04, 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404,
5032
- 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, 0x0c24042c, 0x0c242c04,
5033
- 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
5034
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404,
5035
- 0x14041414, 0x14041434, 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c,
5036
- 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, 0x140c1c04, 0x140c341c,
5037
- 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
5038
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c,
5039
- 0x141c0c04, 0x141c0c24, 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c,
5040
- 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, 0x143e040c, 0x143e041c,
5041
- 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
5042
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414,
5043
- 0x1c0c1404, 0x1c0c1c0c, 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c,
5044
- 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, 0x1c1c0c0c, 0x1c1c1c1c,
5045
- 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
5046
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404,
5047
- 0x24040424, 0x24040c3e, 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e,
5048
- 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, 0x24143404, 0x24143434,
5049
- 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
5050
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04,
5051
- 0x2c040c14, 0x2c04240c, 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434,
5052
- 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, 0x2c1c0414, 0x2c1c2c1c,
5053
- 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
5054
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434,
5055
- 0x34043424, 0x340c140c, 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04,
5056
- 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, 0x34341c1c, 0x343e041c,
5057
- 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
5058
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14,
5059
- 0x3e1c0404, 0x3e1c0c2c, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c,
5060
- 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
5061
- });
5062
-
5063
- static dpct::global_memory<const uint8_t, 1> ksigns_iq2xs(
5064
- sycl::range<1>(128),
5065
- {
5066
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12,
5067
- 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153,
5068
- 154, 27, 156, 29, 30, 159, 160, 33, 34, 163, 36, 165, 166,
5069
- 39, 40, 169, 170, 43, 172, 45, 46, 175, 48, 177, 178, 51,
5070
- 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, 192,
5071
- 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77,
5072
- 78, 207, 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90,
5073
- 219, 92, 221, 222, 95, 96, 225, 226, 99, 228, 101, 102, 231,
5074
- 232, 105, 106, 235, 108, 237, 238, 111, 240, 113, 114, 243, 116,
5075
- 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
5076
- });
5077
-
5078
- static dpct::global_memory<const uint64_t, 1>
5079
- ksigns64(sycl::range<1>(128),
5080
- {
5081
- 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00,
5082
- 0x000000000000ffff, 0xff00000000ff0000, 0x0000000000ff00ff,
5083
- 0x0000000000ffff00, 0xff00000000ffffff, 0xff000000ff000000,
5084
- 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
5085
- 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00,
5086
- 0x00000000ffffffff, 0xff0000ff00000000, 0x000000ff000000ff,
5087
- 0x000000ff0000ff00, 0xff0000ff0000ffff, 0x000000ff00ff0000,
5088
- 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
5089
- 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00,
5090
- 0x000000ffff00ffff, 0xff0000ffffff0000, 0x000000ffffff00ff,
5091
- 0x000000ffffffff00, 0xff0000ffffffffff, 0xff00ff0000000000,
5092
- 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
5093
- 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00,
5094
- 0x0000ff0000ffffff, 0x0000ff00ff000000, 0xff00ff00ff0000ff,
5095
- 0xff00ff00ff00ff00, 0x0000ff00ff00ffff, 0xff00ff00ffff0000,
5096
- 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
5097
- 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00,
5098
- 0x0000ffff0000ffff, 0xff00ffff00ff0000, 0x0000ffff00ff00ff,
5099
- 0x0000ffff00ffff00, 0xff00ffff00ffffff, 0xff00ffffff000000,
5100
- 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
5101
- 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00,
5102
- 0x0000ffffffffffff, 0xffff000000000000, 0x00ff0000000000ff,
5103
- 0x00ff00000000ff00, 0xffff00000000ffff, 0x00ff000000ff0000,
5104
- 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
5105
- 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00,
5106
- 0x00ff0000ff00ffff, 0xffff0000ffff0000, 0x00ff0000ffff00ff,
5107
- 0x00ff0000ffffff00, 0xffff0000ffffffff, 0x00ff00ff00000000,
5108
- 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
5109
- 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
5110
- 0xffff00ff00ffffff, 0xffff00ffff000000, 0x00ff00ffff0000ff,
5111
- 0x00ff00ffff00ff00, 0xffff00ffff00ffff, 0x00ff00ffffff0000,
5112
- 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
5113
- 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00,
5114
- 0x00ffff000000ffff, 0xffffff0000ff0000, 0x00ffff0000ff00ff,
5115
- 0x00ffff0000ffff00, 0xffffff0000ffffff, 0xffffff00ff000000,
5116
- 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
5117
- 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00,
5118
- 0x00ffff00ffffffff, 0xffffffff00000000, 0x00ffffff000000ff,
5119
- 0x00ffffff0000ff00, 0xffffffff0000ffff, 0x00ffffff00ff0000,
5120
- 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
5121
- 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00,
5122
- 0x00ffffffff00ffff, 0xffffffffffff0000, 0x00ffffffffff00ff,
5123
- 0x00ffffffffffff00, 0xffffffffffffffff,
5124
- });
5125
- //#endif
5126
-
5127
- static dpct::global_memory<const uint8_t, 1>
5128
- kmask_iq2xs(sycl::range<1>(8), {1, 2, 4, 8, 16, 32, 64, 128});
5129
-
5130
4599
  template<typename dst_t>
5131
4600
  static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
5132
4601
  const sycl::nd_item<3> &item_ct1,
@@ -5213,6 +4682,65 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
5213
4682
 
5214
4683
  }
5215
4684
 
4685
+ template<typename dst_t>
4686
+ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4687
+ const sycl::nd_item<3> &item_ct1,
4688
+ const uint32_t *iq3s_grid,
4689
+ const uint8_t *ksigns_iq2xs,
4690
+ const uint8_t *kmask_iq2xs) {
4691
+
4692
+ const int i = item_ct1.get_group(2);
4693
+ const block_iq3_s * x = (const block_iq3_s *) vx;
4694
+
4695
+ const int tid = item_ct1.get_local_id(2);
4696
+ #if QK_K == 256
4697
+ const int il = tid/8; // 0...3
4698
+ const int ib = tid%8; // 0...7
4699
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4700
+ const uint8_t * qs = x[i].qs + 8*ib;
4701
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]);
4702
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]);
4703
+ const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
4704
+ const uint8_t signs = x[i].signs[4*ib + il];
4705
+ for (int j = 0; j < 4; ++j) {
4706
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4707
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4708
+ }
4709
+ #else
4710
+ assert(false);
4711
+ #endif
4712
+
4713
+ }
4714
+
4715
+ template<typename dst_t>
4716
+ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4717
+ const sycl::nd_item<3> &item_ct1,
4718
+ const uint32_t *iq1s_grid,
4719
+ const uint8_t *ksigns_iq2xs,
4720
+ const uint8_t *kmask_iq2xs) {
4721
+ const int i = item_ct1.get_group(2);
4722
+ const block_iq1_s * x = (const block_iq1_s *) vx;
4723
+
4724
+ const int tid = item_ct1.get_local_id(2);
4725
+ #if QK_K == 256
4726
+ const int il = tid/8; // 0...3
4727
+ const int ib = tid%8; // 0...7
4728
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4729
+ const uint8_t * qs = x[i].qs + 8*ib;
4730
+ const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
4731
+ const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
4732
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
4733
+ const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
4734
+ for (int j = 0; j < 4; ++j) {
4735
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4736
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4737
+ }
4738
+ #else
4739
+ assert(false);
4740
+ #endif
4741
+
4742
+ }
4743
+
5216
4744
  /*
5217
4745
  DPCT1110:4: The total declared local variable size in device function
5218
4746
  dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@@ -8059,6 +7587,75 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
8059
7587
  #endif
8060
7588
  }
8061
7589
 
7590
+ static __dpct_inline__ float
7591
+ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7592
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7593
+ const uint32_t *iq3s_grid, const uint64_t *ksigns64) {
7594
+ #if DPCT_COMPATIBILITY_TEMP >= \
7595
+ MIN_CC_DP4A // lowest compute capability for integer intrinsics
7596
+ #if QK_K == 256
7597
+ const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7598
+
7599
+ const int ib32 = iqs;
7600
+ const uint8_t * qs = bq2->qs + 8*ib32;
7601
+ const int8_t * q8 = bq8_1[ib32].qs;
7602
+ int sumi = 0;
7603
+ for (int l = 0; l < 4; ++l) {
7604
+ const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
7605
+ const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
7606
+ uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7607
+ ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
7608
+ uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7609
+ ((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
7610
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7611
+ grid1[0] ^ signs0, signs0, std::minus<>());
7612
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7613
+ grid2[0] ^ signs1, signs1, std::minus<>());
7614
+ sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
7615
+ sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7616
+ q8 += 8;
7617
+ }
7618
+ const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0];
7619
+ return d * sumi;
7620
+ #else
7621
+ assert(false);
7622
+ return 0.f;
7623
+ #endif
7624
+ #else
7625
+ assert(false);
7626
+ return 0.f;
7627
+ #endif
7628
+ }
7629
+
7630
+ static __dpct_inline__ float
7631
+ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7632
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7633
+ const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
7634
+ #if QK_K == 256
7635
+ const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7636
+
7637
+ const int ib32 = iqs;
7638
+ const uint8_t * qs = bq1->qs + 4*ib32;
7639
+ const int8_t * q8 = bq8_1[ib32].qs;
7640
+ int sumi = 0;
7641
+ for (int l = 0; l < 4; ++l) {
7642
+ const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
7643
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
7644
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7645
+ grid[0] ^ signs[0], signs[0], std::minus<>());
7646
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7647
+ grid[1] ^ signs[1], signs[1], std::minus<>());
7648
+ sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
7649
+ sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7650
+ q8 += 8;
7651
+ }
7652
+ const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
7653
+ return d * sumi;
7654
+ #else
7655
+ assert(false);
7656
+ return 0.f;
7657
+ #endif
7658
+ }
8062
7659
 
8063
7660
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
8064
7661
  int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
@@ -8824,6 +8421,98 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
8824
8421
  }
8825
8422
  }
8826
8423
 
8424
+ template <int qk, int qi, typename block_q_t, int vdr>
8425
+ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8426
+ const sycl::nd_item<3> &item_ct1,
8427
+ const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) {
8428
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8429
+ item_ct1.get_local_id(1);
8430
+
8431
+ if (row >= nrows) {
8432
+ return;
8433
+ }
8434
+
8435
+ const int blocks_per_row = ncols / qk;
8436
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8437
+
8438
+ // partial sum for each thread
8439
+ float tmp = 0.0f;
8440
+
8441
+ const block_q_t * x = (const block_q_t *) vx;
8442
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8443
+
8444
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8445
+ i += blocks_per_warp) {
8446
+ const int ibx = row*blocks_per_row + i; // x block index
8447
+
8448
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8449
+
8450
+ const int iqs =
8451
+ vdr *
8452
+ (item_ct1.get_local_id(2) %
8453
+ (qi / vdr)); // x block quant index when casting the quants to int
8454
+
8455
+ tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr);
8456
+ }
8457
+
8458
+ // sum up partial sums and write back result
8459
+ #pragma unroll
8460
+ for (int mask = 16; mask > 0; mask >>= 1) {
8461
+ tmp +=
8462
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8463
+ }
8464
+
8465
+ if (item_ct1.get_local_id(2) == 0) {
8466
+ dst[row] = tmp;
8467
+ }
8468
+ }
8469
+
8470
+ template <int qk, int qi, typename block_q_t, int vdr>
8471
+ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8472
+ const sycl::nd_item<3> &item_ct1,
8473
+ const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
8474
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8475
+ item_ct1.get_local_id(1);
8476
+
8477
+ if (row >= nrows) {
8478
+ return;
8479
+ }
8480
+
8481
+ const int blocks_per_row = ncols / qk;
8482
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8483
+
8484
+ // partial sum for each thread
8485
+ float tmp = 0.0f;
8486
+
8487
+ const block_q_t * x = (const block_q_t *) vx;
8488
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8489
+
8490
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8491
+ i += blocks_per_warp) {
8492
+ const int ibx = row*blocks_per_row + i; // x block index
8493
+
8494
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8495
+
8496
+ const int iqs =
8497
+ vdr *
8498
+ (item_ct1.get_local_id(2) %
8499
+ (qi / vdr)); // x block quant index when casting the quants to int
8500
+
8501
+ tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr);
8502
+ }
8503
+
8504
+ // sum up partial sums and write back result
8505
+ #pragma unroll
8506
+ for (int mask = 16; mask > 0; mask >>= 1) {
8507
+ tmp +=
8508
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8509
+ }
8510
+
8511
+ if (item_ct1.get_local_id(2) == 0) {
8512
+ dst[row] = tmp;
8513
+ }
8514
+ }
8515
+
8827
8516
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
8828
8517
  static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
8829
8518
  const sycl::nd_item<3> &item_ct1) {
@@ -10509,6 +10198,64 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10509
10198
  }
10510
10199
  }
10511
10200
 
10201
+ template <typename dst_t>
10202
+ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10203
+ dpct::queue_ptr stream) {
10204
+ const int nb = k / QK_K;
10205
+ {
10206
+ iq3s_grid.init(*stream);
10207
+ ksigns_iq2xs.init(*stream);
10208
+ kmask_iq2xs.init(*stream);
10209
+
10210
+ dpct::has_capability_or_fail(stream->get_device(),
10211
+ {sycl::aspect::fp16});
10212
+
10213
+ stream->submit([&](sycl::handler &cgh) {
10214
+ auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10215
+ auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10216
+ auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10217
+
10218
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10219
+ sycl::range<3>(1, 1, 32),
10220
+ sycl::range<3>(1, 1, 32)),
10221
+ [=](sycl::nd_item<3> item_ct1) {
10222
+ dequantize_block_iq3_s(
10223
+ vx, y, item_ct1, iq3s_grid_ptr_ct1,
10224
+ ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10225
+ });
10226
+ });
10227
+ }
10228
+ }
10229
+
10230
+ template <typename dst_t>
10231
+ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10232
+ dpct::queue_ptr stream) {
10233
+ const int nb = k / QK_K;
10234
+ {
10235
+ iq1s_grid_gpu.init(*stream);
10236
+ ksigns_iq2xs.init(*stream);
10237
+ kmask_iq2xs.init(*stream);
10238
+
10239
+ dpct::has_capability_or_fail(stream->get_device(),
10240
+ {sycl::aspect::fp16});
10241
+
10242
+ stream->submit([&](sycl::handler &cgh) {
10243
+ auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10244
+ auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10245
+ auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10246
+
10247
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10248
+ sycl::range<3>(1, 1, 32),
10249
+ sycl::range<3>(1, 1, 32)),
10250
+ [=](sycl::nd_item<3> item_ct1) {
10251
+ dequantize_block_iq1_s(
10252
+ vx, y, item_ct1, iq1s_grid_ptr_ct1,
10253
+ ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10254
+ });
10255
+ });
10256
+ }
10257
+ }
10258
+
10512
10259
  template <typename src_t, typename dst_t>
10513
10260
  static void convert_unary_sycl(const void *__restrict__ vx,
10514
10261
  dst_t *__restrict__ y, const int k,
@@ -10559,6 +10306,10 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
10559
10306
  return dequantize_row_iq2_xs_sycl;
10560
10307
  case GGML_TYPE_IQ3_XXS:
10561
10308
  return dequantize_row_iq3_xxs_sycl;
10309
+ case GGML_TYPE_IQ3_S:
10310
+ return dequantize_row_iq3_s_sycl;
10311
+ case GGML_TYPE_IQ1_S:
10312
+ return dequantize_row_iq1_s_sycl;
10562
10313
  case GGML_TYPE_F32:
10563
10314
  return convert_unary_sycl<float>;
10564
10315
  default:
@@ -10599,6 +10350,10 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
10599
10350
  return dequantize_row_iq2_xs_sycl;
10600
10351
  case GGML_TYPE_IQ3_XXS:
10601
10352
  return dequantize_row_iq3_xxs_sycl;
10353
+ case GGML_TYPE_IQ3_S:
10354
+ return dequantize_row_iq3_s_sycl;
10355
+ case GGML_TYPE_IQ1_S:
10356
+ return dequantize_row_iq1_s_sycl;
10602
10357
  case GGML_TYPE_F16:
10603
10358
  return convert_unary_sycl<sycl::half>;
10604
10359
  default:
@@ -11188,6 +10943,61 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
11188
10943
  }
11189
10944
  }
11190
10945
 
10946
+ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
10947
+ float *dst, const int ncols,
10948
+ const int nrows,
10949
+ dpct::queue_ptr stream) {
10950
+ GGML_ASSERT(ncols % QK_K == 0);
10951
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
10952
+ const sycl::range<3> block_nums(1, 1, block_num_y);
10953
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10954
+ {
10955
+ iq3s_grid.init(*stream);
10956
+ ksigns64.init(*stream);
10957
+
10958
+ stream->submit([&](sycl::handler &cgh) {
10959
+ auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10960
+ auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10961
+
10962
+ cgh.parallel_for(
10963
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
10964
+ [=](sycl::nd_item<3> item_ct1)
10965
+ [[intel::reqd_sub_group_size(32)]] {
10966
+ mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
10967
+ vx, vy, dst, ncols, nrows, item_ct1,
10968
+ iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
10969
+ });
10970
+ });
10971
+ }
10972
+ }
10973
+
10974
+ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
10975
+ float *dst, const int ncols,
10976
+ const int nrows,
10977
+ dpct::queue_ptr stream) {
10978
+ GGML_ASSERT(ncols % QK_K == 0);
10979
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
10980
+ const sycl::range<3> block_nums(1, 1, block_num_y);
10981
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10982
+ {
10983
+ iq1s_grid_gpu.init(*stream);
10984
+ ksigns64.init(*stream);
10985
+
10986
+ stream->submit([&](sycl::handler &cgh) {
10987
+ auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10988
+ auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10989
+
10990
+ cgh.parallel_for(
10991
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
10992
+ [=](sycl::nd_item<3> item_ct1)
10993
+ [[intel::reqd_sub_group_size(32)]] {
10994
+ mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
10995
+ vx, vy, dst, ncols, nrows, item_ct1,
10996
+ iq1s_grid_ptr_ct1, ksigns64_ptr_ct1);
10997
+ });
10998
+ });
10999
+ }
11000
+ }
11191
11001
 
11192
11002
  static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
11193
11003
  float *dst, const int ncols_x,
@@ -13936,8 +13746,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
13936
13746
  case GGML_TYPE_Q5_K:
13937
13747
  case GGML_TYPE_IQ2_XXS:
13938
13748
  case GGML_TYPE_IQ2_XS:
13749
+ case GGML_TYPE_IQ1_S:
13939
13750
  case GGML_TYPE_IQ3_XXS:
13940
13751
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
13752
+ case GGML_TYPE_IQ3_S:
13753
+ return max_compute_capability >= VER_GEN9 ? 128 : 64;
13941
13754
  case GGML_TYPE_Q6_K:
13942
13755
  return 64;
13943
13756
  default:
@@ -13998,6 +13811,12 @@ inline void ggml_sycl_op_mul_mat_vec_q(
13998
13811
  case GGML_TYPE_IQ3_XXS:
13999
13812
  mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14000
13813
  break;
13814
+ case GGML_TYPE_IQ3_S:
13815
+ mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13816
+ break;
13817
+ case GGML_TYPE_IQ1_S:
13818
+ mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13819
+ break;
14001
13820
  default:
14002
13821
  GGML_ASSERT(false);
14003
13822
  break;
@@ -17343,9 +17162,8 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17343
17162
  return false;
17344
17163
  }
17345
17164
  ggml_type a_type = a->type;
17346
- if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
17347
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
17348
- a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
17165
+ if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S ||
17166
+ a_type == GGML_TYPE_IQ4_XS) {
17349
17167
  return false;
17350
17168
  }
17351
17169
  return true;
@@ -17440,13 +17258,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17440
17258
  /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
17441
17259
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
17442
17260
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
17443
- /* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async,
17261
+ /* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
17444
17262
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17445
17263
  /* .graph_plan_create = */ NULL,
17446
17264
  /* .graph_plan_free = */ NULL,
17447
17265
  /* .graph_plan_compute = */ NULL,
17448
17266
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17449
17267
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17268
+ /* .event_new = */ NULL,
17269
+ /* .event_free = */ NULL,
17270
+ /* .event_record = */ NULL,
17271
+ /* .event_wait = */ NULL,
17272
+ /* .event_synchronize = */ NULL,
17450
17273
  };
17451
17274
 
17452
17275
  static ggml_guid_t ggml_backend_sycl_guid() {