llama_cpp 0.14.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -202,24 +202,29 @@ namespace dpct
202
202
  // Version string has the following format:
203
203
  // a. OpenCL<space><major.minor><space><vendor-specific-information>
204
204
  // b. <major.minor>
205
+ // c. <AmdGcnArchName> e.g gfx1030
205
206
  std::string ver;
206
207
  ver = dev.get_info<sycl::info::device::version>();
207
208
  std::string::size_type i = 0;
208
- while (i < ver.size())
209
- {
210
- if (isdigit(ver[i]))
211
- break;
212
- i++;
209
+ while (i < ver.size()) {
210
+ if (isdigit(ver[i]))
211
+ break;
212
+ i++;
213
213
  }
214
214
  major = std::stoi(&(ver[i]));
215
- while (i < ver.size())
216
- {
217
- if (ver[i] == '.')
218
- break;
219
- i++;
215
+ while (i < ver.size()) {
216
+ if (ver[i] == '.')
217
+ break;
218
+ i++;
219
+ }
220
+ if (i < ver.size()) {
221
+ // a. and b.
222
+ i++;
223
+ minor = std::stoi(&(ver[i]));
224
+ } else {
225
+ // c.
226
+ minor = 0;
220
227
  }
221
- i++;
222
- minor = std::stoi(&(ver[i]));
223
228
  }
224
229
 
225
230
  template <typename tag, typename T>
@@ -3144,6 +3149,9 @@ namespace dpct
3144
3149
 
3145
3150
  } // COPY from DPCT head files
3146
3151
 
3152
+ #define GGML_COMMON_DECL_SYCL
3153
+ #define GGML_COMMON_IMPL_SYCL
3154
+ #include "ggml-common.h"
3147
3155
 
3148
3156
  static int g_ggml_sycl_debug=0;
3149
3157
  #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
@@ -3310,66 +3318,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
3310
3318
  const float *src1_dd, float *dst_dd,
3311
3319
  const dpct::queue_ptr &main_stream);
3312
3320
 
3313
- // QK = number of values after dequantization
3314
- // QR = QK / number of values before dequantization
3315
- // QI = number of 32 bit integers before dequantization
3316
-
3317
- #define QK4_0 32
3318
- #define QR4_0 2
3319
- #define QI4_0 (QK4_0 / (4 * QR4_0))
3320
- typedef struct dpct_type_block_q4_0 {
3321
- sycl::half d; // delta
3322
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
3323
- } block_q4_0;
3324
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
3325
-
3326
- #define QK4_1 32
3327
- #define QR4_1 2
3328
- #define QI4_1 (QK4_1 / (4 * QR4_1))
3329
- typedef struct dpct_type_block_q4_1 {
3330
- sycl::half2 dm; // dm.x = delta, dm.y = min
3331
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
3332
- } block_q4_1;
3333
- static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
3334
-
3335
- #define QK5_0 32
3336
- #define QR5_0 2
3337
- #define QI5_0 (QK5_0 / (4 * QR5_0))
3338
- typedef struct dpct_type_block_q5_0 {
3339
- sycl::half d; // delta
3340
- uint8_t qh[4]; // 5-th bit of quants
3341
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
3342
- } block_q5_0;
3343
- static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
3344
-
3345
- #define QK5_1 32
3346
- #define QR5_1 2
3347
- #define QI5_1 (QK5_1 / (4 * QR5_1))
3348
- typedef struct dpct_type_block_q5_1 {
3349
- sycl::half2 dm; // dm.x = delta, dm.y = min
3350
- uint8_t qh[4]; // 5-th bit of quants
3351
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
3352
- } block_q5_1;
3353
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
3354
-
3355
- #define QK8_0 32
3356
- #define QR8_0 1
3357
- #define QI8_0 (QK8_0 / (4 * QR8_0))
3358
- typedef struct dpct_type_block_q8_0 {
3359
- sycl::half d; // delta
3360
- int8_t qs[QK8_0]; // quants
3361
- } block_q8_0;
3362
- static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
3363
-
3364
- #define QK8_1 32
3365
- #define QR8_1 1
3366
- #define QI8_1 (QK8_1 / (4 * QR8_1))
3367
- typedef struct dpct_type_block_q8_1 {
3368
- sycl::half2 ds; // ds.x = delta, ds.y = sum
3369
- int8_t qs[QK8_0]; // quants
3370
- } block_q8_1;
3371
- static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
3372
-
3373
3321
  typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
3374
3322
  typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
3375
3323
  int **x_qh, int **x_sc);
@@ -3386,112 +3334,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3386
3334
  const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
3387
3335
  const int &i, const int &j, const int &k);
3388
3336
 
3389
- //================================= k-quants
3390
-
3391
- #ifdef GGML_QKK_64
3392
- #define QK_K 64
3393
- #define K_SCALE_SIZE 4
3394
- #else
3395
- #define QK_K 256
3396
- #define K_SCALE_SIZE 12
3397
- #endif
3398
-
3399
- #define QR2_K 4
3400
- #define QI2_K (QK_K / (4*QR2_K))
3401
- typedef struct dpct_type_block_q2_K {
3402
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
3403
- uint8_t qs[QK_K/4]; // quants
3404
- sycl::half2 dm; // super-block scale for quantized scales/mins
3405
- } block_q2_K;
3406
- static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
3407
-
3408
- #define QR3_K 4
3409
- #define QI3_K (QK_K / (4*QR3_K))
3410
- typedef struct dpct_type_block_q3_K {
3411
- uint8_t hmask[QK_K/8]; // quants - high bit
3412
- uint8_t qs[QK_K/4]; // quants - low 2 bits
3413
- #ifdef GGML_QKK_64
3414
- uint8_t scales[2]; // scales, quantized with 8 bits
3415
- #else
3416
- uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
3417
- #endif
3418
- sycl::half d; // super-block scale
3419
- } block_q3_K;
3420
- //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
3421
-
3422
- #define QR4_K 2
3423
- #define QI4_K (QK_K / (4*QR4_K))
3424
- #ifdef GGML_QKK_64
3425
- typedef struct {
3426
- sycl::half dm[2]; // super-block scales/mins
3427
- uint8_t scales[2]; // 4-bit block scales/mins
3428
- uint8_t qs[QK_K/2]; // 4--bit quants
3429
- } block_q4_K;
3430
- static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
3431
- #else
3432
- typedef struct dpct_type_block_q4_K {
3433
- sycl::half2 dm; // super-block scale for quantized scales/mins
3434
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
3435
- uint8_t qs[QK_K/2]; // 4--bit quants
3436
- } block_q4_K;
3437
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
3438
- #endif
3439
-
3440
- #define QR5_K 2
3441
- #define QI5_K (QK_K / (4*QR5_K))
3442
- #ifdef GGML_QKK_64
3443
- typedef struct {
3444
- sycl::half d; // super-block scale
3445
- int8_t scales[QK_K/16]; // block scales
3446
- uint8_t qh[QK_K/8]; // quants, high bit
3447
- uint8_t qs[QK_K/2]; // quants, low 4 bits
3448
- } block_q5_K;
3449
- static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
3450
- #else
3451
- typedef struct dpct_type_block_q5_K {
3452
- sycl::half2 dm; // super-block scale for quantized scales/mins
3453
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
3454
- uint8_t qh[QK_K/8]; // quants, high bit
3455
- uint8_t qs[QK_K/2]; // quants, low 4 bits
3456
- } block_q5_K;
3457
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
3458
- #endif
3459
-
3460
- #define QR6_K 2
3461
- #define QI6_K (QK_K / (4*QR6_K))
3462
- typedef struct dpct_type_block_q6_K {
3463
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
3464
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
3465
- int8_t scales[QK_K/16]; // scales
3466
- sycl::half d; // delta
3467
- } block_q6_K;
3468
- static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
3469
-
3470
- #define QR2_XXS 8
3471
- #define QI2_XXS (QK_K / (4*QR2_XXS))
3472
- typedef struct dpct_type_block_iq2_xxs {
3473
- sycl::half d;
3474
- uint16_t qs[QK_K/8];
3475
- } block_iq2_xxs;
3476
- static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
3477
-
3478
- #define QR2_XS 8
3479
- #define QI2_XS (QK_K / (4*QR2_XS))
3480
- typedef struct dpct_type_block_iq2_xs {
3481
- sycl::half d;
3482
- uint16_t qs[QK_K/8];
3483
- uint8_t scales[QK_K/32];
3484
- } block_iq2_xs;
3485
- static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
3486
-
3487
- #define QR3_XXS 8
3488
- #define QI3_XXS (QK_K / (4*QR3_XXS))
3489
- typedef struct dpct_type_block_iq3_xxs {
3490
- sycl::half d;
3491
- uint8_t qs[3*(QK_K/8)];
3492
- } block_iq3_xxs;
3493
- static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
3494
-
3495
3337
  #define WARP_SIZE 32
3496
3338
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
3497
3339
 
@@ -3609,7 +3451,7 @@ class sycl_gpu_mgr {
3609
3451
  dpct::device_info prop;
3610
3452
  dpct::get_device_info(prop, device);
3611
3453
  if (max_compute_units == prop.get_max_compute_units() &&
3612
- prop.get_major_version() == 1) {
3454
+ is_ext_oneapi_device(device)) {
3613
3455
  gpus.push_back(id);
3614
3456
  devices.push_back(device);
3615
3457
  work_group_size = prop.get_max_work_group_size();
@@ -3642,6 +3484,15 @@ class sycl_gpu_mgr {
3642
3484
  assert(false);
3643
3485
  return -1;
3644
3486
  }
3487
+
3488
+ bool is_ext_oneapi_device(const sycl::device &dev) {
3489
+ sycl::backend dev_backend = dev.get_backend();
3490
+ if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
3491
+ dev_backend == sycl::backend::ext_oneapi_cuda ||
3492
+ dev_backend == sycl::backend::ext_oneapi_hip)
3493
+ return true;
3494
+ return false;
3495
+ }
3645
3496
  };
3646
3497
 
3647
3498
  static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL;
@@ -4745,388 +4596,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4745
4596
  #endif
4746
4597
  }
4747
4598
 
4748
- static dpct::global_memory<const uint64_t, 1>
4749
- iq2xxs_grid(sycl::range<1>(256),
4750
- {
4751
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
4752
- 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
4753
- 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b,
4754
- 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
4755
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08,
4756
- 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
4757
- 0x080808082b08082b, 0x080808082b082b2b, 0x080808082b2b082b,
4758
- 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
4759
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908,
4760
- 0x080808192b192b08, 0x0808082b08080808, 0x0808082b0808082b,
4761
- 0x0808082b082b082b, 0x0808082b2b08082b, 0x0808190808080819,
4762
- 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
4763
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
4764
- 0x0808190819082b08, 0x08081908192b0808, 0x080819082b080819,
4765
- 0x080819082b081908, 0x080819082b190808, 0x080819082b2b1908,
4766
- 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
4767
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19,
4768
- 0x080819192b080808, 0x080819192b190819, 0x0808192b08082b19,
4769
- 0x0808192b08190808, 0x0808192b19080808, 0x0808192b2b081908,
4770
- 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
4771
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08,
4772
- 0x08082b0819080819, 0x08082b0819081908, 0x08082b0819190808,
4773
- 0x08082b081919082b, 0x08082b082b082b08, 0x08082b1908081908,
4774
- 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
4775
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808,
4776
- 0x08190808082b0819, 0x0819080819080808, 0x08190808192b0808,
4777
- 0x081908082b081908, 0x081908082b190808, 0x081908082b191919,
4778
- 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
4779
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808,
4780
- 0x0819082b082b1908, 0x0819082b19081919, 0x0819190808080808,
4781
- 0x0819190808082b08, 0x08191908082b0808, 0x08191908082b1919,
4782
- 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
4783
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b,
4784
- 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
4785
- 0x08192b0819080808, 0x08192b082b080819, 0x08192b1908080808,
4786
- 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
4787
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b,
4788
- 0x082b080819081908, 0x082b0808192b0819, 0x082b08082b080808,
4789
- 0x082b08082b08082b, 0x082b0819082b2b19, 0x082b081919082b08,
4790
- 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
4791
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808,
4792
- 0x082b19081919192b, 0x082b191908080808, 0x082b191919080819,
4793
- 0x082b1919192b1908, 0x082b192b2b190808, 0x082b2b0808082b08,
4794
- 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
4795
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808,
4796
- 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
4797
- 0x1908080819080808, 0x1908080819082b08, 0x190808081919192b,
4798
- 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
4799
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808,
4800
- 0x19080819192b0819, 0x190808192b080808, 0x190808192b081919,
4801
- 0x1908082b08080819, 0x1908082b08190808, 0x1908082b19082b08,
4802
- 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
4803
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808,
4804
- 0x190819082b192b19, 0x190819190819082b, 0x19081919082b1908,
4805
- 0x1908192b08080808, 0x19082b0808080819, 0x19082b0808081908,
4806
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
4807
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819,
4808
- 0x19082b192b08082b, 0x19082b2b19081919, 0x19082b2b2b190808,
4809
- 0x1919080808080808, 0x1919080808082b08, 0x1919080808190819,
4810
- 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
4811
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b,
4812
- 0x191908192b2b1908, 0x1919082b2b190819, 0x191919082b190808,
4813
- 0x191919082b19082b, 0x1919191908082b2b, 0x1919192b08080819,
4814
- 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
4815
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808,
4816
- 0x19192b2b08082b08, 0x192b080808081908, 0x192b080808190808,
4817
- 0x192b080819080808, 0x192b0808192b2b08, 0x192b081908080808,
4818
- 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
4819
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808,
4820
- 0x192b19190819082b, 0x192b19192b081908, 0x192b2b081908082b,
4821
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808082b2b,
4822
- 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
4823
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819,
4824
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808,
4825
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
4826
- 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
4827
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808,
4828
- 0x2b082b080808082b, 0x2b082b1908081908, 0x2b082b2b08190819,
4829
- 0x2b19080808081908, 0x2b19080808190808, 0x2b190808082b1908,
4830
- 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
4831
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808,
4832
- 0x2b191908082b082b, 0x2b19190819081908, 0x2b19191919190819,
4833
- 0x2b192b082b080819, 0x2b192b19082b0808, 0x2b2b08080808082b,
4834
- 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
4835
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808,
4836
- 0x2b2b2b1908081908,
4837
- });
4838
-
4839
- static dpct::global_memory<const uint64_t, 1>
4840
- iq2xs_grid(sycl::range<1>(512),
4841
- {
4842
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
4843
- 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
4844
- 0x0808080808191908, 0x080808080819192b, 0x0808080808192b19,
4845
- 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
4846
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908,
4847
- 0x080808081908192b, 0x0808080819082b19, 0x0808080819190808,
4848
- 0x080808081919082b, 0x0808080819191919, 0x0808080819192b08,
4849
- 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
4850
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08,
4851
- 0x080808082b190819, 0x080808082b191908, 0x080808082b192b19,
4852
- 0x080808082b2b0808, 0x0808081908080819, 0x0808081908081908,
4853
- 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
4854
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08,
4855
- 0x0808081908192b2b, 0x08080819082b0819, 0x08080819082b1908,
4856
- 0x0808081919080808, 0x080808191908082b, 0x0808081919081919,
4857
- 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
4858
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819,
4859
- 0x080808192b081908, 0x080808192b190808, 0x0808082b08080808,
4860
- 0x0808082b0808082b, 0x0808082b08081919, 0x0808082b08082b08,
4861
- 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
4862
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808,
4863
- 0x0808082b19191919, 0x0808082b2b080808, 0x0808082b2b082b2b,
4864
- 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
4865
- 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
4866
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819,
4867
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
4868
- 0x0808190819081919, 0x0808190819082b08, 0x0808190819190819,
4869
- 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
4870
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
4871
- 0x0808191908080808, 0x080819190808082b, 0x0808191908081919,
4872
- 0x0808191908082b08, 0x0808191908190819, 0x0808191908191908,
4873
- 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
4874
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808,
4875
- 0x0808192b08080819, 0x0808192b08081908, 0x0808192b08190808,
4876
- 0x0808192b082b192b, 0x0808192b19080808, 0x0808192b1908082b,
4877
- 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
4878
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b,
4879
- 0x08082b0808190819, 0x08082b0808191908, 0x08082b08082b0808,
4880
- 0x08082b08082b1919, 0x08082b0819080819, 0x08082b0819081908,
4881
- 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
4882
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819,
4883
- 0x08082b1908081908, 0x08082b1908190808, 0x08082b1919080808,
4884
- 0x08082b192b080819, 0x08082b192b082b19, 0x08082b2b08080808,
4885
- 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
4886
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908,
4887
- 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808,
4888
- 0x081908080819082b, 0x0819080808191919, 0x0819080808192b08,
4889
- 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
4890
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08,
4891
- 0x0819080819190819, 0x0819080819191908, 0x08190808192b0808,
4892
- 0x08190808192b2b2b, 0x081908082b080819, 0x081908082b081908,
4893
- 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
4894
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819,
4895
- 0x0819081908191908, 0x08190819082b0808, 0x0819081919080819,
4896
- 0x0819081919081908, 0x0819081919190808, 0x081908192b080808,
4897
- 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
4898
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808,
4899
- 0x0819082b19080808, 0x0819082b192b0808, 0x0819190808080808,
4900
- 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
4901
- 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
4902
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19,
4903
- 0x0819190819190808, 0x08191908192b1908, 0x081919082b080808,
4904
- 0x0819191908080819, 0x0819191908081908, 0x0819191908190808,
4905
- 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
4906
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908,
4907
- 0x08192b0808190808, 0x08192b080819082b, 0x08192b0819080808,
4908
- 0x08192b0819191908, 0x08192b082b08192b, 0x08192b1908080808,
4909
- 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
4910
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b,
4911
- 0x082b080808081919, 0x082b080808082b08, 0x082b080808082b2b,
4912
- 0x082b080808190819, 0x082b080808191908, 0x082b0808082b0808,
4913
- 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
4914
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819,
4915
- 0x082b081908081908, 0x082b081908190808, 0x082b081919080808,
4916
- 0x082b081919082b08, 0x082b0819192b1919, 0x082b082b08080808,
4917
- 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
4918
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808,
4919
- 0x082b1908082b2b19, 0x082b190819080808, 0x082b191908080808,
4920
- 0x082b191919080819, 0x082b19191919082b, 0x082b19192b192b19,
4921
- 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
4922
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b,
4923
- 0x082b2b08082b0808, 0x082b2b0819191919, 0x082b2b082b082b08,
4924
- 0x082b2b082b2b082b, 0x082b2b19192b2b08, 0x082b2b192b190808,
4925
- 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
4926
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
4927
- 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19,
4928
- 0x1908080808190808, 0x190808080819082b, 0x1908080808191919,
4929
- 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
4930
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919,
4931
- 0x1908080819082b08, 0x1908080819082b2b, 0x1908080819190819,
4932
- 0x1908080819191908, 0x19080808192b0808, 0x19080808192b1919,
4933
- 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
4934
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919,
4935
- 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
4936
- 0x19080819082b0808, 0x1908081919080819, 0x1908081919081908,
4937
- 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
4938
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908,
4939
- 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b082b2b19,
4940
- 0x1908082b19080808, 0x1908190808080808, 0x190819080808082b,
4941
- 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
4942
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808,
4943
- 0x1908190819080819, 0x1908190819081908, 0x1908190819190808,
4944
- 0x190819082b080808, 0x190819082b191908, 0x1908191908080819,
4945
- 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
4946
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808,
4947
- 0x1908192b08082b2b, 0x1908192b19081908, 0x1908192b19190808,
4948
- 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808190808,
4949
- 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
4950
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819,
4951
- 0x19082b1919081908, 0x19082b1919190808, 0x19082b19192b2b19,
4952
- 0x19082b2b08081908, 0x1919080808080808, 0x191908080808082b,
4953
- 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
4954
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08,
4955
- 0x1919080819080819, 0x1919080819081908, 0x1919080819190808,
4956
- 0x191908082b080808, 0x1919081908080819, 0x1919081908081908,
4957
- 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
4958
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908,
4959
- 0x1919082b2b2b2b2b, 0x1919190808080819, 0x1919190808081908,
4960
- 0x1919190808190808, 0x19191908082b0819, 0x1919190819080808,
4961
- 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
4962
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808,
4963
- 0x191919192b082b08, 0x1919192b082b0819, 0x1919192b192b2b08,
4964
- 0x1919192b2b2b0819, 0x19192b0808080808, 0x19192b0808191908,
4965
- 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
4966
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b,
4967
- 0x19192b2b2b081919, 0x192b080808080819, 0x192b080808081908,
4968
- 0x192b080808190808, 0x192b080819080808, 0x192b080819191908,
4969
- 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
4970
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b,
4971
- 0x192b082b2b19082b, 0x192b190808080808, 0x192b19080819192b,
4972
- 0x192b191908190808, 0x192b191919080808, 0x192b191919081919,
4973
- 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
4974
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908,
4975
- 0x192b2b2b192b082b, 0x2b08080808080808, 0x2b0808080808082b,
4976
- 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
4977
- 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
4978
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808,
4979
- 0x2b0808082b080808, 0x2b0808082b08082b, 0x2b0808082b2b2b08,
4980
- 0x2b0808082b2b2b2b, 0x2b08081908080819, 0x2b08081908081908,
4981
- 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
4982
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808,
4983
- 0x2b08082b082b0808, 0x2b08082b2b080808, 0x2b08082b2b08082b,
4984
- 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, 0x2b08190808080819,
4985
- 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
4986
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
4987
- 0x2b0819082b082b19, 0x2b08191908080808, 0x2b08191919081908,
4988
- 0x2b0819192b2b1919, 0x2b08192b08192b08, 0x2b08192b192b2b2b,
4989
- 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
4990
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b,
4991
- 0x2b082b082b2b2b08, 0x2b082b190808192b, 0x2b082b2b082b082b,
4992
- 0x2b082b2b2b080808, 0x2b082b2b2b082b08, 0x2b082b2b2b19192b,
4993
- 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
4994
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b,
4995
- 0x2b1908082b081908, 0x2b19081908080808, 0x2b190819082b082b,
4996
- 0x2b190819192b1908, 0x2b19082b1919192b, 0x2b19082b2b082b19,
4997
- 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
4998
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19,
4999
- 0x2b1919192b190808, 0x2b1919192b19082b, 0x2b19192b19080819,
5000
- 0x2b192b0819190819, 0x2b192b082b2b192b, 0x2b192b1919082b19,
5001
- 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
5002
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b,
5003
- 0x2b2b0808082b0808, 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808,
5004
- 0x2b2b081919190819, 0x2b2b081919192b19, 0x2b2b08192b2b192b,
5005
- 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
5006
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808,
5007
- 0x2b2b190819080808, 0x2b2b19082b191919, 0x2b2b192b192b1919,
5008
- 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, 0x2b2b2b08082b0808,
5009
- 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
5010
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908,
5011
- 0x2b2b2b192b08192b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b,
5012
- 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
5013
- });
5014
-
5015
- static dpct::global_memory<const uint32_t, 1> iq3xxs_grid(
5016
- sycl::range<1>(256),
5017
- {
5018
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e,
5019
- 0x04041404, 0x04041414, 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c,
5020
- 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, 0x040c140c, 0x040c142c,
5021
- 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
5022
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c,
5023
- 0x04141c1c, 0x04141c3e, 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c,
5024
- 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, 0x041c3e04, 0x04240c1c,
5025
- 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
5026
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04,
5027
- 0x043e0c24, 0x043e0c34, 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c,
5028
- 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, 0x0c041c04, 0x0c041c14,
5029
- 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
5030
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14,
5031
- 0x0c14140c, 0x0c141c04, 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404,
5032
- 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, 0x0c24042c, 0x0c242c04,
5033
- 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
5034
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404,
5035
- 0x14041414, 0x14041434, 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c,
5036
- 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, 0x140c1c04, 0x140c341c,
5037
- 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
5038
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c,
5039
- 0x141c0c04, 0x141c0c24, 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c,
5040
- 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, 0x143e040c, 0x143e041c,
5041
- 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
5042
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414,
5043
- 0x1c0c1404, 0x1c0c1c0c, 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c,
5044
- 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, 0x1c1c0c0c, 0x1c1c1c1c,
5045
- 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
5046
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404,
5047
- 0x24040424, 0x24040c3e, 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e,
5048
- 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, 0x24143404, 0x24143434,
5049
- 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
5050
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04,
5051
- 0x2c040c14, 0x2c04240c, 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434,
5052
- 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, 0x2c1c0414, 0x2c1c2c1c,
5053
- 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
5054
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434,
5055
- 0x34043424, 0x340c140c, 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04,
5056
- 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, 0x34341c1c, 0x343e041c,
5057
- 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
5058
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14,
5059
- 0x3e1c0404, 0x3e1c0c2c, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c,
5060
- 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
5061
- });
5062
-
5063
- static dpct::global_memory<const uint8_t, 1> ksigns_iq2xs(
5064
- sycl::range<1>(128),
5065
- {
5066
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12,
5067
- 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153,
5068
- 154, 27, 156, 29, 30, 159, 160, 33, 34, 163, 36, 165, 166,
5069
- 39, 40, 169, 170, 43, 172, 45, 46, 175, 48, 177, 178, 51,
5070
- 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, 192,
5071
- 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77,
5072
- 78, 207, 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90,
5073
- 219, 92, 221, 222, 95, 96, 225, 226, 99, 228, 101, 102, 231,
5074
- 232, 105, 106, 235, 108, 237, 238, 111, 240, 113, 114, 243, 116,
5075
- 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
5076
- });
5077
-
5078
- static dpct::global_memory<const uint64_t, 1>
5079
- ksigns64(sycl::range<1>(128),
5080
- {
5081
- 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00,
5082
- 0x000000000000ffff, 0xff00000000ff0000, 0x0000000000ff00ff,
5083
- 0x0000000000ffff00, 0xff00000000ffffff, 0xff000000ff000000,
5084
- 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
5085
- 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00,
5086
- 0x00000000ffffffff, 0xff0000ff00000000, 0x000000ff000000ff,
5087
- 0x000000ff0000ff00, 0xff0000ff0000ffff, 0x000000ff00ff0000,
5088
- 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
5089
- 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00,
5090
- 0x000000ffff00ffff, 0xff0000ffffff0000, 0x000000ffffff00ff,
5091
- 0x000000ffffffff00, 0xff0000ffffffffff, 0xff00ff0000000000,
5092
- 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
5093
- 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00,
5094
- 0x0000ff0000ffffff, 0x0000ff00ff000000, 0xff00ff00ff0000ff,
5095
- 0xff00ff00ff00ff00, 0x0000ff00ff00ffff, 0xff00ff00ffff0000,
5096
- 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
5097
- 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00,
5098
- 0x0000ffff0000ffff, 0xff00ffff00ff0000, 0x0000ffff00ff00ff,
5099
- 0x0000ffff00ffff00, 0xff00ffff00ffffff, 0xff00ffffff000000,
5100
- 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
5101
- 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00,
5102
- 0x0000ffffffffffff, 0xffff000000000000, 0x00ff0000000000ff,
5103
- 0x00ff00000000ff00, 0xffff00000000ffff, 0x00ff000000ff0000,
5104
- 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
5105
- 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00,
5106
- 0x00ff0000ff00ffff, 0xffff0000ffff0000, 0x00ff0000ffff00ff,
5107
- 0x00ff0000ffffff00, 0xffff0000ffffffff, 0x00ff00ff00000000,
5108
- 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
5109
- 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
5110
- 0xffff00ff00ffffff, 0xffff00ffff000000, 0x00ff00ffff0000ff,
5111
- 0x00ff00ffff00ff00, 0xffff00ffff00ffff, 0x00ff00ffffff0000,
5112
- 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
5113
- 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00,
5114
- 0x00ffff000000ffff, 0xffffff0000ff0000, 0x00ffff0000ff00ff,
5115
- 0x00ffff0000ffff00, 0xffffff0000ffffff, 0xffffff00ff000000,
5116
- 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
5117
- 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00,
5118
- 0x00ffff00ffffffff, 0xffffffff00000000, 0x00ffffff000000ff,
5119
- 0x00ffffff0000ff00, 0xffffffff0000ffff, 0x00ffffff00ff0000,
5120
- 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
5121
- 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00,
5122
- 0x00ffffffff00ffff, 0xffffffffffff0000, 0x00ffffffffff00ff,
5123
- 0x00ffffffffffff00, 0xffffffffffffffff,
5124
- });
5125
- //#endif
5126
-
5127
- static dpct::global_memory<const uint8_t, 1>
5128
- kmask_iq2xs(sycl::range<1>(8), {1, 2, 4, 8, 16, 32, 64, 128});
5129
-
5130
4599
  template<typename dst_t>
5131
4600
  static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
5132
4601
  const sycl::nd_item<3> &item_ct1,
@@ -5213,6 +4682,65 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
5213
4682
 
5214
4683
  }
5215
4684
 
4685
+ template<typename dst_t>
4686
+ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4687
+ const sycl::nd_item<3> &item_ct1,
4688
+ const uint32_t *iq3s_grid,
4689
+ const uint8_t *ksigns_iq2xs,
4690
+ const uint8_t *kmask_iq2xs) {
4691
+
4692
+ const int i = item_ct1.get_group(2);
4693
+ const block_iq3_s * x = (const block_iq3_s *) vx;
4694
+
4695
+ const int tid = item_ct1.get_local_id(2);
4696
+ #if QK_K == 256
4697
+ const int il = tid/8; // 0...3
4698
+ const int ib = tid%8; // 0...7
4699
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4700
+ const uint8_t * qs = x[i].qs + 8*ib;
4701
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]);
4702
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]);
4703
+ const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
4704
+ const uint8_t signs = x[i].signs[4*ib + il];
4705
+ for (int j = 0; j < 4; ++j) {
4706
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4707
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4708
+ }
4709
+ #else
4710
+ assert(false);
4711
+ #endif
4712
+
4713
+ }
4714
+
4715
+ template<typename dst_t>
4716
+ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4717
+ const sycl::nd_item<3> &item_ct1,
4718
+ const uint32_t *iq1s_grid,
4719
+ const uint8_t *ksigns_iq2xs,
4720
+ const uint8_t *kmask_iq2xs) {
4721
+ const int i = item_ct1.get_group(2);
4722
+ const block_iq1_s * x = (const block_iq1_s *) vx;
4723
+
4724
+ const int tid = item_ct1.get_local_id(2);
4725
+ #if QK_K == 256
4726
+ const int il = tid/8; // 0...3
4727
+ const int ib = tid%8; // 0...7
4728
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4729
+ const uint8_t * qs = x[i].qs + 8*ib;
4730
+ const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
4731
+ const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
4732
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
4733
+ const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
4734
+ for (int j = 0; j < 4; ++j) {
4735
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4736
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4737
+ }
4738
+ #else
4739
+ assert(false);
4740
+ #endif
4741
+
4742
+ }
4743
+
5216
4744
  /*
5217
4745
  DPCT1110:4: The total declared local variable size in device function
5218
4746
  dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@@ -8059,6 +7587,75 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
8059
7587
  #endif
8060
7588
  }
8061
7589
 
7590
+ static __dpct_inline__ float
7591
+ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7592
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7593
+ const uint32_t *iq3s_grid, const uint64_t *ksigns64) {
7594
+ #if DPCT_COMPATIBILITY_TEMP >= \
7595
+ MIN_CC_DP4A // lowest compute capability for integer intrinsics
7596
+ #if QK_K == 256
7597
+ const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7598
+
7599
+ const int ib32 = iqs;
7600
+ const uint8_t * qs = bq2->qs + 8*ib32;
7601
+ const int8_t * q8 = bq8_1[ib32].qs;
7602
+ int sumi = 0;
7603
+ for (int l = 0; l < 4; ++l) {
7604
+ const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
7605
+ const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
7606
+ uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7607
+ ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
7608
+ uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7609
+ ((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
7610
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7611
+ grid1[0] ^ signs0, signs0, std::minus<>());
7612
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7613
+ grid2[0] ^ signs1, signs1, std::minus<>());
7614
+ sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
7615
+ sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7616
+ q8 += 8;
7617
+ }
7618
+ const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0];
7619
+ return d * sumi;
7620
+ #else
7621
+ assert(false);
7622
+ return 0.f;
7623
+ #endif
7624
+ #else
7625
+ assert(false);
7626
+ return 0.f;
7627
+ #endif
7628
+ }
7629
+
7630
+ static __dpct_inline__ float
7631
+ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7632
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7633
+ const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
7634
+ #if QK_K == 256
7635
+ const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7636
+
7637
+ const int ib32 = iqs;
7638
+ const uint8_t * qs = bq1->qs + 4*ib32;
7639
+ const int8_t * q8 = bq8_1[ib32].qs;
7640
+ int sumi = 0;
7641
+ for (int l = 0; l < 4; ++l) {
7642
+ const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
7643
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
7644
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7645
+ grid[0] ^ signs[0], signs[0], std::minus<>());
7646
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7647
+ grid[1] ^ signs[1], signs[1], std::minus<>());
7648
+ sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
7649
+ sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7650
+ q8 += 8;
7651
+ }
7652
+ const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
7653
+ return d * sumi;
7654
+ #else
7655
+ assert(false);
7656
+ return 0.f;
7657
+ #endif
7658
+ }
8062
7659
 
8063
7660
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
8064
7661
  int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
@@ -8824,6 +8421,98 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
8824
8421
  }
8825
8422
  }
8826
8423
 
8424
+ template <int qk, int qi, typename block_q_t, int vdr>
8425
+ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8426
+ const sycl::nd_item<3> &item_ct1,
8427
+ const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) {
8428
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8429
+ item_ct1.get_local_id(1);
8430
+
8431
+ if (row >= nrows) {
8432
+ return;
8433
+ }
8434
+
8435
+ const int blocks_per_row = ncols / qk;
8436
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8437
+
8438
+ // partial sum for each thread
8439
+ float tmp = 0.0f;
8440
+
8441
+ const block_q_t * x = (const block_q_t *) vx;
8442
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8443
+
8444
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8445
+ i += blocks_per_warp) {
8446
+ const int ibx = row*blocks_per_row + i; // x block index
8447
+
8448
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8449
+
8450
+ const int iqs =
8451
+ vdr *
8452
+ (item_ct1.get_local_id(2) %
8453
+ (qi / vdr)); // x block quant index when casting the quants to int
8454
+
8455
+ tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr);
8456
+ }
8457
+
8458
+ // sum up partial sums and write back result
8459
+ #pragma unroll
8460
+ for (int mask = 16; mask > 0; mask >>= 1) {
8461
+ tmp +=
8462
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8463
+ }
8464
+
8465
+ if (item_ct1.get_local_id(2) == 0) {
8466
+ dst[row] = tmp;
8467
+ }
8468
+ }
8469
+
8470
+ template <int qk, int qi, typename block_q_t, int vdr>
8471
+ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8472
+ const sycl::nd_item<3> &item_ct1,
8473
+ const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
8474
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8475
+ item_ct1.get_local_id(1);
8476
+
8477
+ if (row >= nrows) {
8478
+ return;
8479
+ }
8480
+
8481
+ const int blocks_per_row = ncols / qk;
8482
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8483
+
8484
+ // partial sum for each thread
8485
+ float tmp = 0.0f;
8486
+
8487
+ const block_q_t * x = (const block_q_t *) vx;
8488
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8489
+
8490
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8491
+ i += blocks_per_warp) {
8492
+ const int ibx = row*blocks_per_row + i; // x block index
8493
+
8494
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8495
+
8496
+ const int iqs =
8497
+ vdr *
8498
+ (item_ct1.get_local_id(2) %
8499
+ (qi / vdr)); // x block quant index when casting the quants to int
8500
+
8501
+ tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr);
8502
+ }
8503
+
8504
+ // sum up partial sums and write back result
8505
+ #pragma unroll
8506
+ for (int mask = 16; mask > 0; mask >>= 1) {
8507
+ tmp +=
8508
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8509
+ }
8510
+
8511
+ if (item_ct1.get_local_id(2) == 0) {
8512
+ dst[row] = tmp;
8513
+ }
8514
+ }
8515
+
8827
8516
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
8828
8517
  static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
8829
8518
  const sycl::nd_item<3> &item_ct1) {
@@ -10509,6 +10198,64 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10509
10198
  }
10510
10199
  }
10511
10200
 
10201
+ template <typename dst_t>
10202
+ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10203
+ dpct::queue_ptr stream) {
10204
+ const int nb = k / QK_K;
10205
+ {
10206
+ iq3s_grid.init(*stream);
10207
+ ksigns_iq2xs.init(*stream);
10208
+ kmask_iq2xs.init(*stream);
10209
+
10210
+ dpct::has_capability_or_fail(stream->get_device(),
10211
+ {sycl::aspect::fp16});
10212
+
10213
+ stream->submit([&](sycl::handler &cgh) {
10214
+ auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10215
+ auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10216
+ auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10217
+
10218
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10219
+ sycl::range<3>(1, 1, 32),
10220
+ sycl::range<3>(1, 1, 32)),
10221
+ [=](sycl::nd_item<3> item_ct1) {
10222
+ dequantize_block_iq3_s(
10223
+ vx, y, item_ct1, iq3s_grid_ptr_ct1,
10224
+ ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10225
+ });
10226
+ });
10227
+ }
10228
+ }
10229
+
10230
+ template <typename dst_t>
10231
+ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10232
+ dpct::queue_ptr stream) {
10233
+ const int nb = k / QK_K;
10234
+ {
10235
+ iq1s_grid_gpu.init(*stream);
10236
+ ksigns_iq2xs.init(*stream);
10237
+ kmask_iq2xs.init(*stream);
10238
+
10239
+ dpct::has_capability_or_fail(stream->get_device(),
10240
+ {sycl::aspect::fp16});
10241
+
10242
+ stream->submit([&](sycl::handler &cgh) {
10243
+ auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10244
+ auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10245
+ auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10246
+
10247
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10248
+ sycl::range<3>(1, 1, 32),
10249
+ sycl::range<3>(1, 1, 32)),
10250
+ [=](sycl::nd_item<3> item_ct1) {
10251
+ dequantize_block_iq1_s(
10252
+ vx, y, item_ct1, iq1s_grid_ptr_ct1,
10253
+ ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10254
+ });
10255
+ });
10256
+ }
10257
+ }
10258
+
10512
10259
  template <typename src_t, typename dst_t>
10513
10260
  static void convert_unary_sycl(const void *__restrict__ vx,
10514
10261
  dst_t *__restrict__ y, const int k,
@@ -10559,6 +10306,10 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
10559
10306
  return dequantize_row_iq2_xs_sycl;
10560
10307
  case GGML_TYPE_IQ3_XXS:
10561
10308
  return dequantize_row_iq3_xxs_sycl;
10309
+ case GGML_TYPE_IQ3_S:
10310
+ return dequantize_row_iq3_s_sycl;
10311
+ case GGML_TYPE_IQ1_S:
10312
+ return dequantize_row_iq1_s_sycl;
10562
10313
  case GGML_TYPE_F32:
10563
10314
  return convert_unary_sycl<float>;
10564
10315
  default:
@@ -10599,6 +10350,10 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
10599
10350
  return dequantize_row_iq2_xs_sycl;
10600
10351
  case GGML_TYPE_IQ3_XXS:
10601
10352
  return dequantize_row_iq3_xxs_sycl;
10353
+ case GGML_TYPE_IQ3_S:
10354
+ return dequantize_row_iq3_s_sycl;
10355
+ case GGML_TYPE_IQ1_S:
10356
+ return dequantize_row_iq1_s_sycl;
10602
10357
  case GGML_TYPE_F16:
10603
10358
  return convert_unary_sycl<sycl::half>;
10604
10359
  default:
@@ -11188,6 +10943,61 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
11188
10943
  }
11189
10944
  }
11190
10945
 
10946
+ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
10947
+ float *dst, const int ncols,
10948
+ const int nrows,
10949
+ dpct::queue_ptr stream) {
10950
+ GGML_ASSERT(ncols % QK_K == 0);
10951
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
10952
+ const sycl::range<3> block_nums(1, 1, block_num_y);
10953
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10954
+ {
10955
+ iq3s_grid.init(*stream);
10956
+ ksigns64.init(*stream);
10957
+
10958
+ stream->submit([&](sycl::handler &cgh) {
10959
+ auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10960
+ auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10961
+
10962
+ cgh.parallel_for(
10963
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
10964
+ [=](sycl::nd_item<3> item_ct1)
10965
+ [[intel::reqd_sub_group_size(32)]] {
10966
+ mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
10967
+ vx, vy, dst, ncols, nrows, item_ct1,
10968
+ iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
10969
+ });
10970
+ });
10971
+ }
10972
+ }
10973
+
10974
+ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
10975
+ float *dst, const int ncols,
10976
+ const int nrows,
10977
+ dpct::queue_ptr stream) {
10978
+ GGML_ASSERT(ncols % QK_K == 0);
10979
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
10980
+ const sycl::range<3> block_nums(1, 1, block_num_y);
10981
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10982
+ {
10983
+ iq1s_grid_gpu.init(*stream);
10984
+ ksigns64.init(*stream);
10985
+
10986
+ stream->submit([&](sycl::handler &cgh) {
10987
+ auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10988
+ auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10989
+
10990
+ cgh.parallel_for(
10991
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
10992
+ [=](sycl::nd_item<3> item_ct1)
10993
+ [[intel::reqd_sub_group_size(32)]] {
10994
+ mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
10995
+ vx, vy, dst, ncols, nrows, item_ct1,
10996
+ iq1s_grid_ptr_ct1, ksigns64_ptr_ct1);
10997
+ });
10998
+ });
10999
+ }
11000
+ }
11191
11001
 
11192
11002
  static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
11193
11003
  float *dst, const int ncols_x,
@@ -13936,8 +13746,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
13936
13746
  case GGML_TYPE_Q5_K:
13937
13747
  case GGML_TYPE_IQ2_XXS:
13938
13748
  case GGML_TYPE_IQ2_XS:
13749
+ case GGML_TYPE_IQ1_S:
13939
13750
  case GGML_TYPE_IQ3_XXS:
13940
13751
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
13752
+ case GGML_TYPE_IQ3_S:
13753
+ return max_compute_capability >= VER_GEN9 ? 128 : 64;
13941
13754
  case GGML_TYPE_Q6_K:
13942
13755
  return 64;
13943
13756
  default:
@@ -13998,6 +13811,12 @@ inline void ggml_sycl_op_mul_mat_vec_q(
13998
13811
  case GGML_TYPE_IQ3_XXS:
13999
13812
  mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14000
13813
  break;
13814
+ case GGML_TYPE_IQ3_S:
13815
+ mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13816
+ break;
13817
+ case GGML_TYPE_IQ1_S:
13818
+ mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13819
+ break;
14001
13820
  default:
14002
13821
  GGML_ASSERT(false);
14003
13822
  break;
@@ -17343,9 +17162,8 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17343
17162
  return false;
17344
17163
  }
17345
17164
  ggml_type a_type = a->type;
17346
- if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
17347
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
17348
- a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
17165
+ if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S ||
17166
+ a_type == GGML_TYPE_IQ4_XS) {
17349
17167
  return false;
17350
17168
  }
17351
17169
  return true;
@@ -17440,13 +17258,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17440
17258
  /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
17441
17259
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
17442
17260
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
17443
- /* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async,
17261
+ /* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
17444
17262
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17445
17263
  /* .graph_plan_create = */ NULL,
17446
17264
  /* .graph_plan_free = */ NULL,
17447
17265
  /* .graph_plan_compute = */ NULL,
17448
17266
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17449
17267
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17268
+ /* .event_new = */ NULL,
17269
+ /* .event_free = */ NULL,
17270
+ /* .event_record = */ NULL,
17271
+ /* .event_wait = */ NULL,
17272
+ /* .event_synchronize = */ NULL,
17450
17273
  };
17451
17274
 
17452
17275
  static ggml_guid_t ggml_backend_sycl_guid() {