llama_cpp 0.14.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +71 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +9 -0
- data/vendor/tmp/llama.cpp/Makefile +28 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
- data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
- data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
- data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
- data/vendor/tmp/llama.cpp/ggml.c +509 -343
- data/vendor/tmp/llama.cpp/ggml.h +61 -47
- data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
- data/vendor/tmp/llama.cpp/llama.h +25 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -202,24 +202,29 @@ namespace dpct
|
|
202
202
|
// Version string has the following format:
|
203
203
|
// a. OpenCL<space><major.minor><space><vendor-specific-information>
|
204
204
|
// b. <major.minor>
|
205
|
+
// c. <AmdGcnArchName> e.g gfx1030
|
205
206
|
std::string ver;
|
206
207
|
ver = dev.get_info<sycl::info::device::version>();
|
207
208
|
std::string::size_type i = 0;
|
208
|
-
while (i < ver.size())
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
i++;
|
209
|
+
while (i < ver.size()) {
|
210
|
+
if (isdigit(ver[i]))
|
211
|
+
break;
|
212
|
+
i++;
|
213
213
|
}
|
214
214
|
major = std::stoi(&(ver[i]));
|
215
|
-
while (i < ver.size())
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
215
|
+
while (i < ver.size()) {
|
216
|
+
if (ver[i] == '.')
|
217
|
+
break;
|
218
|
+
i++;
|
219
|
+
}
|
220
|
+
if (i < ver.size()) {
|
221
|
+
// a. and b.
|
222
|
+
i++;
|
223
|
+
minor = std::stoi(&(ver[i]));
|
224
|
+
} else {
|
225
|
+
// c.
|
226
|
+
minor = 0;
|
220
227
|
}
|
221
|
-
i++;
|
222
|
-
minor = std::stoi(&(ver[i]));
|
223
228
|
}
|
224
229
|
|
225
230
|
template <typename tag, typename T>
|
@@ -3144,6 +3149,9 @@ namespace dpct
|
|
3144
3149
|
|
3145
3150
|
} // COPY from DPCT head files
|
3146
3151
|
|
3152
|
+
#define GGML_COMMON_DECL_SYCL
|
3153
|
+
#define GGML_COMMON_IMPL_SYCL
|
3154
|
+
#include "ggml-common.h"
|
3147
3155
|
|
3148
3156
|
static int g_ggml_sycl_debug=0;
|
3149
3157
|
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
|
@@ -3310,66 +3318,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
|
|
3310
3318
|
const float *src1_dd, float *dst_dd,
|
3311
3319
|
const dpct::queue_ptr &main_stream);
|
3312
3320
|
|
3313
|
-
// QK = number of values after dequantization
|
3314
|
-
// QR = QK / number of values before dequantization
|
3315
|
-
// QI = number of 32 bit integers before dequantization
|
3316
|
-
|
3317
|
-
#define QK4_0 32
|
3318
|
-
#define QR4_0 2
|
3319
|
-
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
3320
|
-
typedef struct dpct_type_block_q4_0 {
|
3321
|
-
sycl::half d; // delta
|
3322
|
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
3323
|
-
} block_q4_0;
|
3324
|
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
3325
|
-
|
3326
|
-
#define QK4_1 32
|
3327
|
-
#define QR4_1 2
|
3328
|
-
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
3329
|
-
typedef struct dpct_type_block_q4_1 {
|
3330
|
-
sycl::half2 dm; // dm.x = delta, dm.y = min
|
3331
|
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
3332
|
-
} block_q4_1;
|
3333
|
-
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
3334
|
-
|
3335
|
-
#define QK5_0 32
|
3336
|
-
#define QR5_0 2
|
3337
|
-
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
3338
|
-
typedef struct dpct_type_block_q5_0 {
|
3339
|
-
sycl::half d; // delta
|
3340
|
-
uint8_t qh[4]; // 5-th bit of quants
|
3341
|
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
3342
|
-
} block_q5_0;
|
3343
|
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
3344
|
-
|
3345
|
-
#define QK5_1 32
|
3346
|
-
#define QR5_1 2
|
3347
|
-
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
3348
|
-
typedef struct dpct_type_block_q5_1 {
|
3349
|
-
sycl::half2 dm; // dm.x = delta, dm.y = min
|
3350
|
-
uint8_t qh[4]; // 5-th bit of quants
|
3351
|
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
3352
|
-
} block_q5_1;
|
3353
|
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
3354
|
-
|
3355
|
-
#define QK8_0 32
|
3356
|
-
#define QR8_0 1
|
3357
|
-
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
3358
|
-
typedef struct dpct_type_block_q8_0 {
|
3359
|
-
sycl::half d; // delta
|
3360
|
-
int8_t qs[QK8_0]; // quants
|
3361
|
-
} block_q8_0;
|
3362
|
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
3363
|
-
|
3364
|
-
#define QK8_1 32
|
3365
|
-
#define QR8_1 1
|
3366
|
-
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
3367
|
-
typedef struct dpct_type_block_q8_1 {
|
3368
|
-
sycl::half2 ds; // ds.x = delta, ds.y = sum
|
3369
|
-
int8_t qs[QK8_0]; // quants
|
3370
|
-
} block_q8_1;
|
3371
|
-
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
3372
|
-
|
3373
3321
|
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
3374
3322
|
typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
|
3375
3323
|
int **x_qh, int **x_sc);
|
@@ -3386,112 +3334,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
3386
3334
|
const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
|
3387
3335
|
const int &i, const int &j, const int &k);
|
3388
3336
|
|
3389
|
-
//================================= k-quants
|
3390
|
-
|
3391
|
-
#ifdef GGML_QKK_64
|
3392
|
-
#define QK_K 64
|
3393
|
-
#define K_SCALE_SIZE 4
|
3394
|
-
#else
|
3395
|
-
#define QK_K 256
|
3396
|
-
#define K_SCALE_SIZE 12
|
3397
|
-
#endif
|
3398
|
-
|
3399
|
-
#define QR2_K 4
|
3400
|
-
#define QI2_K (QK_K / (4*QR2_K))
|
3401
|
-
typedef struct dpct_type_block_q2_K {
|
3402
|
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
3403
|
-
uint8_t qs[QK_K/4]; // quants
|
3404
|
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
3405
|
-
} block_q2_K;
|
3406
|
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
3407
|
-
|
3408
|
-
#define QR3_K 4
|
3409
|
-
#define QI3_K (QK_K / (4*QR3_K))
|
3410
|
-
typedef struct dpct_type_block_q3_K {
|
3411
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
3412
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
3413
|
-
#ifdef GGML_QKK_64
|
3414
|
-
uint8_t scales[2]; // scales, quantized with 8 bits
|
3415
|
-
#else
|
3416
|
-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
3417
|
-
#endif
|
3418
|
-
sycl::half d; // super-block scale
|
3419
|
-
} block_q3_K;
|
3420
|
-
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
3421
|
-
|
3422
|
-
#define QR4_K 2
|
3423
|
-
#define QI4_K (QK_K / (4*QR4_K))
|
3424
|
-
#ifdef GGML_QKK_64
|
3425
|
-
typedef struct {
|
3426
|
-
sycl::half dm[2]; // super-block scales/mins
|
3427
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
3428
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
3429
|
-
} block_q4_K;
|
3430
|
-
static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
3431
|
-
#else
|
3432
|
-
typedef struct dpct_type_block_q4_K {
|
3433
|
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
3434
|
-
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
3435
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
3436
|
-
} block_q4_K;
|
3437
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
3438
|
-
#endif
|
3439
|
-
|
3440
|
-
#define QR5_K 2
|
3441
|
-
#define QI5_K (QK_K / (4*QR5_K))
|
3442
|
-
#ifdef GGML_QKK_64
|
3443
|
-
typedef struct {
|
3444
|
-
sycl::half d; // super-block scale
|
3445
|
-
int8_t scales[QK_K/16]; // block scales
|
3446
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
3447
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
3448
|
-
} block_q5_K;
|
3449
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
3450
|
-
#else
|
3451
|
-
typedef struct dpct_type_block_q5_K {
|
3452
|
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
3453
|
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
3454
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
3455
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
3456
|
-
} block_q5_K;
|
3457
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
3458
|
-
#endif
|
3459
|
-
|
3460
|
-
#define QR6_K 2
|
3461
|
-
#define QI6_K (QK_K / (4*QR6_K))
|
3462
|
-
typedef struct dpct_type_block_q6_K {
|
3463
|
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
3464
|
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
3465
|
-
int8_t scales[QK_K/16]; // scales
|
3466
|
-
sycl::half d; // delta
|
3467
|
-
} block_q6_K;
|
3468
|
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
3469
|
-
|
3470
|
-
#define QR2_XXS 8
|
3471
|
-
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
3472
|
-
typedef struct dpct_type_block_iq2_xxs {
|
3473
|
-
sycl::half d;
|
3474
|
-
uint16_t qs[QK_K/8];
|
3475
|
-
} block_iq2_xxs;
|
3476
|
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
3477
|
-
|
3478
|
-
#define QR2_XS 8
|
3479
|
-
#define QI2_XS (QK_K / (4*QR2_XS))
|
3480
|
-
typedef struct dpct_type_block_iq2_xs {
|
3481
|
-
sycl::half d;
|
3482
|
-
uint16_t qs[QK_K/8];
|
3483
|
-
uint8_t scales[QK_K/32];
|
3484
|
-
} block_iq2_xs;
|
3485
|
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
3486
|
-
|
3487
|
-
#define QR3_XXS 8
|
3488
|
-
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
3489
|
-
typedef struct dpct_type_block_iq3_xxs {
|
3490
|
-
sycl::half d;
|
3491
|
-
uint8_t qs[3*(QK_K/8)];
|
3492
|
-
} block_iq3_xxs;
|
3493
|
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
3494
|
-
|
3495
3337
|
#define WARP_SIZE 32
|
3496
3338
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
3497
3339
|
|
@@ -3609,7 +3451,7 @@ class sycl_gpu_mgr {
|
|
3609
3451
|
dpct::device_info prop;
|
3610
3452
|
dpct::get_device_info(prop, device);
|
3611
3453
|
if (max_compute_units == prop.get_max_compute_units() &&
|
3612
|
-
|
3454
|
+
is_ext_oneapi_device(device)) {
|
3613
3455
|
gpus.push_back(id);
|
3614
3456
|
devices.push_back(device);
|
3615
3457
|
work_group_size = prop.get_max_work_group_size();
|
@@ -3642,6 +3484,15 @@ class sycl_gpu_mgr {
|
|
3642
3484
|
assert(false);
|
3643
3485
|
return -1;
|
3644
3486
|
}
|
3487
|
+
|
3488
|
+
bool is_ext_oneapi_device(const sycl::device &dev) {
|
3489
|
+
sycl::backend dev_backend = dev.get_backend();
|
3490
|
+
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
3491
|
+
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
3492
|
+
dev_backend == sycl::backend::ext_oneapi_hip)
|
3493
|
+
return true;
|
3494
|
+
return false;
|
3495
|
+
}
|
3645
3496
|
};
|
3646
3497
|
|
3647
3498
|
static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL;
|
@@ -4745,388 +4596,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4745
4596
|
#endif
|
4746
4597
|
}
|
4747
4598
|
|
4748
|
-
static dpct::global_memory<const uint64_t, 1>
|
4749
|
-
iq2xxs_grid(sycl::range<1>(256),
|
4750
|
-
{
|
4751
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
|
4752
|
-
0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
|
4753
|
-
0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b,
|
4754
|
-
0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
|
4755
|
-
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08,
|
4756
|
-
0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
4757
|
-
0x080808082b08082b, 0x080808082b082b2b, 0x080808082b2b082b,
|
4758
|
-
0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
4759
|
-
0x0808081908191919, 0x0808081919080808, 0x080808192b081908,
|
4760
|
-
0x080808192b192b08, 0x0808082b08080808, 0x0808082b0808082b,
|
4761
|
-
0x0808082b082b082b, 0x0808082b2b08082b, 0x0808190808080819,
|
4762
|
-
0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
4763
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
|
4764
|
-
0x0808190819082b08, 0x08081908192b0808, 0x080819082b080819,
|
4765
|
-
0x080819082b081908, 0x080819082b190808, 0x080819082b2b1908,
|
4766
|
-
0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
4767
|
-
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19,
|
4768
|
-
0x080819192b080808, 0x080819192b190819, 0x0808192b08082b19,
|
4769
|
-
0x0808192b08190808, 0x0808192b19080808, 0x0808192b2b081908,
|
4770
|
-
0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
4771
|
-
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08,
|
4772
|
-
0x08082b0819080819, 0x08082b0819081908, 0x08082b0819190808,
|
4773
|
-
0x08082b081919082b, 0x08082b082b082b08, 0x08082b1908081908,
|
4774
|
-
0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
4775
|
-
0x0819080808080819, 0x0819080808081908, 0x0819080808190808,
|
4776
|
-
0x08190808082b0819, 0x0819080819080808, 0x08190808192b0808,
|
4777
|
-
0x081908082b081908, 0x081908082b190808, 0x081908082b191919,
|
4778
|
-
0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
4779
|
-
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808,
|
4780
|
-
0x0819082b082b1908, 0x0819082b19081919, 0x0819190808080808,
|
4781
|
-
0x0819190808082b08, 0x08191908082b0808, 0x08191908082b1919,
|
4782
|
-
0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
4783
|
-
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b,
|
4784
|
-
0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
4785
|
-
0x08192b0819080808, 0x08192b082b080819, 0x08192b1908080808,
|
4786
|
-
0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
4787
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b,
|
4788
|
-
0x082b080819081908, 0x082b0808192b0819, 0x082b08082b080808,
|
4789
|
-
0x082b08082b08082b, 0x082b0819082b2b19, 0x082b081919082b08,
|
4790
|
-
0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
4791
|
-
0x082b190808081908, 0x082b190808190808, 0x082b190819080808,
|
4792
|
-
0x082b19081919192b, 0x082b191908080808, 0x082b191919080819,
|
4793
|
-
0x082b1919192b1908, 0x082b192b2b190808, 0x082b2b0808082b08,
|
4794
|
-
0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
4795
|
-
0x1908080808080819, 0x1908080808081908, 0x1908080808190808,
|
4796
|
-
0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
4797
|
-
0x1908080819080808, 0x1908080819082b08, 0x190808081919192b,
|
4798
|
-
0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
4799
|
-
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808,
|
4800
|
-
0x19080819192b0819, 0x190808192b080808, 0x190808192b081919,
|
4801
|
-
0x1908082b08080819, 0x1908082b08190808, 0x1908082b19082b08,
|
4802
|
-
0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
4803
|
-
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808,
|
4804
|
-
0x190819082b192b19, 0x190819190819082b, 0x19081919082b1908,
|
4805
|
-
0x1908192b08080808, 0x19082b0808080819, 0x19082b0808081908,
|
4806
|
-
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
4807
|
-
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819,
|
4808
|
-
0x19082b192b08082b, 0x19082b2b19081919, 0x19082b2b2b190808,
|
4809
|
-
0x1919080808080808, 0x1919080808082b08, 0x1919080808190819,
|
4810
|
-
0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
4811
|
-
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b,
|
4812
|
-
0x191908192b2b1908, 0x1919082b2b190819, 0x191919082b190808,
|
4813
|
-
0x191919082b19082b, 0x1919191908082b2b, 0x1919192b08080819,
|
4814
|
-
0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
4815
|
-
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808,
|
4816
|
-
0x19192b2b08082b08, 0x192b080808081908, 0x192b080808190808,
|
4817
|
-
0x192b080819080808, 0x192b0808192b2b08, 0x192b081908080808,
|
4818
|
-
0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
4819
|
-
0x192b190808080808, 0x192b190808081919, 0x192b191908190808,
|
4820
|
-
0x192b19190819082b, 0x192b19192b081908, 0x192b2b081908082b,
|
4821
|
-
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808082b2b,
|
4822
|
-
0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
4823
|
-
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819,
|
4824
|
-
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808,
|
4825
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
|
4826
|
-
0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
4827
|
-
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808,
|
4828
|
-
0x2b082b080808082b, 0x2b082b1908081908, 0x2b082b2b08190819,
|
4829
|
-
0x2b19080808081908, 0x2b19080808190808, 0x2b190808082b1908,
|
4830
|
-
0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
4831
|
-
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808,
|
4832
|
-
0x2b191908082b082b, 0x2b19190819081908, 0x2b19191919190819,
|
4833
|
-
0x2b192b082b080819, 0x2b192b19082b0808, 0x2b2b08080808082b,
|
4834
|
-
0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
4835
|
-
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808,
|
4836
|
-
0x2b2b2b1908081908,
|
4837
|
-
});
|
4838
|
-
|
4839
|
-
static dpct::global_memory<const uint64_t, 1>
|
4840
|
-
iq2xs_grid(sycl::range<1>(512),
|
4841
|
-
{
|
4842
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
|
4843
|
-
0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
|
4844
|
-
0x0808080808191908, 0x080808080819192b, 0x0808080808192b19,
|
4845
|
-
0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
4846
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908,
|
4847
|
-
0x080808081908192b, 0x0808080819082b19, 0x0808080819190808,
|
4848
|
-
0x080808081919082b, 0x0808080819191919, 0x0808080819192b08,
|
4849
|
-
0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
4850
|
-
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08,
|
4851
|
-
0x080808082b190819, 0x080808082b191908, 0x080808082b192b19,
|
4852
|
-
0x080808082b2b0808, 0x0808081908080819, 0x0808081908081908,
|
4853
|
-
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
4854
|
-
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08,
|
4855
|
-
0x0808081908192b2b, 0x08080819082b0819, 0x08080819082b1908,
|
4856
|
-
0x0808081919080808, 0x080808191908082b, 0x0808081919081919,
|
4857
|
-
0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
4858
|
-
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819,
|
4859
|
-
0x080808192b081908, 0x080808192b190808, 0x0808082b08080808,
|
4860
|
-
0x0808082b0808082b, 0x0808082b08081919, 0x0808082b08082b08,
|
4861
|
-
0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
4862
|
-
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808,
|
4863
|
-
0x0808082b19191919, 0x0808082b2b080808, 0x0808082b2b082b2b,
|
4864
|
-
0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
4865
|
-
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
4866
|
-
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819,
|
4867
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
|
4868
|
-
0x0808190819081919, 0x0808190819082b08, 0x0808190819190819,
|
4869
|
-
0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
4870
|
-
0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
4871
|
-
0x0808191908080808, 0x080819190808082b, 0x0808191908081919,
|
4872
|
-
0x0808191908082b08, 0x0808191908190819, 0x0808191908191908,
|
4873
|
-
0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
4874
|
-
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808,
|
4875
|
-
0x0808192b08080819, 0x0808192b08081908, 0x0808192b08190808,
|
4876
|
-
0x0808192b082b192b, 0x0808192b19080808, 0x0808192b1908082b,
|
4877
|
-
0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
4878
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b,
|
4879
|
-
0x08082b0808190819, 0x08082b0808191908, 0x08082b08082b0808,
|
4880
|
-
0x08082b08082b1919, 0x08082b0819080819, 0x08082b0819081908,
|
4881
|
-
0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
4882
|
-
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
4883
|
-
0x08082b1908081908, 0x08082b1908190808, 0x08082b1919080808,
|
4884
|
-
0x08082b192b080819, 0x08082b192b082b19, 0x08082b2b08080808,
|
4885
|
-
0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
4886
|
-
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908,
|
4887
|
-
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808,
|
4888
|
-
0x081908080819082b, 0x0819080808191919, 0x0819080808192b08,
|
4889
|
-
0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
4890
|
-
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08,
|
4891
|
-
0x0819080819190819, 0x0819080819191908, 0x08190808192b0808,
|
4892
|
-
0x08190808192b2b2b, 0x081908082b080819, 0x081908082b081908,
|
4893
|
-
0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
4894
|
-
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819,
|
4895
|
-
0x0819081908191908, 0x08190819082b0808, 0x0819081919080819,
|
4896
|
-
0x0819081919081908, 0x0819081919190808, 0x081908192b080808,
|
4897
|
-
0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
4898
|
-
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808,
|
4899
|
-
0x0819082b19080808, 0x0819082b192b0808, 0x0819190808080808,
|
4900
|
-
0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
4901
|
-
0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
4902
|
-
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19,
|
4903
|
-
0x0819190819190808, 0x08191908192b1908, 0x081919082b080808,
|
4904
|
-
0x0819191908080819, 0x0819191908081908, 0x0819191908190808,
|
4905
|
-
0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
4906
|
-
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908,
|
4907
|
-
0x08192b0808190808, 0x08192b080819082b, 0x08192b0819080808,
|
4908
|
-
0x08192b0819191908, 0x08192b082b08192b, 0x08192b1908080808,
|
4909
|
-
0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
4910
|
-
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b,
|
4911
|
-
0x082b080808081919, 0x082b080808082b08, 0x082b080808082b2b,
|
4912
|
-
0x082b080808190819, 0x082b080808191908, 0x082b0808082b0808,
|
4913
|
-
0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
4914
|
-
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819,
|
4915
|
-
0x082b081908081908, 0x082b081908190808, 0x082b081919080808,
|
4916
|
-
0x082b081919082b08, 0x082b0819192b1919, 0x082b082b08080808,
|
4917
|
-
0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
4918
|
-
0x082b190808080819, 0x082b190808081908, 0x082b190808190808,
|
4919
|
-
0x082b1908082b2b19, 0x082b190819080808, 0x082b191908080808,
|
4920
|
-
0x082b191919080819, 0x082b19191919082b, 0x082b19192b192b19,
|
4921
|
-
0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
4922
|
-
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b,
|
4923
|
-
0x082b2b08082b0808, 0x082b2b0819191919, 0x082b2b082b082b08,
|
4924
|
-
0x082b2b082b2b082b, 0x082b2b19192b2b08, 0x082b2b192b190808,
|
4925
|
-
0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
4926
|
-
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
4927
|
-
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19,
|
4928
|
-
0x1908080808190808, 0x190808080819082b, 0x1908080808191919,
|
4929
|
-
0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
4930
|
-
0x1908080819080808, 0x190808081908082b, 0x1908080819081919,
|
4931
|
-
0x1908080819082b08, 0x1908080819082b2b, 0x1908080819190819,
|
4932
|
-
0x1908080819191908, 0x19080808192b0808, 0x19080808192b1919,
|
4933
|
-
0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
4934
|
-
0x1908081908080808, 0x190808190808082b, 0x1908081908081919,
|
4935
|
-
0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
4936
|
-
0x19080819082b0808, 0x1908081919080819, 0x1908081919081908,
|
4937
|
-
0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
4938
|
-
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908,
|
4939
|
-
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b082b2b19,
|
4940
|
-
0x1908082b19080808, 0x1908190808080808, 0x190819080808082b,
|
4941
|
-
0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
4942
|
-
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808,
|
4943
|
-
0x1908190819080819, 0x1908190819081908, 0x1908190819190808,
|
4944
|
-
0x190819082b080808, 0x190819082b191908, 0x1908191908080819,
|
4945
|
-
0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
4946
|
-
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808,
|
4947
|
-
0x1908192b08082b2b, 0x1908192b19081908, 0x1908192b19190808,
|
4948
|
-
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808190808,
|
4949
|
-
0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
4950
|
-
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819,
|
4951
|
-
0x19082b1919081908, 0x19082b1919190808, 0x19082b19192b2b19,
|
4952
|
-
0x19082b2b08081908, 0x1919080808080808, 0x191908080808082b,
|
4953
|
-
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
4954
|
-
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08,
|
4955
|
-
0x1919080819080819, 0x1919080819081908, 0x1919080819190808,
|
4956
|
-
0x191908082b080808, 0x1919081908080819, 0x1919081908081908,
|
4957
|
-
0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
4958
|
-
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908,
|
4959
|
-
0x1919082b2b2b2b2b, 0x1919190808080819, 0x1919190808081908,
|
4960
|
-
0x1919190808190808, 0x19191908082b0819, 0x1919190819080808,
|
4961
|
-
0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
4962
|
-
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808,
|
4963
|
-
0x191919192b082b08, 0x1919192b082b0819, 0x1919192b192b2b08,
|
4964
|
-
0x1919192b2b2b0819, 0x19192b0808080808, 0x19192b0808191908,
|
4965
|
-
0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
4966
|
-
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b,
|
4967
|
-
0x19192b2b2b081919, 0x192b080808080819, 0x192b080808081908,
|
4968
|
-
0x192b080808190808, 0x192b080819080808, 0x192b080819191908,
|
4969
|
-
0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
4970
|
-
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b,
|
4971
|
-
0x192b082b2b19082b, 0x192b190808080808, 0x192b19080819192b,
|
4972
|
-
0x192b191908190808, 0x192b191919080808, 0x192b191919081919,
|
4973
|
-
0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
4974
|
-
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908,
|
4975
|
-
0x192b2b2b192b082b, 0x2b08080808080808, 0x2b0808080808082b,
|
4976
|
-
0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
4977
|
-
0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
4978
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808,
|
4979
|
-
0x2b0808082b080808, 0x2b0808082b08082b, 0x2b0808082b2b2b08,
|
4980
|
-
0x2b0808082b2b2b2b, 0x2b08081908080819, 0x2b08081908081908,
|
4981
|
-
0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
4982
|
-
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808,
|
4983
|
-
0x2b08082b082b0808, 0x2b08082b2b080808, 0x2b08082b2b08082b,
|
4984
|
-
0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, 0x2b08190808080819,
|
4985
|
-
0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
4986
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
|
4987
|
-
0x2b0819082b082b19, 0x2b08191908080808, 0x2b08191919081908,
|
4988
|
-
0x2b0819192b2b1919, 0x2b08192b08192b08, 0x2b08192b192b2b2b,
|
4989
|
-
0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
4990
|
-
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b,
|
4991
|
-
0x2b082b082b2b2b08, 0x2b082b190808192b, 0x2b082b2b082b082b,
|
4992
|
-
0x2b082b2b2b080808, 0x2b082b2b2b082b08, 0x2b082b2b2b19192b,
|
4993
|
-
0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
4994
|
-
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b,
|
4995
|
-
0x2b1908082b081908, 0x2b19081908080808, 0x2b190819082b082b,
|
4996
|
-
0x2b190819192b1908, 0x2b19082b1919192b, 0x2b19082b2b082b19,
|
4997
|
-
0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
4998
|
-
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19,
|
4999
|
-
0x2b1919192b190808, 0x2b1919192b19082b, 0x2b19192b19080819,
|
5000
|
-
0x2b192b0819190819, 0x2b192b082b2b192b, 0x2b192b1919082b19,
|
5001
|
-
0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
5002
|
-
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b,
|
5003
|
-
0x2b2b0808082b0808, 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808,
|
5004
|
-
0x2b2b081919190819, 0x2b2b081919192b19, 0x2b2b08192b2b192b,
|
5005
|
-
0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
5006
|
-
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808,
|
5007
|
-
0x2b2b190819080808, 0x2b2b19082b191919, 0x2b2b192b192b1919,
|
5008
|
-
0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, 0x2b2b2b08082b0808,
|
5009
|
-
0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
5010
|
-
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908,
|
5011
|
-
0x2b2b2b192b08192b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b,
|
5012
|
-
0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
5013
|
-
});
|
5014
|
-
|
5015
|
-
static dpct::global_memory<const uint32_t, 1> iq3xxs_grid(
|
5016
|
-
sycl::range<1>(256),
|
5017
|
-
{
|
5018
|
-
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e,
|
5019
|
-
0x04041404, 0x04041414, 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c,
|
5020
|
-
0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, 0x040c140c, 0x040c142c,
|
5021
|
-
0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
5022
|
-
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c,
|
5023
|
-
0x04141c1c, 0x04141c3e, 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c,
|
5024
|
-
0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, 0x041c3e04, 0x04240c1c,
|
5025
|
-
0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
5026
|
-
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04,
|
5027
|
-
0x043e0c24, 0x043e0c34, 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c,
|
5028
|
-
0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, 0x0c041c04, 0x0c041c14,
|
5029
|
-
0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
5030
|
-
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14,
|
5031
|
-
0x0c14140c, 0x0c141c04, 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404,
|
5032
|
-
0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, 0x0c24042c, 0x0c242c04,
|
5033
|
-
0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
5034
|
-
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404,
|
5035
|
-
0x14041414, 0x14041434, 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c,
|
5036
|
-
0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, 0x140c1c04, 0x140c341c,
|
5037
|
-
0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
5038
|
-
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c,
|
5039
|
-
0x141c0c04, 0x141c0c24, 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c,
|
5040
|
-
0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, 0x143e040c, 0x143e041c,
|
5041
|
-
0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
5042
|
-
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414,
|
5043
|
-
0x1c0c1404, 0x1c0c1c0c, 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c,
|
5044
|
-
0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, 0x1c1c0c0c, 0x1c1c1c1c,
|
5045
|
-
0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
5046
|
-
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404,
|
5047
|
-
0x24040424, 0x24040c3e, 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e,
|
5048
|
-
0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, 0x24143404, 0x24143434,
|
5049
|
-
0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
5050
|
-
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04,
|
5051
|
-
0x2c040c14, 0x2c04240c, 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434,
|
5052
|
-
0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, 0x2c1c0414, 0x2c1c2c1c,
|
5053
|
-
0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
5054
|
-
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434,
|
5055
|
-
0x34043424, 0x340c140c, 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04,
|
5056
|
-
0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, 0x34341c1c, 0x343e041c,
|
5057
|
-
0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
5058
|
-
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14,
|
5059
|
-
0x3e1c0404, 0x3e1c0c2c, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c,
|
5060
|
-
0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
5061
|
-
});
|
5062
|
-
|
5063
|
-
static dpct::global_memory<const uint8_t, 1> ksigns_iq2xs(
|
5064
|
-
sycl::range<1>(128),
|
5065
|
-
{
|
5066
|
-
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12,
|
5067
|
-
141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153,
|
5068
|
-
154, 27, 156, 29, 30, 159, 160, 33, 34, 163, 36, 165, 166,
|
5069
|
-
39, 40, 169, 170, 43, 172, 45, 46, 175, 48, 177, 178, 51,
|
5070
|
-
180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, 192,
|
5071
|
-
65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77,
|
5072
|
-
78, 207, 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90,
|
5073
|
-
219, 92, 221, 222, 95, 96, 225, 226, 99, 228, 101, 102, 231,
|
5074
|
-
232, 105, 106, 235, 108, 237, 238, 111, 240, 113, 114, 243, 116,
|
5075
|
-
245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
5076
|
-
});
|
5077
|
-
|
5078
|
-
static dpct::global_memory<const uint64_t, 1>
|
5079
|
-
ksigns64(sycl::range<1>(128),
|
5080
|
-
{
|
5081
|
-
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00,
|
5082
|
-
0x000000000000ffff, 0xff00000000ff0000, 0x0000000000ff00ff,
|
5083
|
-
0x0000000000ffff00, 0xff00000000ffffff, 0xff000000ff000000,
|
5084
|
-
0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
|
5085
|
-
0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00,
|
5086
|
-
0x00000000ffffffff, 0xff0000ff00000000, 0x000000ff000000ff,
|
5087
|
-
0x000000ff0000ff00, 0xff0000ff0000ffff, 0x000000ff00ff0000,
|
5088
|
-
0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
|
5089
|
-
0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00,
|
5090
|
-
0x000000ffff00ffff, 0xff0000ffffff0000, 0x000000ffffff00ff,
|
5091
|
-
0x000000ffffffff00, 0xff0000ffffffffff, 0xff00ff0000000000,
|
5092
|
-
0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
|
5093
|
-
0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00,
|
5094
|
-
0x0000ff0000ffffff, 0x0000ff00ff000000, 0xff00ff00ff0000ff,
|
5095
|
-
0xff00ff00ff00ff00, 0x0000ff00ff00ffff, 0xff00ff00ffff0000,
|
5096
|
-
0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
|
5097
|
-
0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00,
|
5098
|
-
0x0000ffff0000ffff, 0xff00ffff00ff0000, 0x0000ffff00ff00ff,
|
5099
|
-
0x0000ffff00ffff00, 0xff00ffff00ffffff, 0xff00ffffff000000,
|
5100
|
-
0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
|
5101
|
-
0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00,
|
5102
|
-
0x0000ffffffffffff, 0xffff000000000000, 0x00ff0000000000ff,
|
5103
|
-
0x00ff00000000ff00, 0xffff00000000ffff, 0x00ff000000ff0000,
|
5104
|
-
0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
|
5105
|
-
0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00,
|
5106
|
-
0x00ff0000ff00ffff, 0xffff0000ffff0000, 0x00ff0000ffff00ff,
|
5107
|
-
0x00ff0000ffffff00, 0xffff0000ffffffff, 0x00ff00ff00000000,
|
5108
|
-
0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
|
5109
|
-
0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
|
5110
|
-
0xffff00ff00ffffff, 0xffff00ffff000000, 0x00ff00ffff0000ff,
|
5111
|
-
0x00ff00ffff00ff00, 0xffff00ffff00ffff, 0x00ff00ffffff0000,
|
5112
|
-
0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
|
5113
|
-
0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00,
|
5114
|
-
0x00ffff000000ffff, 0xffffff0000ff0000, 0x00ffff0000ff00ff,
|
5115
|
-
0x00ffff0000ffff00, 0xffffff0000ffffff, 0xffffff00ff000000,
|
5116
|
-
0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
|
5117
|
-
0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00,
|
5118
|
-
0x00ffff00ffffffff, 0xffffffff00000000, 0x00ffffff000000ff,
|
5119
|
-
0x00ffffff0000ff00, 0xffffffff0000ffff, 0x00ffffff00ff0000,
|
5120
|
-
0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
|
5121
|
-
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00,
|
5122
|
-
0x00ffffffff00ffff, 0xffffffffffff0000, 0x00ffffffffff00ff,
|
5123
|
-
0x00ffffffffffff00, 0xffffffffffffffff,
|
5124
|
-
});
|
5125
|
-
//#endif
|
5126
|
-
|
5127
|
-
static dpct::global_memory<const uint8_t, 1>
|
5128
|
-
kmask_iq2xs(sycl::range<1>(8), {1, 2, 4, 8, 16, 32, 64, 128});
|
5129
|
-
|
5130
4599
|
template<typename dst_t>
|
5131
4600
|
static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
5132
4601
|
const sycl::nd_item<3> &item_ct1,
|
@@ -5213,6 +4682,65 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
5213
4682
|
|
5214
4683
|
}
|
5215
4684
|
|
4685
|
+
template<typename dst_t>
|
4686
|
+
static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
4687
|
+
const sycl::nd_item<3> &item_ct1,
|
4688
|
+
const uint32_t *iq3s_grid,
|
4689
|
+
const uint8_t *ksigns_iq2xs,
|
4690
|
+
const uint8_t *kmask_iq2xs) {
|
4691
|
+
|
4692
|
+
const int i = item_ct1.get_group(2);
|
4693
|
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
4694
|
+
|
4695
|
+
const int tid = item_ct1.get_local_id(2);
|
4696
|
+
#if QK_K == 256
|
4697
|
+
const int il = tid/8; // 0...3
|
4698
|
+
const int ib = tid%8; // 0...7
|
4699
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4700
|
+
const uint8_t * qs = x[i].qs + 8*ib;
|
4701
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]);
|
4702
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]);
|
4703
|
+
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
4704
|
+
const uint8_t signs = x[i].signs[4*ib + il];
|
4705
|
+
for (int j = 0; j < 4; ++j) {
|
4706
|
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4707
|
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4708
|
+
}
|
4709
|
+
#else
|
4710
|
+
assert(false);
|
4711
|
+
#endif
|
4712
|
+
|
4713
|
+
}
|
4714
|
+
|
4715
|
+
template<typename dst_t>
|
4716
|
+
static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
4717
|
+
const sycl::nd_item<3> &item_ct1,
|
4718
|
+
const uint32_t *iq1s_grid,
|
4719
|
+
const uint8_t *ksigns_iq2xs,
|
4720
|
+
const uint8_t *kmask_iq2xs) {
|
4721
|
+
const int i = item_ct1.get_group(2);
|
4722
|
+
const block_iq1_s * x = (const block_iq1_s *) vx;
|
4723
|
+
|
4724
|
+
const int tid = item_ct1.get_local_id(2);
|
4725
|
+
#if QK_K == 256
|
4726
|
+
const int il = tid/8; // 0...3
|
4727
|
+
const int ib = tid%8; // 0...7
|
4728
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4729
|
+
const uint8_t * qs = x[i].qs + 8*ib;
|
4730
|
+
const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
|
4731
|
+
const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
|
4732
|
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
|
4733
|
+
const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
|
4734
|
+
for (int j = 0; j < 4; ++j) {
|
4735
|
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4736
|
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4737
|
+
}
|
4738
|
+
#else
|
4739
|
+
assert(false);
|
4740
|
+
#endif
|
4741
|
+
|
4742
|
+
}
|
4743
|
+
|
5216
4744
|
/*
|
5217
4745
|
DPCT1110:4: The total declared local variable size in device function
|
5218
4746
|
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
|
@@ -8059,6 +7587,75 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
8059
7587
|
#endif
|
8060
7588
|
}
|
8061
7589
|
|
7590
|
+
static __dpct_inline__ float
|
7591
|
+
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
7592
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7593
|
+
const uint32_t *iq3s_grid, const uint64_t *ksigns64) {
|
7594
|
+
#if DPCT_COMPATIBILITY_TEMP >= \
|
7595
|
+
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7596
|
+
#if QK_K == 256
|
7597
|
+
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
7598
|
+
|
7599
|
+
const int ib32 = iqs;
|
7600
|
+
const uint8_t * qs = bq2->qs + 8*ib32;
|
7601
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
7602
|
+
int sumi = 0;
|
7603
|
+
for (int l = 0; l < 4; ++l) {
|
7604
|
+
const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
7605
|
+
const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
7606
|
+
uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
7607
|
+
((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
|
7608
|
+
uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
7609
|
+
((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
|
7610
|
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
7611
|
+
grid1[0] ^ signs0, signs0, std::minus<>());
|
7612
|
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
7613
|
+
grid2[0] ^ signs1, signs1, std::minus<>());
|
7614
|
+
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
|
7615
|
+
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
7616
|
+
q8 += 8;
|
7617
|
+
}
|
7618
|
+
const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0];
|
7619
|
+
return d * sumi;
|
7620
|
+
#else
|
7621
|
+
assert(false);
|
7622
|
+
return 0.f;
|
7623
|
+
#endif
|
7624
|
+
#else
|
7625
|
+
assert(false);
|
7626
|
+
return 0.f;
|
7627
|
+
#endif
|
7628
|
+
}
|
7629
|
+
|
7630
|
+
static __dpct_inline__ float
|
7631
|
+
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
7632
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7633
|
+
const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
|
7634
|
+
#if QK_K == 256
|
7635
|
+
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
7636
|
+
|
7637
|
+
const int ib32 = iqs;
|
7638
|
+
const uint8_t * qs = bq1->qs + 4*ib32;
|
7639
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
7640
|
+
int sumi = 0;
|
7641
|
+
for (int l = 0; l < 4; ++l) {
|
7642
|
+
const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
|
7643
|
+
const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
|
7644
|
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
7645
|
+
grid[0] ^ signs[0], signs[0], std::minus<>());
|
7646
|
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
7647
|
+
grid[1] ^ signs[1], signs[1], std::minus<>());
|
7648
|
+
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
|
7649
|
+
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
7650
|
+
q8 += 8;
|
7651
|
+
}
|
7652
|
+
const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
|
7653
|
+
return d * sumi;
|
7654
|
+
#else
|
7655
|
+
assert(false);
|
7656
|
+
return 0.f;
|
7657
|
+
#endif
|
7658
|
+
}
|
8062
7659
|
|
8063
7660
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
8064
7661
|
int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
|
@@ -8824,6 +8421,98 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
|
|
8824
8421
|
}
|
8825
8422
|
}
|
8826
8423
|
|
8424
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8425
|
+
static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8426
|
+
const sycl::nd_item<3> &item_ct1,
|
8427
|
+
const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) {
|
8428
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8429
|
+
item_ct1.get_local_id(1);
|
8430
|
+
|
8431
|
+
if (row >= nrows) {
|
8432
|
+
return;
|
8433
|
+
}
|
8434
|
+
|
8435
|
+
const int blocks_per_row = ncols / qk;
|
8436
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8437
|
+
|
8438
|
+
// partial sum for each thread
|
8439
|
+
float tmp = 0.0f;
|
8440
|
+
|
8441
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8442
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8443
|
+
|
8444
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8445
|
+
i += blocks_per_warp) {
|
8446
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8447
|
+
|
8448
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8449
|
+
|
8450
|
+
const int iqs =
|
8451
|
+
vdr *
|
8452
|
+
(item_ct1.get_local_id(2) %
|
8453
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8454
|
+
|
8455
|
+
tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr);
|
8456
|
+
}
|
8457
|
+
|
8458
|
+
// sum up partial sums and write back result
|
8459
|
+
#pragma unroll
|
8460
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8461
|
+
tmp +=
|
8462
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8463
|
+
}
|
8464
|
+
|
8465
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8466
|
+
dst[row] = tmp;
|
8467
|
+
}
|
8468
|
+
}
|
8469
|
+
|
8470
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8471
|
+
static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8472
|
+
const sycl::nd_item<3> &item_ct1,
|
8473
|
+
const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
|
8474
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8475
|
+
item_ct1.get_local_id(1);
|
8476
|
+
|
8477
|
+
if (row >= nrows) {
|
8478
|
+
return;
|
8479
|
+
}
|
8480
|
+
|
8481
|
+
const int blocks_per_row = ncols / qk;
|
8482
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8483
|
+
|
8484
|
+
// partial sum for each thread
|
8485
|
+
float tmp = 0.0f;
|
8486
|
+
|
8487
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8488
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8489
|
+
|
8490
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8491
|
+
i += blocks_per_warp) {
|
8492
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8493
|
+
|
8494
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8495
|
+
|
8496
|
+
const int iqs =
|
8497
|
+
vdr *
|
8498
|
+
(item_ct1.get_local_id(2) %
|
8499
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8500
|
+
|
8501
|
+
tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr);
|
8502
|
+
}
|
8503
|
+
|
8504
|
+
// sum up partial sums and write back result
|
8505
|
+
#pragma unroll
|
8506
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8507
|
+
tmp +=
|
8508
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8509
|
+
}
|
8510
|
+
|
8511
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8512
|
+
dst[row] = tmp;
|
8513
|
+
}
|
8514
|
+
}
|
8515
|
+
|
8827
8516
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
8828
8517
|
static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
|
8829
8518
|
const sycl::nd_item<3> &item_ct1) {
|
@@ -10509,6 +10198,64 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10509
10198
|
}
|
10510
10199
|
}
|
10511
10200
|
|
10201
|
+
template <typename dst_t>
|
10202
|
+
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
10203
|
+
dpct::queue_ptr stream) {
|
10204
|
+
const int nb = k / QK_K;
|
10205
|
+
{
|
10206
|
+
iq3s_grid.init(*stream);
|
10207
|
+
ksigns_iq2xs.init(*stream);
|
10208
|
+
kmask_iq2xs.init(*stream);
|
10209
|
+
|
10210
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10211
|
+
{sycl::aspect::fp16});
|
10212
|
+
|
10213
|
+
stream->submit([&](sycl::handler &cgh) {
|
10214
|
+
auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
|
10215
|
+
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10216
|
+
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10217
|
+
|
10218
|
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10219
|
+
sycl::range<3>(1, 1, 32),
|
10220
|
+
sycl::range<3>(1, 1, 32)),
|
10221
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10222
|
+
dequantize_block_iq3_s(
|
10223
|
+
vx, y, item_ct1, iq3s_grid_ptr_ct1,
|
10224
|
+
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
10225
|
+
});
|
10226
|
+
});
|
10227
|
+
}
|
10228
|
+
}
|
10229
|
+
|
10230
|
+
template <typename dst_t>
|
10231
|
+
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
10232
|
+
dpct::queue_ptr stream) {
|
10233
|
+
const int nb = k / QK_K;
|
10234
|
+
{
|
10235
|
+
iq1s_grid_gpu.init(*stream);
|
10236
|
+
ksigns_iq2xs.init(*stream);
|
10237
|
+
kmask_iq2xs.init(*stream);
|
10238
|
+
|
10239
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10240
|
+
{sycl::aspect::fp16});
|
10241
|
+
|
10242
|
+
stream->submit([&](sycl::handler &cgh) {
|
10243
|
+
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
|
10244
|
+
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10245
|
+
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10246
|
+
|
10247
|
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10248
|
+
sycl::range<3>(1, 1, 32),
|
10249
|
+
sycl::range<3>(1, 1, 32)),
|
10250
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10251
|
+
dequantize_block_iq1_s(
|
10252
|
+
vx, y, item_ct1, iq1s_grid_ptr_ct1,
|
10253
|
+
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
10254
|
+
});
|
10255
|
+
});
|
10256
|
+
}
|
10257
|
+
}
|
10258
|
+
|
10512
10259
|
template <typename src_t, typename dst_t>
|
10513
10260
|
static void convert_unary_sycl(const void *__restrict__ vx,
|
10514
10261
|
dst_t *__restrict__ y, const int k,
|
@@ -10559,6 +10306,10 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
|
|
10559
10306
|
return dequantize_row_iq2_xs_sycl;
|
10560
10307
|
case GGML_TYPE_IQ3_XXS:
|
10561
10308
|
return dequantize_row_iq3_xxs_sycl;
|
10309
|
+
case GGML_TYPE_IQ3_S:
|
10310
|
+
return dequantize_row_iq3_s_sycl;
|
10311
|
+
case GGML_TYPE_IQ1_S:
|
10312
|
+
return dequantize_row_iq1_s_sycl;
|
10562
10313
|
case GGML_TYPE_F32:
|
10563
10314
|
return convert_unary_sycl<float>;
|
10564
10315
|
default:
|
@@ -10599,6 +10350,10 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
|
|
10599
10350
|
return dequantize_row_iq2_xs_sycl;
|
10600
10351
|
case GGML_TYPE_IQ3_XXS:
|
10601
10352
|
return dequantize_row_iq3_xxs_sycl;
|
10353
|
+
case GGML_TYPE_IQ3_S:
|
10354
|
+
return dequantize_row_iq3_s_sycl;
|
10355
|
+
case GGML_TYPE_IQ1_S:
|
10356
|
+
return dequantize_row_iq1_s_sycl;
|
10602
10357
|
case GGML_TYPE_F16:
|
10603
10358
|
return convert_unary_sycl<sycl::half>;
|
10604
10359
|
default:
|
@@ -11188,6 +10943,61 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
11188
10943
|
}
|
11189
10944
|
}
|
11190
10945
|
|
10946
|
+
static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
10947
|
+
float *dst, const int ncols,
|
10948
|
+
const int nrows,
|
10949
|
+
dpct::queue_ptr stream) {
|
10950
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
10951
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
10952
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10953
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10954
|
+
{
|
10955
|
+
iq3s_grid.init(*stream);
|
10956
|
+
ksigns64.init(*stream);
|
10957
|
+
|
10958
|
+
stream->submit([&](sycl::handler &cgh) {
|
10959
|
+
auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
|
10960
|
+
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10961
|
+
|
10962
|
+
cgh.parallel_for(
|
10963
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10964
|
+
[=](sycl::nd_item<3> item_ct1)
|
10965
|
+
[[intel::reqd_sub_group_size(32)]] {
|
10966
|
+
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
10967
|
+
vx, vy, dst, ncols, nrows, item_ct1,
|
10968
|
+
iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10969
|
+
});
|
10970
|
+
});
|
10971
|
+
}
|
10972
|
+
}
|
10973
|
+
|
10974
|
+
static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
10975
|
+
float *dst, const int ncols,
|
10976
|
+
const int nrows,
|
10977
|
+
dpct::queue_ptr stream) {
|
10978
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
10979
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
10980
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10981
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10982
|
+
{
|
10983
|
+
iq1s_grid_gpu.init(*stream);
|
10984
|
+
ksigns64.init(*stream);
|
10985
|
+
|
10986
|
+
stream->submit([&](sycl::handler &cgh) {
|
10987
|
+
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
|
10988
|
+
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10989
|
+
|
10990
|
+
cgh.parallel_for(
|
10991
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10992
|
+
[=](sycl::nd_item<3> item_ct1)
|
10993
|
+
[[intel::reqd_sub_group_size(32)]] {
|
10994
|
+
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
10995
|
+
vx, vy, dst, ncols, nrows, item_ct1,
|
10996
|
+
iq1s_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10997
|
+
});
|
10998
|
+
});
|
10999
|
+
}
|
11000
|
+
}
|
11191
11001
|
|
11192
11002
|
static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
11193
11003
|
float *dst, const int ncols_x,
|
@@ -13936,8 +13746,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
|
13936
13746
|
case GGML_TYPE_Q5_K:
|
13937
13747
|
case GGML_TYPE_IQ2_XXS:
|
13938
13748
|
case GGML_TYPE_IQ2_XS:
|
13749
|
+
case GGML_TYPE_IQ1_S:
|
13939
13750
|
case GGML_TYPE_IQ3_XXS:
|
13940
13751
|
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
13752
|
+
case GGML_TYPE_IQ3_S:
|
13753
|
+
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
13941
13754
|
case GGML_TYPE_Q6_K:
|
13942
13755
|
return 64;
|
13943
13756
|
default:
|
@@ -13998,6 +13811,12 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
13998
13811
|
case GGML_TYPE_IQ3_XXS:
|
13999
13812
|
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
14000
13813
|
break;
|
13814
|
+
case GGML_TYPE_IQ3_S:
|
13815
|
+
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13816
|
+
break;
|
13817
|
+
case GGML_TYPE_IQ1_S:
|
13818
|
+
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13819
|
+
break;
|
14001
13820
|
default:
|
14002
13821
|
GGML_ASSERT(false);
|
14003
13822
|
break;
|
@@ -17343,9 +17162,8 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17343
17162
|
return false;
|
17344
17163
|
}
|
17345
17164
|
ggml_type a_type = a->type;
|
17346
|
-
if (a_type ==
|
17347
|
-
a_type ==
|
17348
|
-
a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
17165
|
+
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S ||
|
17166
|
+
a_type == GGML_TYPE_IQ4_XS) {
|
17349
17167
|
return false;
|
17350
17168
|
}
|
17351
17169
|
return true;
|
@@ -17440,13 +17258,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17440
17258
|
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
17441
17259
|
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
17442
17260
|
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
17443
|
-
/* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async,
|
17261
|
+
/* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
|
17444
17262
|
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
17445
17263
|
/* .graph_plan_create = */ NULL,
|
17446
17264
|
/* .graph_plan_free = */ NULL,
|
17447
17265
|
/* .graph_plan_compute = */ NULL,
|
17448
17266
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17449
17267
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17268
|
+
/* .event_new = */ NULL,
|
17269
|
+
/* .event_free = */ NULL,
|
17270
|
+
/* .event_record = */ NULL,
|
17271
|
+
/* .event_wait = */ NULL,
|
17272
|
+
/* .event_synchronize = */ NULL,
|
17450
17273
|
};
|
17451
17274
|
|
17452
17275
|
static ggml_guid_t ggml_backend_sycl_guid() {
|