llama_cpp 0.14.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +71 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +9 -0
- data/vendor/tmp/llama.cpp/Makefile +28 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
- data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
- data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
- data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
- data/vendor/tmp/llama.cpp/ggml.c +509 -343
- data/vendor/tmp/llama.cpp/ggml.h +61 -47
- data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
- data/vendor/tmp/llama.cpp/llama.h +25 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -202,24 +202,29 @@ namespace dpct
|
|
202
202
|
// Version string has the following format:
|
203
203
|
// a. OpenCL<space><major.minor><space><vendor-specific-information>
|
204
204
|
// b. <major.minor>
|
205
|
+
// c. <AmdGcnArchName> e.g gfx1030
|
205
206
|
std::string ver;
|
206
207
|
ver = dev.get_info<sycl::info::device::version>();
|
207
208
|
std::string::size_type i = 0;
|
208
|
-
while (i < ver.size())
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
i++;
|
209
|
+
while (i < ver.size()) {
|
210
|
+
if (isdigit(ver[i]))
|
211
|
+
break;
|
212
|
+
i++;
|
213
213
|
}
|
214
214
|
major = std::stoi(&(ver[i]));
|
215
|
-
while (i < ver.size())
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
215
|
+
while (i < ver.size()) {
|
216
|
+
if (ver[i] == '.')
|
217
|
+
break;
|
218
|
+
i++;
|
219
|
+
}
|
220
|
+
if (i < ver.size()) {
|
221
|
+
// a. and b.
|
222
|
+
i++;
|
223
|
+
minor = std::stoi(&(ver[i]));
|
224
|
+
} else {
|
225
|
+
// c.
|
226
|
+
minor = 0;
|
220
227
|
}
|
221
|
-
i++;
|
222
|
-
minor = std::stoi(&(ver[i]));
|
223
228
|
}
|
224
229
|
|
225
230
|
template <typename tag, typename T>
|
@@ -3144,6 +3149,9 @@ namespace dpct
|
|
3144
3149
|
|
3145
3150
|
} // COPY from DPCT head files
|
3146
3151
|
|
3152
|
+
#define GGML_COMMON_DECL_SYCL
|
3153
|
+
#define GGML_COMMON_IMPL_SYCL
|
3154
|
+
#include "ggml-common.h"
|
3147
3155
|
|
3148
3156
|
static int g_ggml_sycl_debug=0;
|
3149
3157
|
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
|
@@ -3310,66 +3318,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
|
|
3310
3318
|
const float *src1_dd, float *dst_dd,
|
3311
3319
|
const dpct::queue_ptr &main_stream);
|
3312
3320
|
|
3313
|
-
// QK = number of values after dequantization
|
3314
|
-
// QR = QK / number of values before dequantization
|
3315
|
-
// QI = number of 32 bit integers before dequantization
|
3316
|
-
|
3317
|
-
#define QK4_0 32
|
3318
|
-
#define QR4_0 2
|
3319
|
-
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
3320
|
-
typedef struct dpct_type_block_q4_0 {
|
3321
|
-
sycl::half d; // delta
|
3322
|
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
3323
|
-
} block_q4_0;
|
3324
|
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
3325
|
-
|
3326
|
-
#define QK4_1 32
|
3327
|
-
#define QR4_1 2
|
3328
|
-
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
3329
|
-
typedef struct dpct_type_block_q4_1 {
|
3330
|
-
sycl::half2 dm; // dm.x = delta, dm.y = min
|
3331
|
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
3332
|
-
} block_q4_1;
|
3333
|
-
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
3334
|
-
|
3335
|
-
#define QK5_0 32
|
3336
|
-
#define QR5_0 2
|
3337
|
-
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
3338
|
-
typedef struct dpct_type_block_q5_0 {
|
3339
|
-
sycl::half d; // delta
|
3340
|
-
uint8_t qh[4]; // 5-th bit of quants
|
3341
|
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
3342
|
-
} block_q5_0;
|
3343
|
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
3344
|
-
|
3345
|
-
#define QK5_1 32
|
3346
|
-
#define QR5_1 2
|
3347
|
-
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
3348
|
-
typedef struct dpct_type_block_q5_1 {
|
3349
|
-
sycl::half2 dm; // dm.x = delta, dm.y = min
|
3350
|
-
uint8_t qh[4]; // 5-th bit of quants
|
3351
|
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
3352
|
-
} block_q5_1;
|
3353
|
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
3354
|
-
|
3355
|
-
#define QK8_0 32
|
3356
|
-
#define QR8_0 1
|
3357
|
-
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
3358
|
-
typedef struct dpct_type_block_q8_0 {
|
3359
|
-
sycl::half d; // delta
|
3360
|
-
int8_t qs[QK8_0]; // quants
|
3361
|
-
} block_q8_0;
|
3362
|
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
3363
|
-
|
3364
|
-
#define QK8_1 32
|
3365
|
-
#define QR8_1 1
|
3366
|
-
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
3367
|
-
typedef struct dpct_type_block_q8_1 {
|
3368
|
-
sycl::half2 ds; // ds.x = delta, ds.y = sum
|
3369
|
-
int8_t qs[QK8_0]; // quants
|
3370
|
-
} block_q8_1;
|
3371
|
-
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
3372
|
-
|
3373
3321
|
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
3374
3322
|
typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
|
3375
3323
|
int **x_qh, int **x_sc);
|
@@ -3386,112 +3334,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
3386
3334
|
const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
|
3387
3335
|
const int &i, const int &j, const int &k);
|
3388
3336
|
|
3389
|
-
//================================= k-quants
|
3390
|
-
|
3391
|
-
#ifdef GGML_QKK_64
|
3392
|
-
#define QK_K 64
|
3393
|
-
#define K_SCALE_SIZE 4
|
3394
|
-
#else
|
3395
|
-
#define QK_K 256
|
3396
|
-
#define K_SCALE_SIZE 12
|
3397
|
-
#endif
|
3398
|
-
|
3399
|
-
#define QR2_K 4
|
3400
|
-
#define QI2_K (QK_K / (4*QR2_K))
|
3401
|
-
typedef struct dpct_type_block_q2_K {
|
3402
|
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
3403
|
-
uint8_t qs[QK_K/4]; // quants
|
3404
|
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
3405
|
-
} block_q2_K;
|
3406
|
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
3407
|
-
|
3408
|
-
#define QR3_K 4
|
3409
|
-
#define QI3_K (QK_K / (4*QR3_K))
|
3410
|
-
typedef struct dpct_type_block_q3_K {
|
3411
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
3412
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
3413
|
-
#ifdef GGML_QKK_64
|
3414
|
-
uint8_t scales[2]; // scales, quantized with 8 bits
|
3415
|
-
#else
|
3416
|
-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
3417
|
-
#endif
|
3418
|
-
sycl::half d; // super-block scale
|
3419
|
-
} block_q3_K;
|
3420
|
-
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
3421
|
-
|
3422
|
-
#define QR4_K 2
|
3423
|
-
#define QI4_K (QK_K / (4*QR4_K))
|
3424
|
-
#ifdef GGML_QKK_64
|
3425
|
-
typedef struct {
|
3426
|
-
sycl::half dm[2]; // super-block scales/mins
|
3427
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
3428
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
3429
|
-
} block_q4_K;
|
3430
|
-
static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
3431
|
-
#else
|
3432
|
-
typedef struct dpct_type_block_q4_K {
|
3433
|
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
3434
|
-
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
3435
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
3436
|
-
} block_q4_K;
|
3437
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
3438
|
-
#endif
|
3439
|
-
|
3440
|
-
#define QR5_K 2
|
3441
|
-
#define QI5_K (QK_K / (4*QR5_K))
|
3442
|
-
#ifdef GGML_QKK_64
|
3443
|
-
typedef struct {
|
3444
|
-
sycl::half d; // super-block scale
|
3445
|
-
int8_t scales[QK_K/16]; // block scales
|
3446
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
3447
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
3448
|
-
} block_q5_K;
|
3449
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
3450
|
-
#else
|
3451
|
-
typedef struct dpct_type_block_q5_K {
|
3452
|
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
3453
|
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
3454
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
3455
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
3456
|
-
} block_q5_K;
|
3457
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
3458
|
-
#endif
|
3459
|
-
|
3460
|
-
#define QR6_K 2
|
3461
|
-
#define QI6_K (QK_K / (4*QR6_K))
|
3462
|
-
typedef struct dpct_type_block_q6_K {
|
3463
|
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
3464
|
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
3465
|
-
int8_t scales[QK_K/16]; // scales
|
3466
|
-
sycl::half d; // delta
|
3467
|
-
} block_q6_K;
|
3468
|
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
3469
|
-
|
3470
|
-
#define QR2_XXS 8
|
3471
|
-
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
3472
|
-
typedef struct dpct_type_block_iq2_xxs {
|
3473
|
-
sycl::half d;
|
3474
|
-
uint16_t qs[QK_K/8];
|
3475
|
-
} block_iq2_xxs;
|
3476
|
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
3477
|
-
|
3478
|
-
#define QR2_XS 8
|
3479
|
-
#define QI2_XS (QK_K / (4*QR2_XS))
|
3480
|
-
typedef struct dpct_type_block_iq2_xs {
|
3481
|
-
sycl::half d;
|
3482
|
-
uint16_t qs[QK_K/8];
|
3483
|
-
uint8_t scales[QK_K/32];
|
3484
|
-
} block_iq2_xs;
|
3485
|
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
3486
|
-
|
3487
|
-
#define QR3_XXS 8
|
3488
|
-
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
3489
|
-
typedef struct dpct_type_block_iq3_xxs {
|
3490
|
-
sycl::half d;
|
3491
|
-
uint8_t qs[3*(QK_K/8)];
|
3492
|
-
} block_iq3_xxs;
|
3493
|
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
3494
|
-
|
3495
3337
|
#define WARP_SIZE 32
|
3496
3338
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
3497
3339
|
|
@@ -3609,7 +3451,7 @@ class sycl_gpu_mgr {
|
|
3609
3451
|
dpct::device_info prop;
|
3610
3452
|
dpct::get_device_info(prop, device);
|
3611
3453
|
if (max_compute_units == prop.get_max_compute_units() &&
|
3612
|
-
|
3454
|
+
is_ext_oneapi_device(device)) {
|
3613
3455
|
gpus.push_back(id);
|
3614
3456
|
devices.push_back(device);
|
3615
3457
|
work_group_size = prop.get_max_work_group_size();
|
@@ -3642,6 +3484,15 @@ class sycl_gpu_mgr {
|
|
3642
3484
|
assert(false);
|
3643
3485
|
return -1;
|
3644
3486
|
}
|
3487
|
+
|
3488
|
+
bool is_ext_oneapi_device(const sycl::device &dev) {
|
3489
|
+
sycl::backend dev_backend = dev.get_backend();
|
3490
|
+
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
3491
|
+
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
3492
|
+
dev_backend == sycl::backend::ext_oneapi_hip)
|
3493
|
+
return true;
|
3494
|
+
return false;
|
3495
|
+
}
|
3645
3496
|
};
|
3646
3497
|
|
3647
3498
|
static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL;
|
@@ -4745,388 +4596,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4745
4596
|
#endif
|
4746
4597
|
}
|
4747
4598
|
|
4748
|
-
static dpct::global_memory<const uint64_t, 1>
|
4749
|
-
iq2xxs_grid(sycl::range<1>(256),
|
4750
|
-
{
|
4751
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
|
4752
|
-
0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
|
4753
|
-
0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b,
|
4754
|
-
0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
|
4755
|
-
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08,
|
4756
|
-
0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
4757
|
-
0x080808082b08082b, 0x080808082b082b2b, 0x080808082b2b082b,
|
4758
|
-
0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
4759
|
-
0x0808081908191919, 0x0808081919080808, 0x080808192b081908,
|
4760
|
-
0x080808192b192b08, 0x0808082b08080808, 0x0808082b0808082b,
|
4761
|
-
0x0808082b082b082b, 0x0808082b2b08082b, 0x0808190808080819,
|
4762
|
-
0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
4763
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
|
4764
|
-
0x0808190819082b08, 0x08081908192b0808, 0x080819082b080819,
|
4765
|
-
0x080819082b081908, 0x080819082b190808, 0x080819082b2b1908,
|
4766
|
-
0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
4767
|
-
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19,
|
4768
|
-
0x080819192b080808, 0x080819192b190819, 0x0808192b08082b19,
|
4769
|
-
0x0808192b08190808, 0x0808192b19080808, 0x0808192b2b081908,
|
4770
|
-
0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
4771
|
-
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08,
|
4772
|
-
0x08082b0819080819, 0x08082b0819081908, 0x08082b0819190808,
|
4773
|
-
0x08082b081919082b, 0x08082b082b082b08, 0x08082b1908081908,
|
4774
|
-
0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
4775
|
-
0x0819080808080819, 0x0819080808081908, 0x0819080808190808,
|
4776
|
-
0x08190808082b0819, 0x0819080819080808, 0x08190808192b0808,
|
4777
|
-
0x081908082b081908, 0x081908082b190808, 0x081908082b191919,
|
4778
|
-
0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
4779
|
-
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808,
|
4780
|
-
0x0819082b082b1908, 0x0819082b19081919, 0x0819190808080808,
|
4781
|
-
0x0819190808082b08, 0x08191908082b0808, 0x08191908082b1919,
|
4782
|
-
0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
4783
|
-
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b,
|
4784
|
-
0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
4785
|
-
0x08192b0819080808, 0x08192b082b080819, 0x08192b1908080808,
|
4786
|
-
0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
4787
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b,
|
4788
|
-
0x082b080819081908, 0x082b0808192b0819, 0x082b08082b080808,
|
4789
|
-
0x082b08082b08082b, 0x082b0819082b2b19, 0x082b081919082b08,
|
4790
|
-
0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
4791
|
-
0x082b190808081908, 0x082b190808190808, 0x082b190819080808,
|
4792
|
-
0x082b19081919192b, 0x082b191908080808, 0x082b191919080819,
|
4793
|
-
0x082b1919192b1908, 0x082b192b2b190808, 0x082b2b0808082b08,
|
4794
|
-
0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
4795
|
-
0x1908080808080819, 0x1908080808081908, 0x1908080808190808,
|
4796
|
-
0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
4797
|
-
0x1908080819080808, 0x1908080819082b08, 0x190808081919192b,
|
4798
|
-
0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
4799
|
-
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808,
|
4800
|
-
0x19080819192b0819, 0x190808192b080808, 0x190808192b081919,
|
4801
|
-
0x1908082b08080819, 0x1908082b08190808, 0x1908082b19082b08,
|
4802
|
-
0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
4803
|
-
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808,
|
4804
|
-
0x190819082b192b19, 0x190819190819082b, 0x19081919082b1908,
|
4805
|
-
0x1908192b08080808, 0x19082b0808080819, 0x19082b0808081908,
|
4806
|
-
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
4807
|
-
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819,
|
4808
|
-
0x19082b192b08082b, 0x19082b2b19081919, 0x19082b2b2b190808,
|
4809
|
-
0x1919080808080808, 0x1919080808082b08, 0x1919080808190819,
|
4810
|
-
0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
4811
|
-
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b,
|
4812
|
-
0x191908192b2b1908, 0x1919082b2b190819, 0x191919082b190808,
|
4813
|
-
0x191919082b19082b, 0x1919191908082b2b, 0x1919192b08080819,
|
4814
|
-
0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
4815
|
-
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808,
|
4816
|
-
0x19192b2b08082b08, 0x192b080808081908, 0x192b080808190808,
|
4817
|
-
0x192b080819080808, 0x192b0808192b2b08, 0x192b081908080808,
|
4818
|
-
0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
4819
|
-
0x192b190808080808, 0x192b190808081919, 0x192b191908190808,
|
4820
|
-
0x192b19190819082b, 0x192b19192b081908, 0x192b2b081908082b,
|
4821
|
-
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808082b2b,
|
4822
|
-
0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
4823
|
-
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819,
|
4824
|
-
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808,
|
4825
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
|
4826
|
-
0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
4827
|
-
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808,
|
4828
|
-
0x2b082b080808082b, 0x2b082b1908081908, 0x2b082b2b08190819,
|
4829
|
-
0x2b19080808081908, 0x2b19080808190808, 0x2b190808082b1908,
|
4830
|
-
0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
4831
|
-
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808,
|
4832
|
-
0x2b191908082b082b, 0x2b19190819081908, 0x2b19191919190819,
|
4833
|
-
0x2b192b082b080819, 0x2b192b19082b0808, 0x2b2b08080808082b,
|
4834
|
-
0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
4835
|
-
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808,
|
4836
|
-
0x2b2b2b1908081908,
|
4837
|
-
});
|
4838
|
-
|
4839
|
-
static dpct::global_memory<const uint64_t, 1>
|
4840
|
-
iq2xs_grid(sycl::range<1>(512),
|
4841
|
-
{
|
4842
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919,
|
4843
|
-
0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819,
|
4844
|
-
0x0808080808191908, 0x080808080819192b, 0x0808080808192b19,
|
4845
|
-
0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
4846
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908,
|
4847
|
-
0x080808081908192b, 0x0808080819082b19, 0x0808080819190808,
|
4848
|
-
0x080808081919082b, 0x0808080819191919, 0x0808080819192b08,
|
4849
|
-
0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
4850
|
-
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08,
|
4851
|
-
0x080808082b190819, 0x080808082b191908, 0x080808082b192b19,
|
4852
|
-
0x080808082b2b0808, 0x0808081908080819, 0x0808081908081908,
|
4853
|
-
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
4854
|
-
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08,
|
4855
|
-
0x0808081908192b2b, 0x08080819082b0819, 0x08080819082b1908,
|
4856
|
-
0x0808081919080808, 0x080808191908082b, 0x0808081919081919,
|
4857
|
-
0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
4858
|
-
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819,
|
4859
|
-
0x080808192b081908, 0x080808192b190808, 0x0808082b08080808,
|
4860
|
-
0x0808082b0808082b, 0x0808082b08081919, 0x0808082b08082b08,
|
4861
|
-
0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
4862
|
-
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808,
|
4863
|
-
0x0808082b19191919, 0x0808082b2b080808, 0x0808082b2b082b2b,
|
4864
|
-
0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
4865
|
-
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
4866
|
-
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819,
|
4867
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b,
|
4868
|
-
0x0808190819081919, 0x0808190819082b08, 0x0808190819190819,
|
4869
|
-
0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
4870
|
-
0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
4871
|
-
0x0808191908080808, 0x080819190808082b, 0x0808191908081919,
|
4872
|
-
0x0808191908082b08, 0x0808191908190819, 0x0808191908191908,
|
4873
|
-
0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
4874
|
-
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808,
|
4875
|
-
0x0808192b08080819, 0x0808192b08081908, 0x0808192b08190808,
|
4876
|
-
0x0808192b082b192b, 0x0808192b19080808, 0x0808192b1908082b,
|
4877
|
-
0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
4878
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b,
|
4879
|
-
0x08082b0808190819, 0x08082b0808191908, 0x08082b08082b0808,
|
4880
|
-
0x08082b08082b1919, 0x08082b0819080819, 0x08082b0819081908,
|
4881
|
-
0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
4882
|
-
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
4883
|
-
0x08082b1908081908, 0x08082b1908190808, 0x08082b1919080808,
|
4884
|
-
0x08082b192b080819, 0x08082b192b082b19, 0x08082b2b08080808,
|
4885
|
-
0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
4886
|
-
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908,
|
4887
|
-
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808,
|
4888
|
-
0x081908080819082b, 0x0819080808191919, 0x0819080808192b08,
|
4889
|
-
0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
4890
|
-
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08,
|
4891
|
-
0x0819080819190819, 0x0819080819191908, 0x08190808192b0808,
|
4892
|
-
0x08190808192b2b2b, 0x081908082b080819, 0x081908082b081908,
|
4893
|
-
0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
4894
|
-
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819,
|
4895
|
-
0x0819081908191908, 0x08190819082b0808, 0x0819081919080819,
|
4896
|
-
0x0819081919081908, 0x0819081919190808, 0x081908192b080808,
|
4897
|
-
0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
4898
|
-
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808,
|
4899
|
-
0x0819082b19080808, 0x0819082b192b0808, 0x0819190808080808,
|
4900
|
-
0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
4901
|
-
0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
4902
|
-
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19,
|
4903
|
-
0x0819190819190808, 0x08191908192b1908, 0x081919082b080808,
|
4904
|
-
0x0819191908080819, 0x0819191908081908, 0x0819191908190808,
|
4905
|
-
0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
4906
|
-
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908,
|
4907
|
-
0x08192b0808190808, 0x08192b080819082b, 0x08192b0819080808,
|
4908
|
-
0x08192b0819191908, 0x08192b082b08192b, 0x08192b1908080808,
|
4909
|
-
0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
4910
|
-
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b,
|
4911
|
-
0x082b080808081919, 0x082b080808082b08, 0x082b080808082b2b,
|
4912
|
-
0x082b080808190819, 0x082b080808191908, 0x082b0808082b0808,
|
4913
|
-
0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
4914
|
-
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819,
|
4915
|
-
0x082b081908081908, 0x082b081908190808, 0x082b081919080808,
|
4916
|
-
0x082b081919082b08, 0x082b0819192b1919, 0x082b082b08080808,
|
4917
|
-
0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
4918
|
-
0x082b190808080819, 0x082b190808081908, 0x082b190808190808,
|
4919
|
-
0x082b1908082b2b19, 0x082b190819080808, 0x082b191908080808,
|
4920
|
-
0x082b191919080819, 0x082b19191919082b, 0x082b19192b192b19,
|
4921
|
-
0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
4922
|
-
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b,
|
4923
|
-
0x082b2b08082b0808, 0x082b2b0819191919, 0x082b2b082b082b08,
|
4924
|
-
0x082b2b082b2b082b, 0x082b2b19192b2b08, 0x082b2b192b190808,
|
4925
|
-
0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
4926
|
-
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
4927
|
-
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19,
|
4928
|
-
0x1908080808190808, 0x190808080819082b, 0x1908080808191919,
|
4929
|
-
0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
4930
|
-
0x1908080819080808, 0x190808081908082b, 0x1908080819081919,
|
4931
|
-
0x1908080819082b08, 0x1908080819082b2b, 0x1908080819190819,
|
4932
|
-
0x1908080819191908, 0x19080808192b0808, 0x19080808192b1919,
|
4933
|
-
0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
4934
|
-
0x1908081908080808, 0x190808190808082b, 0x1908081908081919,
|
4935
|
-
0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
4936
|
-
0x19080819082b0808, 0x1908081919080819, 0x1908081919081908,
|
4937
|
-
0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
4938
|
-
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908,
|
4939
|
-
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b082b2b19,
|
4940
|
-
0x1908082b19080808, 0x1908190808080808, 0x190819080808082b,
|
4941
|
-
0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
4942
|
-
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808,
|
4943
|
-
0x1908190819080819, 0x1908190819081908, 0x1908190819190808,
|
4944
|
-
0x190819082b080808, 0x190819082b191908, 0x1908191908080819,
|
4945
|
-
0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
4946
|
-
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808,
|
4947
|
-
0x1908192b08082b2b, 0x1908192b19081908, 0x1908192b19190808,
|
4948
|
-
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808190808,
|
4949
|
-
0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
4950
|
-
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819,
|
4951
|
-
0x19082b1919081908, 0x19082b1919190808, 0x19082b19192b2b19,
|
4952
|
-
0x19082b2b08081908, 0x1919080808080808, 0x191908080808082b,
|
4953
|
-
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
4954
|
-
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08,
|
4955
|
-
0x1919080819080819, 0x1919080819081908, 0x1919080819190808,
|
4956
|
-
0x191908082b080808, 0x1919081908080819, 0x1919081908081908,
|
4957
|
-
0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
4958
|
-
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908,
|
4959
|
-
0x1919082b2b2b2b2b, 0x1919190808080819, 0x1919190808081908,
|
4960
|
-
0x1919190808190808, 0x19191908082b0819, 0x1919190819080808,
|
4961
|
-
0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
4962
|
-
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808,
|
4963
|
-
0x191919192b082b08, 0x1919192b082b0819, 0x1919192b192b2b08,
|
4964
|
-
0x1919192b2b2b0819, 0x19192b0808080808, 0x19192b0808191908,
|
4965
|
-
0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
4966
|
-
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b,
|
4967
|
-
0x19192b2b2b081919, 0x192b080808080819, 0x192b080808081908,
|
4968
|
-
0x192b080808190808, 0x192b080819080808, 0x192b080819191908,
|
4969
|
-
0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
4970
|
-
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b,
|
4971
|
-
0x192b082b2b19082b, 0x192b190808080808, 0x192b19080819192b,
|
4972
|
-
0x192b191908190808, 0x192b191919080808, 0x192b191919081919,
|
4973
|
-
0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
4974
|
-
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908,
|
4975
|
-
0x192b2b2b192b082b, 0x2b08080808080808, 0x2b0808080808082b,
|
4976
|
-
0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
4977
|
-
0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
4978
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808,
|
4979
|
-
0x2b0808082b080808, 0x2b0808082b08082b, 0x2b0808082b2b2b08,
|
4980
|
-
0x2b0808082b2b2b2b, 0x2b08081908080819, 0x2b08081908081908,
|
4981
|
-
0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
4982
|
-
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808,
|
4983
|
-
0x2b08082b082b0808, 0x2b08082b2b080808, 0x2b08082b2b08082b,
|
4984
|
-
0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, 0x2b08190808080819,
|
4985
|
-
0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
4986
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808,
|
4987
|
-
0x2b0819082b082b19, 0x2b08191908080808, 0x2b08191919081908,
|
4988
|
-
0x2b0819192b2b1919, 0x2b08192b08192b08, 0x2b08192b192b2b2b,
|
4989
|
-
0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
4990
|
-
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b,
|
4991
|
-
0x2b082b082b2b2b08, 0x2b082b190808192b, 0x2b082b2b082b082b,
|
4992
|
-
0x2b082b2b2b080808, 0x2b082b2b2b082b08, 0x2b082b2b2b19192b,
|
4993
|
-
0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
4994
|
-
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b,
|
4995
|
-
0x2b1908082b081908, 0x2b19081908080808, 0x2b190819082b082b,
|
4996
|
-
0x2b190819192b1908, 0x2b19082b1919192b, 0x2b19082b2b082b19,
|
4997
|
-
0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
4998
|
-
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19,
|
4999
|
-
0x2b1919192b190808, 0x2b1919192b19082b, 0x2b19192b19080819,
|
5000
|
-
0x2b192b0819190819, 0x2b192b082b2b192b, 0x2b192b1919082b19,
|
5001
|
-
0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
5002
|
-
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b,
|
5003
|
-
0x2b2b0808082b0808, 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808,
|
5004
|
-
0x2b2b081919190819, 0x2b2b081919192b19, 0x2b2b08192b2b192b,
|
5005
|
-
0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
5006
|
-
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808,
|
5007
|
-
0x2b2b190819080808, 0x2b2b19082b191919, 0x2b2b192b192b1919,
|
5008
|
-
0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, 0x2b2b2b08082b0808,
|
5009
|
-
0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
5010
|
-
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908,
|
5011
|
-
0x2b2b2b192b08192b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b,
|
5012
|
-
0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
5013
|
-
});
|
5014
|
-
|
5015
|
-
static dpct::global_memory<const uint32_t, 1> iq3xxs_grid(
|
5016
|
-
sycl::range<1>(256),
|
5017
|
-
{
|
5018
|
-
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e,
|
5019
|
-
0x04041404, 0x04041414, 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c,
|
5020
|
-
0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, 0x040c140c, 0x040c142c,
|
5021
|
-
0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
5022
|
-
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c,
|
5023
|
-
0x04141c1c, 0x04141c3e, 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c,
|
5024
|
-
0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, 0x041c3e04, 0x04240c1c,
|
5025
|
-
0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
5026
|
-
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04,
|
5027
|
-
0x043e0c24, 0x043e0c34, 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c,
|
5028
|
-
0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, 0x0c041c04, 0x0c041c14,
|
5029
|
-
0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
5030
|
-
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14,
|
5031
|
-
0x0c14140c, 0x0c141c04, 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404,
|
5032
|
-
0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, 0x0c24042c, 0x0c242c04,
|
5033
|
-
0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
5034
|
-
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404,
|
5035
|
-
0x14041414, 0x14041434, 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c,
|
5036
|
-
0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, 0x140c1c04, 0x140c341c,
|
5037
|
-
0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
5038
|
-
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c,
|
5039
|
-
0x141c0c04, 0x141c0c24, 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c,
|
5040
|
-
0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, 0x143e040c, 0x143e041c,
|
5041
|
-
0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
5042
|
-
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414,
|
5043
|
-
0x1c0c1404, 0x1c0c1c0c, 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c,
|
5044
|
-
0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, 0x1c1c0c0c, 0x1c1c1c1c,
|
5045
|
-
0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
5046
|
-
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404,
|
5047
|
-
0x24040424, 0x24040c3e, 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e,
|
5048
|
-
0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, 0x24143404, 0x24143434,
|
5049
|
-
0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
5050
|
-
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04,
|
5051
|
-
0x2c040c14, 0x2c04240c, 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434,
|
5052
|
-
0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, 0x2c1c0414, 0x2c1c2c1c,
|
5053
|
-
0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
5054
|
-
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434,
|
5055
|
-
0x34043424, 0x340c140c, 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04,
|
5056
|
-
0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, 0x34341c1c, 0x343e041c,
|
5057
|
-
0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
5058
|
-
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14,
|
5059
|
-
0x3e1c0404, 0x3e1c0c2c, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c,
|
5060
|
-
0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
5061
|
-
});
|
5062
|
-
|
5063
|
-
static dpct::global_memory<const uint8_t, 1> ksigns_iq2xs(
|
5064
|
-
sycl::range<1>(128),
|
5065
|
-
{
|
5066
|
-
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12,
|
5067
|
-
141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153,
|
5068
|
-
154, 27, 156, 29, 30, 159, 160, 33, 34, 163, 36, 165, 166,
|
5069
|
-
39, 40, 169, 170, 43, 172, 45, 46, 175, 48, 177, 178, 51,
|
5070
|
-
180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, 192,
|
5071
|
-
65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77,
|
5072
|
-
78, 207, 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90,
|
5073
|
-
219, 92, 221, 222, 95, 96, 225, 226, 99, 228, 101, 102, 231,
|
5074
|
-
232, 105, 106, 235, 108, 237, 238, 111, 240, 113, 114, 243, 116,
|
5075
|
-
245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
5076
|
-
});
|
5077
|
-
|
5078
|
-
static dpct::global_memory<const uint64_t, 1>
|
5079
|
-
ksigns64(sycl::range<1>(128),
|
5080
|
-
{
|
5081
|
-
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00,
|
5082
|
-
0x000000000000ffff, 0xff00000000ff0000, 0x0000000000ff00ff,
|
5083
|
-
0x0000000000ffff00, 0xff00000000ffffff, 0xff000000ff000000,
|
5084
|
-
0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
|
5085
|
-
0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00,
|
5086
|
-
0x00000000ffffffff, 0xff0000ff00000000, 0x000000ff000000ff,
|
5087
|
-
0x000000ff0000ff00, 0xff0000ff0000ffff, 0x000000ff00ff0000,
|
5088
|
-
0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
|
5089
|
-
0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00,
|
5090
|
-
0x000000ffff00ffff, 0xff0000ffffff0000, 0x000000ffffff00ff,
|
5091
|
-
0x000000ffffffff00, 0xff0000ffffffffff, 0xff00ff0000000000,
|
5092
|
-
0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
|
5093
|
-
0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00,
|
5094
|
-
0x0000ff0000ffffff, 0x0000ff00ff000000, 0xff00ff00ff0000ff,
|
5095
|
-
0xff00ff00ff00ff00, 0x0000ff00ff00ffff, 0xff00ff00ffff0000,
|
5096
|
-
0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
|
5097
|
-
0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00,
|
5098
|
-
0x0000ffff0000ffff, 0xff00ffff00ff0000, 0x0000ffff00ff00ff,
|
5099
|
-
0x0000ffff00ffff00, 0xff00ffff00ffffff, 0xff00ffffff000000,
|
5100
|
-
0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
|
5101
|
-
0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00,
|
5102
|
-
0x0000ffffffffffff, 0xffff000000000000, 0x00ff0000000000ff,
|
5103
|
-
0x00ff00000000ff00, 0xffff00000000ffff, 0x00ff000000ff0000,
|
5104
|
-
0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
|
5105
|
-
0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00,
|
5106
|
-
0x00ff0000ff00ffff, 0xffff0000ffff0000, 0x00ff0000ffff00ff,
|
5107
|
-
0x00ff0000ffffff00, 0xffff0000ffffffff, 0x00ff00ff00000000,
|
5108
|
-
0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
|
5109
|
-
0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
|
5110
|
-
0xffff00ff00ffffff, 0xffff00ffff000000, 0x00ff00ffff0000ff,
|
5111
|
-
0x00ff00ffff00ff00, 0xffff00ffff00ffff, 0x00ff00ffffff0000,
|
5112
|
-
0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
|
5113
|
-
0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00,
|
5114
|
-
0x00ffff000000ffff, 0xffffff0000ff0000, 0x00ffff0000ff00ff,
|
5115
|
-
0x00ffff0000ffff00, 0xffffff0000ffffff, 0xffffff00ff000000,
|
5116
|
-
0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
|
5117
|
-
0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00,
|
5118
|
-
0x00ffff00ffffffff, 0xffffffff00000000, 0x00ffffff000000ff,
|
5119
|
-
0x00ffffff0000ff00, 0xffffffff0000ffff, 0x00ffffff00ff0000,
|
5120
|
-
0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
|
5121
|
-
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00,
|
5122
|
-
0x00ffffffff00ffff, 0xffffffffffff0000, 0x00ffffffffff00ff,
|
5123
|
-
0x00ffffffffffff00, 0xffffffffffffffff,
|
5124
|
-
});
|
5125
|
-
//#endif
|
5126
|
-
|
5127
|
-
static dpct::global_memory<const uint8_t, 1>
|
5128
|
-
kmask_iq2xs(sycl::range<1>(8), {1, 2, 4, 8, 16, 32, 64, 128});
|
5129
|
-
|
5130
4599
|
template<typename dst_t>
|
5131
4600
|
static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
5132
4601
|
const sycl::nd_item<3> &item_ct1,
|
@@ -5213,6 +4682,65 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
5213
4682
|
|
5214
4683
|
}
|
5215
4684
|
|
4685
|
+
template<typename dst_t>
|
4686
|
+
static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
4687
|
+
const sycl::nd_item<3> &item_ct1,
|
4688
|
+
const uint32_t *iq3s_grid,
|
4689
|
+
const uint8_t *ksigns_iq2xs,
|
4690
|
+
const uint8_t *kmask_iq2xs) {
|
4691
|
+
|
4692
|
+
const int i = item_ct1.get_group(2);
|
4693
|
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
4694
|
+
|
4695
|
+
const int tid = item_ct1.get_local_id(2);
|
4696
|
+
#if QK_K == 256
|
4697
|
+
const int il = tid/8; // 0...3
|
4698
|
+
const int ib = tid%8; // 0...7
|
4699
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4700
|
+
const uint8_t * qs = x[i].qs + 8*ib;
|
4701
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]);
|
4702
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]);
|
4703
|
+
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
4704
|
+
const uint8_t signs = x[i].signs[4*ib + il];
|
4705
|
+
for (int j = 0; j < 4; ++j) {
|
4706
|
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4707
|
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4708
|
+
}
|
4709
|
+
#else
|
4710
|
+
assert(false);
|
4711
|
+
#endif
|
4712
|
+
|
4713
|
+
}
|
4714
|
+
|
4715
|
+
template<typename dst_t>
|
4716
|
+
static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
4717
|
+
const sycl::nd_item<3> &item_ct1,
|
4718
|
+
const uint32_t *iq1s_grid,
|
4719
|
+
const uint8_t *ksigns_iq2xs,
|
4720
|
+
const uint8_t *kmask_iq2xs) {
|
4721
|
+
const int i = item_ct1.get_group(2);
|
4722
|
+
const block_iq1_s * x = (const block_iq1_s *) vx;
|
4723
|
+
|
4724
|
+
const int tid = item_ct1.get_local_id(2);
|
4725
|
+
#if QK_K == 256
|
4726
|
+
const int il = tid/8; // 0...3
|
4727
|
+
const int ib = tid%8; // 0...7
|
4728
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4729
|
+
const uint8_t * qs = x[i].qs + 8*ib;
|
4730
|
+
const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
|
4731
|
+
const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
|
4732
|
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
|
4733
|
+
const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
|
4734
|
+
for (int j = 0; j < 4; ++j) {
|
4735
|
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4736
|
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4737
|
+
}
|
4738
|
+
#else
|
4739
|
+
assert(false);
|
4740
|
+
#endif
|
4741
|
+
|
4742
|
+
}
|
4743
|
+
|
5216
4744
|
/*
|
5217
4745
|
DPCT1110:4: The total declared local variable size in device function
|
5218
4746
|
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
|
@@ -8059,6 +7587,75 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
8059
7587
|
#endif
|
8060
7588
|
}
|
8061
7589
|
|
7590
|
+
static __dpct_inline__ float
|
7591
|
+
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
7592
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7593
|
+
const uint32_t *iq3s_grid, const uint64_t *ksigns64) {
|
7594
|
+
#if DPCT_COMPATIBILITY_TEMP >= \
|
7595
|
+
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7596
|
+
#if QK_K == 256
|
7597
|
+
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
7598
|
+
|
7599
|
+
const int ib32 = iqs;
|
7600
|
+
const uint8_t * qs = bq2->qs + 8*ib32;
|
7601
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
7602
|
+
int sumi = 0;
|
7603
|
+
for (int l = 0; l < 4; ++l) {
|
7604
|
+
const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
7605
|
+
const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
7606
|
+
uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
7607
|
+
((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
|
7608
|
+
uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
7609
|
+
((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
|
7610
|
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
7611
|
+
grid1[0] ^ signs0, signs0, std::minus<>());
|
7612
|
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
7613
|
+
grid2[0] ^ signs1, signs1, std::minus<>());
|
7614
|
+
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
|
7615
|
+
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
7616
|
+
q8 += 8;
|
7617
|
+
}
|
7618
|
+
const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0];
|
7619
|
+
return d * sumi;
|
7620
|
+
#else
|
7621
|
+
assert(false);
|
7622
|
+
return 0.f;
|
7623
|
+
#endif
|
7624
|
+
#else
|
7625
|
+
assert(false);
|
7626
|
+
return 0.f;
|
7627
|
+
#endif
|
7628
|
+
}
|
7629
|
+
|
7630
|
+
static __dpct_inline__ float
|
7631
|
+
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
7632
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7633
|
+
const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
|
7634
|
+
#if QK_K == 256
|
7635
|
+
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
7636
|
+
|
7637
|
+
const int ib32 = iqs;
|
7638
|
+
const uint8_t * qs = bq1->qs + 4*ib32;
|
7639
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
7640
|
+
int sumi = 0;
|
7641
|
+
for (int l = 0; l < 4; ++l) {
|
7642
|
+
const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
|
7643
|
+
const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
|
7644
|
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
7645
|
+
grid[0] ^ signs[0], signs[0], std::minus<>());
|
7646
|
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
7647
|
+
grid[1] ^ signs[1], signs[1], std::minus<>());
|
7648
|
+
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
|
7649
|
+
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
7650
|
+
q8 += 8;
|
7651
|
+
}
|
7652
|
+
const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
|
7653
|
+
return d * sumi;
|
7654
|
+
#else
|
7655
|
+
assert(false);
|
7656
|
+
return 0.f;
|
7657
|
+
#endif
|
7658
|
+
}
|
8062
7659
|
|
8063
7660
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
8064
7661
|
int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
|
@@ -8824,6 +8421,98 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
|
|
8824
8421
|
}
|
8825
8422
|
}
|
8826
8423
|
|
8424
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8425
|
+
static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8426
|
+
const sycl::nd_item<3> &item_ct1,
|
8427
|
+
const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) {
|
8428
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8429
|
+
item_ct1.get_local_id(1);
|
8430
|
+
|
8431
|
+
if (row >= nrows) {
|
8432
|
+
return;
|
8433
|
+
}
|
8434
|
+
|
8435
|
+
const int blocks_per_row = ncols / qk;
|
8436
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8437
|
+
|
8438
|
+
// partial sum for each thread
|
8439
|
+
float tmp = 0.0f;
|
8440
|
+
|
8441
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8442
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8443
|
+
|
8444
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8445
|
+
i += blocks_per_warp) {
|
8446
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8447
|
+
|
8448
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8449
|
+
|
8450
|
+
const int iqs =
|
8451
|
+
vdr *
|
8452
|
+
(item_ct1.get_local_id(2) %
|
8453
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8454
|
+
|
8455
|
+
tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr);
|
8456
|
+
}
|
8457
|
+
|
8458
|
+
// sum up partial sums and write back result
|
8459
|
+
#pragma unroll
|
8460
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8461
|
+
tmp +=
|
8462
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8463
|
+
}
|
8464
|
+
|
8465
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8466
|
+
dst[row] = tmp;
|
8467
|
+
}
|
8468
|
+
}
|
8469
|
+
|
8470
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8471
|
+
static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8472
|
+
const sycl::nd_item<3> &item_ct1,
|
8473
|
+
const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
|
8474
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8475
|
+
item_ct1.get_local_id(1);
|
8476
|
+
|
8477
|
+
if (row >= nrows) {
|
8478
|
+
return;
|
8479
|
+
}
|
8480
|
+
|
8481
|
+
const int blocks_per_row = ncols / qk;
|
8482
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8483
|
+
|
8484
|
+
// partial sum for each thread
|
8485
|
+
float tmp = 0.0f;
|
8486
|
+
|
8487
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8488
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8489
|
+
|
8490
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8491
|
+
i += blocks_per_warp) {
|
8492
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8493
|
+
|
8494
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8495
|
+
|
8496
|
+
const int iqs =
|
8497
|
+
vdr *
|
8498
|
+
(item_ct1.get_local_id(2) %
|
8499
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8500
|
+
|
8501
|
+
tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr);
|
8502
|
+
}
|
8503
|
+
|
8504
|
+
// sum up partial sums and write back result
|
8505
|
+
#pragma unroll
|
8506
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8507
|
+
tmp +=
|
8508
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8509
|
+
}
|
8510
|
+
|
8511
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8512
|
+
dst[row] = tmp;
|
8513
|
+
}
|
8514
|
+
}
|
8515
|
+
|
8827
8516
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
8828
8517
|
static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
|
8829
8518
|
const sycl::nd_item<3> &item_ct1) {
|
@@ -10509,6 +10198,64 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10509
10198
|
}
|
10510
10199
|
}
|
10511
10200
|
|
10201
|
+
template <typename dst_t>
|
10202
|
+
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
10203
|
+
dpct::queue_ptr stream) {
|
10204
|
+
const int nb = k / QK_K;
|
10205
|
+
{
|
10206
|
+
iq3s_grid.init(*stream);
|
10207
|
+
ksigns_iq2xs.init(*stream);
|
10208
|
+
kmask_iq2xs.init(*stream);
|
10209
|
+
|
10210
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10211
|
+
{sycl::aspect::fp16});
|
10212
|
+
|
10213
|
+
stream->submit([&](sycl::handler &cgh) {
|
10214
|
+
auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
|
10215
|
+
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10216
|
+
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10217
|
+
|
10218
|
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10219
|
+
sycl::range<3>(1, 1, 32),
|
10220
|
+
sycl::range<3>(1, 1, 32)),
|
10221
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10222
|
+
dequantize_block_iq3_s(
|
10223
|
+
vx, y, item_ct1, iq3s_grid_ptr_ct1,
|
10224
|
+
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
10225
|
+
});
|
10226
|
+
});
|
10227
|
+
}
|
10228
|
+
}
|
10229
|
+
|
10230
|
+
template <typename dst_t>
|
10231
|
+
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
10232
|
+
dpct::queue_ptr stream) {
|
10233
|
+
const int nb = k / QK_K;
|
10234
|
+
{
|
10235
|
+
iq1s_grid_gpu.init(*stream);
|
10236
|
+
ksigns_iq2xs.init(*stream);
|
10237
|
+
kmask_iq2xs.init(*stream);
|
10238
|
+
|
10239
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10240
|
+
{sycl::aspect::fp16});
|
10241
|
+
|
10242
|
+
stream->submit([&](sycl::handler &cgh) {
|
10243
|
+
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
|
10244
|
+
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10245
|
+
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10246
|
+
|
10247
|
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10248
|
+
sycl::range<3>(1, 1, 32),
|
10249
|
+
sycl::range<3>(1, 1, 32)),
|
10250
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10251
|
+
dequantize_block_iq1_s(
|
10252
|
+
vx, y, item_ct1, iq1s_grid_ptr_ct1,
|
10253
|
+
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
10254
|
+
});
|
10255
|
+
});
|
10256
|
+
}
|
10257
|
+
}
|
10258
|
+
|
10512
10259
|
template <typename src_t, typename dst_t>
|
10513
10260
|
static void convert_unary_sycl(const void *__restrict__ vx,
|
10514
10261
|
dst_t *__restrict__ y, const int k,
|
@@ -10559,6 +10306,10 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
|
|
10559
10306
|
return dequantize_row_iq2_xs_sycl;
|
10560
10307
|
case GGML_TYPE_IQ3_XXS:
|
10561
10308
|
return dequantize_row_iq3_xxs_sycl;
|
10309
|
+
case GGML_TYPE_IQ3_S:
|
10310
|
+
return dequantize_row_iq3_s_sycl;
|
10311
|
+
case GGML_TYPE_IQ1_S:
|
10312
|
+
return dequantize_row_iq1_s_sycl;
|
10562
10313
|
case GGML_TYPE_F32:
|
10563
10314
|
return convert_unary_sycl<float>;
|
10564
10315
|
default:
|
@@ -10599,6 +10350,10 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
|
|
10599
10350
|
return dequantize_row_iq2_xs_sycl;
|
10600
10351
|
case GGML_TYPE_IQ3_XXS:
|
10601
10352
|
return dequantize_row_iq3_xxs_sycl;
|
10353
|
+
case GGML_TYPE_IQ3_S:
|
10354
|
+
return dequantize_row_iq3_s_sycl;
|
10355
|
+
case GGML_TYPE_IQ1_S:
|
10356
|
+
return dequantize_row_iq1_s_sycl;
|
10602
10357
|
case GGML_TYPE_F16:
|
10603
10358
|
return convert_unary_sycl<sycl::half>;
|
10604
10359
|
default:
|
@@ -11188,6 +10943,61 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
11188
10943
|
}
|
11189
10944
|
}
|
11190
10945
|
|
10946
|
+
static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
10947
|
+
float *dst, const int ncols,
|
10948
|
+
const int nrows,
|
10949
|
+
dpct::queue_ptr stream) {
|
10950
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
10951
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
10952
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10953
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10954
|
+
{
|
10955
|
+
iq3s_grid.init(*stream);
|
10956
|
+
ksigns64.init(*stream);
|
10957
|
+
|
10958
|
+
stream->submit([&](sycl::handler &cgh) {
|
10959
|
+
auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
|
10960
|
+
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10961
|
+
|
10962
|
+
cgh.parallel_for(
|
10963
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10964
|
+
[=](sycl::nd_item<3> item_ct1)
|
10965
|
+
[[intel::reqd_sub_group_size(32)]] {
|
10966
|
+
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
10967
|
+
vx, vy, dst, ncols, nrows, item_ct1,
|
10968
|
+
iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10969
|
+
});
|
10970
|
+
});
|
10971
|
+
}
|
10972
|
+
}
|
10973
|
+
|
10974
|
+
static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
10975
|
+
float *dst, const int ncols,
|
10976
|
+
const int nrows,
|
10977
|
+
dpct::queue_ptr stream) {
|
10978
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
10979
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
10980
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10981
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10982
|
+
{
|
10983
|
+
iq1s_grid_gpu.init(*stream);
|
10984
|
+
ksigns64.init(*stream);
|
10985
|
+
|
10986
|
+
stream->submit([&](sycl::handler &cgh) {
|
10987
|
+
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
|
10988
|
+
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10989
|
+
|
10990
|
+
cgh.parallel_for(
|
10991
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10992
|
+
[=](sycl::nd_item<3> item_ct1)
|
10993
|
+
[[intel::reqd_sub_group_size(32)]] {
|
10994
|
+
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
10995
|
+
vx, vy, dst, ncols, nrows, item_ct1,
|
10996
|
+
iq1s_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10997
|
+
});
|
10998
|
+
});
|
10999
|
+
}
|
11000
|
+
}
|
11191
11001
|
|
11192
11002
|
static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
11193
11003
|
float *dst, const int ncols_x,
|
@@ -13936,8 +13746,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
|
13936
13746
|
case GGML_TYPE_Q5_K:
|
13937
13747
|
case GGML_TYPE_IQ2_XXS:
|
13938
13748
|
case GGML_TYPE_IQ2_XS:
|
13749
|
+
case GGML_TYPE_IQ1_S:
|
13939
13750
|
case GGML_TYPE_IQ3_XXS:
|
13940
13751
|
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
13752
|
+
case GGML_TYPE_IQ3_S:
|
13753
|
+
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
13941
13754
|
case GGML_TYPE_Q6_K:
|
13942
13755
|
return 64;
|
13943
13756
|
default:
|
@@ -13998,6 +13811,12 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
13998
13811
|
case GGML_TYPE_IQ3_XXS:
|
13999
13812
|
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
14000
13813
|
break;
|
13814
|
+
case GGML_TYPE_IQ3_S:
|
13815
|
+
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13816
|
+
break;
|
13817
|
+
case GGML_TYPE_IQ1_S:
|
13818
|
+
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13819
|
+
break;
|
14001
13820
|
default:
|
14002
13821
|
GGML_ASSERT(false);
|
14003
13822
|
break;
|
@@ -17343,9 +17162,8 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17343
17162
|
return false;
|
17344
17163
|
}
|
17345
17164
|
ggml_type a_type = a->type;
|
17346
|
-
if (a_type ==
|
17347
|
-
a_type ==
|
17348
|
-
a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
17165
|
+
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S ||
|
17166
|
+
a_type == GGML_TYPE_IQ4_XS) {
|
17349
17167
|
return false;
|
17350
17168
|
}
|
17351
17169
|
return true;
|
@@ -17440,13 +17258,18 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17440
17258
|
/* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
|
17441
17259
|
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
17442
17260
|
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
17443
|
-
/* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async,
|
17261
|
+
/* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
|
17444
17262
|
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
17445
17263
|
/* .graph_plan_create = */ NULL,
|
17446
17264
|
/* .graph_plan_free = */ NULL,
|
17447
17265
|
/* .graph_plan_compute = */ NULL,
|
17448
17266
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17449
17267
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17268
|
+
/* .event_new = */ NULL,
|
17269
|
+
/* .event_free = */ NULL,
|
17270
|
+
/* .event_record = */ NULL,
|
17271
|
+
/* .event_wait = */ NULL,
|
17272
|
+
/* .event_synchronize = */ NULL,
|
17450
17273
|
};
|
17451
17274
|
|
17452
17275
|
static ggml_guid_t ggml_backend_sycl_guid() {
|