llama_cpp 0.15.1 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
3154
3154
|
#define SYCL_SCALE_BLOCK_SIZE 256
|
3155
3155
|
#define SYCL_CLAMP_BLOCK_SIZE 256
|
3156
3156
|
#define SYCL_ROPE_BLOCK_SIZE 256
|
3157
|
-
#define SYCL_ALIBI_BLOCK_SIZE 32
|
3158
3157
|
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
3159
3158
|
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
3160
3159
|
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -3848,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
|
|
3848
3847
|
}
|
3849
3848
|
}
|
3850
3849
|
|
3851
|
-
static void upscale_f32(const float *x, float *dst, const int
|
3852
|
-
const
|
3853
|
-
|
3854
|
-
|
3855
|
-
|
3856
|
-
|
3850
|
+
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
3851
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
3852
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
3853
|
+
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
3854
|
+
int index = item_ct1.get_local_id(0) +
|
3855
|
+
item_ct1.get_group(0) * item_ct1.get_local_range(0);
|
3856
|
+
if (index >= ne10 * ne11 * ne12 * ne13) {
|
3857
3857
|
return;
|
3858
3858
|
}
|
3859
3859
|
// operation
|
3860
|
-
int
|
3861
|
-
int
|
3862
|
-
int
|
3863
|
-
int
|
3864
|
-
|
3865
|
-
|
3860
|
+
int i10 = index % ne10;
|
3861
|
+
int i11 = (index / ne10) % ne11;
|
3862
|
+
int i12 = (index / (ne10 * ne11)) % ne12;
|
3863
|
+
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
3864
|
+
|
3865
|
+
int i00 = i10 / sf0;
|
3866
|
+
int i01 = i11 / sf1;
|
3867
|
+
int i02 = i12 / sf2;
|
3868
|
+
int i03 = i13 / sf3;
|
3869
|
+
|
3870
|
+
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
3866
3871
|
}
|
3867
3872
|
|
3868
3873
|
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
@@ -4192,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4192
4197
|
const block_q2_K * x = (const block_q2_K *) vx;
|
4193
4198
|
|
4194
4199
|
const int tid = item_ct1.get_local_id(2);
|
4195
|
-
#if QK_K == 256
|
4196
4200
|
const int n = tid/32;
|
4197
4201
|
const int l = tid - 32*n;
|
4198
4202
|
const int is = 8*n + l/16;
|
@@ -4206,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4206
4210
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4207
4211
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
4208
4212
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
4209
|
-
#else
|
4210
|
-
const int is = tid/16; // 0 or 1
|
4211
|
-
const int il = tid%16; // 0...15
|
4212
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4213
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4214
|
-
|
4215
|
-
float dall = x[i].dm[0];
|
4216
|
-
float dmin = x[i].dm[1];
|
4217
|
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
4218
|
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4219
|
-
#endif
|
4220
|
-
|
4221
4213
|
}
|
4222
4214
|
|
4223
4215
|
template<typename dst_t>
|
@@ -4227,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4227
4219
|
const int i = item_ct1.get_group(2);
|
4228
4220
|
const block_q3_K * x = (const block_q3_K *) vx;
|
4229
4221
|
|
4230
|
-
#if QK_K == 256
|
4231
4222
|
const int r = item_ct1.get_local_id(2) / 4;
|
4232
4223
|
const int tid = r/2;
|
4233
4224
|
const int is0 = r%2;
|
@@ -4251,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4251
4242
|
const uint8_t * hm = x[i].hmask;
|
4252
4243
|
|
4253
4244
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
4254
|
-
#else
|
4255
|
-
const int tid = item_ct1.get_local_id(2);
|
4256
|
-
const int is = tid/16; // 0 or 1
|
4257
|
-
const int il = tid%16; // 0...15
|
4258
|
-
const int im = il/8; // 0...1
|
4259
|
-
const int in = il%8; // 0...7
|
4260
|
-
|
4261
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4262
|
-
|
4263
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4264
|
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
4265
|
-
const float d = (float)x[i].d;
|
4266
|
-
|
4267
|
-
if (is == 0) {
|
4268
|
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4269
|
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4270
|
-
} else {
|
4271
|
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4272
|
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4273
|
-
}
|
4274
|
-
#endif
|
4275
|
-
|
4276
4245
|
}
|
4277
4246
|
|
4278
|
-
#if QK_K == 256
|
4279
4247
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
4280
4248
|
if (j < 4) {
|
4281
4249
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -4284,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
4284
4252
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
4285
4253
|
}
|
4286
4254
|
}
|
4287
|
-
#endif
|
4288
4255
|
|
4289
4256
|
template<typename dst_t>
|
4290
4257
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
@@ -4293,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4293
4260
|
|
4294
4261
|
const int i = item_ct1.get_group(2);
|
4295
4262
|
|
4296
|
-
#if QK_K == 256
|
4297
4263
|
// assume 32 threads
|
4298
4264
|
const int tid = item_ct1.get_local_id(2);
|
4299
4265
|
const int il = tid/8;
|
@@ -4317,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4317
4283
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
4318
4284
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
4319
4285
|
}
|
4320
|
-
#else
|
4321
|
-
const int tid = item_ct1.get_local_id(2);
|
4322
|
-
const uint8_t * q = x[i].qs;
|
4323
|
-
dst_t * y = yy + i*QK_K;
|
4324
|
-
const float d = (float)x[i].dm[0];
|
4325
|
-
const float m = (float)x[i].dm[1];
|
4326
|
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
4327
|
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
4328
|
-
#endif
|
4329
4286
|
}
|
4330
4287
|
|
4331
4288
|
template<typename dst_t>
|
@@ -4335,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4335
4292
|
|
4336
4293
|
const int i = item_ct1.get_group(2);
|
4337
4294
|
|
4338
|
-
#if QK_K == 256
|
4339
4295
|
// assume 64 threads - this is very slightly better than the one below
|
4340
4296
|
const int tid = item_ct1.get_local_id(2);
|
4341
4297
|
const int il = tid/16; // il is in 0...3
|
@@ -4362,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4362
4318
|
hm <<= 1;
|
4363
4319
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
4364
4320
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
4365
|
-
#else
|
4366
|
-
const int tid = item_ct1.get_local_id(2);
|
4367
|
-
const uint8_t q = x[i].qs[tid];
|
4368
|
-
const int im = tid/8; // 0...3
|
4369
|
-
const int in = tid%8; // 0...7
|
4370
|
-
const int is = tid/16; // 0 or 1
|
4371
|
-
const uint8_t h = x[i].qh[in] >> im;
|
4372
|
-
const float d = x[i].d;
|
4373
|
-
dst_t * y = yy + i*QK_K + tid;
|
4374
|
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
4375
|
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
4376
|
-
#endif
|
4377
4321
|
}
|
4378
4322
|
|
4379
4323
|
template<typename dst_t>
|
@@ -4382,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4382
4326
|
const block_q6_K * x = (const block_q6_K *) vx;
|
4383
4327
|
|
4384
4328
|
const int i = item_ct1.get_group(2);
|
4385
|
-
#if QK_K == 256
|
4386
4329
|
|
4387
4330
|
// assume 64 threads - this is very slightly better than the one below
|
4388
4331
|
const int tid = item_ct1.get_local_id(2);
|
@@ -4402,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4402
4345
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
4403
4346
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4404
4347
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
4405
|
-
#else
|
4406
|
-
|
4407
|
-
// assume 32 threads
|
4408
|
-
const int tid = item_ct1.get_local_id(2);
|
4409
|
-
const int ip = tid/16; // 0 or 1
|
4410
|
-
const int il = tid - 16*ip; // 0...15
|
4411
|
-
|
4412
|
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
4413
|
-
|
4414
|
-
const float d = x[i].d;
|
4415
|
-
|
4416
|
-
const uint8_t ql = x[i].ql[16*ip + il];
|
4417
|
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
4418
|
-
const int8_t * sc = x[i].scales;
|
4419
|
-
|
4420
|
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
4421
|
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4422
|
-
#endif
|
4423
4348
|
}
|
4424
4349
|
|
4425
4350
|
template<typename dst_t>
|
@@ -4433,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4433
4358
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
4434
4359
|
|
4435
4360
|
const int tid = item_ct1.get_local_id(2);
|
4436
|
-
#if QK_K == 256
|
4437
4361
|
const int il = tid/8; // 0...3
|
4438
4362
|
const int ib = tid%8; // 0...7
|
4439
4363
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4444,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4444
4368
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
4445
4369
|
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
4446
4370
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
4447
|
-
#else
|
4448
|
-
assert(false);
|
4449
|
-
#endif
|
4450
|
-
|
4451
4371
|
}
|
4452
4372
|
|
4453
4373
|
template<typename dst_t>
|
@@ -4461,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4461
4381
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
4462
4382
|
|
4463
4383
|
const int tid = item_ct1.get_local_id(2);
|
4464
|
-
#if QK_K == 256
|
4465
4384
|
const int il = tid/8; // 0...3
|
4466
4385
|
const int ib = tid%8; // 0...7
|
4467
4386
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4470,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4470
4389
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4471
4390
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
4472
4391
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4473
|
-
#else
|
4474
|
-
assert(false);
|
4475
|
-
#endif
|
4476
|
-
|
4477
4392
|
}
|
4478
4393
|
|
4479
4394
|
template <typename dst_t>
|
@@ -4485,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4485
4400
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
4486
4401
|
|
4487
4402
|
const int tid = item_ct1.get_local_id(2);
|
4488
|
-
#if QK_K == 256
|
4489
4403
|
const int il = tid/8; // 0...3
|
4490
4404
|
const int ib = tid%8; // 0...7
|
4491
4405
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4493,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4493
4407
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4494
4408
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
4495
4409
|
#pragma unroll
|
4496
|
-
for (int j = 0; j < 8; ++j)
|
4410
|
+
for (int j = 0; j < 8; ++j) {
|
4497
4411
|
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4498
|
-
|
4499
|
-
assert(false);
|
4500
|
-
|
4501
|
-
#endif
|
4502
|
-
|
4412
|
+
}
|
4503
4413
|
}
|
4504
4414
|
|
4505
4415
|
template<typename dst_t>
|
@@ -4513,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4513
4423
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
4514
4424
|
|
4515
4425
|
const int tid = item_ct1.get_local_id(2);
|
4516
|
-
#if QK_K == 256
|
4517
4426
|
const int il = tid/8; // 0...3
|
4518
4427
|
const int ib = tid%8; // 0...7
|
4519
4428
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4528,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4528
4437
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4529
4438
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4530
4439
|
}
|
4531
|
-
#else
|
4532
|
-
assert(false);
|
4533
|
-
#endif
|
4534
|
-
|
4535
4440
|
}
|
4536
4441
|
|
4537
4442
|
template <typename dst_t>
|
@@ -4544,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4544
4449
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
4545
4450
|
|
4546
4451
|
const int tid = item_ct1.get_local_id(2);
|
4547
|
-
#if QK_K == 256
|
4548
4452
|
const int il = tid/8; // 0...3
|
4549
4453
|
const int ib = tid%8; // 0...7
|
4550
4454
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4558,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4558
4462
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4559
4463
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4560
4464
|
}
|
4561
|
-
#else
|
4562
|
-
assert(false);
|
4563
|
-
#endif
|
4564
|
-
|
4565
4465
|
}
|
4566
4466
|
|
4567
4467
|
template <typename dst_t>
|
@@ -4574,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4574
4474
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
4575
4475
|
|
4576
4476
|
const int tid = item_ct1.get_local_id(2);
|
4577
|
-
#if QK_K == 256
|
4578
4477
|
const int il = tid/8; // 0...3
|
4579
4478
|
const int ib = tid%8; // 0...7
|
4580
4479
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4588,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4588
4487
|
for (int j = 0; j < 8; ++j) {
|
4589
4488
|
y[j] = d * (q[j] + delta);
|
4590
4489
|
}
|
4591
|
-
#else
|
4592
|
-
assert(false);
|
4593
|
-
#endif
|
4594
|
-
|
4595
4490
|
}
|
4596
4491
|
|
4597
4492
|
template <typename dst_t>
|
@@ -4604,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4604
4499
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
4605
4500
|
|
4606
4501
|
const int tid = item_ct1.get_local_id(2);
|
4607
|
-
#if QK_K == 256
|
4608
4502
|
const int il = tid/8; // 0...3
|
4609
4503
|
const int ib = tid%8; // 0...7
|
4610
4504
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4622,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4622
4516
|
for (int j = 0; j < 8; ++j) {
|
4623
4517
|
y[j] = d * (q[j] + delta);
|
4624
4518
|
}
|
4625
|
-
#else
|
4626
|
-
assert(false);
|
4627
|
-
#endif
|
4628
|
-
|
4629
4519
|
}
|
4630
4520
|
|
4631
4521
|
template <typename dst_t>
|
@@ -4699,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4699
4589
|
|
4700
4590
|
float tmp = 0; // partial sum for thread in warp
|
4701
4591
|
|
4702
|
-
#if QK_K == 256
|
4703
4592
|
const int tid =
|
4704
4593
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
4705
4594
|
const int ix =
|
@@ -4750,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4750
4639
|
tmp += dall * sum1 - dmin * sum2;
|
4751
4640
|
|
4752
4641
|
}
|
4753
|
-
#else
|
4754
|
-
const int tid = item_ct1.get_local_id(2) /
|
4755
|
-
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4756
|
-
const int ix = item_ct1.get_local_id(2) %
|
4757
|
-
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4758
|
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
4759
|
-
|
4760
|
-
uint32_t uaux[2];
|
4761
|
-
const uint8_t * d = (const uint8_t *)uaux;
|
4762
|
-
|
4763
|
-
|
4764
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4765
|
-
|
4766
|
-
const float * y = yy + i * QK_K + offset;
|
4767
|
-
const uint8_t * q = x[i].qs + offset;
|
4768
|
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
4769
|
-
|
4770
|
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
4771
|
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
4772
|
-
|
4773
|
-
const sycl::float2 dall =
|
4774
|
-
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
4775
|
-
|
4776
|
-
float sum1 = 0, sum2 = 0;
|
4777
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4778
|
-
const uint8_t ql = q[l];
|
4779
|
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
4780
|
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
4781
|
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
4782
|
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
4783
|
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
4784
|
-
}
|
4785
|
-
tmp += dall.x() * sum1 - dall.y() * sum2;
|
4786
|
-
}
|
4787
|
-
|
4788
|
-
#endif
|
4789
4642
|
|
4790
4643
|
// sum up partial sums and write back result
|
4791
4644
|
#pragma unroll
|
@@ -4823,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4823
4676
|
|
4824
4677
|
float tmp = 0; // partial sum for thread in warp
|
4825
4678
|
|
4826
|
-
#if QK_K == 256
|
4827
|
-
|
4828
4679
|
const uint16_t kmask1 = 0x0303;
|
4829
4680
|
const uint16_t kmask2 = 0x0f0f;
|
4830
4681
|
|
@@ -4877,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4877
4728
|
tmp += d * sum;
|
4878
4729
|
|
4879
4730
|
}
|
4880
|
-
#else
|
4881
|
-
|
4882
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4883
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4884
|
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
4885
|
-
const int in = offset/8; // 0 or 1
|
4886
|
-
const int im = offset%8; // 0...7
|
4887
|
-
|
4888
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4889
|
-
|
4890
|
-
const float * y = yy + i * QK_K + offset;
|
4891
|
-
const uint8_t * q = x[i].qs + offset;
|
4892
|
-
const uint8_t * s = x[i].scales;
|
4893
|
-
|
4894
|
-
const float dall = (float)x[i].d;
|
4895
|
-
|
4896
|
-
float sum = 0;
|
4897
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4898
|
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
4899
|
-
const uint8_t ql = q[l];
|
4900
|
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
4901
|
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
4902
|
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
4903
|
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
4904
|
-
}
|
4905
|
-
tmp += sum;
|
4906
|
-
}
|
4907
|
-
#endif
|
4908
4731
|
|
4909
4732
|
// sum up partial sums and write back result
|
4910
4733
|
#pragma unroll
|
@@ -4939,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
4939
4762
|
|
4940
4763
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
4941
4764
|
|
4942
|
-
#if QK_K == 256
|
4943
4765
|
const uint16_t kmask1 = 0x3f3f;
|
4944
4766
|
const uint16_t kmask2 = 0x0f0f;
|
4945
4767
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5028,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
5028
4850
|
#endif
|
5029
4851
|
|
5030
4852
|
}
|
5031
|
-
#else
|
5032
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5033
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5034
|
-
|
5035
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5036
|
-
|
5037
|
-
uint16_t aux16[2];
|
5038
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
5039
|
-
|
5040
|
-
float tmp = 0;
|
5041
|
-
|
5042
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5043
|
-
const uint8_t * q = x[i].qs + step;
|
5044
|
-
const float * y = yy + i*QK_K + step;
|
5045
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
5046
|
-
aux16[0] = a[0] & 0x0f0f;
|
5047
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
5048
|
-
const float d = (float)x[i].dm[0];
|
5049
|
-
const float m = (float)x[i].dm[1];
|
5050
|
-
float sum = 0.f;
|
5051
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5052
|
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
5053
|
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
5054
|
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
5055
|
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
5056
|
-
}
|
5057
|
-
tmp += sum;
|
5058
|
-
}
|
5059
|
-
|
5060
|
-
#endif
|
5061
4853
|
|
5062
4854
|
// sum up partial sums and write back result
|
5063
4855
|
#pragma unroll
|
@@ -5092,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5092
4884
|
|
5093
4885
|
float tmp = 0; // partial sum for thread in warp
|
5094
4886
|
|
5095
|
-
#if QK_K == 256
|
5096
4887
|
const uint16_t kmask1 = 0x3f3f;
|
5097
4888
|
const uint16_t kmask2 = 0x0f0f;
|
5098
4889
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5169,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5169
4960
|
dmin * smin;
|
5170
4961
|
}
|
5171
4962
|
|
5172
|
-
#else
|
5173
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5174
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5175
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5176
|
-
const int im = step/8;
|
5177
|
-
const int in = step%8;
|
5178
|
-
|
5179
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5180
|
-
const uint8_t * q = x[i].qs + step;
|
5181
|
-
const int8_t * s = x[i].scales;
|
5182
|
-
const float * y = yy + i*QK_K + step;
|
5183
|
-
const float d = x[i].d;
|
5184
|
-
float sum = 0.f;
|
5185
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5186
|
-
const uint8_t h = x[i].qh[in+j] >> im;
|
5187
|
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
5188
|
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
5189
|
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
5190
|
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
5191
|
-
}
|
5192
|
-
tmp += sum;
|
5193
|
-
}
|
5194
|
-
#endif
|
5195
|
-
|
5196
4963
|
// sum up partial sums and write back result
|
5197
4964
|
#pragma unroll
|
5198
4965
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -5219,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5219
4986
|
|
5220
4987
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
5221
4988
|
|
5222
|
-
#if QK_K == 256
|
5223
|
-
|
5224
4989
|
const int tid =
|
5225
4990
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
5226
4991
|
const int ix =
|
@@ -5277,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5277
5042
|
|
5278
5043
|
}
|
5279
5044
|
|
5280
|
-
#else
|
5281
|
-
|
5282
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
5283
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
5284
|
-
|
5285
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5286
|
-
|
5287
|
-
float tmp = 0; // partial sum for thread in warp
|
5288
|
-
|
5289
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5290
|
-
|
5291
|
-
const float * y = yy + i * QK_K + step;
|
5292
|
-
const uint8_t * ql = x[i].ql + step;
|
5293
|
-
const uint8_t * qh = x[i].qh + step;
|
5294
|
-
const int8_t * s = x[i].scales;
|
5295
|
-
|
5296
|
-
const float d = x[i+0].d;
|
5297
|
-
|
5298
|
-
float sum = 0;
|
5299
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5300
|
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
5301
|
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
5302
|
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
5303
|
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
5304
|
-
}
|
5305
|
-
tmp += sum;
|
5306
|
-
|
5307
|
-
}
|
5308
|
-
|
5309
|
-
#endif
|
5310
|
-
|
5311
5045
|
// sum up partial sums and write back result
|
5312
5046
|
#pragma unroll
|
5313
5047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -6852,7 +6586,6 @@ static __dpct_inline__ float
|
|
6852
6586
|
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
6853
6587
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
6854
6588
|
|
6855
|
-
#ifndef GGML_QKK_64
|
6856
6589
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6857
6590
|
|
6858
6591
|
int v[2];
|
@@ -6894,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
6894
6627
|
}
|
6895
6628
|
|
6896
6629
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
6897
|
-
|
6898
|
-
#else
|
6899
|
-
|
6900
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
6901
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6902
|
-
|
6903
|
-
float sumf_d = 0.0f;
|
6904
|
-
float sumf_m = 0.0f;
|
6905
|
-
|
6906
|
-
uint16_t aux16[2];
|
6907
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
6908
|
-
|
6909
|
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
6910
|
-
aux16[0] = a[0] & 0x0f0f;
|
6911
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
6912
|
-
|
6913
|
-
const float dall = bq4_K->dm[0];
|
6914
|
-
const float dmin = bq4_K->dm[1];
|
6915
|
-
|
6916
|
-
const float d8_1 = bq8_1[0].ds[0];
|
6917
|
-
const float d8_2 = bq8_1[1].ds[1];
|
6918
|
-
|
6919
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
6920
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
6921
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
6922
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
6923
|
-
|
6924
|
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
6925
|
-
const int v1 = q4[0];
|
6926
|
-
const int v2 = q4[4];
|
6927
|
-
|
6928
|
-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
6929
|
-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
6930
|
-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
6931
|
-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
6932
|
-
|
6933
|
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
6934
|
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
6935
|
-
|
6936
|
-
return dall * sumf_d - dmin * sumf_m;
|
6937
|
-
|
6938
|
-
#else
|
6939
|
-
bad_arch();
|
6940
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
6941
|
-
|
6942
|
-
#endif
|
6943
6630
|
}
|
6944
6631
|
|
6945
6632
|
template <int mmq_y>
|
@@ -6998,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
6998
6685
|
|
6999
6686
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
7000
6687
|
|
7001
|
-
#if QK_K == 256
|
7002
6688
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
7003
|
-
#else
|
7004
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
7005
|
-
#endif
|
7006
6689
|
}
|
7007
6690
|
|
7008
6691
|
#pragma unroll
|
@@ -7045,7 +6728,6 @@ static __dpct_inline__ float
|
|
7045
6728
|
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
7046
6729
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7047
6730
|
|
7048
|
-
#ifndef GGML_QKK_64
|
7049
6731
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7050
6732
|
|
7051
6733
|
int vl[2];
|
@@ -7087,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
7087
6769
|
}
|
7088
6770
|
|
7089
6771
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
7090
|
-
|
7091
|
-
#else
|
7092
|
-
|
7093
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
7094
|
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7095
|
-
|
7096
|
-
const int8_t * s = bq5_K->scales;
|
7097
|
-
|
7098
|
-
const float d = bq5_K->d;
|
7099
|
-
|
7100
|
-
const float d8_1 = bq8_1[0].ds[0];
|
7101
|
-
const float d8_2 = bq8_1[1].ds[1];
|
7102
|
-
|
7103
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
7104
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
7105
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
7106
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
7107
|
-
|
7108
|
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
7109
|
-
const int vl1 = ql[0];
|
7110
|
-
const int vl2 = ql[4];
|
7111
|
-
|
7112
|
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
7113
|
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
7114
|
-
const int in = step%8; // 0, 4, 0, 4
|
7115
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
7116
|
-
|
7117
|
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
7118
|
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
7119
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
7120
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
7121
|
-
|
7122
|
-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
7123
|
-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
7124
|
-
|
7125
|
-
return d * sumf_d;
|
7126
|
-
|
7127
|
-
#else
|
7128
|
-
bad_arch();
|
7129
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
7130
|
-
|
7131
|
-
#endif
|
7132
6772
|
}
|
7133
6773
|
|
7134
6774
|
template <int mmq_y>
|
@@ -7200,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
7200
6840
|
|
7201
6841
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
7202
6842
|
|
7203
|
-
#if QK_K == 256
|
7204
6843
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
7205
|
-
#endif
|
7206
6844
|
}
|
7207
6845
|
|
7208
6846
|
#pragma unroll
|
@@ -7382,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7382
7020
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7383
7021
|
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
7384
7022
|
const uint8_t *kmask_iq2xs) {
|
7385
|
-
#if QK_K == 256
|
7386
7023
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
7387
7024
|
|
7388
7025
|
#if QR2_XXS == 8
|
@@ -7423,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7423
7060
|
}
|
7424
7061
|
return d * (sumi1 + sumi2);
|
7425
7062
|
#endif
|
7426
|
-
#else
|
7427
|
-
assert(false);
|
7428
|
-
return 0.f;
|
7429
|
-
#endif
|
7430
7063
|
}
|
7431
7064
|
|
7432
7065
|
static __dpct_inline__ float
|
@@ -7435,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7435
7068
|
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
7436
7069
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7437
7070
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7438
|
-
#if QK_K == 256
|
7439
7071
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
7440
7072
|
|
7441
7073
|
const int ib32 = iqs;
|
@@ -7473,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7473
7105
|
assert(false);
|
7474
7106
|
return 0.f;
|
7475
7107
|
#endif
|
7476
|
-
#else
|
7477
|
-
assert(false);
|
7478
|
-
return 0.f;
|
7479
|
-
#endif
|
7480
7108
|
}
|
7481
7109
|
|
7482
7110
|
static __dpct_inline__ float
|
7483
7111
|
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
7484
7112
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7485
|
-
#if QK_K == 256
|
7486
7113
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
7487
7114
|
|
7488
7115
|
const int ib32 = iqs;
|
@@ -7526,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
7526
7153
|
}
|
7527
7154
|
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
7528
7155
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
7529
|
-
#else
|
7530
|
-
assert(false);
|
7531
|
-
#endif
|
7532
7156
|
}
|
7533
7157
|
|
7534
7158
|
static __dpct_inline__ float
|
@@ -7537,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7537
7161
|
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
7538
7162
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7539
7163
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7540
|
-
#if QK_K == 256
|
7541
7164
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
7542
7165
|
|
7543
7166
|
const int ib32 = iqs;
|
@@ -7565,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7565
7188
|
assert(false);
|
7566
7189
|
return 0.f;
|
7567
7190
|
#endif
|
7568
|
-
#else
|
7569
|
-
assert(false);
|
7570
|
-
return 0.f;
|
7571
|
-
#endif
|
7572
7191
|
}
|
7573
7192
|
|
7574
7193
|
static __dpct_inline__ float
|
7575
7194
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
7576
7195
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7577
7196
|
const uint32_t *iq3s_grid) {
|
7578
|
-
#if QK_K == 256
|
7579
7197
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
7580
7198
|
|
7581
7199
|
const int ib32 = iqs;
|
@@ -7604,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7604
7222
|
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
7605
7223
|
bq8_1[ib32].ds[0];
|
7606
7224
|
return d * sumi;
|
7607
|
-
#else
|
7608
|
-
assert(false);
|
7609
|
-
#endif
|
7610
7225
|
}
|
7611
7226
|
|
7612
7227
|
static __dpct_inline__ float
|
7613
7228
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
7614
7229
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7615
7230
|
const uint32_t *iq1s_grid_gpu) {
|
7616
|
-
#if QK_K == 256
|
7617
7231
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
7618
7232
|
|
7619
7233
|
const int ib32 = iqs;
|
@@ -7632,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
7632
7246
|
const float d = d1q * bq8_1[ib32].ds[0];
|
7633
7247
|
const float m = d1q * bq8_1[ib32].ds[1];
|
7634
7248
|
return d * sumi + m * delta;
|
7635
|
-
#else
|
7636
|
-
assert(false);
|
7637
|
-
#endif
|
7638
7249
|
}
|
7639
7250
|
|
7640
7251
|
static __dpct_inline__ float
|
7641
7252
|
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
7642
7253
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7643
|
-
#if QK_K == 256
|
7644
7254
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
7645
7255
|
|
7646
7256
|
const int ib32 = iqs;
|
@@ -7665,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
7665
7275
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
7666
7276
|
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
7667
7277
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
7668
|
-
#else
|
7669
|
-
assert(false);
|
7670
|
-
#endif
|
7671
7278
|
}
|
7672
7279
|
|
7673
7280
|
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
@@ -7715,7 +7322,6 @@ static __dpct_inline__ float
|
|
7715
7322
|
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
7716
7323
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7717
7324
|
|
7718
|
-
#if QK_K == 256
|
7719
7325
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
7720
7326
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
7721
7327
|
|
@@ -7733,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
7733
7339
|
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
7734
7340
|
}
|
7735
7341
|
return d * (sumi1 + sumi2);
|
7736
|
-
#else
|
7737
|
-
assert(false);
|
7738
|
-
#endif
|
7739
7342
|
}
|
7740
7343
|
|
7741
7344
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
@@ -9316,32 +8919,6 @@ static void rope_glm_f32(
|
|
9316
8919
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
9317
8920
|
}
|
9318
8921
|
|
9319
|
-
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
9320
|
-
const int n_heads_log2_floor, const float m0, const float m1,
|
9321
|
-
const sycl::nd_item<3> &item_ct1) {
|
9322
|
-
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
9323
|
-
item_ct1.get_local_id(2);
|
9324
|
-
|
9325
|
-
if (col >= ncols) {
|
9326
|
-
return;
|
9327
|
-
}
|
9328
|
-
|
9329
|
-
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
9330
|
-
item_ct1.get_local_id(1);
|
9331
|
-
const int i = row*ncols + col;
|
9332
|
-
|
9333
|
-
const int k = row/k_rows;
|
9334
|
-
|
9335
|
-
float m_k;
|
9336
|
-
if (k < n_heads_log2_floor) {
|
9337
|
-
m_k = dpct::pow(m0, k + 1);
|
9338
|
-
} else {
|
9339
|
-
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
|
9340
|
-
}
|
9341
|
-
|
9342
|
-
dst[i] = col * m_k + x[i];
|
9343
|
-
}
|
9344
|
-
|
9345
8922
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
9346
8923
|
const sycl::nd_item<3> &item_ct1) {
|
9347
8924
|
const int row = item_ct1.get_group(1);
|
@@ -9443,7 +9020,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
9443
9020
|
|
9444
9021
|
|
9445
9022
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
9446
|
-
static void soft_max_f32(const float * x, const float * mask,
|
9023
|
+
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
|
9447
9024
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
9448
9025
|
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
9449
9026
|
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
@@ -9457,7 +9034,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
9457
9034
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
9458
9035
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
9459
9036
|
|
9460
|
-
float slope =
|
9037
|
+
float slope = 1.0f;
|
9461
9038
|
|
9462
9039
|
// ALiBi
|
9463
9040
|
if (max_bias > 0.0f) {
|
@@ -9482,7 +9059,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
9482
9059
|
const int ix = rowx*ncols + col;
|
9483
9060
|
const int iy = rowy*ncols + col;
|
9484
9061
|
|
9485
|
-
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f)
|
9062
|
+
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
|
9486
9063
|
|
9487
9064
|
vals[col] = val;
|
9488
9065
|
max_val = sycl::max(max_val, val);
|
@@ -10112,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
|
10112
9689
|
});
|
10113
9690
|
}
|
10114
9691
|
|
10115
|
-
static void upscale_f32_sycl(const float *x, float *dst, const int
|
10116
|
-
const int
|
10117
|
-
const int
|
10118
|
-
|
10119
|
-
int
|
10120
|
-
|
9692
|
+
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
9693
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
9694
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
9695
|
+
const float sf2, const float sf3, dpct::queue_ptr stream) {
|
9696
|
+
int dst_size = ne10 * ne11 * ne12 * ne13;
|
9697
|
+
int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
9698
|
+
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
10121
9699
|
stream->parallel_for(
|
10122
|
-
sycl::nd_range<
|
10123
|
-
|
10124
|
-
|
10125
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10126
|
-
upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
|
9700
|
+
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
9701
|
+
[=](sycl::nd_item<1> item_ct1) {
|
9702
|
+
upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
10127
9703
|
});
|
10128
9704
|
}
|
10129
9705
|
|
@@ -10225,7 +9801,6 @@ template <typename dst_t>
|
|
10225
9801
|
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
10226
9802
|
dpct::queue_ptr stream) {
|
10227
9803
|
const int nb = k / QK_K;
|
10228
|
-
#if QK_K == 256
|
10229
9804
|
{
|
10230
9805
|
dpct::has_capability_or_fail(stream->get_device(),
|
10231
9806
|
{sycl::aspect::fp16});
|
@@ -10237,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10237
9812
|
dequantize_block_q2_K(vx, y, item_ct1);
|
10238
9813
|
});
|
10239
9814
|
}
|
10240
|
-
#else
|
10241
|
-
{
|
10242
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10243
|
-
{sycl::aspect::fp16});
|
10244
|
-
|
10245
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10246
|
-
sycl::range<3>(1, 1, 32),
|
10247
|
-
sycl::range<3>(1, 1, 32)),
|
10248
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10249
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
10250
|
-
});
|
10251
|
-
}
|
10252
|
-
|
10253
|
-
#endif
|
10254
9815
|
}
|
10255
9816
|
|
10256
9817
|
template <typename dst_t>
|
10257
9818
|
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
10258
9819
|
dpct::queue_ptr stream) {
|
10259
9820
|
const int nb = k / QK_K;
|
10260
|
-
#if QK_K == 256
|
10261
9821
|
{
|
10262
9822
|
dpct::has_capability_or_fail(stream->get_device(),
|
10263
9823
|
{sycl::aspect::fp16});
|
@@ -10269,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10269
9829
|
dequantize_block_q3_K(vx, y, item_ct1);
|
10270
9830
|
});
|
10271
9831
|
}
|
10272
|
-
#else
|
10273
|
-
{
|
10274
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10275
|
-
{sycl::aspect::fp16});
|
10276
|
-
|
10277
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10278
|
-
sycl::range<3>(1, 1, 32),
|
10279
|
-
sycl::range<3>(1, 1, 32)),
|
10280
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10281
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
10282
|
-
});
|
10283
|
-
}
|
10284
|
-
#endif
|
10285
9832
|
}
|
10286
9833
|
|
10287
9834
|
template <typename dst_t>
|
@@ -10342,7 +9889,6 @@ template <typename dst_t>
|
|
10342
9889
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
10343
9890
|
dpct::queue_ptr stream) {
|
10344
9891
|
const int nb = k / QK_K;
|
10345
|
-
#if QK_K == 256
|
10346
9892
|
{
|
10347
9893
|
dpct::has_capability_or_fail(stream->get_device(),
|
10348
9894
|
{sycl::aspect::fp16});
|
@@ -10354,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10354
9900
|
dequantize_block_q5_K(vx, y, item_ct1);
|
10355
9901
|
});
|
10356
9902
|
}
|
10357
|
-
#else
|
10358
|
-
{
|
10359
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10360
|
-
{sycl::aspect::fp16});
|
10361
|
-
|
10362
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10363
|
-
sycl::range<3>(1, 1, 32),
|
10364
|
-
sycl::range<3>(1, 1, 32)),
|
10365
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10366
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
10367
|
-
});
|
10368
|
-
}
|
10369
|
-
|
10370
|
-
#endif
|
10371
9903
|
}
|
10372
9904
|
|
10373
9905
|
template <typename dst_t>
|
10374
9906
|
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
10375
9907
|
dpct::queue_ptr stream) {
|
10376
9908
|
const int nb = k / QK_K;
|
10377
|
-
#if QK_K == 256
|
10378
9909
|
{
|
10379
9910
|
dpct::has_capability_or_fail(stream->get_device(),
|
10380
9911
|
{sycl::aspect::fp16});
|
@@ -10386,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10386
9917
|
dequantize_block_q6_K(vx, y, item_ct1);
|
10387
9918
|
});
|
10388
9919
|
}
|
10389
|
-
#else
|
10390
|
-
{
|
10391
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10392
|
-
{sycl::aspect::fp16});
|
10393
|
-
|
10394
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10395
|
-
sycl::range<3>(1, 1, 32),
|
10396
|
-
sycl::range<3>(1, 1, 32)),
|
10397
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10398
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
10399
|
-
});
|
10400
|
-
}
|
10401
|
-
|
10402
|
-
#endif
|
10403
9920
|
}
|
10404
9921
|
|
10405
9922
|
template <typename dst_t>
|
@@ -10551,9 +10068,6 @@ template <typename dst_t>
|
|
10551
10068
|
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
10552
10069
|
dpct::queue_ptr stream) {
|
10553
10070
|
const int nb = (k + QK_K - 1) / QK_K;
|
10554
|
-
#if QK_K == 64
|
10555
|
-
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
10556
|
-
#else
|
10557
10071
|
{
|
10558
10072
|
dpct::has_capability_or_fail(stream->get_device(),
|
10559
10073
|
{sycl::aspect::fp16});
|
@@ -10568,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10568
10082
|
});
|
10569
10083
|
});
|
10570
10084
|
}
|
10571
|
-
#endif
|
10572
10085
|
}
|
10573
10086
|
|
10574
10087
|
|
@@ -12073,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12073
11586
|
const int nrows_y, const int nrows_dst,
|
12074
11587
|
dpct::queue_ptr stream) try {
|
12075
11588
|
|
12076
|
-
#if QK_K == 256
|
12077
|
-
|
12078
11589
|
int id;
|
12079
11590
|
SYCL_CHECK(
|
12080
11591
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
@@ -12189,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12189
11700
|
});
|
12190
11701
|
}
|
12191
11702
|
}
|
12192
|
-
#endif
|
12193
11703
|
}
|
12194
11704
|
catch (sycl::exception const &exc) {
|
12195
11705
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -12964,20 +12474,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
|
12964
12474
|
});
|
12965
12475
|
}
|
12966
12476
|
|
12967
|
-
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
|
12968
|
-
const int nrows, const int k_rows,
|
12969
|
-
const int n_heads_log2_floor, const float m0,
|
12970
|
-
const float m1, dpct::queue_ptr stream) {
|
12971
|
-
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
|
12972
|
-
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
|
12973
|
-
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
|
12974
|
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12975
|
-
[=](sycl::nd_item<3> item_ct1) {
|
12976
|
-
alibi_f32(x, dst, ncols, k_rows,
|
12977
|
-
n_heads_log2_floor, m0, m1, item_ct1);
|
12978
|
-
});
|
12979
|
-
}
|
12980
|
-
|
12981
12477
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
12982
12478
|
const int nrows, dpct::queue_ptr stream) {
|
12983
12479
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
@@ -13058,7 +12554,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
|
13058
12554
|
}
|
13059
12555
|
|
13060
12556
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
13061
|
-
static void soft_max_f32_submitter(const float * x, const float * mask,
|
12557
|
+
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
|
13062
12558
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
13063
12559
|
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
13064
12560
|
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
@@ -13068,7 +12564,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
13068
12564
|
cgh.parallel_for(
|
13069
12565
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
13070
12566
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
13071
|
-
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask,
|
12567
|
+
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
13072
12568
|
nrows_y, scale, max_bias, m0,
|
13073
12569
|
m1, n_head_log2, item_ct1,
|
13074
12570
|
local_buf_acc.get_pointer());
|
@@ -13076,7 +12572,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
13076
12572
|
});
|
13077
12573
|
}
|
13078
12574
|
|
13079
|
-
static void soft_max_f32_sycl(const float * x, const float * mask,
|
12575
|
+
static void soft_max_f32_sycl(const float * x, const float * mask,
|
13080
12576
|
float * dst, const int ncols_x, const int nrows_x,
|
13081
12577
|
const int nrows_y, const float scale, const float max_bias,
|
13082
12578
|
dpct::queue_ptr stream) {
|
@@ -13098,60 +12594,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
13098
12594
|
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
13099
12595
|
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
13100
12596
|
if (ncols_x > max_block_size) {
|
13101
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
12597
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13102
12598
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13103
12599
|
block_dims, n_local_scratch, stream);
|
13104
12600
|
return;
|
13105
12601
|
}
|
13106
12602
|
switch (ncols_x) {
|
13107
12603
|
case 32:
|
13108
|
-
soft_max_f32_submitter<true, 32, 32>(x, mask,
|
12604
|
+
soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
|
13109
12605
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13110
12606
|
block_dims, n_local_scratch, stream);
|
13111
12607
|
break;
|
13112
12608
|
case 64:
|
13113
|
-
soft_max_f32_submitter<true, 64, 64>(x, mask,
|
12609
|
+
soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
|
13114
12610
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13115
12611
|
block_dims, n_local_scratch, stream);
|
13116
12612
|
break;
|
13117
12613
|
case 128:
|
13118
|
-
soft_max_f32_submitter<true, 128, 128>(x, mask,
|
12614
|
+
soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
|
13119
12615
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13120
12616
|
block_dims, n_local_scratch, stream);
|
13121
12617
|
break;
|
13122
12618
|
case 256:
|
13123
|
-
soft_max_f32_submitter<true, 256, 256>(x, mask,
|
12619
|
+
soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
|
13124
12620
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13125
12621
|
block_dims, n_local_scratch, stream);
|
13126
12622
|
break;
|
13127
12623
|
case 512:
|
13128
|
-
soft_max_f32_submitter<true, 512, 512>(x, mask,
|
12624
|
+
soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
|
13129
12625
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13130
12626
|
block_dims, n_local_scratch, stream);
|
13131
12627
|
break;
|
13132
12628
|
case 1024:
|
13133
|
-
soft_max_f32_submitter<true, 1024, 1024>(x, mask,
|
12629
|
+
soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13134
12630
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13135
12631
|
block_dims, n_local_scratch, stream);
|
13136
12632
|
break;
|
13137
12633
|
case 2048:
|
13138
|
-
soft_max_f32_submitter<true, 2048, 1024>(x, mask,
|
12634
|
+
soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13139
12635
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13140
12636
|
block_dims, n_local_scratch, stream);
|
13141
12637
|
break;
|
13142
12638
|
case 4096:
|
13143
|
-
soft_max_f32_submitter<true, 4096, 1024>(x, mask,
|
12639
|
+
soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13144
12640
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13145
12641
|
block_dims, n_local_scratch, stream);
|
13146
12642
|
break;
|
13147
12643
|
default:
|
13148
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
12644
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13149
12645
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13150
12646
|
block_dims, n_local_scratch, stream);
|
13151
12647
|
break;
|
13152
12648
|
}
|
13153
12649
|
} else {
|
13154
|
-
soft_max_f32_submitter<false, 0, 0>(x, mask,
|
12650
|
+
soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13155
12651
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13156
12652
|
block_dims, WARP_SIZE, stream);
|
13157
12653
|
}
|
@@ -14026,11 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
|
14026
13522
|
|
14027
13523
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14028
13524
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
14029
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
14030
13525
|
|
14031
|
-
const
|
13526
|
+
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
13527
|
+
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
13528
|
+
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
13529
|
+
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
14032
13530
|
|
14033
|
-
upscale_f32_sycl(src0_dd, dst_dd, src0->
|
13531
|
+
upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
13532
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
13533
|
+
main_stream);
|
14034
13534
|
|
14035
13535
|
(void) src1;
|
14036
13536
|
(void) dst;
|
@@ -14486,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14486
13986
|
ggml_tensor *dst, const float *src0_dd,
|
14487
13987
|
const float *src1_dd, float *dst_dd,
|
14488
13988
|
const dpct::queue_ptr &main_stream) {
|
13989
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
13990
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
13991
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
14489
13992
|
|
14490
13993
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
14491
13994
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
@@ -14562,36 +14065,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14562
14065
|
(void) src1_dd;
|
14563
14066
|
}
|
14564
14067
|
|
14565
|
-
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
14566
|
-
ggml_tensor *dst, const float *src0_dd,
|
14567
|
-
const float *src1_dd, float *dst_dd,
|
14568
|
-
const dpct::queue_ptr &main_stream) {
|
14569
|
-
|
14570
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14571
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
14572
|
-
|
14573
|
-
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
14574
|
-
const int64_t nrows = ggml_nrows(src0);
|
14575
|
-
|
14576
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14577
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
14578
|
-
float max_bias;
|
14579
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
14580
|
-
|
14581
|
-
//GGML_ASSERT(ne01 + n_past == ne00);
|
14582
|
-
GGML_ASSERT(n_head == ne02);
|
14583
|
-
|
14584
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
14585
|
-
|
14586
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
14587
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
14588
|
-
|
14589
|
-
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
14590
|
-
|
14591
|
-
(void) src1;
|
14592
|
-
(void) src1_dd;
|
14593
|
-
}
|
14594
|
-
|
14595
14068
|
static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
|
14596
14069
|
const ggml_tensor *src1, ggml_tensor *dst,
|
14597
14070
|
const float *src0_dd, const float *src1_dd,
|
@@ -14746,12 +14219,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14746
14219
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14747
14220
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
14748
14221
|
|
14749
|
-
|
14750
|
-
|
14751
|
-
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
|
14222
|
+
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
|
14752
14223
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
14753
14224
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
14754
|
-
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
|
14755
14225
|
|
14756
14226
|
const int64_t ne00 = src0->ne[0];
|
14757
14227
|
const int64_t nrows_x = ggml_nrows(src0);
|
@@ -14763,25 +14233,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14763
14233
|
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
14764
14234
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
14765
14235
|
|
14766
|
-
|
14767
|
-
float * src2_dd = nullptr;
|
14768
|
-
sycl_pool_alloc<float> src2_f;
|
14769
|
-
|
14770
|
-
const bool use_src2 = src2 != nullptr;
|
14771
|
-
|
14772
|
-
if (use_src2) {
|
14773
|
-
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
14774
|
-
|
14775
|
-
if (src2_on_device) {
|
14776
|
-
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
14777
|
-
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
14778
|
-
} else {
|
14779
|
-
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
14780
|
-
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
14781
|
-
}
|
14782
|
-
}
|
14783
|
-
|
14784
|
-
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
14236
|
+
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
|
14785
14237
|
nrows_x, nrows_y, scale, max_bias, main_stream);
|
14786
14238
|
}
|
14787
14239
|
|
@@ -15656,26 +15108,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15656
15108
|
const int64_t r2 = ne12/ne02;
|
15657
15109
|
const int64_t r3 = ne13/ne03;
|
15658
15110
|
|
15659
|
-
#if 0
|
15660
|
-
// use syclGemmEx
|
15661
|
-
{
|
15662
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
15663
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
15664
|
-
int i03 = i13 / r3;
|
15665
|
-
int i02 = i12 / r2;
|
15666
|
-
|
15667
|
-
SYCL_CHECK(
|
15668
|
-
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
15669
|
-
ne01, ne11, ne10,
|
15670
|
-
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
|
15671
|
-
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
|
15672
|
-
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
15673
|
-
cu_compute_type,
|
15674
|
-
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
15675
|
-
}
|
15676
|
-
}
|
15677
|
-
}
|
15678
|
-
#else
|
15679
15111
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
15680
15112
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
15681
15113
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
@@ -15687,7 +15119,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15687
15119
|
nb11 / nb10, nb12 / nb10, beta,
|
15688
15120
|
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
15689
15121
|
ne12 * ne13, cu_compute_type)));
|
15690
|
-
g_sycl_handles[g_main_device]->wait();
|
15691
15122
|
} else {
|
15692
15123
|
const int ne23 = ne12*ne13;
|
15693
15124
|
|
@@ -15718,7 +15149,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15718
15149
|
nb02, nb03, nb12_scaled, nb13_scaled,
|
15719
15150
|
nbd2, nbd3, r2, r3, item_ct1);
|
15720
15151
|
});
|
15721
|
-
})
|
15152
|
+
});
|
15722
15153
|
}
|
15723
15154
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
15724
15155
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
@@ -15729,9 +15160,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15729
15160
|
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
15730
15161
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
15731
15162
|
cu_compute_type)));
|
15732
|
-
g_sycl_handles[g_main_device]->wait();
|
15733
15163
|
}
|
15734
|
-
#endif
|
15735
15164
|
|
15736
15165
|
if (no_mixed_dtypes) {
|
15737
15166
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
@@ -16232,10 +15661,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
16232
15661
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
|
16233
15662
|
}
|
16234
15663
|
|
16235
|
-
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
16236
|
-
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
|
16237
|
-
}
|
16238
|
-
|
16239
15664
|
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
16240
15665
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
|
16241
15666
|
}
|
@@ -16612,9 +16037,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
16612
16037
|
case GGML_OP_ROPE:
|
16613
16038
|
func = ggml_sycl_rope;
|
16614
16039
|
break;
|
16615
|
-
case GGML_OP_ALIBI:
|
16616
|
-
func = ggml_sycl_alibi;
|
16617
|
-
break;
|
16618
16040
|
case GGML_OP_IM2COL:
|
16619
16041
|
func = ggml_sycl_im2col;
|
16620
16042
|
break;
|
@@ -17744,7 +17166,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17744
17166
|
case GGML_OP_DIAG_MASK_INF:
|
17745
17167
|
case GGML_OP_SOFT_MAX:
|
17746
17168
|
case GGML_OP_ROPE:
|
17747
|
-
case GGML_OP_ALIBI:
|
17748
17169
|
case GGML_OP_IM2COL:
|
17749
17170
|
case GGML_OP_POOL_2D:
|
17750
17171
|
case GGML_OP_SUM_ROWS:
|