llama_cpp 0.15.2 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -3847,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
|
|
3847
3847
|
}
|
3848
3848
|
}
|
3849
3849
|
|
3850
|
-
static void upscale_f32(const float *x, float *dst, const int
|
3851
|
-
const
|
3852
|
-
|
3853
|
-
|
3854
|
-
|
3855
|
-
|
3850
|
+
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
3851
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
3852
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
3853
|
+
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
3854
|
+
int index = item_ct1.get_local_id(0) +
|
3855
|
+
item_ct1.get_group(0) * item_ct1.get_local_range(0);
|
3856
|
+
if (index >= ne10 * ne11 * ne12 * ne13) {
|
3856
3857
|
return;
|
3857
3858
|
}
|
3858
3859
|
// operation
|
3859
|
-
int
|
3860
|
-
int
|
3861
|
-
int
|
3862
|
-
int
|
3863
|
-
|
3864
|
-
|
3860
|
+
int i10 = index % ne10;
|
3861
|
+
int i11 = (index / ne10) % ne11;
|
3862
|
+
int i12 = (index / (ne10 * ne11)) % ne12;
|
3863
|
+
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
3864
|
+
|
3865
|
+
int i00 = i10 / sf0;
|
3866
|
+
int i01 = i11 / sf1;
|
3867
|
+
int i02 = i12 / sf2;
|
3868
|
+
int i03 = i13 / sf3;
|
3869
|
+
|
3870
|
+
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
3865
3871
|
}
|
3866
3872
|
|
3867
3873
|
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
@@ -4191,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4191
4197
|
const block_q2_K * x = (const block_q2_K *) vx;
|
4192
4198
|
|
4193
4199
|
const int tid = item_ct1.get_local_id(2);
|
4194
|
-
#if QK_K == 256
|
4195
4200
|
const int n = tid/32;
|
4196
4201
|
const int l = tid - 32*n;
|
4197
4202
|
const int is = 8*n + l/16;
|
@@ -4205,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4205
4210
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4206
4211
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
4207
4212
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
4208
|
-
#else
|
4209
|
-
const int is = tid/16; // 0 or 1
|
4210
|
-
const int il = tid%16; // 0...15
|
4211
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4212
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4213
|
-
|
4214
|
-
float dall = x[i].dm[0];
|
4215
|
-
float dmin = x[i].dm[1];
|
4216
|
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
4217
|
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4218
|
-
#endif
|
4219
|
-
|
4220
4213
|
}
|
4221
4214
|
|
4222
4215
|
template<typename dst_t>
|
@@ -4226,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4226
4219
|
const int i = item_ct1.get_group(2);
|
4227
4220
|
const block_q3_K * x = (const block_q3_K *) vx;
|
4228
4221
|
|
4229
|
-
#if QK_K == 256
|
4230
4222
|
const int r = item_ct1.get_local_id(2) / 4;
|
4231
4223
|
const int tid = r/2;
|
4232
4224
|
const int is0 = r%2;
|
@@ -4250,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4250
4242
|
const uint8_t * hm = x[i].hmask;
|
4251
4243
|
|
4252
4244
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
4253
|
-
#else
|
4254
|
-
const int tid = item_ct1.get_local_id(2);
|
4255
|
-
const int is = tid/16; // 0 or 1
|
4256
|
-
const int il = tid%16; // 0...15
|
4257
|
-
const int im = il/8; // 0...1
|
4258
|
-
const int in = il%8; // 0...7
|
4259
|
-
|
4260
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4261
|
-
|
4262
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4263
|
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
4264
|
-
const float d = (float)x[i].d;
|
4265
|
-
|
4266
|
-
if (is == 0) {
|
4267
|
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4268
|
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4269
|
-
} else {
|
4270
|
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4271
|
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4272
|
-
}
|
4273
|
-
#endif
|
4274
|
-
|
4275
4245
|
}
|
4276
4246
|
|
4277
|
-
#if QK_K == 256
|
4278
4247
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
4279
4248
|
if (j < 4) {
|
4280
4249
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -4283,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
4283
4252
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
4284
4253
|
}
|
4285
4254
|
}
|
4286
|
-
#endif
|
4287
4255
|
|
4288
4256
|
template<typename dst_t>
|
4289
4257
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
@@ -4292,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4292
4260
|
|
4293
4261
|
const int i = item_ct1.get_group(2);
|
4294
4262
|
|
4295
|
-
#if QK_K == 256
|
4296
4263
|
// assume 32 threads
|
4297
4264
|
const int tid = item_ct1.get_local_id(2);
|
4298
4265
|
const int il = tid/8;
|
@@ -4316,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4316
4283
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
4317
4284
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
4318
4285
|
}
|
4319
|
-
#else
|
4320
|
-
const int tid = item_ct1.get_local_id(2);
|
4321
|
-
const uint8_t * q = x[i].qs;
|
4322
|
-
dst_t * y = yy + i*QK_K;
|
4323
|
-
const float d = (float)x[i].dm[0];
|
4324
|
-
const float m = (float)x[i].dm[1];
|
4325
|
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
4326
|
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
4327
|
-
#endif
|
4328
4286
|
}
|
4329
4287
|
|
4330
4288
|
template<typename dst_t>
|
@@ -4334,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4334
4292
|
|
4335
4293
|
const int i = item_ct1.get_group(2);
|
4336
4294
|
|
4337
|
-
#if QK_K == 256
|
4338
4295
|
// assume 64 threads - this is very slightly better than the one below
|
4339
4296
|
const int tid = item_ct1.get_local_id(2);
|
4340
4297
|
const int il = tid/16; // il is in 0...3
|
@@ -4361,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4361
4318
|
hm <<= 1;
|
4362
4319
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
4363
4320
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
4364
|
-
#else
|
4365
|
-
const int tid = item_ct1.get_local_id(2);
|
4366
|
-
const uint8_t q = x[i].qs[tid];
|
4367
|
-
const int im = tid/8; // 0...3
|
4368
|
-
const int in = tid%8; // 0...7
|
4369
|
-
const int is = tid/16; // 0 or 1
|
4370
|
-
const uint8_t h = x[i].qh[in] >> im;
|
4371
|
-
const float d = x[i].d;
|
4372
|
-
dst_t * y = yy + i*QK_K + tid;
|
4373
|
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
4374
|
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
4375
|
-
#endif
|
4376
4321
|
}
|
4377
4322
|
|
4378
4323
|
template<typename dst_t>
|
@@ -4381,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4381
4326
|
const block_q6_K * x = (const block_q6_K *) vx;
|
4382
4327
|
|
4383
4328
|
const int i = item_ct1.get_group(2);
|
4384
|
-
#if QK_K == 256
|
4385
4329
|
|
4386
4330
|
// assume 64 threads - this is very slightly better than the one below
|
4387
4331
|
const int tid = item_ct1.get_local_id(2);
|
@@ -4401,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4401
4345
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
4402
4346
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4403
4347
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
4404
|
-
#else
|
4405
|
-
|
4406
|
-
// assume 32 threads
|
4407
|
-
const int tid = item_ct1.get_local_id(2);
|
4408
|
-
const int ip = tid/16; // 0 or 1
|
4409
|
-
const int il = tid - 16*ip; // 0...15
|
4410
|
-
|
4411
|
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
4412
|
-
|
4413
|
-
const float d = x[i].d;
|
4414
|
-
|
4415
|
-
const uint8_t ql = x[i].ql[16*ip + il];
|
4416
|
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
4417
|
-
const int8_t * sc = x[i].scales;
|
4418
|
-
|
4419
|
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
4420
|
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4421
|
-
#endif
|
4422
4348
|
}
|
4423
4349
|
|
4424
4350
|
template<typename dst_t>
|
@@ -4432,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4432
4358
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
4433
4359
|
|
4434
4360
|
const int tid = item_ct1.get_local_id(2);
|
4435
|
-
#if QK_K == 256
|
4436
4361
|
const int il = tid/8; // 0...3
|
4437
4362
|
const int ib = tid%8; // 0...7
|
4438
4363
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4443,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4443
4368
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
4444
4369
|
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
4445
4370
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
4446
|
-
#else
|
4447
|
-
assert(false);
|
4448
|
-
#endif
|
4449
|
-
|
4450
4371
|
}
|
4451
4372
|
|
4452
4373
|
template<typename dst_t>
|
@@ -4460,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4460
4381
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
4461
4382
|
|
4462
4383
|
const int tid = item_ct1.get_local_id(2);
|
4463
|
-
#if QK_K == 256
|
4464
4384
|
const int il = tid/8; // 0...3
|
4465
4385
|
const int ib = tid%8; // 0...7
|
4466
4386
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4469,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4469
4389
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4470
4390
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
4471
4391
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4472
|
-
#else
|
4473
|
-
assert(false);
|
4474
|
-
#endif
|
4475
|
-
|
4476
4392
|
}
|
4477
4393
|
|
4478
4394
|
template <typename dst_t>
|
@@ -4484,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4484
4400
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
4485
4401
|
|
4486
4402
|
const int tid = item_ct1.get_local_id(2);
|
4487
|
-
#if QK_K == 256
|
4488
4403
|
const int il = tid/8; // 0...3
|
4489
4404
|
const int ib = tid%8; // 0...7
|
4490
4405
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4492,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4492
4407
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4493
4408
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
4494
4409
|
#pragma unroll
|
4495
|
-
for (int j = 0; j < 8; ++j)
|
4410
|
+
for (int j = 0; j < 8; ++j) {
|
4496
4411
|
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4497
|
-
|
4498
|
-
assert(false);
|
4499
|
-
|
4500
|
-
#endif
|
4501
|
-
|
4412
|
+
}
|
4502
4413
|
}
|
4503
4414
|
|
4504
4415
|
template<typename dst_t>
|
@@ -4512,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4512
4423
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
4513
4424
|
|
4514
4425
|
const int tid = item_ct1.get_local_id(2);
|
4515
|
-
#if QK_K == 256
|
4516
4426
|
const int il = tid/8; // 0...3
|
4517
4427
|
const int ib = tid%8; // 0...7
|
4518
4428
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4527,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4527
4437
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4528
4438
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4529
4439
|
}
|
4530
|
-
#else
|
4531
|
-
assert(false);
|
4532
|
-
#endif
|
4533
|
-
|
4534
4440
|
}
|
4535
4441
|
|
4536
4442
|
template <typename dst_t>
|
@@ -4543,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4543
4449
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
4544
4450
|
|
4545
4451
|
const int tid = item_ct1.get_local_id(2);
|
4546
|
-
#if QK_K == 256
|
4547
4452
|
const int il = tid/8; // 0...3
|
4548
4453
|
const int ib = tid%8; // 0...7
|
4549
4454
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4557,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4557
4462
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4558
4463
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4559
4464
|
}
|
4560
|
-
#else
|
4561
|
-
assert(false);
|
4562
|
-
#endif
|
4563
|
-
|
4564
4465
|
}
|
4565
4466
|
|
4566
4467
|
template <typename dst_t>
|
@@ -4573,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4573
4474
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
4574
4475
|
|
4575
4476
|
const int tid = item_ct1.get_local_id(2);
|
4576
|
-
#if QK_K == 256
|
4577
4477
|
const int il = tid/8; // 0...3
|
4578
4478
|
const int ib = tid%8; // 0...7
|
4579
4479
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4587,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4587
4487
|
for (int j = 0; j < 8; ++j) {
|
4588
4488
|
y[j] = d * (q[j] + delta);
|
4589
4489
|
}
|
4590
|
-
#else
|
4591
|
-
assert(false);
|
4592
|
-
#endif
|
4593
|
-
|
4594
4490
|
}
|
4595
4491
|
|
4596
4492
|
template <typename dst_t>
|
@@ -4603,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4603
4499
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
4604
4500
|
|
4605
4501
|
const int tid = item_ct1.get_local_id(2);
|
4606
|
-
#if QK_K == 256
|
4607
4502
|
const int il = tid/8; // 0...3
|
4608
4503
|
const int ib = tid%8; // 0...7
|
4609
4504
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4621,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4621
4516
|
for (int j = 0; j < 8; ++j) {
|
4622
4517
|
y[j] = d * (q[j] + delta);
|
4623
4518
|
}
|
4624
|
-
#else
|
4625
|
-
assert(false);
|
4626
|
-
#endif
|
4627
|
-
|
4628
4519
|
}
|
4629
4520
|
|
4630
4521
|
template <typename dst_t>
|
@@ -4698,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4698
4589
|
|
4699
4590
|
float tmp = 0; // partial sum for thread in warp
|
4700
4591
|
|
4701
|
-
#if QK_K == 256
|
4702
4592
|
const int tid =
|
4703
4593
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
4704
4594
|
const int ix =
|
@@ -4749,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4749
4639
|
tmp += dall * sum1 - dmin * sum2;
|
4750
4640
|
|
4751
4641
|
}
|
4752
|
-
#else
|
4753
|
-
const int tid = item_ct1.get_local_id(2) /
|
4754
|
-
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4755
|
-
const int ix = item_ct1.get_local_id(2) %
|
4756
|
-
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4757
|
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
4758
|
-
|
4759
|
-
uint32_t uaux[2];
|
4760
|
-
const uint8_t * d = (const uint8_t *)uaux;
|
4761
|
-
|
4762
|
-
|
4763
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4764
|
-
|
4765
|
-
const float * y = yy + i * QK_K + offset;
|
4766
|
-
const uint8_t * q = x[i].qs + offset;
|
4767
|
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
4768
|
-
|
4769
|
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
4770
|
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
4771
|
-
|
4772
|
-
const sycl::float2 dall =
|
4773
|
-
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
4774
|
-
|
4775
|
-
float sum1 = 0, sum2 = 0;
|
4776
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4777
|
-
const uint8_t ql = q[l];
|
4778
|
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
4779
|
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
4780
|
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
4781
|
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
4782
|
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
4783
|
-
}
|
4784
|
-
tmp += dall.x() * sum1 - dall.y() * sum2;
|
4785
|
-
}
|
4786
|
-
|
4787
|
-
#endif
|
4788
4642
|
|
4789
4643
|
// sum up partial sums and write back result
|
4790
4644
|
#pragma unroll
|
@@ -4822,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4822
4676
|
|
4823
4677
|
float tmp = 0; // partial sum for thread in warp
|
4824
4678
|
|
4825
|
-
#if QK_K == 256
|
4826
|
-
|
4827
4679
|
const uint16_t kmask1 = 0x0303;
|
4828
4680
|
const uint16_t kmask2 = 0x0f0f;
|
4829
4681
|
|
@@ -4876,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4876
4728
|
tmp += d * sum;
|
4877
4729
|
|
4878
4730
|
}
|
4879
|
-
#else
|
4880
|
-
|
4881
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4882
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4883
|
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
4884
|
-
const int in = offset/8; // 0 or 1
|
4885
|
-
const int im = offset%8; // 0...7
|
4886
|
-
|
4887
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4888
|
-
|
4889
|
-
const float * y = yy + i * QK_K + offset;
|
4890
|
-
const uint8_t * q = x[i].qs + offset;
|
4891
|
-
const uint8_t * s = x[i].scales;
|
4892
|
-
|
4893
|
-
const float dall = (float)x[i].d;
|
4894
|
-
|
4895
|
-
float sum = 0;
|
4896
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4897
|
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
4898
|
-
const uint8_t ql = q[l];
|
4899
|
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
4900
|
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
4901
|
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
4902
|
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
4903
|
-
}
|
4904
|
-
tmp += sum;
|
4905
|
-
}
|
4906
|
-
#endif
|
4907
4731
|
|
4908
4732
|
// sum up partial sums and write back result
|
4909
4733
|
#pragma unroll
|
@@ -4938,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
4938
4762
|
|
4939
4763
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
4940
4764
|
|
4941
|
-
#if QK_K == 256
|
4942
4765
|
const uint16_t kmask1 = 0x3f3f;
|
4943
4766
|
const uint16_t kmask2 = 0x0f0f;
|
4944
4767
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5027,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
5027
4850
|
#endif
|
5028
4851
|
|
5029
4852
|
}
|
5030
|
-
#else
|
5031
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5032
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5033
|
-
|
5034
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5035
|
-
|
5036
|
-
uint16_t aux16[2];
|
5037
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
5038
|
-
|
5039
|
-
float tmp = 0;
|
5040
|
-
|
5041
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5042
|
-
const uint8_t * q = x[i].qs + step;
|
5043
|
-
const float * y = yy + i*QK_K + step;
|
5044
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
5045
|
-
aux16[0] = a[0] & 0x0f0f;
|
5046
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
5047
|
-
const float d = (float)x[i].dm[0];
|
5048
|
-
const float m = (float)x[i].dm[1];
|
5049
|
-
float sum = 0.f;
|
5050
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5051
|
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
5052
|
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
5053
|
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
5054
|
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
5055
|
-
}
|
5056
|
-
tmp += sum;
|
5057
|
-
}
|
5058
|
-
|
5059
|
-
#endif
|
5060
4853
|
|
5061
4854
|
// sum up partial sums and write back result
|
5062
4855
|
#pragma unroll
|
@@ -5091,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5091
4884
|
|
5092
4885
|
float tmp = 0; // partial sum for thread in warp
|
5093
4886
|
|
5094
|
-
#if QK_K == 256
|
5095
4887
|
const uint16_t kmask1 = 0x3f3f;
|
5096
4888
|
const uint16_t kmask2 = 0x0f0f;
|
5097
4889
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5168,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5168
4960
|
dmin * smin;
|
5169
4961
|
}
|
5170
4962
|
|
5171
|
-
#else
|
5172
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5173
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5174
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5175
|
-
const int im = step/8;
|
5176
|
-
const int in = step%8;
|
5177
|
-
|
5178
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5179
|
-
const uint8_t * q = x[i].qs + step;
|
5180
|
-
const int8_t * s = x[i].scales;
|
5181
|
-
const float * y = yy + i*QK_K + step;
|
5182
|
-
const float d = x[i].d;
|
5183
|
-
float sum = 0.f;
|
5184
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5185
|
-
const uint8_t h = x[i].qh[in+j] >> im;
|
5186
|
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
5187
|
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
5188
|
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
5189
|
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
5190
|
-
}
|
5191
|
-
tmp += sum;
|
5192
|
-
}
|
5193
|
-
#endif
|
5194
|
-
|
5195
4963
|
// sum up partial sums and write back result
|
5196
4964
|
#pragma unroll
|
5197
4965
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -5218,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5218
4986
|
|
5219
4987
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
5220
4988
|
|
5221
|
-
#if QK_K == 256
|
5222
|
-
|
5223
4989
|
const int tid =
|
5224
4990
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
5225
4991
|
const int ix =
|
@@ -5276,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5276
5042
|
|
5277
5043
|
}
|
5278
5044
|
|
5279
|
-
#else
|
5280
|
-
|
5281
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
5282
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
5283
|
-
|
5284
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5285
|
-
|
5286
|
-
float tmp = 0; // partial sum for thread in warp
|
5287
|
-
|
5288
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5289
|
-
|
5290
|
-
const float * y = yy + i * QK_K + step;
|
5291
|
-
const uint8_t * ql = x[i].ql + step;
|
5292
|
-
const uint8_t * qh = x[i].qh + step;
|
5293
|
-
const int8_t * s = x[i].scales;
|
5294
|
-
|
5295
|
-
const float d = x[i+0].d;
|
5296
|
-
|
5297
|
-
float sum = 0;
|
5298
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5299
|
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
5300
|
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
5301
|
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
5302
|
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
5303
|
-
}
|
5304
|
-
tmp += sum;
|
5305
|
-
|
5306
|
-
}
|
5307
|
-
|
5308
|
-
#endif
|
5309
|
-
|
5310
5045
|
// sum up partial sums and write back result
|
5311
5046
|
#pragma unroll
|
5312
5047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -6851,7 +6586,6 @@ static __dpct_inline__ float
|
|
6851
6586
|
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
6852
6587
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
6853
6588
|
|
6854
|
-
#ifndef GGML_QKK_64
|
6855
6589
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6856
6590
|
|
6857
6591
|
int v[2];
|
@@ -6893,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
6893
6627
|
}
|
6894
6628
|
|
6895
6629
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
6896
|
-
|
6897
|
-
#else
|
6898
|
-
|
6899
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
6900
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6901
|
-
|
6902
|
-
float sumf_d = 0.0f;
|
6903
|
-
float sumf_m = 0.0f;
|
6904
|
-
|
6905
|
-
uint16_t aux16[2];
|
6906
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
6907
|
-
|
6908
|
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
6909
|
-
aux16[0] = a[0] & 0x0f0f;
|
6910
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
6911
|
-
|
6912
|
-
const float dall = bq4_K->dm[0];
|
6913
|
-
const float dmin = bq4_K->dm[1];
|
6914
|
-
|
6915
|
-
const float d8_1 = bq8_1[0].ds[0];
|
6916
|
-
const float d8_2 = bq8_1[1].ds[1];
|
6917
|
-
|
6918
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
6919
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
6920
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
6921
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
6922
|
-
|
6923
|
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
6924
|
-
const int v1 = q4[0];
|
6925
|
-
const int v2 = q4[4];
|
6926
|
-
|
6927
|
-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
6928
|
-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
6929
|
-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
6930
|
-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
6931
|
-
|
6932
|
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
6933
|
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
6934
|
-
|
6935
|
-
return dall * sumf_d - dmin * sumf_m;
|
6936
|
-
|
6937
|
-
#else
|
6938
|
-
bad_arch();
|
6939
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
6940
|
-
|
6941
|
-
#endif
|
6942
6630
|
}
|
6943
6631
|
|
6944
6632
|
template <int mmq_y>
|
@@ -6997,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
6997
6685
|
|
6998
6686
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
6999
6687
|
|
7000
|
-
#if QK_K == 256
|
7001
6688
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
7002
|
-
#else
|
7003
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
7004
|
-
#endif
|
7005
6689
|
}
|
7006
6690
|
|
7007
6691
|
#pragma unroll
|
@@ -7044,7 +6728,6 @@ static __dpct_inline__ float
|
|
7044
6728
|
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
7045
6729
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7046
6730
|
|
7047
|
-
#ifndef GGML_QKK_64
|
7048
6731
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7049
6732
|
|
7050
6733
|
int vl[2];
|
@@ -7086,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
7086
6769
|
}
|
7087
6770
|
|
7088
6771
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
7089
|
-
|
7090
|
-
#else
|
7091
|
-
|
7092
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
7093
|
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7094
|
-
|
7095
|
-
const int8_t * s = bq5_K->scales;
|
7096
|
-
|
7097
|
-
const float d = bq5_K->d;
|
7098
|
-
|
7099
|
-
const float d8_1 = bq8_1[0].ds[0];
|
7100
|
-
const float d8_2 = bq8_1[1].ds[1];
|
7101
|
-
|
7102
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
7103
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
7104
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
7105
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
7106
|
-
|
7107
|
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
7108
|
-
const int vl1 = ql[0];
|
7109
|
-
const int vl2 = ql[4];
|
7110
|
-
|
7111
|
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
7112
|
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
7113
|
-
const int in = step%8; // 0, 4, 0, 4
|
7114
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
7115
|
-
|
7116
|
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
7117
|
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
7118
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
7119
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
7120
|
-
|
7121
|
-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
7122
|
-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
7123
|
-
|
7124
|
-
return d * sumf_d;
|
7125
|
-
|
7126
|
-
#else
|
7127
|
-
bad_arch();
|
7128
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
7129
|
-
|
7130
|
-
#endif
|
7131
6772
|
}
|
7132
6773
|
|
7133
6774
|
template <int mmq_y>
|
@@ -7199,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
7199
6840
|
|
7200
6841
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
7201
6842
|
|
7202
|
-
#if QK_K == 256
|
7203
6843
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
7204
|
-
#endif
|
7205
6844
|
}
|
7206
6845
|
|
7207
6846
|
#pragma unroll
|
@@ -7381,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7381
7020
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7382
7021
|
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
7383
7022
|
const uint8_t *kmask_iq2xs) {
|
7384
|
-
#if QK_K == 256
|
7385
7023
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
7386
7024
|
|
7387
7025
|
#if QR2_XXS == 8
|
@@ -7422,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7422
7060
|
}
|
7423
7061
|
return d * (sumi1 + sumi2);
|
7424
7062
|
#endif
|
7425
|
-
#else
|
7426
|
-
assert(false);
|
7427
|
-
return 0.f;
|
7428
|
-
#endif
|
7429
7063
|
}
|
7430
7064
|
|
7431
7065
|
static __dpct_inline__ float
|
@@ -7434,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7434
7068
|
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
7435
7069
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7436
7070
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7437
|
-
#if QK_K == 256
|
7438
7071
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
7439
7072
|
|
7440
7073
|
const int ib32 = iqs;
|
@@ -7472,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7472
7105
|
assert(false);
|
7473
7106
|
return 0.f;
|
7474
7107
|
#endif
|
7475
|
-
#else
|
7476
|
-
assert(false);
|
7477
|
-
return 0.f;
|
7478
|
-
#endif
|
7479
7108
|
}
|
7480
7109
|
|
7481
7110
|
static __dpct_inline__ float
|
7482
7111
|
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
7483
7112
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7484
|
-
#if QK_K == 256
|
7485
7113
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
7486
7114
|
|
7487
7115
|
const int ib32 = iqs;
|
@@ -7525,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
7525
7153
|
}
|
7526
7154
|
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
7527
7155
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
7528
|
-
#else
|
7529
|
-
assert(false);
|
7530
|
-
#endif
|
7531
7156
|
}
|
7532
7157
|
|
7533
7158
|
static __dpct_inline__ float
|
@@ -7536,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7536
7161
|
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
7537
7162
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7538
7163
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7539
|
-
#if QK_K == 256
|
7540
7164
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
7541
7165
|
|
7542
7166
|
const int ib32 = iqs;
|
@@ -7564,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7564
7188
|
assert(false);
|
7565
7189
|
return 0.f;
|
7566
7190
|
#endif
|
7567
|
-
#else
|
7568
|
-
assert(false);
|
7569
|
-
return 0.f;
|
7570
|
-
#endif
|
7571
7191
|
}
|
7572
7192
|
|
7573
7193
|
static __dpct_inline__ float
|
7574
7194
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
7575
7195
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7576
7196
|
const uint32_t *iq3s_grid) {
|
7577
|
-
#if QK_K == 256
|
7578
7197
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
7579
7198
|
|
7580
7199
|
const int ib32 = iqs;
|
@@ -7603,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7603
7222
|
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
7604
7223
|
bq8_1[ib32].ds[0];
|
7605
7224
|
return d * sumi;
|
7606
|
-
#else
|
7607
|
-
assert(false);
|
7608
|
-
#endif
|
7609
7225
|
}
|
7610
7226
|
|
7611
7227
|
static __dpct_inline__ float
|
7612
7228
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
7613
7229
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7614
7230
|
const uint32_t *iq1s_grid_gpu) {
|
7615
|
-
#if QK_K == 256
|
7616
7231
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
7617
7232
|
|
7618
7233
|
const int ib32 = iqs;
|
@@ -7631,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
7631
7246
|
const float d = d1q * bq8_1[ib32].ds[0];
|
7632
7247
|
const float m = d1q * bq8_1[ib32].ds[1];
|
7633
7248
|
return d * sumi + m * delta;
|
7634
|
-
#else
|
7635
|
-
assert(false);
|
7636
|
-
#endif
|
7637
7249
|
}
|
7638
7250
|
|
7639
7251
|
static __dpct_inline__ float
|
7640
7252
|
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
7641
7253
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7642
|
-
#if QK_K == 256
|
7643
7254
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
7644
7255
|
|
7645
7256
|
const int ib32 = iqs;
|
@@ -7664,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
7664
7275
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
7665
7276
|
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
7666
7277
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
7667
|
-
#else
|
7668
|
-
assert(false);
|
7669
|
-
#endif
|
7670
7278
|
}
|
7671
7279
|
|
7672
7280
|
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
@@ -7714,7 +7322,6 @@ static __dpct_inline__ float
|
|
7714
7322
|
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
7715
7323
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7716
7324
|
|
7717
|
-
#if QK_K == 256
|
7718
7325
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
7719
7326
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
7720
7327
|
|
@@ -7732,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
7732
7339
|
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
7733
7340
|
}
|
7734
7341
|
return d * (sumi1 + sumi2);
|
7735
|
-
#else
|
7736
|
-
assert(false);
|
7737
|
-
#endif
|
7738
7342
|
}
|
7739
7343
|
|
7740
7344
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
@@ -10085,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
|
10085
9689
|
});
|
10086
9690
|
}
|
10087
9691
|
|
10088
|
-
static void upscale_f32_sycl(const float *x, float *dst, const int
|
10089
|
-
const int
|
10090
|
-
const int
|
10091
|
-
|
10092
|
-
int
|
10093
|
-
|
9692
|
+
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
9693
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
9694
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
9695
|
+
const float sf2, const float sf3, dpct::queue_ptr stream) {
|
9696
|
+
int dst_size = ne10 * ne11 * ne12 * ne13;
|
9697
|
+
int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
9698
|
+
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
10094
9699
|
stream->parallel_for(
|
10095
|
-
sycl::nd_range<
|
10096
|
-
|
10097
|
-
|
10098
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10099
|
-
upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
|
9700
|
+
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
9701
|
+
[=](sycl::nd_item<1> item_ct1) {
|
9702
|
+
upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
10100
9703
|
});
|
10101
9704
|
}
|
10102
9705
|
|
@@ -10198,7 +9801,6 @@ template <typename dst_t>
|
|
10198
9801
|
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
10199
9802
|
dpct::queue_ptr stream) {
|
10200
9803
|
const int nb = k / QK_K;
|
10201
|
-
#if QK_K == 256
|
10202
9804
|
{
|
10203
9805
|
dpct::has_capability_or_fail(stream->get_device(),
|
10204
9806
|
{sycl::aspect::fp16});
|
@@ -10210,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10210
9812
|
dequantize_block_q2_K(vx, y, item_ct1);
|
10211
9813
|
});
|
10212
9814
|
}
|
10213
|
-
#else
|
10214
|
-
{
|
10215
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10216
|
-
{sycl::aspect::fp16});
|
10217
|
-
|
10218
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10219
|
-
sycl::range<3>(1, 1, 32),
|
10220
|
-
sycl::range<3>(1, 1, 32)),
|
10221
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10222
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
10223
|
-
});
|
10224
|
-
}
|
10225
|
-
|
10226
|
-
#endif
|
10227
9815
|
}
|
10228
9816
|
|
10229
9817
|
template <typename dst_t>
|
10230
9818
|
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
10231
9819
|
dpct::queue_ptr stream) {
|
10232
9820
|
const int nb = k / QK_K;
|
10233
|
-
#if QK_K == 256
|
10234
9821
|
{
|
10235
9822
|
dpct::has_capability_or_fail(stream->get_device(),
|
10236
9823
|
{sycl::aspect::fp16});
|
@@ -10242,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10242
9829
|
dequantize_block_q3_K(vx, y, item_ct1);
|
10243
9830
|
});
|
10244
9831
|
}
|
10245
|
-
#else
|
10246
|
-
{
|
10247
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10248
|
-
{sycl::aspect::fp16});
|
10249
|
-
|
10250
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10251
|
-
sycl::range<3>(1, 1, 32),
|
10252
|
-
sycl::range<3>(1, 1, 32)),
|
10253
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10254
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
10255
|
-
});
|
10256
|
-
}
|
10257
|
-
#endif
|
10258
9832
|
}
|
10259
9833
|
|
10260
9834
|
template <typename dst_t>
|
@@ -10315,7 +9889,6 @@ template <typename dst_t>
|
|
10315
9889
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
10316
9890
|
dpct::queue_ptr stream) {
|
10317
9891
|
const int nb = k / QK_K;
|
10318
|
-
#if QK_K == 256
|
10319
9892
|
{
|
10320
9893
|
dpct::has_capability_or_fail(stream->get_device(),
|
10321
9894
|
{sycl::aspect::fp16});
|
@@ -10327,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10327
9900
|
dequantize_block_q5_K(vx, y, item_ct1);
|
10328
9901
|
});
|
10329
9902
|
}
|
10330
|
-
#else
|
10331
|
-
{
|
10332
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10333
|
-
{sycl::aspect::fp16});
|
10334
|
-
|
10335
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10336
|
-
sycl::range<3>(1, 1, 32),
|
10337
|
-
sycl::range<3>(1, 1, 32)),
|
10338
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10339
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
10340
|
-
});
|
10341
|
-
}
|
10342
|
-
|
10343
|
-
#endif
|
10344
9903
|
}
|
10345
9904
|
|
10346
9905
|
template <typename dst_t>
|
10347
9906
|
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
10348
9907
|
dpct::queue_ptr stream) {
|
10349
9908
|
const int nb = k / QK_K;
|
10350
|
-
#if QK_K == 256
|
10351
9909
|
{
|
10352
9910
|
dpct::has_capability_or_fail(stream->get_device(),
|
10353
9911
|
{sycl::aspect::fp16});
|
@@ -10359,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10359
9917
|
dequantize_block_q6_K(vx, y, item_ct1);
|
10360
9918
|
});
|
10361
9919
|
}
|
10362
|
-
#else
|
10363
|
-
{
|
10364
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10365
|
-
{sycl::aspect::fp16});
|
10366
|
-
|
10367
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10368
|
-
sycl::range<3>(1, 1, 32),
|
10369
|
-
sycl::range<3>(1, 1, 32)),
|
10370
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10371
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
10372
|
-
});
|
10373
|
-
}
|
10374
|
-
|
10375
|
-
#endif
|
10376
9920
|
}
|
10377
9921
|
|
10378
9922
|
template <typename dst_t>
|
@@ -10524,9 +10068,6 @@ template <typename dst_t>
|
|
10524
10068
|
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
10525
10069
|
dpct::queue_ptr stream) {
|
10526
10070
|
const int nb = (k + QK_K - 1) / QK_K;
|
10527
|
-
#if QK_K == 64
|
10528
|
-
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
10529
|
-
#else
|
10530
10071
|
{
|
10531
10072
|
dpct::has_capability_or_fail(stream->get_device(),
|
10532
10073
|
{sycl::aspect::fp16});
|
@@ -10541,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10541
10082
|
});
|
10542
10083
|
});
|
10543
10084
|
}
|
10544
|
-
#endif
|
10545
10085
|
}
|
10546
10086
|
|
10547
10087
|
|
@@ -12046,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12046
11586
|
const int nrows_y, const int nrows_dst,
|
12047
11587
|
dpct::queue_ptr stream) try {
|
12048
11588
|
|
12049
|
-
#if QK_K == 256
|
12050
|
-
|
12051
11589
|
int id;
|
12052
11590
|
SYCL_CHECK(
|
12053
11591
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
@@ -12162,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12162
11700
|
});
|
12163
11701
|
}
|
12164
11702
|
}
|
12165
|
-
#endif
|
12166
11703
|
}
|
12167
11704
|
catch (sycl::exception const &exc) {
|
12168
11705
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -13985,15 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
|
13985
13522
|
|
13986
13523
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
13987
13524
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
13988
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
13989
|
-
|
13990
|
-
#pragma message("TODO: generalize upscale operator")
|
13991
|
-
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
13992
|
-
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
13993
13525
|
|
13994
|
-
const
|
13526
|
+
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
13527
|
+
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
13528
|
+
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
13529
|
+
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
13995
13530
|
|
13996
|
-
upscale_f32_sycl(src0_dd, dst_dd, src0->
|
13531
|
+
upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
13532
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
13533
|
+
main_stream);
|
13997
13534
|
|
13998
13535
|
(void) src1;
|
13999
13536
|
(void) dst;
|
@@ -14449,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14449
13986
|
ggml_tensor *dst, const float *src0_dd,
|
14450
13987
|
const float *src1_dd, float *dst_dd,
|
14451
13988
|
const dpct::queue_ptr &main_stream) {
|
13989
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
13990
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
13991
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
14452
13992
|
|
14453
13993
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
14454
13994
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|