llama_cpp 0.15.2 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -3847,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
|
|
3847
3847
|
}
|
3848
3848
|
}
|
3849
3849
|
|
3850
|
-
static void upscale_f32(const float *x, float *dst, const int
|
3851
|
-
const
|
3852
|
-
|
3853
|
-
|
3854
|
-
|
3855
|
-
|
3850
|
+
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
3851
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
3852
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
3853
|
+
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
3854
|
+
int index = item_ct1.get_local_id(0) +
|
3855
|
+
item_ct1.get_group(0) * item_ct1.get_local_range(0);
|
3856
|
+
if (index >= ne10 * ne11 * ne12 * ne13) {
|
3856
3857
|
return;
|
3857
3858
|
}
|
3858
3859
|
// operation
|
3859
|
-
int
|
3860
|
-
int
|
3861
|
-
int
|
3862
|
-
int
|
3863
|
-
|
3864
|
-
|
3860
|
+
int i10 = index % ne10;
|
3861
|
+
int i11 = (index / ne10) % ne11;
|
3862
|
+
int i12 = (index / (ne10 * ne11)) % ne12;
|
3863
|
+
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
3864
|
+
|
3865
|
+
int i00 = i10 / sf0;
|
3866
|
+
int i01 = i11 / sf1;
|
3867
|
+
int i02 = i12 / sf2;
|
3868
|
+
int i03 = i13 / sf3;
|
3869
|
+
|
3870
|
+
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
3865
3871
|
}
|
3866
3872
|
|
3867
3873
|
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
@@ -4191,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4191
4197
|
const block_q2_K * x = (const block_q2_K *) vx;
|
4192
4198
|
|
4193
4199
|
const int tid = item_ct1.get_local_id(2);
|
4194
|
-
#if QK_K == 256
|
4195
4200
|
const int n = tid/32;
|
4196
4201
|
const int l = tid - 32*n;
|
4197
4202
|
const int is = 8*n + l/16;
|
@@ -4205,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4205
4210
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4206
4211
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
4207
4212
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
4208
|
-
#else
|
4209
|
-
const int is = tid/16; // 0 or 1
|
4210
|
-
const int il = tid%16; // 0...15
|
4211
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4212
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4213
|
-
|
4214
|
-
float dall = x[i].dm[0];
|
4215
|
-
float dmin = x[i].dm[1];
|
4216
|
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
4217
|
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4218
|
-
#endif
|
4219
|
-
|
4220
4213
|
}
|
4221
4214
|
|
4222
4215
|
template<typename dst_t>
|
@@ -4226,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4226
4219
|
const int i = item_ct1.get_group(2);
|
4227
4220
|
const block_q3_K * x = (const block_q3_K *) vx;
|
4228
4221
|
|
4229
|
-
#if QK_K == 256
|
4230
4222
|
const int r = item_ct1.get_local_id(2) / 4;
|
4231
4223
|
const int tid = r/2;
|
4232
4224
|
const int is0 = r%2;
|
@@ -4250,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4250
4242
|
const uint8_t * hm = x[i].hmask;
|
4251
4243
|
|
4252
4244
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
4253
|
-
#else
|
4254
|
-
const int tid = item_ct1.get_local_id(2);
|
4255
|
-
const int is = tid/16; // 0 or 1
|
4256
|
-
const int il = tid%16; // 0...15
|
4257
|
-
const int im = il/8; // 0...1
|
4258
|
-
const int in = il%8; // 0...7
|
4259
|
-
|
4260
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4261
|
-
|
4262
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4263
|
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
4264
|
-
const float d = (float)x[i].d;
|
4265
|
-
|
4266
|
-
if (is == 0) {
|
4267
|
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4268
|
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4269
|
-
} else {
|
4270
|
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4271
|
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4272
|
-
}
|
4273
|
-
#endif
|
4274
|
-
|
4275
4245
|
}
|
4276
4246
|
|
4277
|
-
#if QK_K == 256
|
4278
4247
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
4279
4248
|
if (j < 4) {
|
4280
4249
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -4283,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
4283
4252
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
4284
4253
|
}
|
4285
4254
|
}
|
4286
|
-
#endif
|
4287
4255
|
|
4288
4256
|
template<typename dst_t>
|
4289
4257
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
@@ -4292,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4292
4260
|
|
4293
4261
|
const int i = item_ct1.get_group(2);
|
4294
4262
|
|
4295
|
-
#if QK_K == 256
|
4296
4263
|
// assume 32 threads
|
4297
4264
|
const int tid = item_ct1.get_local_id(2);
|
4298
4265
|
const int il = tid/8;
|
@@ -4316,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4316
4283
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
4317
4284
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
4318
4285
|
}
|
4319
|
-
#else
|
4320
|
-
const int tid = item_ct1.get_local_id(2);
|
4321
|
-
const uint8_t * q = x[i].qs;
|
4322
|
-
dst_t * y = yy + i*QK_K;
|
4323
|
-
const float d = (float)x[i].dm[0];
|
4324
|
-
const float m = (float)x[i].dm[1];
|
4325
|
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
4326
|
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
4327
|
-
#endif
|
4328
4286
|
}
|
4329
4287
|
|
4330
4288
|
template<typename dst_t>
|
@@ -4334,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4334
4292
|
|
4335
4293
|
const int i = item_ct1.get_group(2);
|
4336
4294
|
|
4337
|
-
#if QK_K == 256
|
4338
4295
|
// assume 64 threads - this is very slightly better than the one below
|
4339
4296
|
const int tid = item_ct1.get_local_id(2);
|
4340
4297
|
const int il = tid/16; // il is in 0...3
|
@@ -4361,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4361
4318
|
hm <<= 1;
|
4362
4319
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
4363
4320
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
4364
|
-
#else
|
4365
|
-
const int tid = item_ct1.get_local_id(2);
|
4366
|
-
const uint8_t q = x[i].qs[tid];
|
4367
|
-
const int im = tid/8; // 0...3
|
4368
|
-
const int in = tid%8; // 0...7
|
4369
|
-
const int is = tid/16; // 0 or 1
|
4370
|
-
const uint8_t h = x[i].qh[in] >> im;
|
4371
|
-
const float d = x[i].d;
|
4372
|
-
dst_t * y = yy + i*QK_K + tid;
|
4373
|
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
4374
|
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
4375
|
-
#endif
|
4376
4321
|
}
|
4377
4322
|
|
4378
4323
|
template<typename dst_t>
|
@@ -4381,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4381
4326
|
const block_q6_K * x = (const block_q6_K *) vx;
|
4382
4327
|
|
4383
4328
|
const int i = item_ct1.get_group(2);
|
4384
|
-
#if QK_K == 256
|
4385
4329
|
|
4386
4330
|
// assume 64 threads - this is very slightly better than the one below
|
4387
4331
|
const int tid = item_ct1.get_local_id(2);
|
@@ -4401,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4401
4345
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
4402
4346
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4403
4347
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
4404
|
-
#else
|
4405
|
-
|
4406
|
-
// assume 32 threads
|
4407
|
-
const int tid = item_ct1.get_local_id(2);
|
4408
|
-
const int ip = tid/16; // 0 or 1
|
4409
|
-
const int il = tid - 16*ip; // 0...15
|
4410
|
-
|
4411
|
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
4412
|
-
|
4413
|
-
const float d = x[i].d;
|
4414
|
-
|
4415
|
-
const uint8_t ql = x[i].ql[16*ip + il];
|
4416
|
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
4417
|
-
const int8_t * sc = x[i].scales;
|
4418
|
-
|
4419
|
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
4420
|
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4421
|
-
#endif
|
4422
4348
|
}
|
4423
4349
|
|
4424
4350
|
template<typename dst_t>
|
@@ -4432,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4432
4358
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
4433
4359
|
|
4434
4360
|
const int tid = item_ct1.get_local_id(2);
|
4435
|
-
#if QK_K == 256
|
4436
4361
|
const int il = tid/8; // 0...3
|
4437
4362
|
const int ib = tid%8; // 0...7
|
4438
4363
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4443,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4443
4368
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
4444
4369
|
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
4445
4370
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
4446
|
-
#else
|
4447
|
-
assert(false);
|
4448
|
-
#endif
|
4449
|
-
|
4450
4371
|
}
|
4451
4372
|
|
4452
4373
|
template<typename dst_t>
|
@@ -4460,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4460
4381
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
4461
4382
|
|
4462
4383
|
const int tid = item_ct1.get_local_id(2);
|
4463
|
-
#if QK_K == 256
|
4464
4384
|
const int il = tid/8; // 0...3
|
4465
4385
|
const int ib = tid%8; // 0...7
|
4466
4386
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4469,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4469
4389
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4470
4390
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
4471
4391
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4472
|
-
#else
|
4473
|
-
assert(false);
|
4474
|
-
#endif
|
4475
|
-
|
4476
4392
|
}
|
4477
4393
|
|
4478
4394
|
template <typename dst_t>
|
@@ -4484,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4484
4400
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
4485
4401
|
|
4486
4402
|
const int tid = item_ct1.get_local_id(2);
|
4487
|
-
#if QK_K == 256
|
4488
4403
|
const int il = tid/8; // 0...3
|
4489
4404
|
const int ib = tid%8; // 0...7
|
4490
4405
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4492,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4492
4407
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4493
4408
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
4494
4409
|
#pragma unroll
|
4495
|
-
for (int j = 0; j < 8; ++j)
|
4410
|
+
for (int j = 0; j < 8; ++j) {
|
4496
4411
|
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4497
|
-
|
4498
|
-
assert(false);
|
4499
|
-
|
4500
|
-
#endif
|
4501
|
-
|
4412
|
+
}
|
4502
4413
|
}
|
4503
4414
|
|
4504
4415
|
template<typename dst_t>
|
@@ -4512,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4512
4423
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
4513
4424
|
|
4514
4425
|
const int tid = item_ct1.get_local_id(2);
|
4515
|
-
#if QK_K == 256
|
4516
4426
|
const int il = tid/8; // 0...3
|
4517
4427
|
const int ib = tid%8; // 0...7
|
4518
4428
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4527,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4527
4437
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4528
4438
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4529
4439
|
}
|
4530
|
-
#else
|
4531
|
-
assert(false);
|
4532
|
-
#endif
|
4533
|
-
|
4534
4440
|
}
|
4535
4441
|
|
4536
4442
|
template <typename dst_t>
|
@@ -4543,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4543
4449
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
4544
4450
|
|
4545
4451
|
const int tid = item_ct1.get_local_id(2);
|
4546
|
-
#if QK_K == 256
|
4547
4452
|
const int il = tid/8; // 0...3
|
4548
4453
|
const int ib = tid%8; // 0...7
|
4549
4454
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4557,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4557
4462
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4558
4463
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4559
4464
|
}
|
4560
|
-
#else
|
4561
|
-
assert(false);
|
4562
|
-
#endif
|
4563
|
-
|
4564
4465
|
}
|
4565
4466
|
|
4566
4467
|
template <typename dst_t>
|
@@ -4573,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4573
4474
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
4574
4475
|
|
4575
4476
|
const int tid = item_ct1.get_local_id(2);
|
4576
|
-
#if QK_K == 256
|
4577
4477
|
const int il = tid/8; // 0...3
|
4578
4478
|
const int ib = tid%8; // 0...7
|
4579
4479
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4587,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4587
4487
|
for (int j = 0; j < 8; ++j) {
|
4588
4488
|
y[j] = d * (q[j] + delta);
|
4589
4489
|
}
|
4590
|
-
#else
|
4591
|
-
assert(false);
|
4592
|
-
#endif
|
4593
|
-
|
4594
4490
|
}
|
4595
4491
|
|
4596
4492
|
template <typename dst_t>
|
@@ -4603,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4603
4499
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
4604
4500
|
|
4605
4501
|
const int tid = item_ct1.get_local_id(2);
|
4606
|
-
#if QK_K == 256
|
4607
4502
|
const int il = tid/8; // 0...3
|
4608
4503
|
const int ib = tid%8; // 0...7
|
4609
4504
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4621,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4621
4516
|
for (int j = 0; j < 8; ++j) {
|
4622
4517
|
y[j] = d * (q[j] + delta);
|
4623
4518
|
}
|
4624
|
-
#else
|
4625
|
-
assert(false);
|
4626
|
-
#endif
|
4627
|
-
|
4628
4519
|
}
|
4629
4520
|
|
4630
4521
|
template <typename dst_t>
|
@@ -4698,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4698
4589
|
|
4699
4590
|
float tmp = 0; // partial sum for thread in warp
|
4700
4591
|
|
4701
|
-
#if QK_K == 256
|
4702
4592
|
const int tid =
|
4703
4593
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
4704
4594
|
const int ix =
|
@@ -4749,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4749
4639
|
tmp += dall * sum1 - dmin * sum2;
|
4750
4640
|
|
4751
4641
|
}
|
4752
|
-
#else
|
4753
|
-
const int tid = item_ct1.get_local_id(2) /
|
4754
|
-
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4755
|
-
const int ix = item_ct1.get_local_id(2) %
|
4756
|
-
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4757
|
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
4758
|
-
|
4759
|
-
uint32_t uaux[2];
|
4760
|
-
const uint8_t * d = (const uint8_t *)uaux;
|
4761
|
-
|
4762
|
-
|
4763
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4764
|
-
|
4765
|
-
const float * y = yy + i * QK_K + offset;
|
4766
|
-
const uint8_t * q = x[i].qs + offset;
|
4767
|
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
4768
|
-
|
4769
|
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
4770
|
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
4771
|
-
|
4772
|
-
const sycl::float2 dall =
|
4773
|
-
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
4774
|
-
|
4775
|
-
float sum1 = 0, sum2 = 0;
|
4776
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4777
|
-
const uint8_t ql = q[l];
|
4778
|
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
4779
|
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
4780
|
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
4781
|
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
4782
|
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
4783
|
-
}
|
4784
|
-
tmp += dall.x() * sum1 - dall.y() * sum2;
|
4785
|
-
}
|
4786
|
-
|
4787
|
-
#endif
|
4788
4642
|
|
4789
4643
|
// sum up partial sums and write back result
|
4790
4644
|
#pragma unroll
|
@@ -4822,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4822
4676
|
|
4823
4677
|
float tmp = 0; // partial sum for thread in warp
|
4824
4678
|
|
4825
|
-
#if QK_K == 256
|
4826
|
-
|
4827
4679
|
const uint16_t kmask1 = 0x0303;
|
4828
4680
|
const uint16_t kmask2 = 0x0f0f;
|
4829
4681
|
|
@@ -4876,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4876
4728
|
tmp += d * sum;
|
4877
4729
|
|
4878
4730
|
}
|
4879
|
-
#else
|
4880
|
-
|
4881
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4882
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4883
|
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
4884
|
-
const int in = offset/8; // 0 or 1
|
4885
|
-
const int im = offset%8; // 0...7
|
4886
|
-
|
4887
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4888
|
-
|
4889
|
-
const float * y = yy + i * QK_K + offset;
|
4890
|
-
const uint8_t * q = x[i].qs + offset;
|
4891
|
-
const uint8_t * s = x[i].scales;
|
4892
|
-
|
4893
|
-
const float dall = (float)x[i].d;
|
4894
|
-
|
4895
|
-
float sum = 0;
|
4896
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4897
|
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
4898
|
-
const uint8_t ql = q[l];
|
4899
|
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
4900
|
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
4901
|
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
4902
|
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
4903
|
-
}
|
4904
|
-
tmp += sum;
|
4905
|
-
}
|
4906
|
-
#endif
|
4907
4731
|
|
4908
4732
|
// sum up partial sums and write back result
|
4909
4733
|
#pragma unroll
|
@@ -4938,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
4938
4762
|
|
4939
4763
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
4940
4764
|
|
4941
|
-
#if QK_K == 256
|
4942
4765
|
const uint16_t kmask1 = 0x3f3f;
|
4943
4766
|
const uint16_t kmask2 = 0x0f0f;
|
4944
4767
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5027,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
5027
4850
|
#endif
|
5028
4851
|
|
5029
4852
|
}
|
5030
|
-
#else
|
5031
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5032
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5033
|
-
|
5034
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5035
|
-
|
5036
|
-
uint16_t aux16[2];
|
5037
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
5038
|
-
|
5039
|
-
float tmp = 0;
|
5040
|
-
|
5041
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5042
|
-
const uint8_t * q = x[i].qs + step;
|
5043
|
-
const float * y = yy + i*QK_K + step;
|
5044
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
5045
|
-
aux16[0] = a[0] & 0x0f0f;
|
5046
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
5047
|
-
const float d = (float)x[i].dm[0];
|
5048
|
-
const float m = (float)x[i].dm[1];
|
5049
|
-
float sum = 0.f;
|
5050
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5051
|
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
5052
|
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
5053
|
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
5054
|
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
5055
|
-
}
|
5056
|
-
tmp += sum;
|
5057
|
-
}
|
5058
|
-
|
5059
|
-
#endif
|
5060
4853
|
|
5061
4854
|
// sum up partial sums and write back result
|
5062
4855
|
#pragma unroll
|
@@ -5091,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5091
4884
|
|
5092
4885
|
float tmp = 0; // partial sum for thread in warp
|
5093
4886
|
|
5094
|
-
#if QK_K == 256
|
5095
4887
|
const uint16_t kmask1 = 0x3f3f;
|
5096
4888
|
const uint16_t kmask2 = 0x0f0f;
|
5097
4889
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5168,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5168
4960
|
dmin * smin;
|
5169
4961
|
}
|
5170
4962
|
|
5171
|
-
#else
|
5172
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5173
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5174
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5175
|
-
const int im = step/8;
|
5176
|
-
const int in = step%8;
|
5177
|
-
|
5178
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5179
|
-
const uint8_t * q = x[i].qs + step;
|
5180
|
-
const int8_t * s = x[i].scales;
|
5181
|
-
const float * y = yy + i*QK_K + step;
|
5182
|
-
const float d = x[i].d;
|
5183
|
-
float sum = 0.f;
|
5184
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5185
|
-
const uint8_t h = x[i].qh[in+j] >> im;
|
5186
|
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
5187
|
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
5188
|
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
5189
|
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
5190
|
-
}
|
5191
|
-
tmp += sum;
|
5192
|
-
}
|
5193
|
-
#endif
|
5194
|
-
|
5195
4963
|
// sum up partial sums and write back result
|
5196
4964
|
#pragma unroll
|
5197
4965
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -5218,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5218
4986
|
|
5219
4987
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
5220
4988
|
|
5221
|
-
#if QK_K == 256
|
5222
|
-
|
5223
4989
|
const int tid =
|
5224
4990
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
5225
4991
|
const int ix =
|
@@ -5276,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5276
5042
|
|
5277
5043
|
}
|
5278
5044
|
|
5279
|
-
#else
|
5280
|
-
|
5281
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
5282
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
5283
|
-
|
5284
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5285
|
-
|
5286
|
-
float tmp = 0; // partial sum for thread in warp
|
5287
|
-
|
5288
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5289
|
-
|
5290
|
-
const float * y = yy + i * QK_K + step;
|
5291
|
-
const uint8_t * ql = x[i].ql + step;
|
5292
|
-
const uint8_t * qh = x[i].qh + step;
|
5293
|
-
const int8_t * s = x[i].scales;
|
5294
|
-
|
5295
|
-
const float d = x[i+0].d;
|
5296
|
-
|
5297
|
-
float sum = 0;
|
5298
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5299
|
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
5300
|
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
5301
|
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
5302
|
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
5303
|
-
}
|
5304
|
-
tmp += sum;
|
5305
|
-
|
5306
|
-
}
|
5307
|
-
|
5308
|
-
#endif
|
5309
|
-
|
5310
5045
|
// sum up partial sums and write back result
|
5311
5046
|
#pragma unroll
|
5312
5047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -6851,7 +6586,6 @@ static __dpct_inline__ float
|
|
6851
6586
|
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
6852
6587
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
6853
6588
|
|
6854
|
-
#ifndef GGML_QKK_64
|
6855
6589
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6856
6590
|
|
6857
6591
|
int v[2];
|
@@ -6893,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
6893
6627
|
}
|
6894
6628
|
|
6895
6629
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
6896
|
-
|
6897
|
-
#else
|
6898
|
-
|
6899
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
6900
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6901
|
-
|
6902
|
-
float sumf_d = 0.0f;
|
6903
|
-
float sumf_m = 0.0f;
|
6904
|
-
|
6905
|
-
uint16_t aux16[2];
|
6906
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
6907
|
-
|
6908
|
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
6909
|
-
aux16[0] = a[0] & 0x0f0f;
|
6910
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
6911
|
-
|
6912
|
-
const float dall = bq4_K->dm[0];
|
6913
|
-
const float dmin = bq4_K->dm[1];
|
6914
|
-
|
6915
|
-
const float d8_1 = bq8_1[0].ds[0];
|
6916
|
-
const float d8_2 = bq8_1[1].ds[1];
|
6917
|
-
|
6918
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
6919
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
6920
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
6921
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
6922
|
-
|
6923
|
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
6924
|
-
const int v1 = q4[0];
|
6925
|
-
const int v2 = q4[4];
|
6926
|
-
|
6927
|
-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
6928
|
-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
6929
|
-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
6930
|
-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
6931
|
-
|
6932
|
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
6933
|
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
6934
|
-
|
6935
|
-
return dall * sumf_d - dmin * sumf_m;
|
6936
|
-
|
6937
|
-
#else
|
6938
|
-
bad_arch();
|
6939
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
6940
|
-
|
6941
|
-
#endif
|
6942
6630
|
}
|
6943
6631
|
|
6944
6632
|
template <int mmq_y>
|
@@ -6997,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
6997
6685
|
|
6998
6686
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
6999
6687
|
|
7000
|
-
#if QK_K == 256
|
7001
6688
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
7002
|
-
#else
|
7003
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
7004
|
-
#endif
|
7005
6689
|
}
|
7006
6690
|
|
7007
6691
|
#pragma unroll
|
@@ -7044,7 +6728,6 @@ static __dpct_inline__ float
|
|
7044
6728
|
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
7045
6729
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7046
6730
|
|
7047
|
-
#ifndef GGML_QKK_64
|
7048
6731
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7049
6732
|
|
7050
6733
|
int vl[2];
|
@@ -7086,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
7086
6769
|
}
|
7087
6770
|
|
7088
6771
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
7089
|
-
|
7090
|
-
#else
|
7091
|
-
|
7092
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
7093
|
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7094
|
-
|
7095
|
-
const int8_t * s = bq5_K->scales;
|
7096
|
-
|
7097
|
-
const float d = bq5_K->d;
|
7098
|
-
|
7099
|
-
const float d8_1 = bq8_1[0].ds[0];
|
7100
|
-
const float d8_2 = bq8_1[1].ds[1];
|
7101
|
-
|
7102
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
7103
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
7104
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
7105
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
7106
|
-
|
7107
|
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
7108
|
-
const int vl1 = ql[0];
|
7109
|
-
const int vl2 = ql[4];
|
7110
|
-
|
7111
|
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
7112
|
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
7113
|
-
const int in = step%8; // 0, 4, 0, 4
|
7114
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
7115
|
-
|
7116
|
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
7117
|
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
7118
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
7119
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
7120
|
-
|
7121
|
-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
7122
|
-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
7123
|
-
|
7124
|
-
return d * sumf_d;
|
7125
|
-
|
7126
|
-
#else
|
7127
|
-
bad_arch();
|
7128
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
7129
|
-
|
7130
|
-
#endif
|
7131
6772
|
}
|
7132
6773
|
|
7133
6774
|
template <int mmq_y>
|
@@ -7199,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
7199
6840
|
|
7200
6841
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
7201
6842
|
|
7202
|
-
#if QK_K == 256
|
7203
6843
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
7204
|
-
#endif
|
7205
6844
|
}
|
7206
6845
|
|
7207
6846
|
#pragma unroll
|
@@ -7381,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7381
7020
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7382
7021
|
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
7383
7022
|
const uint8_t *kmask_iq2xs) {
|
7384
|
-
#if QK_K == 256
|
7385
7023
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
7386
7024
|
|
7387
7025
|
#if QR2_XXS == 8
|
@@ -7422,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7422
7060
|
}
|
7423
7061
|
return d * (sumi1 + sumi2);
|
7424
7062
|
#endif
|
7425
|
-
#else
|
7426
|
-
assert(false);
|
7427
|
-
return 0.f;
|
7428
|
-
#endif
|
7429
7063
|
}
|
7430
7064
|
|
7431
7065
|
static __dpct_inline__ float
|
@@ -7434,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7434
7068
|
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
7435
7069
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7436
7070
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7437
|
-
#if QK_K == 256
|
7438
7071
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
7439
7072
|
|
7440
7073
|
const int ib32 = iqs;
|
@@ -7472,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7472
7105
|
assert(false);
|
7473
7106
|
return 0.f;
|
7474
7107
|
#endif
|
7475
|
-
#else
|
7476
|
-
assert(false);
|
7477
|
-
return 0.f;
|
7478
|
-
#endif
|
7479
7108
|
}
|
7480
7109
|
|
7481
7110
|
static __dpct_inline__ float
|
7482
7111
|
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
7483
7112
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7484
|
-
#if QK_K == 256
|
7485
7113
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
7486
7114
|
|
7487
7115
|
const int ib32 = iqs;
|
@@ -7525,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
7525
7153
|
}
|
7526
7154
|
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
7527
7155
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
7528
|
-
#else
|
7529
|
-
assert(false);
|
7530
|
-
#endif
|
7531
7156
|
}
|
7532
7157
|
|
7533
7158
|
static __dpct_inline__ float
|
@@ -7536,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7536
7161
|
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
7537
7162
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7538
7163
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7539
|
-
#if QK_K == 256
|
7540
7164
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
7541
7165
|
|
7542
7166
|
const int ib32 = iqs;
|
@@ -7564,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7564
7188
|
assert(false);
|
7565
7189
|
return 0.f;
|
7566
7190
|
#endif
|
7567
|
-
#else
|
7568
|
-
assert(false);
|
7569
|
-
return 0.f;
|
7570
|
-
#endif
|
7571
7191
|
}
|
7572
7192
|
|
7573
7193
|
static __dpct_inline__ float
|
7574
7194
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
7575
7195
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7576
7196
|
const uint32_t *iq3s_grid) {
|
7577
|
-
#if QK_K == 256
|
7578
7197
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
7579
7198
|
|
7580
7199
|
const int ib32 = iqs;
|
@@ -7603,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7603
7222
|
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
7604
7223
|
bq8_1[ib32].ds[0];
|
7605
7224
|
return d * sumi;
|
7606
|
-
#else
|
7607
|
-
assert(false);
|
7608
|
-
#endif
|
7609
7225
|
}
|
7610
7226
|
|
7611
7227
|
static __dpct_inline__ float
|
7612
7228
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
7613
7229
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7614
7230
|
const uint32_t *iq1s_grid_gpu) {
|
7615
|
-
#if QK_K == 256
|
7616
7231
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
7617
7232
|
|
7618
7233
|
const int ib32 = iqs;
|
@@ -7631,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
7631
7246
|
const float d = d1q * bq8_1[ib32].ds[0];
|
7632
7247
|
const float m = d1q * bq8_1[ib32].ds[1];
|
7633
7248
|
return d * sumi + m * delta;
|
7634
|
-
#else
|
7635
|
-
assert(false);
|
7636
|
-
#endif
|
7637
7249
|
}
|
7638
7250
|
|
7639
7251
|
static __dpct_inline__ float
|
7640
7252
|
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
7641
7253
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7642
|
-
#if QK_K == 256
|
7643
7254
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
7644
7255
|
|
7645
7256
|
const int ib32 = iqs;
|
@@ -7664,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
7664
7275
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
7665
7276
|
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
7666
7277
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
7667
|
-
#else
|
7668
|
-
assert(false);
|
7669
|
-
#endif
|
7670
7278
|
}
|
7671
7279
|
|
7672
7280
|
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
@@ -7714,7 +7322,6 @@ static __dpct_inline__ float
|
|
7714
7322
|
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
7715
7323
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7716
7324
|
|
7717
|
-
#if QK_K == 256
|
7718
7325
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
7719
7326
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
7720
7327
|
|
@@ -7732,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
7732
7339
|
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
7733
7340
|
}
|
7734
7341
|
return d * (sumi1 + sumi2);
|
7735
|
-
#else
|
7736
|
-
assert(false);
|
7737
|
-
#endif
|
7738
7342
|
}
|
7739
7343
|
|
7740
7344
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
@@ -10085,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
|
10085
9689
|
});
|
10086
9690
|
}
|
10087
9691
|
|
10088
|
-
static void upscale_f32_sycl(const float *x, float *dst, const int
|
10089
|
-
const int
|
10090
|
-
const int
|
10091
|
-
|
10092
|
-
int
|
10093
|
-
|
9692
|
+
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
9693
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
9694
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
9695
|
+
const float sf2, const float sf3, dpct::queue_ptr stream) {
|
9696
|
+
int dst_size = ne10 * ne11 * ne12 * ne13;
|
9697
|
+
int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
9698
|
+
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
10094
9699
|
stream->parallel_for(
|
10095
|
-
sycl::nd_range<
|
10096
|
-
|
10097
|
-
|
10098
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10099
|
-
upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
|
9700
|
+
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
9701
|
+
[=](sycl::nd_item<1> item_ct1) {
|
9702
|
+
upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
10100
9703
|
});
|
10101
9704
|
}
|
10102
9705
|
|
@@ -10198,7 +9801,6 @@ template <typename dst_t>
|
|
10198
9801
|
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
10199
9802
|
dpct::queue_ptr stream) {
|
10200
9803
|
const int nb = k / QK_K;
|
10201
|
-
#if QK_K == 256
|
10202
9804
|
{
|
10203
9805
|
dpct::has_capability_or_fail(stream->get_device(),
|
10204
9806
|
{sycl::aspect::fp16});
|
@@ -10210,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10210
9812
|
dequantize_block_q2_K(vx, y, item_ct1);
|
10211
9813
|
});
|
10212
9814
|
}
|
10213
|
-
#else
|
10214
|
-
{
|
10215
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10216
|
-
{sycl::aspect::fp16});
|
10217
|
-
|
10218
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10219
|
-
sycl::range<3>(1, 1, 32),
|
10220
|
-
sycl::range<3>(1, 1, 32)),
|
10221
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10222
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
10223
|
-
});
|
10224
|
-
}
|
10225
|
-
|
10226
|
-
#endif
|
10227
9815
|
}
|
10228
9816
|
|
10229
9817
|
template <typename dst_t>
|
10230
9818
|
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
10231
9819
|
dpct::queue_ptr stream) {
|
10232
9820
|
const int nb = k / QK_K;
|
10233
|
-
#if QK_K == 256
|
10234
9821
|
{
|
10235
9822
|
dpct::has_capability_or_fail(stream->get_device(),
|
10236
9823
|
{sycl::aspect::fp16});
|
@@ -10242,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10242
9829
|
dequantize_block_q3_K(vx, y, item_ct1);
|
10243
9830
|
});
|
10244
9831
|
}
|
10245
|
-
#else
|
10246
|
-
{
|
10247
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10248
|
-
{sycl::aspect::fp16});
|
10249
|
-
|
10250
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10251
|
-
sycl::range<3>(1, 1, 32),
|
10252
|
-
sycl::range<3>(1, 1, 32)),
|
10253
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10254
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
10255
|
-
});
|
10256
|
-
}
|
10257
|
-
#endif
|
10258
9832
|
}
|
10259
9833
|
|
10260
9834
|
template <typename dst_t>
|
@@ -10315,7 +9889,6 @@ template <typename dst_t>
|
|
10315
9889
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
10316
9890
|
dpct::queue_ptr stream) {
|
10317
9891
|
const int nb = k / QK_K;
|
10318
|
-
#if QK_K == 256
|
10319
9892
|
{
|
10320
9893
|
dpct::has_capability_or_fail(stream->get_device(),
|
10321
9894
|
{sycl::aspect::fp16});
|
@@ -10327,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10327
9900
|
dequantize_block_q5_K(vx, y, item_ct1);
|
10328
9901
|
});
|
10329
9902
|
}
|
10330
|
-
#else
|
10331
|
-
{
|
10332
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10333
|
-
{sycl::aspect::fp16});
|
10334
|
-
|
10335
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10336
|
-
sycl::range<3>(1, 1, 32),
|
10337
|
-
sycl::range<3>(1, 1, 32)),
|
10338
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10339
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
10340
|
-
});
|
10341
|
-
}
|
10342
|
-
|
10343
|
-
#endif
|
10344
9903
|
}
|
10345
9904
|
|
10346
9905
|
template <typename dst_t>
|
10347
9906
|
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
10348
9907
|
dpct::queue_ptr stream) {
|
10349
9908
|
const int nb = k / QK_K;
|
10350
|
-
#if QK_K == 256
|
10351
9909
|
{
|
10352
9910
|
dpct::has_capability_or_fail(stream->get_device(),
|
10353
9911
|
{sycl::aspect::fp16});
|
@@ -10359,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10359
9917
|
dequantize_block_q6_K(vx, y, item_ct1);
|
10360
9918
|
});
|
10361
9919
|
}
|
10362
|
-
#else
|
10363
|
-
{
|
10364
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10365
|
-
{sycl::aspect::fp16});
|
10366
|
-
|
10367
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10368
|
-
sycl::range<3>(1, 1, 32),
|
10369
|
-
sycl::range<3>(1, 1, 32)),
|
10370
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10371
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
10372
|
-
});
|
10373
|
-
}
|
10374
|
-
|
10375
|
-
#endif
|
10376
9920
|
}
|
10377
9921
|
|
10378
9922
|
template <typename dst_t>
|
@@ -10524,9 +10068,6 @@ template <typename dst_t>
|
|
10524
10068
|
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
10525
10069
|
dpct::queue_ptr stream) {
|
10526
10070
|
const int nb = (k + QK_K - 1) / QK_K;
|
10527
|
-
#if QK_K == 64
|
10528
|
-
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
10529
|
-
#else
|
10530
10071
|
{
|
10531
10072
|
dpct::has_capability_or_fail(stream->get_device(),
|
10532
10073
|
{sycl::aspect::fp16});
|
@@ -10541,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10541
10082
|
});
|
10542
10083
|
});
|
10543
10084
|
}
|
10544
|
-
#endif
|
10545
10085
|
}
|
10546
10086
|
|
10547
10087
|
|
@@ -12046,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12046
11586
|
const int nrows_y, const int nrows_dst,
|
12047
11587
|
dpct::queue_ptr stream) try {
|
12048
11588
|
|
12049
|
-
#if QK_K == 256
|
12050
|
-
|
12051
11589
|
int id;
|
12052
11590
|
SYCL_CHECK(
|
12053
11591
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
@@ -12162,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12162
11700
|
});
|
12163
11701
|
}
|
12164
11702
|
}
|
12165
|
-
#endif
|
12166
11703
|
}
|
12167
11704
|
catch (sycl::exception const &exc) {
|
12168
11705
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -13985,15 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
|
13985
13522
|
|
13986
13523
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
13987
13524
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
13988
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
13989
|
-
|
13990
|
-
#pragma message("TODO: generalize upscale operator")
|
13991
|
-
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
13992
|
-
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
13993
13525
|
|
13994
|
-
const
|
13526
|
+
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
13527
|
+
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
13528
|
+
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
13529
|
+
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
13995
13530
|
|
13996
|
-
upscale_f32_sycl(src0_dd, dst_dd, src0->
|
13531
|
+
upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
13532
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
13533
|
+
main_stream);
|
13997
13534
|
|
13998
13535
|
(void) src1;
|
13999
13536
|
(void) dst;
|
@@ -14449,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14449
13986
|
ggml_tensor *dst, const float *src0_dd,
|
14450
13987
|
const float *src1_dd, float *dst_dd,
|
14451
13988
|
const dpct::queue_ptr &main_stream) {
|
13989
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
13990
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
13991
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
14452
13992
|
|
14453
13993
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
14454
13994
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|