llama_cpp 0.15.1 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
3154
3154
|
#define SYCL_SCALE_BLOCK_SIZE 256
|
3155
3155
|
#define SYCL_CLAMP_BLOCK_SIZE 256
|
3156
3156
|
#define SYCL_ROPE_BLOCK_SIZE 256
|
3157
|
-
#define SYCL_ALIBI_BLOCK_SIZE 32
|
3158
3157
|
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
3159
3158
|
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
3160
3159
|
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -3848,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
|
|
3848
3847
|
}
|
3849
3848
|
}
|
3850
3849
|
|
3851
|
-
static void upscale_f32(const float *x, float *dst, const int
|
3852
|
-
const
|
3853
|
-
|
3854
|
-
|
3855
|
-
|
3856
|
-
|
3850
|
+
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
3851
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
3852
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
3853
|
+
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
3854
|
+
int index = item_ct1.get_local_id(0) +
|
3855
|
+
item_ct1.get_group(0) * item_ct1.get_local_range(0);
|
3856
|
+
if (index >= ne10 * ne11 * ne12 * ne13) {
|
3857
3857
|
return;
|
3858
3858
|
}
|
3859
3859
|
// operation
|
3860
|
-
int
|
3861
|
-
int
|
3862
|
-
int
|
3863
|
-
int
|
3864
|
-
|
3865
|
-
|
3860
|
+
int i10 = index % ne10;
|
3861
|
+
int i11 = (index / ne10) % ne11;
|
3862
|
+
int i12 = (index / (ne10 * ne11)) % ne12;
|
3863
|
+
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
3864
|
+
|
3865
|
+
int i00 = i10 / sf0;
|
3866
|
+
int i01 = i11 / sf1;
|
3867
|
+
int i02 = i12 / sf2;
|
3868
|
+
int i03 = i13 / sf3;
|
3869
|
+
|
3870
|
+
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
3866
3871
|
}
|
3867
3872
|
|
3868
3873
|
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
@@ -4192,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4192
4197
|
const block_q2_K * x = (const block_q2_K *) vx;
|
4193
4198
|
|
4194
4199
|
const int tid = item_ct1.get_local_id(2);
|
4195
|
-
#if QK_K == 256
|
4196
4200
|
const int n = tid/32;
|
4197
4201
|
const int l = tid - 32*n;
|
4198
4202
|
const int is = 8*n + l/16;
|
@@ -4206,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
4206
4210
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4207
4211
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
4208
4212
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
4209
|
-
#else
|
4210
|
-
const int is = tid/16; // 0 or 1
|
4211
|
-
const int il = tid%16; // 0...15
|
4212
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4213
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4214
|
-
|
4215
|
-
float dall = x[i].dm[0];
|
4216
|
-
float dmin = x[i].dm[1];
|
4217
|
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
4218
|
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
4219
|
-
#endif
|
4220
|
-
|
4221
4213
|
}
|
4222
4214
|
|
4223
4215
|
template<typename dst_t>
|
@@ -4227,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4227
4219
|
const int i = item_ct1.get_group(2);
|
4228
4220
|
const block_q3_K * x = (const block_q3_K *) vx;
|
4229
4221
|
|
4230
|
-
#if QK_K == 256
|
4231
4222
|
const int r = item_ct1.get_local_id(2) / 4;
|
4232
4223
|
const int tid = r/2;
|
4233
4224
|
const int is0 = r%2;
|
@@ -4251,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
4251
4242
|
const uint8_t * hm = x[i].hmask;
|
4252
4243
|
|
4253
4244
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
4254
|
-
#else
|
4255
|
-
const int tid = item_ct1.get_local_id(2);
|
4256
|
-
const int is = tid/16; // 0 or 1
|
4257
|
-
const int il = tid%16; // 0...15
|
4258
|
-
const int im = il/8; // 0...1
|
4259
|
-
const int in = il%8; // 0...7
|
4260
|
-
|
4261
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
4262
|
-
|
4263
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
4264
|
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
4265
|
-
const float d = (float)x[i].d;
|
4266
|
-
|
4267
|
-
if (is == 0) {
|
4268
|
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4269
|
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4270
|
-
} else {
|
4271
|
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
4272
|
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
4273
|
-
}
|
4274
|
-
#endif
|
4275
|
-
|
4276
4245
|
}
|
4277
4246
|
|
4278
|
-
#if QK_K == 256
|
4279
4247
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
4280
4248
|
if (j < 4) {
|
4281
4249
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -4284,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
4284
4252
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
4285
4253
|
}
|
4286
4254
|
}
|
4287
|
-
#endif
|
4288
4255
|
|
4289
4256
|
template<typename dst_t>
|
4290
4257
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
@@ -4293,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4293
4260
|
|
4294
4261
|
const int i = item_ct1.get_group(2);
|
4295
4262
|
|
4296
|
-
#if QK_K == 256
|
4297
4263
|
// assume 32 threads
|
4298
4264
|
const int tid = item_ct1.get_local_id(2);
|
4299
4265
|
const int il = tid/8;
|
@@ -4317,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
4317
4283
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
4318
4284
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
4319
4285
|
}
|
4320
|
-
#else
|
4321
|
-
const int tid = item_ct1.get_local_id(2);
|
4322
|
-
const uint8_t * q = x[i].qs;
|
4323
|
-
dst_t * y = yy + i*QK_K;
|
4324
|
-
const float d = (float)x[i].dm[0];
|
4325
|
-
const float m = (float)x[i].dm[1];
|
4326
|
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
4327
|
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
4328
|
-
#endif
|
4329
4286
|
}
|
4330
4287
|
|
4331
4288
|
template<typename dst_t>
|
@@ -4335,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4335
4292
|
|
4336
4293
|
const int i = item_ct1.get_group(2);
|
4337
4294
|
|
4338
|
-
#if QK_K == 256
|
4339
4295
|
// assume 64 threads - this is very slightly better than the one below
|
4340
4296
|
const int tid = item_ct1.get_local_id(2);
|
4341
4297
|
const int il = tid/16; // il is in 0...3
|
@@ -4362,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
4362
4318
|
hm <<= 1;
|
4363
4319
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
4364
4320
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
4365
|
-
#else
|
4366
|
-
const int tid = item_ct1.get_local_id(2);
|
4367
|
-
const uint8_t q = x[i].qs[tid];
|
4368
|
-
const int im = tid/8; // 0...3
|
4369
|
-
const int in = tid%8; // 0...7
|
4370
|
-
const int is = tid/16; // 0 or 1
|
4371
|
-
const uint8_t h = x[i].qh[in] >> im;
|
4372
|
-
const float d = x[i].d;
|
4373
|
-
dst_t * y = yy + i*QK_K + tid;
|
4374
|
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
4375
|
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
4376
|
-
#endif
|
4377
4321
|
}
|
4378
4322
|
|
4379
4323
|
template<typename dst_t>
|
@@ -4382,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4382
4326
|
const block_q6_K * x = (const block_q6_K *) vx;
|
4383
4327
|
|
4384
4328
|
const int i = item_ct1.get_group(2);
|
4385
|
-
#if QK_K == 256
|
4386
4329
|
|
4387
4330
|
// assume 64 threads - this is very slightly better than the one below
|
4388
4331
|
const int tid = item_ct1.get_local_id(2);
|
@@ -4402,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
4402
4345
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
4403
4346
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4404
4347
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
4405
|
-
#else
|
4406
|
-
|
4407
|
-
// assume 32 threads
|
4408
|
-
const int tid = item_ct1.get_local_id(2);
|
4409
|
-
const int ip = tid/16; // 0 or 1
|
4410
|
-
const int il = tid - 16*ip; // 0...15
|
4411
|
-
|
4412
|
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
4413
|
-
|
4414
|
-
const float d = x[i].d;
|
4415
|
-
|
4416
|
-
const uint8_t ql = x[i].ql[16*ip + il];
|
4417
|
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
4418
|
-
const int8_t * sc = x[i].scales;
|
4419
|
-
|
4420
|
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
4421
|
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
4422
|
-
#endif
|
4423
4348
|
}
|
4424
4349
|
|
4425
4350
|
template<typename dst_t>
|
@@ -4433,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4433
4358
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
4434
4359
|
|
4435
4360
|
const int tid = item_ct1.get_local_id(2);
|
4436
|
-
#if QK_K == 256
|
4437
4361
|
const int il = tid/8; // 0...3
|
4438
4362
|
const int ib = tid%8; // 0...7
|
4439
4363
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4444,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4444
4368
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
4445
4369
|
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
4446
4370
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
4447
|
-
#else
|
4448
|
-
assert(false);
|
4449
|
-
#endif
|
4450
|
-
|
4451
4371
|
}
|
4452
4372
|
|
4453
4373
|
template<typename dst_t>
|
@@ -4461,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4461
4381
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
4462
4382
|
|
4463
4383
|
const int tid = item_ct1.get_local_id(2);
|
4464
|
-
#if QK_K == 256
|
4465
4384
|
const int il = tid/8; // 0...3
|
4466
4385
|
const int ib = tid%8; // 0...7
|
4467
4386
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4470,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4470
4389
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4471
4390
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
4472
4391
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4473
|
-
#else
|
4474
|
-
assert(false);
|
4475
|
-
#endif
|
4476
|
-
|
4477
4392
|
}
|
4478
4393
|
|
4479
4394
|
template <typename dst_t>
|
@@ -4485,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4485
4400
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
4486
4401
|
|
4487
4402
|
const int tid = item_ct1.get_local_id(2);
|
4488
|
-
#if QK_K == 256
|
4489
4403
|
const int il = tid/8; // 0...3
|
4490
4404
|
const int ib = tid%8; // 0...7
|
4491
4405
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4493,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4493
4407
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4494
4408
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
4495
4409
|
#pragma unroll
|
4496
|
-
for (int j = 0; j < 8; ++j)
|
4410
|
+
for (int j = 0; j < 8; ++j) {
|
4497
4411
|
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4498
|
-
|
4499
|
-
assert(false);
|
4500
|
-
|
4501
|
-
#endif
|
4502
|
-
|
4412
|
+
}
|
4503
4413
|
}
|
4504
4414
|
|
4505
4415
|
template<typename dst_t>
|
@@ -4513,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4513
4423
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
4514
4424
|
|
4515
4425
|
const int tid = item_ct1.get_local_id(2);
|
4516
|
-
#if QK_K == 256
|
4517
4426
|
const int il = tid/8; // 0...3
|
4518
4427
|
const int ib = tid%8; // 0...7
|
4519
4428
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4528,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4528
4437
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4529
4438
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4530
4439
|
}
|
4531
|
-
#else
|
4532
|
-
assert(false);
|
4533
|
-
#endif
|
4534
|
-
|
4535
4440
|
}
|
4536
4441
|
|
4537
4442
|
template <typename dst_t>
|
@@ -4544,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4544
4449
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
4545
4450
|
|
4546
4451
|
const int tid = item_ct1.get_local_id(2);
|
4547
|
-
#if QK_K == 256
|
4548
4452
|
const int il = tid/8; // 0...3
|
4549
4453
|
const int ib = tid%8; // 0...7
|
4550
4454
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4558,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4558
4462
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4559
4463
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4560
4464
|
}
|
4561
|
-
#else
|
4562
|
-
assert(false);
|
4563
|
-
#endif
|
4564
|
-
|
4565
4465
|
}
|
4566
4466
|
|
4567
4467
|
template <typename dst_t>
|
@@ -4574,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4574
4474
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
4575
4475
|
|
4576
4476
|
const int tid = item_ct1.get_local_id(2);
|
4577
|
-
#if QK_K == 256
|
4578
4477
|
const int il = tid/8; // 0...3
|
4579
4478
|
const int ib = tid%8; // 0...7
|
4580
4479
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4588,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4588
4487
|
for (int j = 0; j < 8; ++j) {
|
4589
4488
|
y[j] = d * (q[j] + delta);
|
4590
4489
|
}
|
4591
|
-
#else
|
4592
|
-
assert(false);
|
4593
|
-
#endif
|
4594
|
-
|
4595
4490
|
}
|
4596
4491
|
|
4597
4492
|
template <typename dst_t>
|
@@ -4604,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4604
4499
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
4605
4500
|
|
4606
4501
|
const int tid = item_ct1.get_local_id(2);
|
4607
|
-
#if QK_K == 256
|
4608
4502
|
const int il = tid/8; // 0...3
|
4609
4503
|
const int ib = tid%8; // 0...7
|
4610
4504
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
@@ -4622,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
4622
4516
|
for (int j = 0; j < 8; ++j) {
|
4623
4517
|
y[j] = d * (q[j] + delta);
|
4624
4518
|
}
|
4625
|
-
#else
|
4626
|
-
assert(false);
|
4627
|
-
#endif
|
4628
|
-
|
4629
4519
|
}
|
4630
4520
|
|
4631
4521
|
template <typename dst_t>
|
@@ -4699,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4699
4589
|
|
4700
4590
|
float tmp = 0; // partial sum for thread in warp
|
4701
4591
|
|
4702
|
-
#if QK_K == 256
|
4703
4592
|
const int tid =
|
4704
4593
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
4705
4594
|
const int ix =
|
@@ -4750,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
4750
4639
|
tmp += dall * sum1 - dmin * sum2;
|
4751
4640
|
|
4752
4641
|
}
|
4753
|
-
#else
|
4754
|
-
const int tid = item_ct1.get_local_id(2) /
|
4755
|
-
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4756
|
-
const int ix = item_ct1.get_local_id(2) %
|
4757
|
-
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4758
|
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
4759
|
-
|
4760
|
-
uint32_t uaux[2];
|
4761
|
-
const uint8_t * d = (const uint8_t *)uaux;
|
4762
|
-
|
4763
|
-
|
4764
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4765
|
-
|
4766
|
-
const float * y = yy + i * QK_K + offset;
|
4767
|
-
const uint8_t * q = x[i].qs + offset;
|
4768
|
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
4769
|
-
|
4770
|
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
4771
|
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
4772
|
-
|
4773
|
-
const sycl::float2 dall =
|
4774
|
-
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
4775
|
-
|
4776
|
-
float sum1 = 0, sum2 = 0;
|
4777
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4778
|
-
const uint8_t ql = q[l];
|
4779
|
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
4780
|
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
4781
|
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
4782
|
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
4783
|
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
4784
|
-
}
|
4785
|
-
tmp += dall.x() * sum1 - dall.y() * sum2;
|
4786
|
-
}
|
4787
|
-
|
4788
|
-
#endif
|
4789
4642
|
|
4790
4643
|
// sum up partial sums and write back result
|
4791
4644
|
#pragma unroll
|
@@ -4823,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4823
4676
|
|
4824
4677
|
float tmp = 0; // partial sum for thread in warp
|
4825
4678
|
|
4826
|
-
#if QK_K == 256
|
4827
|
-
|
4828
4679
|
const uint16_t kmask1 = 0x0303;
|
4829
4680
|
const uint16_t kmask2 = 0x0f0f;
|
4830
4681
|
|
@@ -4877,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
4877
4728
|
tmp += d * sum;
|
4878
4729
|
|
4879
4730
|
}
|
4880
|
-
#else
|
4881
|
-
|
4882
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
4883
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
4884
|
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
4885
|
-
const int in = offset/8; // 0 or 1
|
4886
|
-
const int im = offset%8; // 0...7
|
4887
|
-
|
4888
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
4889
|
-
|
4890
|
-
const float * y = yy + i * QK_K + offset;
|
4891
|
-
const uint8_t * q = x[i].qs + offset;
|
4892
|
-
const uint8_t * s = x[i].scales;
|
4893
|
-
|
4894
|
-
const float dall = (float)x[i].d;
|
4895
|
-
|
4896
|
-
float sum = 0;
|
4897
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
4898
|
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
4899
|
-
const uint8_t ql = q[l];
|
4900
|
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
4901
|
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
4902
|
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
4903
|
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
4904
|
-
}
|
4905
|
-
tmp += sum;
|
4906
|
-
}
|
4907
|
-
#endif
|
4908
4731
|
|
4909
4732
|
// sum up partial sums and write back result
|
4910
4733
|
#pragma unroll
|
@@ -4939,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
4939
4762
|
|
4940
4763
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
4941
4764
|
|
4942
|
-
#if QK_K == 256
|
4943
4765
|
const uint16_t kmask1 = 0x3f3f;
|
4944
4766
|
const uint16_t kmask2 = 0x0f0f;
|
4945
4767
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5028,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
5028
4850
|
#endif
|
5029
4851
|
|
5030
4852
|
}
|
5031
|
-
#else
|
5032
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5033
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5034
|
-
|
5035
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5036
|
-
|
5037
|
-
uint16_t aux16[2];
|
5038
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
5039
|
-
|
5040
|
-
float tmp = 0;
|
5041
|
-
|
5042
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5043
|
-
const uint8_t * q = x[i].qs + step;
|
5044
|
-
const float * y = yy + i*QK_K + step;
|
5045
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
5046
|
-
aux16[0] = a[0] & 0x0f0f;
|
5047
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
5048
|
-
const float d = (float)x[i].dm[0];
|
5049
|
-
const float m = (float)x[i].dm[1];
|
5050
|
-
float sum = 0.f;
|
5051
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5052
|
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
5053
|
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
5054
|
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
5055
|
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
5056
|
-
}
|
5057
|
-
tmp += sum;
|
5058
|
-
}
|
5059
|
-
|
5060
|
-
#endif
|
5061
4853
|
|
5062
4854
|
// sum up partial sums and write back result
|
5063
4855
|
#pragma unroll
|
@@ -5092,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5092
4884
|
|
5093
4885
|
float tmp = 0; // partial sum for thread in warp
|
5094
4886
|
|
5095
|
-
#if QK_K == 256
|
5096
4887
|
const uint16_t kmask1 = 0x3f3f;
|
5097
4888
|
const uint16_t kmask2 = 0x0f0f;
|
5098
4889
|
const uint16_t kmask3 = 0xc0c0;
|
@@ -5169,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
5169
4960
|
dmin * smin;
|
5170
4961
|
}
|
5171
4962
|
|
5172
|
-
#else
|
5173
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
5174
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
5175
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5176
|
-
const int im = step/8;
|
5177
|
-
const int in = step%8;
|
5178
|
-
|
5179
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5180
|
-
const uint8_t * q = x[i].qs + step;
|
5181
|
-
const int8_t * s = x[i].scales;
|
5182
|
-
const float * y = yy + i*QK_K + step;
|
5183
|
-
const float d = x[i].d;
|
5184
|
-
float sum = 0.f;
|
5185
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5186
|
-
const uint8_t h = x[i].qh[in+j] >> im;
|
5187
|
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
5188
|
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
5189
|
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
5190
|
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
5191
|
-
}
|
5192
|
-
tmp += sum;
|
5193
|
-
}
|
5194
|
-
#endif
|
5195
|
-
|
5196
4963
|
// sum up partial sums and write back result
|
5197
4964
|
#pragma unroll
|
5198
4965
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -5219,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5219
4986
|
|
5220
4987
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
5221
4988
|
|
5222
|
-
#if QK_K == 256
|
5223
|
-
|
5224
4989
|
const int tid =
|
5225
4990
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
5226
4991
|
const int ix =
|
@@ -5277,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
5277
5042
|
|
5278
5043
|
}
|
5279
5044
|
|
5280
|
-
#else
|
5281
|
-
|
5282
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
5283
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
5284
|
-
|
5285
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
5286
|
-
|
5287
|
-
float tmp = 0; // partial sum for thread in warp
|
5288
|
-
|
5289
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
5290
|
-
|
5291
|
-
const float * y = yy + i * QK_K + step;
|
5292
|
-
const uint8_t * ql = x[i].ql + step;
|
5293
|
-
const uint8_t * qh = x[i].qh + step;
|
5294
|
-
const int8_t * s = x[i].scales;
|
5295
|
-
|
5296
|
-
const float d = x[i+0].d;
|
5297
|
-
|
5298
|
-
float sum = 0;
|
5299
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
5300
|
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
5301
|
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
5302
|
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
5303
|
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
5304
|
-
}
|
5305
|
-
tmp += sum;
|
5306
|
-
|
5307
|
-
}
|
5308
|
-
|
5309
|
-
#endif
|
5310
|
-
|
5311
5045
|
// sum up partial sums and write back result
|
5312
5046
|
#pragma unroll
|
5313
5047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -6852,7 +6586,6 @@ static __dpct_inline__ float
|
|
6852
6586
|
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
6853
6587
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
6854
6588
|
|
6855
|
-
#ifndef GGML_QKK_64
|
6856
6589
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6857
6590
|
|
6858
6591
|
int v[2];
|
@@ -6894,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
6894
6627
|
}
|
6895
6628
|
|
6896
6629
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
6897
|
-
|
6898
|
-
#else
|
6899
|
-
|
6900
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
6901
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
6902
|
-
|
6903
|
-
float sumf_d = 0.0f;
|
6904
|
-
float sumf_m = 0.0f;
|
6905
|
-
|
6906
|
-
uint16_t aux16[2];
|
6907
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
6908
|
-
|
6909
|
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
6910
|
-
aux16[0] = a[0] & 0x0f0f;
|
6911
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
6912
|
-
|
6913
|
-
const float dall = bq4_K->dm[0];
|
6914
|
-
const float dmin = bq4_K->dm[1];
|
6915
|
-
|
6916
|
-
const float d8_1 = bq8_1[0].ds[0];
|
6917
|
-
const float d8_2 = bq8_1[1].ds[1];
|
6918
|
-
|
6919
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
6920
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
6921
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
6922
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
6923
|
-
|
6924
|
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
6925
|
-
const int v1 = q4[0];
|
6926
|
-
const int v2 = q4[4];
|
6927
|
-
|
6928
|
-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
6929
|
-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
6930
|
-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
6931
|
-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
6932
|
-
|
6933
|
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
6934
|
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
6935
|
-
|
6936
|
-
return dall * sumf_d - dmin * sumf_m;
|
6937
|
-
|
6938
|
-
#else
|
6939
|
-
bad_arch();
|
6940
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
6941
|
-
|
6942
|
-
#endif
|
6943
6630
|
}
|
6944
6631
|
|
6945
6632
|
template <int mmq_y>
|
@@ -6998,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
6998
6685
|
|
6999
6686
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
7000
6687
|
|
7001
|
-
#if QK_K == 256
|
7002
6688
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
7003
|
-
#else
|
7004
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
7005
|
-
#endif
|
7006
6689
|
}
|
7007
6690
|
|
7008
6691
|
#pragma unroll
|
@@ -7045,7 +6728,6 @@ static __dpct_inline__ float
|
|
7045
6728
|
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
7046
6729
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7047
6730
|
|
7048
|
-
#ifndef GGML_QKK_64
|
7049
6731
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7050
6732
|
|
7051
6733
|
int vl[2];
|
@@ -7087,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
7087
6769
|
}
|
7088
6770
|
|
7089
6771
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
7090
|
-
|
7091
|
-
#else
|
7092
|
-
|
7093
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
7094
|
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
7095
|
-
|
7096
|
-
const int8_t * s = bq5_K->scales;
|
7097
|
-
|
7098
|
-
const float d = bq5_K->d;
|
7099
|
-
|
7100
|
-
const float d8_1 = bq8_1[0].ds[0];
|
7101
|
-
const float d8_2 = bq8_1[1].ds[1];
|
7102
|
-
|
7103
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
7104
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
7105
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
7106
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
7107
|
-
|
7108
|
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
7109
|
-
const int vl1 = ql[0];
|
7110
|
-
const int vl2 = ql[4];
|
7111
|
-
|
7112
|
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
7113
|
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
7114
|
-
const int in = step%8; // 0, 4, 0, 4
|
7115
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
7116
|
-
|
7117
|
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
7118
|
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
7119
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
7120
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
7121
|
-
|
7122
|
-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
7123
|
-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
7124
|
-
|
7125
|
-
return d * sumf_d;
|
7126
|
-
|
7127
|
-
#else
|
7128
|
-
bad_arch();
|
7129
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
7130
|
-
|
7131
|
-
#endif
|
7132
6772
|
}
|
7133
6773
|
|
7134
6774
|
template <int mmq_y>
|
@@ -7200,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
7200
6840
|
|
7201
6841
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
7202
6842
|
|
7203
|
-
#if QK_K == 256
|
7204
6843
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
7205
|
-
#endif
|
7206
6844
|
}
|
7207
6845
|
|
7208
6846
|
#pragma unroll
|
@@ -7382,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7382
7020
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7383
7021
|
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
7384
7022
|
const uint8_t *kmask_iq2xs) {
|
7385
|
-
#if QK_K == 256
|
7386
7023
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
7387
7024
|
|
7388
7025
|
#if QR2_XXS == 8
|
@@ -7423,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
7423
7060
|
}
|
7424
7061
|
return d * (sumi1 + sumi2);
|
7425
7062
|
#endif
|
7426
|
-
#else
|
7427
|
-
assert(false);
|
7428
|
-
return 0.f;
|
7429
|
-
#endif
|
7430
7063
|
}
|
7431
7064
|
|
7432
7065
|
static __dpct_inline__ float
|
@@ -7435,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7435
7068
|
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
7436
7069
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7437
7070
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7438
|
-
#if QK_K == 256
|
7439
7071
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
7440
7072
|
|
7441
7073
|
const int ib32 = iqs;
|
@@ -7473,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7473
7105
|
assert(false);
|
7474
7106
|
return 0.f;
|
7475
7107
|
#endif
|
7476
|
-
#else
|
7477
|
-
assert(false);
|
7478
|
-
return 0.f;
|
7479
|
-
#endif
|
7480
7108
|
}
|
7481
7109
|
|
7482
7110
|
static __dpct_inline__ float
|
7483
7111
|
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
7484
7112
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7485
|
-
#if QK_K == 256
|
7486
7113
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
7487
7114
|
|
7488
7115
|
const int ib32 = iqs;
|
@@ -7526,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
7526
7153
|
}
|
7527
7154
|
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
7528
7155
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
7529
|
-
#else
|
7530
|
-
assert(false);
|
7531
|
-
#endif
|
7532
7156
|
}
|
7533
7157
|
|
7534
7158
|
static __dpct_inline__ float
|
@@ -7537,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7537
7161
|
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
7538
7162
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
7539
7163
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7540
|
-
#if QK_K == 256
|
7541
7164
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
7542
7165
|
|
7543
7166
|
const int ib32 = iqs;
|
@@ -7565,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7565
7188
|
assert(false);
|
7566
7189
|
return 0.f;
|
7567
7190
|
#endif
|
7568
|
-
#else
|
7569
|
-
assert(false);
|
7570
|
-
return 0.f;
|
7571
|
-
#endif
|
7572
7191
|
}
|
7573
7192
|
|
7574
7193
|
static __dpct_inline__ float
|
7575
7194
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
7576
7195
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7577
7196
|
const uint32_t *iq3s_grid) {
|
7578
|
-
#if QK_K == 256
|
7579
7197
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
7580
7198
|
|
7581
7199
|
const int ib32 = iqs;
|
@@ -7604,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7604
7222
|
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
7605
7223
|
bq8_1[ib32].ds[0];
|
7606
7224
|
return d * sumi;
|
7607
|
-
#else
|
7608
|
-
assert(false);
|
7609
|
-
#endif
|
7610
7225
|
}
|
7611
7226
|
|
7612
7227
|
static __dpct_inline__ float
|
7613
7228
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
7614
7229
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7615
7230
|
const uint32_t *iq1s_grid_gpu) {
|
7616
|
-
#if QK_K == 256
|
7617
7231
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
7618
7232
|
|
7619
7233
|
const int ib32 = iqs;
|
@@ -7632,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
7632
7246
|
const float d = d1q * bq8_1[ib32].ds[0];
|
7633
7247
|
const float m = d1q * bq8_1[ib32].ds[1];
|
7634
7248
|
return d * sumi + m * delta;
|
7635
|
-
#else
|
7636
|
-
assert(false);
|
7637
|
-
#endif
|
7638
7249
|
}
|
7639
7250
|
|
7640
7251
|
static __dpct_inline__ float
|
7641
7252
|
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
7642
7253
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7643
|
-
#if QK_K == 256
|
7644
7254
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
7645
7255
|
|
7646
7256
|
const int ib32 = iqs;
|
@@ -7665,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
7665
7275
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
7666
7276
|
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
7667
7277
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
7668
|
-
#else
|
7669
|
-
assert(false);
|
7670
|
-
#endif
|
7671
7278
|
}
|
7672
7279
|
|
7673
7280
|
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
@@ -7715,7 +7322,6 @@ static __dpct_inline__ float
|
|
7715
7322
|
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
7716
7323
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7717
7324
|
|
7718
|
-
#if QK_K == 256
|
7719
7325
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
7720
7326
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
7721
7327
|
|
@@ -7733,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
7733
7339
|
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
7734
7340
|
}
|
7735
7341
|
return d * (sumi1 + sumi2);
|
7736
|
-
#else
|
7737
|
-
assert(false);
|
7738
|
-
#endif
|
7739
7342
|
}
|
7740
7343
|
|
7741
7344
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
@@ -9316,32 +8919,6 @@ static void rope_glm_f32(
|
|
9316
8919
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
9317
8920
|
}
|
9318
8921
|
|
9319
|
-
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
9320
|
-
const int n_heads_log2_floor, const float m0, const float m1,
|
9321
|
-
const sycl::nd_item<3> &item_ct1) {
|
9322
|
-
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
9323
|
-
item_ct1.get_local_id(2);
|
9324
|
-
|
9325
|
-
if (col >= ncols) {
|
9326
|
-
return;
|
9327
|
-
}
|
9328
|
-
|
9329
|
-
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
9330
|
-
item_ct1.get_local_id(1);
|
9331
|
-
const int i = row*ncols + col;
|
9332
|
-
|
9333
|
-
const int k = row/k_rows;
|
9334
|
-
|
9335
|
-
float m_k;
|
9336
|
-
if (k < n_heads_log2_floor) {
|
9337
|
-
m_k = dpct::pow(m0, k + 1);
|
9338
|
-
} else {
|
9339
|
-
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
|
9340
|
-
}
|
9341
|
-
|
9342
|
-
dst[i] = col * m_k + x[i];
|
9343
|
-
}
|
9344
|
-
|
9345
8922
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
9346
8923
|
const sycl::nd_item<3> &item_ct1) {
|
9347
8924
|
const int row = item_ct1.get_group(1);
|
@@ -9443,7 +9020,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
9443
9020
|
|
9444
9021
|
|
9445
9022
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
9446
|
-
static void soft_max_f32(const float * x, const float * mask,
|
9023
|
+
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
|
9447
9024
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
9448
9025
|
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
9449
9026
|
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
@@ -9457,7 +9034,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
9457
9034
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
9458
9035
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
9459
9036
|
|
9460
|
-
float slope =
|
9037
|
+
float slope = 1.0f;
|
9461
9038
|
|
9462
9039
|
// ALiBi
|
9463
9040
|
if (max_bias > 0.0f) {
|
@@ -9482,7 +9059,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
9482
9059
|
const int ix = rowx*ncols + col;
|
9483
9060
|
const int iy = rowy*ncols + col;
|
9484
9061
|
|
9485
|
-
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f)
|
9062
|
+
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
|
9486
9063
|
|
9487
9064
|
vals[col] = val;
|
9488
9065
|
max_val = sycl::max(max_val, val);
|
@@ -10112,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
|
10112
9689
|
});
|
10113
9690
|
}
|
10114
9691
|
|
10115
|
-
static void upscale_f32_sycl(const float *x, float *dst, const int
|
10116
|
-
const int
|
10117
|
-
const int
|
10118
|
-
|
10119
|
-
int
|
10120
|
-
|
9692
|
+
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
9693
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
9694
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
9695
|
+
const float sf2, const float sf3, dpct::queue_ptr stream) {
|
9696
|
+
int dst_size = ne10 * ne11 * ne12 * ne13;
|
9697
|
+
int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
9698
|
+
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
10121
9699
|
stream->parallel_for(
|
10122
|
-
sycl::nd_range<
|
10123
|
-
|
10124
|
-
|
10125
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10126
|
-
upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
|
9700
|
+
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
9701
|
+
[=](sycl::nd_item<1> item_ct1) {
|
9702
|
+
upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
10127
9703
|
});
|
10128
9704
|
}
|
10129
9705
|
|
@@ -10225,7 +9801,6 @@ template <typename dst_t>
|
|
10225
9801
|
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
10226
9802
|
dpct::queue_ptr stream) {
|
10227
9803
|
const int nb = k / QK_K;
|
10228
|
-
#if QK_K == 256
|
10229
9804
|
{
|
10230
9805
|
dpct::has_capability_or_fail(stream->get_device(),
|
10231
9806
|
{sycl::aspect::fp16});
|
@@ -10237,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10237
9812
|
dequantize_block_q2_K(vx, y, item_ct1);
|
10238
9813
|
});
|
10239
9814
|
}
|
10240
|
-
#else
|
10241
|
-
{
|
10242
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10243
|
-
{sycl::aspect::fp16});
|
10244
|
-
|
10245
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10246
|
-
sycl::range<3>(1, 1, 32),
|
10247
|
-
sycl::range<3>(1, 1, 32)),
|
10248
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10249
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
10250
|
-
});
|
10251
|
-
}
|
10252
|
-
|
10253
|
-
#endif
|
10254
9815
|
}
|
10255
9816
|
|
10256
9817
|
template <typename dst_t>
|
10257
9818
|
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
10258
9819
|
dpct::queue_ptr stream) {
|
10259
9820
|
const int nb = k / QK_K;
|
10260
|
-
#if QK_K == 256
|
10261
9821
|
{
|
10262
9822
|
dpct::has_capability_or_fail(stream->get_device(),
|
10263
9823
|
{sycl::aspect::fp16});
|
@@ -10269,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10269
9829
|
dequantize_block_q3_K(vx, y, item_ct1);
|
10270
9830
|
});
|
10271
9831
|
}
|
10272
|
-
#else
|
10273
|
-
{
|
10274
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10275
|
-
{sycl::aspect::fp16});
|
10276
|
-
|
10277
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10278
|
-
sycl::range<3>(1, 1, 32),
|
10279
|
-
sycl::range<3>(1, 1, 32)),
|
10280
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10281
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
10282
|
-
});
|
10283
|
-
}
|
10284
|
-
#endif
|
10285
9832
|
}
|
10286
9833
|
|
10287
9834
|
template <typename dst_t>
|
@@ -10342,7 +9889,6 @@ template <typename dst_t>
|
|
10342
9889
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
10343
9890
|
dpct::queue_ptr stream) {
|
10344
9891
|
const int nb = k / QK_K;
|
10345
|
-
#if QK_K == 256
|
10346
9892
|
{
|
10347
9893
|
dpct::has_capability_or_fail(stream->get_device(),
|
10348
9894
|
{sycl::aspect::fp16});
|
@@ -10354,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10354
9900
|
dequantize_block_q5_K(vx, y, item_ct1);
|
10355
9901
|
});
|
10356
9902
|
}
|
10357
|
-
#else
|
10358
|
-
{
|
10359
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10360
|
-
{sycl::aspect::fp16});
|
10361
|
-
|
10362
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10363
|
-
sycl::range<3>(1, 1, 32),
|
10364
|
-
sycl::range<3>(1, 1, 32)),
|
10365
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10366
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
10367
|
-
});
|
10368
|
-
}
|
10369
|
-
|
10370
|
-
#endif
|
10371
9903
|
}
|
10372
9904
|
|
10373
9905
|
template <typename dst_t>
|
10374
9906
|
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
10375
9907
|
dpct::queue_ptr stream) {
|
10376
9908
|
const int nb = k / QK_K;
|
10377
|
-
#if QK_K == 256
|
10378
9909
|
{
|
10379
9910
|
dpct::has_capability_or_fail(stream->get_device(),
|
10380
9911
|
{sycl::aspect::fp16});
|
@@ -10386,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10386
9917
|
dequantize_block_q6_K(vx, y, item_ct1);
|
10387
9918
|
});
|
10388
9919
|
}
|
10389
|
-
#else
|
10390
|
-
{
|
10391
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
10392
|
-
{sycl::aspect::fp16});
|
10393
|
-
|
10394
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10395
|
-
sycl::range<3>(1, 1, 32),
|
10396
|
-
sycl::range<3>(1, 1, 32)),
|
10397
|
-
[=](sycl::nd_item<3> item_ct1) {
|
10398
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
10399
|
-
});
|
10400
|
-
}
|
10401
|
-
|
10402
|
-
#endif
|
10403
9920
|
}
|
10404
9921
|
|
10405
9922
|
template <typename dst_t>
|
@@ -10551,9 +10068,6 @@ template <typename dst_t>
|
|
10551
10068
|
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
10552
10069
|
dpct::queue_ptr stream) {
|
10553
10070
|
const int nb = (k + QK_K - 1) / QK_K;
|
10554
|
-
#if QK_K == 64
|
10555
|
-
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
10556
|
-
#else
|
10557
10071
|
{
|
10558
10072
|
dpct::has_capability_or_fail(stream->get_device(),
|
10559
10073
|
{sycl::aspect::fp16});
|
@@ -10568,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10568
10082
|
});
|
10569
10083
|
});
|
10570
10084
|
}
|
10571
|
-
#endif
|
10572
10085
|
}
|
10573
10086
|
|
10574
10087
|
|
@@ -12073,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12073
11586
|
const int nrows_y, const int nrows_dst,
|
12074
11587
|
dpct::queue_ptr stream) try {
|
12075
11588
|
|
12076
|
-
#if QK_K == 256
|
12077
|
-
|
12078
11589
|
int id;
|
12079
11590
|
SYCL_CHECK(
|
12080
11591
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
@@ -12189,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
12189
11700
|
});
|
12190
11701
|
}
|
12191
11702
|
}
|
12192
|
-
#endif
|
12193
11703
|
}
|
12194
11704
|
catch (sycl::exception const &exc) {
|
12195
11705
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -12964,20 +12474,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
|
12964
12474
|
});
|
12965
12475
|
}
|
12966
12476
|
|
12967
|
-
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
|
12968
|
-
const int nrows, const int k_rows,
|
12969
|
-
const int n_heads_log2_floor, const float m0,
|
12970
|
-
const float m1, dpct::queue_ptr stream) {
|
12971
|
-
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
|
12972
|
-
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
|
12973
|
-
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
|
12974
|
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12975
|
-
[=](sycl::nd_item<3> item_ct1) {
|
12976
|
-
alibi_f32(x, dst, ncols, k_rows,
|
12977
|
-
n_heads_log2_floor, m0, m1, item_ct1);
|
12978
|
-
});
|
12979
|
-
}
|
12980
|
-
|
12981
12477
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
12982
12478
|
const int nrows, dpct::queue_ptr stream) {
|
12983
12479
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
@@ -13058,7 +12554,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
|
13058
12554
|
}
|
13059
12555
|
|
13060
12556
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
13061
|
-
static void soft_max_f32_submitter(const float * x, const float * mask,
|
12557
|
+
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
|
13062
12558
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
13063
12559
|
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
13064
12560
|
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
@@ -13068,7 +12564,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
13068
12564
|
cgh.parallel_for(
|
13069
12565
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
13070
12566
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
13071
|
-
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask,
|
12567
|
+
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
13072
12568
|
nrows_y, scale, max_bias, m0,
|
13073
12569
|
m1, n_head_log2, item_ct1,
|
13074
12570
|
local_buf_acc.get_pointer());
|
@@ -13076,7 +12572,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
13076
12572
|
});
|
13077
12573
|
}
|
13078
12574
|
|
13079
|
-
static void soft_max_f32_sycl(const float * x, const float * mask,
|
12575
|
+
static void soft_max_f32_sycl(const float * x, const float * mask,
|
13080
12576
|
float * dst, const int ncols_x, const int nrows_x,
|
13081
12577
|
const int nrows_y, const float scale, const float max_bias,
|
13082
12578
|
dpct::queue_ptr stream) {
|
@@ -13098,60 +12594,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
13098
12594
|
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
13099
12595
|
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
13100
12596
|
if (ncols_x > max_block_size) {
|
13101
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
12597
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13102
12598
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13103
12599
|
block_dims, n_local_scratch, stream);
|
13104
12600
|
return;
|
13105
12601
|
}
|
13106
12602
|
switch (ncols_x) {
|
13107
12603
|
case 32:
|
13108
|
-
soft_max_f32_submitter<true, 32, 32>(x, mask,
|
12604
|
+
soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
|
13109
12605
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13110
12606
|
block_dims, n_local_scratch, stream);
|
13111
12607
|
break;
|
13112
12608
|
case 64:
|
13113
|
-
soft_max_f32_submitter<true, 64, 64>(x, mask,
|
12609
|
+
soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
|
13114
12610
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13115
12611
|
block_dims, n_local_scratch, stream);
|
13116
12612
|
break;
|
13117
12613
|
case 128:
|
13118
|
-
soft_max_f32_submitter<true, 128, 128>(x, mask,
|
12614
|
+
soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
|
13119
12615
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13120
12616
|
block_dims, n_local_scratch, stream);
|
13121
12617
|
break;
|
13122
12618
|
case 256:
|
13123
|
-
soft_max_f32_submitter<true, 256, 256>(x, mask,
|
12619
|
+
soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
|
13124
12620
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13125
12621
|
block_dims, n_local_scratch, stream);
|
13126
12622
|
break;
|
13127
12623
|
case 512:
|
13128
|
-
soft_max_f32_submitter<true, 512, 512>(x, mask,
|
12624
|
+
soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
|
13129
12625
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13130
12626
|
block_dims, n_local_scratch, stream);
|
13131
12627
|
break;
|
13132
12628
|
case 1024:
|
13133
|
-
soft_max_f32_submitter<true, 1024, 1024>(x, mask,
|
12629
|
+
soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13134
12630
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13135
12631
|
block_dims, n_local_scratch, stream);
|
13136
12632
|
break;
|
13137
12633
|
case 2048:
|
13138
|
-
soft_max_f32_submitter<true, 2048, 1024>(x, mask,
|
12634
|
+
soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13139
12635
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13140
12636
|
block_dims, n_local_scratch, stream);
|
13141
12637
|
break;
|
13142
12638
|
case 4096:
|
13143
|
-
soft_max_f32_submitter<true, 4096, 1024>(x, mask,
|
12639
|
+
soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13144
12640
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13145
12641
|
block_dims, n_local_scratch, stream);
|
13146
12642
|
break;
|
13147
12643
|
default:
|
13148
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
12644
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13149
12645
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13150
12646
|
block_dims, n_local_scratch, stream);
|
13151
12647
|
break;
|
13152
12648
|
}
|
13153
12649
|
} else {
|
13154
|
-
soft_max_f32_submitter<false, 0, 0>(x, mask,
|
12650
|
+
soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13155
12651
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13156
12652
|
block_dims, WARP_SIZE, stream);
|
13157
12653
|
}
|
@@ -14026,11 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
|
14026
13522
|
|
14027
13523
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14028
13524
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
14029
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
14030
13525
|
|
14031
|
-
const
|
13526
|
+
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
13527
|
+
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
13528
|
+
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
13529
|
+
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
14032
13530
|
|
14033
|
-
upscale_f32_sycl(src0_dd, dst_dd, src0->
|
13531
|
+
upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
13532
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
13533
|
+
main_stream);
|
14034
13534
|
|
14035
13535
|
(void) src1;
|
14036
13536
|
(void) dst;
|
@@ -14486,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14486
13986
|
ggml_tensor *dst, const float *src0_dd,
|
14487
13987
|
const float *src1_dd, float *dst_dd,
|
14488
13988
|
const dpct::queue_ptr &main_stream) {
|
13989
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
13990
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
13991
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
14489
13992
|
|
14490
13993
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
14491
13994
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
@@ -14562,36 +14065,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14562
14065
|
(void) src1_dd;
|
14563
14066
|
}
|
14564
14067
|
|
14565
|
-
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
14566
|
-
ggml_tensor *dst, const float *src0_dd,
|
14567
|
-
const float *src1_dd, float *dst_dd,
|
14568
|
-
const dpct::queue_ptr &main_stream) {
|
14569
|
-
|
14570
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14571
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
14572
|
-
|
14573
|
-
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
14574
|
-
const int64_t nrows = ggml_nrows(src0);
|
14575
|
-
|
14576
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14577
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
14578
|
-
float max_bias;
|
14579
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
14580
|
-
|
14581
|
-
//GGML_ASSERT(ne01 + n_past == ne00);
|
14582
|
-
GGML_ASSERT(n_head == ne02);
|
14583
|
-
|
14584
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
14585
|
-
|
14586
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
14587
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
14588
|
-
|
14589
|
-
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
14590
|
-
|
14591
|
-
(void) src1;
|
14592
|
-
(void) src1_dd;
|
14593
|
-
}
|
14594
|
-
|
14595
14068
|
static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
|
14596
14069
|
const ggml_tensor *src1, ggml_tensor *dst,
|
14597
14070
|
const float *src0_dd, const float *src1_dd,
|
@@ -14746,12 +14219,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14746
14219
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14747
14220
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
14748
14221
|
|
14749
|
-
|
14750
|
-
|
14751
|
-
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
|
14222
|
+
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
|
14752
14223
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
14753
14224
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
14754
|
-
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
|
14755
14225
|
|
14756
14226
|
const int64_t ne00 = src0->ne[0];
|
14757
14227
|
const int64_t nrows_x = ggml_nrows(src0);
|
@@ -14763,25 +14233,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14763
14233
|
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
14764
14234
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
14765
14235
|
|
14766
|
-
|
14767
|
-
float * src2_dd = nullptr;
|
14768
|
-
sycl_pool_alloc<float> src2_f;
|
14769
|
-
|
14770
|
-
const bool use_src2 = src2 != nullptr;
|
14771
|
-
|
14772
|
-
if (use_src2) {
|
14773
|
-
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
14774
|
-
|
14775
|
-
if (src2_on_device) {
|
14776
|
-
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
14777
|
-
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
14778
|
-
} else {
|
14779
|
-
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
14780
|
-
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
14781
|
-
}
|
14782
|
-
}
|
14783
|
-
|
14784
|
-
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
14236
|
+
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
|
14785
14237
|
nrows_x, nrows_y, scale, max_bias, main_stream);
|
14786
14238
|
}
|
14787
14239
|
|
@@ -15656,26 +15108,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15656
15108
|
const int64_t r2 = ne12/ne02;
|
15657
15109
|
const int64_t r3 = ne13/ne03;
|
15658
15110
|
|
15659
|
-
#if 0
|
15660
|
-
// use syclGemmEx
|
15661
|
-
{
|
15662
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
15663
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
15664
|
-
int i03 = i13 / r3;
|
15665
|
-
int i02 = i12 / r2;
|
15666
|
-
|
15667
|
-
SYCL_CHECK(
|
15668
|
-
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
15669
|
-
ne01, ne11, ne10,
|
15670
|
-
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
|
15671
|
-
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
|
15672
|
-
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
15673
|
-
cu_compute_type,
|
15674
|
-
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
15675
|
-
}
|
15676
|
-
}
|
15677
|
-
}
|
15678
|
-
#else
|
15679
15111
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
15680
15112
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
15681
15113
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
@@ -15687,7 +15119,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15687
15119
|
nb11 / nb10, nb12 / nb10, beta,
|
15688
15120
|
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
15689
15121
|
ne12 * ne13, cu_compute_type)));
|
15690
|
-
g_sycl_handles[g_main_device]->wait();
|
15691
15122
|
} else {
|
15692
15123
|
const int ne23 = ne12*ne13;
|
15693
15124
|
|
@@ -15718,7 +15149,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15718
15149
|
nb02, nb03, nb12_scaled, nb13_scaled,
|
15719
15150
|
nbd2, nbd3, r2, r3, item_ct1);
|
15720
15151
|
});
|
15721
|
-
})
|
15152
|
+
});
|
15722
15153
|
}
|
15723
15154
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
15724
15155
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
@@ -15729,9 +15160,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15729
15160
|
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
15730
15161
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
15731
15162
|
cu_compute_type)));
|
15732
|
-
g_sycl_handles[g_main_device]->wait();
|
15733
15163
|
}
|
15734
|
-
#endif
|
15735
15164
|
|
15736
15165
|
if (no_mixed_dtypes) {
|
15737
15166
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
@@ -16232,10 +15661,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
16232
15661
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
|
16233
15662
|
}
|
16234
15663
|
|
16235
|
-
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
16236
|
-
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
|
16237
|
-
}
|
16238
|
-
|
16239
15664
|
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
16240
15665
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
|
16241
15666
|
}
|
@@ -16612,9 +16037,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
16612
16037
|
case GGML_OP_ROPE:
|
16613
16038
|
func = ggml_sycl_rope;
|
16614
16039
|
break;
|
16615
|
-
case GGML_OP_ALIBI:
|
16616
|
-
func = ggml_sycl_alibi;
|
16617
|
-
break;
|
16618
16040
|
case GGML_OP_IM2COL:
|
16619
16041
|
func = ggml_sycl_im2col;
|
16620
16042
|
break;
|
@@ -17744,7 +17166,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17744
17166
|
case GGML_OP_DIAG_MASK_INF:
|
17745
17167
|
case GGML_OP_SOFT_MAX:
|
17746
17168
|
case GGML_OP_ROPE:
|
17747
|
-
case GGML_OP_ALIBI:
|
17748
17169
|
case GGML_OP_IM2COL:
|
17749
17170
|
case GGML_OP_POOL_2D:
|
17750
17171
|
case GGML_OP_SUM_ROWS:
|