@fugood/llama.node 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/CMakeLists.txt +72 -46
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +732 -752
- package/src/llama.cpp/common/common.h +47 -41
- package/src/llama.cpp/common/grammar-parser.cpp +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +89 -7
- package/src/llama.cpp/common/sampling.h +5 -0
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +24 -16
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
- package/src/llama.cpp/examples/server/server.cpp +21 -9
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +0 -1
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +4 -0
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3700 -2041
- package/src/llama.cpp/ggml-rpc.cpp +188 -56
- package/src/llama.cpp/ggml-sycl.cpp +99 -530
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +202 -225
- package/src/llama.cpp/ggml.c +1034 -1154
- package/src/llama.cpp/ggml.h +59 -31
- package/src/llama.cpp/llama.cpp +859 -609
- package/src/llama.cpp/llama.h +19 -6
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -3847,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
|
|
|
3847
3847
|
}
|
|
3848
3848
|
}
|
|
3849
3849
|
|
|
3850
|
-
static void upscale_f32(const float *x, float *dst, const int
|
|
3851
|
-
const
|
|
3852
|
-
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3850
|
+
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
|
3851
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
3852
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
3853
|
+
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
|
3854
|
+
int index = item_ct1.get_local_id(0) +
|
|
3855
|
+
item_ct1.get_group(0) * item_ct1.get_local_range(0);
|
|
3856
|
+
if (index >= ne10 * ne11 * ne12 * ne13) {
|
|
3856
3857
|
return;
|
|
3857
3858
|
}
|
|
3858
3859
|
// operation
|
|
3859
|
-
int
|
|
3860
|
-
int
|
|
3861
|
-
int
|
|
3862
|
-
int
|
|
3863
|
-
|
|
3864
|
-
|
|
3860
|
+
int i10 = index % ne10;
|
|
3861
|
+
int i11 = (index / ne10) % ne11;
|
|
3862
|
+
int i12 = (index / (ne10 * ne11)) % ne12;
|
|
3863
|
+
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
|
3864
|
+
|
|
3865
|
+
int i00 = i10 / sf0;
|
|
3866
|
+
int i01 = i11 / sf1;
|
|
3867
|
+
int i02 = i12 / sf2;
|
|
3868
|
+
int i03 = i13 / sf3;
|
|
3869
|
+
|
|
3870
|
+
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
|
3865
3871
|
}
|
|
3866
3872
|
|
|
3867
3873
|
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
@@ -4191,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4191
4197
|
const block_q2_K * x = (const block_q2_K *) vx;
|
|
4192
4198
|
|
|
4193
4199
|
const int tid = item_ct1.get_local_id(2);
|
|
4194
|
-
#if QK_K == 256
|
|
4195
4200
|
const int n = tid/32;
|
|
4196
4201
|
const int l = tid - 32*n;
|
|
4197
4202
|
const int is = 8*n + l/16;
|
|
@@ -4205,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4205
4210
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
|
4206
4211
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
|
4207
4212
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
|
4208
|
-
#else
|
|
4209
|
-
const int is = tid/16; // 0 or 1
|
|
4210
|
-
const int il = tid%16; // 0...15
|
|
4211
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
4212
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
4213
|
-
|
|
4214
|
-
float dall = x[i].dm[0];
|
|
4215
|
-
float dmin = x[i].dm[1];
|
|
4216
|
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
|
4217
|
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
|
4218
|
-
#endif
|
|
4219
|
-
|
|
4220
4213
|
}
|
|
4221
4214
|
|
|
4222
4215
|
template<typename dst_t>
|
|
@@ -4226,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4226
4219
|
const int i = item_ct1.get_group(2);
|
|
4227
4220
|
const block_q3_K * x = (const block_q3_K *) vx;
|
|
4228
4221
|
|
|
4229
|
-
#if QK_K == 256
|
|
4230
4222
|
const int r = item_ct1.get_local_id(2) / 4;
|
|
4231
4223
|
const int tid = r/2;
|
|
4232
4224
|
const int is0 = r%2;
|
|
@@ -4250,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4250
4242
|
const uint8_t * hm = x[i].hmask;
|
|
4251
4243
|
|
|
4252
4244
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
|
4253
|
-
#else
|
|
4254
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4255
|
-
const int is = tid/16; // 0 or 1
|
|
4256
|
-
const int il = tid%16; // 0...15
|
|
4257
|
-
const int im = il/8; // 0...1
|
|
4258
|
-
const int in = il%8; // 0...7
|
|
4259
|
-
|
|
4260
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
4261
|
-
|
|
4262
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
4263
|
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
|
4264
|
-
const float d = (float)x[i].d;
|
|
4265
|
-
|
|
4266
|
-
if (is == 0) {
|
|
4267
|
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
4268
|
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
4269
|
-
} else {
|
|
4270
|
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
4271
|
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
4272
|
-
}
|
|
4273
|
-
#endif
|
|
4274
|
-
|
|
4275
4245
|
}
|
|
4276
4246
|
|
|
4277
|
-
#if QK_K == 256
|
|
4278
4247
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
|
4279
4248
|
if (j < 4) {
|
|
4280
4249
|
d = q[j] & 63; m = q[j + 4] & 63;
|
|
@@ -4283,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
|
4283
4252
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
|
4284
4253
|
}
|
|
4285
4254
|
}
|
|
4286
|
-
#endif
|
|
4287
4255
|
|
|
4288
4256
|
template<typename dst_t>
|
|
4289
4257
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
@@ -4292,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4292
4260
|
|
|
4293
4261
|
const int i = item_ct1.get_group(2);
|
|
4294
4262
|
|
|
4295
|
-
#if QK_K == 256
|
|
4296
4263
|
// assume 32 threads
|
|
4297
4264
|
const int tid = item_ct1.get_local_id(2);
|
|
4298
4265
|
const int il = tid/8;
|
|
@@ -4316,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4316
4283
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
|
4317
4284
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
|
4318
4285
|
}
|
|
4319
|
-
#else
|
|
4320
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4321
|
-
const uint8_t * q = x[i].qs;
|
|
4322
|
-
dst_t * y = yy + i*QK_K;
|
|
4323
|
-
const float d = (float)x[i].dm[0];
|
|
4324
|
-
const float m = (float)x[i].dm[1];
|
|
4325
|
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
|
4326
|
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
|
4327
|
-
#endif
|
|
4328
4286
|
}
|
|
4329
4287
|
|
|
4330
4288
|
template<typename dst_t>
|
|
@@ -4334,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4334
4292
|
|
|
4335
4293
|
const int i = item_ct1.get_group(2);
|
|
4336
4294
|
|
|
4337
|
-
#if QK_K == 256
|
|
4338
4295
|
// assume 64 threads - this is very slightly better than the one below
|
|
4339
4296
|
const int tid = item_ct1.get_local_id(2);
|
|
4340
4297
|
const int il = tid/16; // il is in 0...3
|
|
@@ -4361,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4361
4318
|
hm <<= 1;
|
|
4362
4319
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
|
4363
4320
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
|
4364
|
-
#else
|
|
4365
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4366
|
-
const uint8_t q = x[i].qs[tid];
|
|
4367
|
-
const int im = tid/8; // 0...3
|
|
4368
|
-
const int in = tid%8; // 0...7
|
|
4369
|
-
const int is = tid/16; // 0 or 1
|
|
4370
|
-
const uint8_t h = x[i].qh[in] >> im;
|
|
4371
|
-
const float d = x[i].d;
|
|
4372
|
-
dst_t * y = yy + i*QK_K + tid;
|
|
4373
|
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
|
4374
|
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
|
4375
|
-
#endif
|
|
4376
4321
|
}
|
|
4377
4322
|
|
|
4378
4323
|
template<typename dst_t>
|
|
@@ -4381,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4381
4326
|
const block_q6_K * x = (const block_q6_K *) vx;
|
|
4382
4327
|
|
|
4383
4328
|
const int i = item_ct1.get_group(2);
|
|
4384
|
-
#if QK_K == 256
|
|
4385
4329
|
|
|
4386
4330
|
// assume 64 threads - this is very slightly better than the one below
|
|
4387
4331
|
const int tid = item_ct1.get_local_id(2);
|
|
@@ -4401,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4401
4345
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
|
4402
4346
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
|
4403
4347
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
|
4404
|
-
#else
|
|
4405
|
-
|
|
4406
|
-
// assume 32 threads
|
|
4407
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4408
|
-
const int ip = tid/16; // 0 or 1
|
|
4409
|
-
const int il = tid - 16*ip; // 0...15
|
|
4410
|
-
|
|
4411
|
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
|
4412
|
-
|
|
4413
|
-
const float d = x[i].d;
|
|
4414
|
-
|
|
4415
|
-
const uint8_t ql = x[i].ql[16*ip + il];
|
|
4416
|
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
|
4417
|
-
const int8_t * sc = x[i].scales;
|
|
4418
|
-
|
|
4419
|
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
|
4420
|
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
|
4421
|
-
#endif
|
|
4422
4348
|
}
|
|
4423
4349
|
|
|
4424
4350
|
template<typename dst_t>
|
|
@@ -4432,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4432
4358
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
|
4433
4359
|
|
|
4434
4360
|
const int tid = item_ct1.get_local_id(2);
|
|
4435
|
-
#if QK_K == 256
|
|
4436
4361
|
const int il = tid/8; // 0...3
|
|
4437
4362
|
const int ib = tid%8; // 0...7
|
|
4438
4363
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4443,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4443
4368
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
|
4444
4369
|
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
|
4445
4370
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
|
4446
|
-
#else
|
|
4447
|
-
assert(false);
|
|
4448
|
-
#endif
|
|
4449
|
-
|
|
4450
4371
|
}
|
|
4451
4372
|
|
|
4452
4373
|
template<typename dst_t>
|
|
@@ -4460,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
|
4460
4381
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
|
4461
4382
|
|
|
4462
4383
|
const int tid = item_ct1.get_local_id(2);
|
|
4463
|
-
#if QK_K == 256
|
|
4464
4384
|
const int il = tid/8; // 0...3
|
|
4465
4385
|
const int ib = tid%8; // 0...7
|
|
4466
4386
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4469,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
|
4469
4389
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
|
4470
4390
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
|
4471
4391
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
4472
|
-
#else
|
|
4473
|
-
assert(false);
|
|
4474
|
-
#endif
|
|
4475
|
-
|
|
4476
4392
|
}
|
|
4477
4393
|
|
|
4478
4394
|
template <typename dst_t>
|
|
@@ -4484,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4484
4400
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
|
4485
4401
|
|
|
4486
4402
|
const int tid = item_ct1.get_local_id(2);
|
|
4487
|
-
#if QK_K == 256
|
|
4488
4403
|
const int il = tid/8; // 0...3
|
|
4489
4404
|
const int ib = tid%8; // 0...7
|
|
4490
4405
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4492,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4492
4407
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
|
4493
4408
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
|
4494
4409
|
#pragma unroll
|
|
4495
|
-
for (int j = 0; j < 8; ++j)
|
|
4410
|
+
for (int j = 0; j < 8; ++j) {
|
|
4496
4411
|
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
4497
|
-
|
|
4498
|
-
assert(false);
|
|
4499
|
-
|
|
4500
|
-
#endif
|
|
4501
|
-
|
|
4412
|
+
}
|
|
4502
4413
|
}
|
|
4503
4414
|
|
|
4504
4415
|
template<typename dst_t>
|
|
@@ -4512,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4512
4423
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
|
4513
4424
|
|
|
4514
4425
|
const int tid = item_ct1.get_local_id(2);
|
|
4515
|
-
#if QK_K == 256
|
|
4516
4426
|
const int il = tid/8; // 0...3
|
|
4517
4427
|
const int ib = tid%8; // 0...7
|
|
4518
4428
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4527,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4527
4437
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4528
4438
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
4529
4439
|
}
|
|
4530
|
-
#else
|
|
4531
|
-
assert(false);
|
|
4532
|
-
#endif
|
|
4533
|
-
|
|
4534
4440
|
}
|
|
4535
4441
|
|
|
4536
4442
|
template <typename dst_t>
|
|
@@ -4543,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4543
4449
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
|
4544
4450
|
|
|
4545
4451
|
const int tid = item_ct1.get_local_id(2);
|
|
4546
|
-
#if QK_K == 256
|
|
4547
4452
|
const int il = tid/8; // 0...3
|
|
4548
4453
|
const int ib = tid%8; // 0...7
|
|
4549
4454
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4557,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4557
4462
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4558
4463
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
4559
4464
|
}
|
|
4560
|
-
#else
|
|
4561
|
-
assert(false);
|
|
4562
|
-
#endif
|
|
4563
|
-
|
|
4564
4465
|
}
|
|
4565
4466
|
|
|
4566
4467
|
template <typename dst_t>
|
|
@@ -4573,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4573
4474
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
|
4574
4475
|
|
|
4575
4476
|
const int tid = item_ct1.get_local_id(2);
|
|
4576
|
-
#if QK_K == 256
|
|
4577
4477
|
const int il = tid/8; // 0...3
|
|
4578
4478
|
const int ib = tid%8; // 0...7
|
|
4579
4479
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4587,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4587
4487
|
for (int j = 0; j < 8; ++j) {
|
|
4588
4488
|
y[j] = d * (q[j] + delta);
|
|
4589
4489
|
}
|
|
4590
|
-
#else
|
|
4591
|
-
assert(false);
|
|
4592
|
-
#endif
|
|
4593
|
-
|
|
4594
4490
|
}
|
|
4595
4491
|
|
|
4596
4492
|
template <typename dst_t>
|
|
@@ -4603,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4603
4499
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
|
4604
4500
|
|
|
4605
4501
|
const int tid = item_ct1.get_local_id(2);
|
|
4606
|
-
#if QK_K == 256
|
|
4607
4502
|
const int il = tid/8; // 0...3
|
|
4608
4503
|
const int ib = tid%8; // 0...7
|
|
4609
4504
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4621,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4621
4516
|
for (int j = 0; j < 8; ++j) {
|
|
4622
4517
|
y[j] = d * (q[j] + delta);
|
|
4623
4518
|
}
|
|
4624
|
-
#else
|
|
4625
|
-
assert(false);
|
|
4626
|
-
#endif
|
|
4627
|
-
|
|
4628
4519
|
}
|
|
4629
4520
|
|
|
4630
4521
|
template <typename dst_t>
|
|
@@ -4698,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
|
4698
4589
|
|
|
4699
4590
|
float tmp = 0; // partial sum for thread in warp
|
|
4700
4591
|
|
|
4701
|
-
#if QK_K == 256
|
|
4702
4592
|
const int tid =
|
|
4703
4593
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
|
4704
4594
|
const int ix =
|
|
@@ -4749,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
|
4749
4639
|
tmp += dall * sum1 - dmin * sum2;
|
|
4750
4640
|
|
|
4751
4641
|
}
|
|
4752
|
-
#else
|
|
4753
|
-
const int tid = item_ct1.get_local_id(2) /
|
|
4754
|
-
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
4755
|
-
const int ix = item_ct1.get_local_id(2) %
|
|
4756
|
-
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
4757
|
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
|
4758
|
-
|
|
4759
|
-
uint32_t uaux[2];
|
|
4760
|
-
const uint8_t * d = (const uint8_t *)uaux;
|
|
4761
|
-
|
|
4762
|
-
|
|
4763
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
4764
|
-
|
|
4765
|
-
const float * y = yy + i * QK_K + offset;
|
|
4766
|
-
const uint8_t * q = x[i].qs + offset;
|
|
4767
|
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
|
4768
|
-
|
|
4769
|
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
|
4770
|
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
|
4771
|
-
|
|
4772
|
-
const sycl::float2 dall =
|
|
4773
|
-
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
|
4774
|
-
|
|
4775
|
-
float sum1 = 0, sum2 = 0;
|
|
4776
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
4777
|
-
const uint8_t ql = q[l];
|
|
4778
|
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
|
4779
|
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
|
4780
|
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
|
4781
|
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
|
4782
|
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
|
4783
|
-
}
|
|
4784
|
-
tmp += dall.x() * sum1 - dall.y() * sum2;
|
|
4785
|
-
}
|
|
4786
|
-
|
|
4787
|
-
#endif
|
|
4788
4642
|
|
|
4789
4643
|
// sum up partial sums and write back result
|
|
4790
4644
|
#pragma unroll
|
|
@@ -4822,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
|
4822
4676
|
|
|
4823
4677
|
float tmp = 0; // partial sum for thread in warp
|
|
4824
4678
|
|
|
4825
|
-
#if QK_K == 256
|
|
4826
|
-
|
|
4827
4679
|
const uint16_t kmask1 = 0x0303;
|
|
4828
4680
|
const uint16_t kmask2 = 0x0f0f;
|
|
4829
4681
|
|
|
@@ -4876,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
|
4876
4728
|
tmp += d * sum;
|
|
4877
4729
|
|
|
4878
4730
|
}
|
|
4879
|
-
#else
|
|
4880
|
-
|
|
4881
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
4882
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
4883
|
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
|
4884
|
-
const int in = offset/8; // 0 or 1
|
|
4885
|
-
const int im = offset%8; // 0...7
|
|
4886
|
-
|
|
4887
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
4888
|
-
|
|
4889
|
-
const float * y = yy + i * QK_K + offset;
|
|
4890
|
-
const uint8_t * q = x[i].qs + offset;
|
|
4891
|
-
const uint8_t * s = x[i].scales;
|
|
4892
|
-
|
|
4893
|
-
const float dall = (float)x[i].d;
|
|
4894
|
-
|
|
4895
|
-
float sum = 0;
|
|
4896
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
4897
|
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
|
4898
|
-
const uint8_t ql = q[l];
|
|
4899
|
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
|
4900
|
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
|
4901
|
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
|
4902
|
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
|
4903
|
-
}
|
|
4904
|
-
tmp += sum;
|
|
4905
|
-
}
|
|
4906
|
-
#endif
|
|
4907
4731
|
|
|
4908
4732
|
// sum up partial sums and write back result
|
|
4909
4733
|
#pragma unroll
|
|
@@ -4938,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
|
4938
4762
|
|
|
4939
4763
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
|
4940
4764
|
|
|
4941
|
-
#if QK_K == 256
|
|
4942
4765
|
const uint16_t kmask1 = 0x3f3f;
|
|
4943
4766
|
const uint16_t kmask2 = 0x0f0f;
|
|
4944
4767
|
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -5027,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
|
5027
4850
|
#endif
|
|
5028
4851
|
|
|
5029
4852
|
}
|
|
5030
|
-
#else
|
|
5031
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
5032
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
|
5033
|
-
|
|
5034
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
5035
|
-
|
|
5036
|
-
uint16_t aux16[2];
|
|
5037
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
|
5038
|
-
|
|
5039
|
-
float tmp = 0;
|
|
5040
|
-
|
|
5041
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
5042
|
-
const uint8_t * q = x[i].qs + step;
|
|
5043
|
-
const float * y = yy + i*QK_K + step;
|
|
5044
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
5045
|
-
aux16[0] = a[0] & 0x0f0f;
|
|
5046
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
5047
|
-
const float d = (float)x[i].dm[0];
|
|
5048
|
-
const float m = (float)x[i].dm[1];
|
|
5049
|
-
float sum = 0.f;
|
|
5050
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
5051
|
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
|
5052
|
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
|
5053
|
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
|
5054
|
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
|
5055
|
-
}
|
|
5056
|
-
tmp += sum;
|
|
5057
|
-
}
|
|
5058
|
-
|
|
5059
|
-
#endif
|
|
5060
4853
|
|
|
5061
4854
|
// sum up partial sums and write back result
|
|
5062
4855
|
#pragma unroll
|
|
@@ -5091,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
|
5091
4884
|
|
|
5092
4885
|
float tmp = 0; // partial sum for thread in warp
|
|
5093
4886
|
|
|
5094
|
-
#if QK_K == 256
|
|
5095
4887
|
const uint16_t kmask1 = 0x3f3f;
|
|
5096
4888
|
const uint16_t kmask2 = 0x0f0f;
|
|
5097
4889
|
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -5168,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
|
5168
4960
|
dmin * smin;
|
|
5169
4961
|
}
|
|
5170
4962
|
|
|
5171
|
-
#else
|
|
5172
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
5173
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
|
5174
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
5175
|
-
const int im = step/8;
|
|
5176
|
-
const int in = step%8;
|
|
5177
|
-
|
|
5178
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
5179
|
-
const uint8_t * q = x[i].qs + step;
|
|
5180
|
-
const int8_t * s = x[i].scales;
|
|
5181
|
-
const float * y = yy + i*QK_K + step;
|
|
5182
|
-
const float d = x[i].d;
|
|
5183
|
-
float sum = 0.f;
|
|
5184
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
5185
|
-
const uint8_t h = x[i].qh[in+j] >> im;
|
|
5186
|
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
|
5187
|
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
|
5188
|
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
|
5189
|
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
|
5190
|
-
}
|
|
5191
|
-
tmp += sum;
|
|
5192
|
-
}
|
|
5193
|
-
#endif
|
|
5194
|
-
|
|
5195
4963
|
// sum up partial sums and write back result
|
|
5196
4964
|
#pragma unroll
|
|
5197
4965
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -5218,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
|
5218
4986
|
|
|
5219
4987
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
|
5220
4988
|
|
|
5221
|
-
#if QK_K == 256
|
|
5222
|
-
|
|
5223
4989
|
const int tid =
|
|
5224
4990
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
|
5225
4991
|
const int ix =
|
|
@@ -5276,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
|
5276
5042
|
|
|
5277
5043
|
}
|
|
5278
5044
|
|
|
5279
|
-
#else
|
|
5280
|
-
|
|
5281
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
|
5282
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
|
5283
|
-
|
|
5284
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
5285
|
-
|
|
5286
|
-
float tmp = 0; // partial sum for thread in warp
|
|
5287
|
-
|
|
5288
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
5289
|
-
|
|
5290
|
-
const float * y = yy + i * QK_K + step;
|
|
5291
|
-
const uint8_t * ql = x[i].ql + step;
|
|
5292
|
-
const uint8_t * qh = x[i].qh + step;
|
|
5293
|
-
const int8_t * s = x[i].scales;
|
|
5294
|
-
|
|
5295
|
-
const float d = x[i+0].d;
|
|
5296
|
-
|
|
5297
|
-
float sum = 0;
|
|
5298
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
5299
|
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
|
5300
|
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
|
5301
|
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
|
5302
|
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
|
5303
|
-
}
|
|
5304
|
-
tmp += sum;
|
|
5305
|
-
|
|
5306
|
-
}
|
|
5307
|
-
|
|
5308
|
-
#endif
|
|
5309
|
-
|
|
5310
5045
|
// sum up partial sums and write back result
|
|
5311
5046
|
#pragma unroll
|
|
5312
5047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -6851,7 +6586,6 @@ static __dpct_inline__ float
|
|
|
6851
6586
|
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
6852
6587
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
6853
6588
|
|
|
6854
|
-
#ifndef GGML_QKK_64
|
|
6855
6589
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
6856
6590
|
|
|
6857
6591
|
int v[2];
|
|
@@ -6893,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
|
6893
6627
|
}
|
|
6894
6628
|
|
|
6895
6629
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
|
6896
|
-
|
|
6897
|
-
#else
|
|
6898
|
-
|
|
6899
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
|
6900
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
6901
|
-
|
|
6902
|
-
float sumf_d = 0.0f;
|
|
6903
|
-
float sumf_m = 0.0f;
|
|
6904
|
-
|
|
6905
|
-
uint16_t aux16[2];
|
|
6906
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
|
6907
|
-
|
|
6908
|
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
|
6909
|
-
aux16[0] = a[0] & 0x0f0f;
|
|
6910
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
6911
|
-
|
|
6912
|
-
const float dall = bq4_K->dm[0];
|
|
6913
|
-
const float dmin = bq4_K->dm[1];
|
|
6914
|
-
|
|
6915
|
-
const float d8_1 = bq8_1[0].ds[0];
|
|
6916
|
-
const float d8_2 = bq8_1[1].ds[1];
|
|
6917
|
-
|
|
6918
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
6919
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
6920
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
6921
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
6922
|
-
|
|
6923
|
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
|
6924
|
-
const int v1 = q4[0];
|
|
6925
|
-
const int v2 = q4[4];
|
|
6926
|
-
|
|
6927
|
-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
|
6928
|
-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
|
6929
|
-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
|
6930
|
-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
|
6931
|
-
|
|
6932
|
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
|
6933
|
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
|
6934
|
-
|
|
6935
|
-
return dall * sumf_d - dmin * sumf_m;
|
|
6936
|
-
|
|
6937
|
-
#else
|
|
6938
|
-
bad_arch();
|
|
6939
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
|
6940
|
-
|
|
6941
|
-
#endif
|
|
6942
6630
|
}
|
|
6943
6631
|
|
|
6944
6632
|
template <int mmq_y>
|
|
@@ -6997,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
|
6997
6685
|
|
|
6998
6686
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
6999
6687
|
|
|
7000
|
-
#if QK_K == 256
|
|
7001
6688
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
|
7002
|
-
#else
|
|
7003
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
|
7004
|
-
#endif
|
|
7005
6689
|
}
|
|
7006
6690
|
|
|
7007
6691
|
#pragma unroll
|
|
@@ -7044,7 +6728,6 @@ static __dpct_inline__ float
|
|
|
7044
6728
|
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
7045
6729
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7046
6730
|
|
|
7047
|
-
#ifndef GGML_QKK_64
|
|
7048
6731
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
7049
6732
|
|
|
7050
6733
|
int vl[2];
|
|
@@ -7086,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
|
7086
6769
|
}
|
|
7087
6770
|
|
|
7088
6771
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
|
7089
|
-
|
|
7090
|
-
#else
|
|
7091
|
-
|
|
7092
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
|
7093
|
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
7094
|
-
|
|
7095
|
-
const int8_t * s = bq5_K->scales;
|
|
7096
|
-
|
|
7097
|
-
const float d = bq5_K->d;
|
|
7098
|
-
|
|
7099
|
-
const float d8_1 = bq8_1[0].ds[0];
|
|
7100
|
-
const float d8_2 = bq8_1[1].ds[1];
|
|
7101
|
-
|
|
7102
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
7103
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
7104
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
7105
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
7106
|
-
|
|
7107
|
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
|
7108
|
-
const int vl1 = ql[0];
|
|
7109
|
-
const int vl2 = ql[4];
|
|
7110
|
-
|
|
7111
|
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
|
7112
|
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
|
7113
|
-
const int in = step%8; // 0, 4, 0, 4
|
|
7114
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
|
7115
|
-
|
|
7116
|
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
|
7117
|
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
|
7118
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
|
7119
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
|
7120
|
-
|
|
7121
|
-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
|
7122
|
-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
|
7123
|
-
|
|
7124
|
-
return d * sumf_d;
|
|
7125
|
-
|
|
7126
|
-
#else
|
|
7127
|
-
bad_arch();
|
|
7128
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
|
7129
|
-
|
|
7130
|
-
#endif
|
|
7131
6772
|
}
|
|
7132
6773
|
|
|
7133
6774
|
template <int mmq_y>
|
|
@@ -7199,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
|
7199
6840
|
|
|
7200
6841
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
7201
6842
|
|
|
7202
|
-
#if QK_K == 256
|
|
7203
6843
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
|
7204
|
-
#endif
|
|
7205
6844
|
}
|
|
7206
6845
|
|
|
7207
6846
|
#pragma unroll
|
|
@@ -7381,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7381
7020
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
7382
7021
|
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
|
7383
7022
|
const uint8_t *kmask_iq2xs) {
|
|
7384
|
-
#if QK_K == 256
|
|
7385
7023
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
|
7386
7024
|
|
|
7387
7025
|
#if QR2_XXS == 8
|
|
@@ -7422,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7422
7060
|
}
|
|
7423
7061
|
return d * (sumi1 + sumi2);
|
|
7424
7062
|
#endif
|
|
7425
|
-
#else
|
|
7426
|
-
assert(false);
|
|
7427
|
-
return 0.f;
|
|
7428
|
-
#endif
|
|
7429
7063
|
}
|
|
7430
7064
|
|
|
7431
7065
|
static __dpct_inline__ float
|
|
@@ -7434,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
|
7434
7068
|
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
|
7435
7069
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
|
7436
7070
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
7437
|
-
#if QK_K == 256
|
|
7438
7071
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
|
7439
7072
|
|
|
7440
7073
|
const int ib32 = iqs;
|
|
@@ -7472,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
|
7472
7105
|
assert(false);
|
|
7473
7106
|
return 0.f;
|
|
7474
7107
|
#endif
|
|
7475
|
-
#else
|
|
7476
|
-
assert(false);
|
|
7477
|
-
return 0.f;
|
|
7478
|
-
#endif
|
|
7479
7108
|
}
|
|
7480
7109
|
|
|
7481
7110
|
static __dpct_inline__ float
|
|
7482
7111
|
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
7483
7112
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7484
|
-
#if QK_K == 256
|
|
7485
7113
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
|
7486
7114
|
|
|
7487
7115
|
const int ib32 = iqs;
|
|
@@ -7525,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
|
7525
7153
|
}
|
|
7526
7154
|
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
|
7527
7155
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
|
7528
|
-
#else
|
|
7529
|
-
assert(false);
|
|
7530
|
-
#endif
|
|
7531
7156
|
}
|
|
7532
7157
|
|
|
7533
7158
|
static __dpct_inline__ float
|
|
@@ -7536,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7536
7161
|
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
|
7537
7162
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
|
7538
7163
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
7539
|
-
#if QK_K == 256
|
|
7540
7164
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
|
7541
7165
|
|
|
7542
7166
|
const int ib32 = iqs;
|
|
@@ -7564,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7564
7188
|
assert(false);
|
|
7565
7189
|
return 0.f;
|
|
7566
7190
|
#endif
|
|
7567
|
-
#else
|
|
7568
|
-
assert(false);
|
|
7569
|
-
return 0.f;
|
|
7570
|
-
#endif
|
|
7571
7191
|
}
|
|
7572
7192
|
|
|
7573
7193
|
static __dpct_inline__ float
|
|
7574
7194
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7575
7195
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
7576
7196
|
const uint32_t *iq3s_grid) {
|
|
7577
|
-
#if QK_K == 256
|
|
7578
7197
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
|
7579
7198
|
|
|
7580
7199
|
const int ib32 = iqs;
|
|
@@ -7603,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
|
7603
7222
|
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
|
7604
7223
|
bq8_1[ib32].ds[0];
|
|
7605
7224
|
return d * sumi;
|
|
7606
|
-
#else
|
|
7607
|
-
assert(false);
|
|
7608
|
-
#endif
|
|
7609
7225
|
}
|
|
7610
7226
|
|
|
7611
7227
|
static __dpct_inline__ float
|
|
7612
7228
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
7613
7229
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
7614
7230
|
const uint32_t *iq1s_grid_gpu) {
|
|
7615
|
-
#if QK_K == 256
|
|
7616
7231
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
|
7617
7232
|
|
|
7618
7233
|
const int ib32 = iqs;
|
|
@@ -7631,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
|
7631
7246
|
const float d = d1q * bq8_1[ib32].ds[0];
|
|
7632
7247
|
const float m = d1q * bq8_1[ib32].ds[1];
|
|
7633
7248
|
return d * sumi + m * delta;
|
|
7634
|
-
#else
|
|
7635
|
-
assert(false);
|
|
7636
|
-
#endif
|
|
7637
7249
|
}
|
|
7638
7250
|
|
|
7639
7251
|
static __dpct_inline__ float
|
|
7640
7252
|
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
7641
7253
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7642
|
-
#if QK_K == 256
|
|
7643
7254
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
|
7644
7255
|
|
|
7645
7256
|
const int ib32 = iqs;
|
|
@@ -7664,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
|
7664
7275
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
7665
7276
|
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
|
7666
7277
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
|
7667
|
-
#else
|
|
7668
|
-
assert(false);
|
|
7669
|
-
#endif
|
|
7670
7278
|
}
|
|
7671
7279
|
|
|
7672
7280
|
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
|
@@ -7714,7 +7322,6 @@ static __dpct_inline__ float
|
|
|
7714
7322
|
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
7715
7323
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7716
7324
|
|
|
7717
|
-
#if QK_K == 256
|
|
7718
7325
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
|
7719
7326
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
|
7720
7327
|
|
|
@@ -7732,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
|
7732
7339
|
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
|
7733
7340
|
}
|
|
7734
7341
|
return d * (sumi1 + sumi2);
|
|
7735
|
-
#else
|
|
7736
|
-
assert(false);
|
|
7737
|
-
#endif
|
|
7738
7342
|
}
|
|
7739
7343
|
|
|
7740
7344
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
|
@@ -9226,12 +8830,11 @@ static void rope(
|
|
|
9226
8830
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
|
9227
8831
|
}
|
|
9228
8832
|
|
|
9229
|
-
template<typename T, bool has_pos>
|
|
8833
|
+
template<typename T, bool has_pos, bool has_freq_facs>
|
|
9230
8834
|
static void rope_neox(
|
|
9231
8835
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
|
9232
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
|
9233
|
-
,
|
|
9234
|
-
const sycl::nd_item<3> &item_ct1) {
|
|
8836
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
|
8837
|
+
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
|
9235
8838
|
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
9236
8839
|
item_ct1.get_local_id(1));
|
|
9237
8840
|
|
|
@@ -9259,8 +8862,10 @@ static void rope_neox(
|
|
|
9259
8862
|
float cur_rot = inv_ndims * ic - ib;
|
|
9260
8863
|
|
|
9261
8864
|
const int p = has_pos ? pos[i2] : 0;
|
|
8865
|
+
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
|
8866
|
+
|
|
9262
8867
|
const float theta_base =
|
|
9263
|
-
p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
|
|
8868
|
+
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
|
9264
8869
|
|
|
9265
8870
|
float cos_theta, sin_theta;
|
|
9266
8871
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
@@ -10085,18 +9690,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
10085
9690
|
});
|
|
10086
9691
|
}
|
|
10087
9692
|
|
|
10088
|
-
static void upscale_f32_sycl(const float *x, float *dst, const int
|
|
10089
|
-
const int
|
|
10090
|
-
const int
|
|
10091
|
-
|
|
10092
|
-
int
|
|
10093
|
-
|
|
9693
|
+
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
|
9694
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
9695
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
9696
|
+
const float sf2, const float sf3, dpct::queue_ptr stream) {
|
|
9697
|
+
int dst_size = ne10 * ne11 * ne12 * ne13;
|
|
9698
|
+
int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
|
9699
|
+
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
|
10094
9700
|
stream->parallel_for(
|
|
10095
|
-
sycl::nd_range<
|
|
10096
|
-
|
|
10097
|
-
|
|
10098
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10099
|
-
upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
|
|
9701
|
+
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
|
9702
|
+
[=](sycl::nd_item<1> item_ct1) {
|
|
9703
|
+
upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
|
10100
9704
|
});
|
|
10101
9705
|
}
|
|
10102
9706
|
|
|
@@ -10198,7 +9802,6 @@ template <typename dst_t>
|
|
|
10198
9802
|
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10199
9803
|
dpct::queue_ptr stream) {
|
|
10200
9804
|
const int nb = k / QK_K;
|
|
10201
|
-
#if QK_K == 256
|
|
10202
9805
|
{
|
|
10203
9806
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10204
9807
|
{sycl::aspect::fp16});
|
|
@@ -10210,27 +9813,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10210
9813
|
dequantize_block_q2_K(vx, y, item_ct1);
|
|
10211
9814
|
});
|
|
10212
9815
|
}
|
|
10213
|
-
#else
|
|
10214
|
-
{
|
|
10215
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10216
|
-
{sycl::aspect::fp16});
|
|
10217
|
-
|
|
10218
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10219
|
-
sycl::range<3>(1, 1, 32),
|
|
10220
|
-
sycl::range<3>(1, 1, 32)),
|
|
10221
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10222
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
|
10223
|
-
});
|
|
10224
|
-
}
|
|
10225
|
-
|
|
10226
|
-
#endif
|
|
10227
9816
|
}
|
|
10228
9817
|
|
|
10229
9818
|
template <typename dst_t>
|
|
10230
9819
|
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10231
9820
|
dpct::queue_ptr stream) {
|
|
10232
9821
|
const int nb = k / QK_K;
|
|
10233
|
-
#if QK_K == 256
|
|
10234
9822
|
{
|
|
10235
9823
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10236
9824
|
{sycl::aspect::fp16});
|
|
@@ -10242,19 +9830,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10242
9830
|
dequantize_block_q3_K(vx, y, item_ct1);
|
|
10243
9831
|
});
|
|
10244
9832
|
}
|
|
10245
|
-
#else
|
|
10246
|
-
{
|
|
10247
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10248
|
-
{sycl::aspect::fp16});
|
|
10249
|
-
|
|
10250
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10251
|
-
sycl::range<3>(1, 1, 32),
|
|
10252
|
-
sycl::range<3>(1, 1, 32)),
|
|
10253
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10254
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
|
10255
|
-
});
|
|
10256
|
-
}
|
|
10257
|
-
#endif
|
|
10258
9833
|
}
|
|
10259
9834
|
|
|
10260
9835
|
template <typename dst_t>
|
|
@@ -10315,7 +9890,6 @@ template <typename dst_t>
|
|
|
10315
9890
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10316
9891
|
dpct::queue_ptr stream) {
|
|
10317
9892
|
const int nb = k / QK_K;
|
|
10318
|
-
#if QK_K == 256
|
|
10319
9893
|
{
|
|
10320
9894
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10321
9895
|
{sycl::aspect::fp16});
|
|
@@ -10327,27 +9901,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10327
9901
|
dequantize_block_q5_K(vx, y, item_ct1);
|
|
10328
9902
|
});
|
|
10329
9903
|
}
|
|
10330
|
-
#else
|
|
10331
|
-
{
|
|
10332
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10333
|
-
{sycl::aspect::fp16});
|
|
10334
|
-
|
|
10335
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10336
|
-
sycl::range<3>(1, 1, 32),
|
|
10337
|
-
sycl::range<3>(1, 1, 32)),
|
|
10338
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10339
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
|
10340
|
-
});
|
|
10341
|
-
}
|
|
10342
|
-
|
|
10343
|
-
#endif
|
|
10344
9904
|
}
|
|
10345
9905
|
|
|
10346
9906
|
template <typename dst_t>
|
|
10347
9907
|
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10348
9908
|
dpct::queue_ptr stream) {
|
|
10349
9909
|
const int nb = k / QK_K;
|
|
10350
|
-
#if QK_K == 256
|
|
10351
9910
|
{
|
|
10352
9911
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10353
9912
|
{sycl::aspect::fp16});
|
|
@@ -10359,20 +9918,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10359
9918
|
dequantize_block_q6_K(vx, y, item_ct1);
|
|
10360
9919
|
});
|
|
10361
9920
|
}
|
|
10362
|
-
#else
|
|
10363
|
-
{
|
|
10364
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10365
|
-
{sycl::aspect::fp16});
|
|
10366
|
-
|
|
10367
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10368
|
-
sycl::range<3>(1, 1, 32),
|
|
10369
|
-
sycl::range<3>(1, 1, 32)),
|
|
10370
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10371
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
|
10372
|
-
});
|
|
10373
|
-
}
|
|
10374
|
-
|
|
10375
|
-
#endif
|
|
10376
9921
|
}
|
|
10377
9922
|
|
|
10378
9923
|
template <typename dst_t>
|
|
@@ -10524,9 +10069,6 @@ template <typename dst_t>
|
|
|
10524
10069
|
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10525
10070
|
dpct::queue_ptr stream) {
|
|
10526
10071
|
const int nb = (k + QK_K - 1) / QK_K;
|
|
10527
|
-
#if QK_K == 64
|
|
10528
|
-
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
|
10529
|
-
#else
|
|
10530
10072
|
{
|
|
10531
10073
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10532
10074
|
{sycl::aspect::fp16});
|
|
@@ -10541,7 +10083,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10541
10083
|
});
|
|
10542
10084
|
});
|
|
10543
10085
|
}
|
|
10544
|
-
#endif
|
|
10545
10086
|
}
|
|
10546
10087
|
|
|
10547
10088
|
|
|
@@ -12046,8 +11587,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
12046
11587
|
const int nrows_y, const int nrows_dst,
|
|
12047
11588
|
dpct::queue_ptr stream) try {
|
|
12048
11589
|
|
|
12049
|
-
#if QK_K == 256
|
|
12050
|
-
|
|
12051
11590
|
int id;
|
|
12052
11591
|
SYCL_CHECK(
|
|
12053
11592
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
|
@@ -12162,7 +11701,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
12162
11701
|
});
|
|
12163
11702
|
}
|
|
12164
11703
|
}
|
|
12165
|
-
#endif
|
|
12166
11704
|
}
|
|
12167
11705
|
catch (sycl::exception const &exc) {
|
|
12168
11706
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -12876,7 +12414,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
|
12876
12414
|
const int32_t *pos, float freq_scale,
|
|
12877
12415
|
int p_delta_rows, float freq_base, float ext_factor,
|
|
12878
12416
|
float attn_factor, rope_corr_dims corr_dims,
|
|
12879
|
-
dpct::queue_ptr stream) {
|
|
12417
|
+
const float * freq_factors, dpct::queue_ptr stream) {
|
|
12880
12418
|
GGML_ASSERT(ncols % 2 == 0);
|
|
12881
12419
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
12882
12420
|
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
|
@@ -12886,38 +12424,48 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
|
12886
12424
|
const float inv_ndims = -1.0f / n_dims;
|
|
12887
12425
|
|
|
12888
12426
|
if (pos == nullptr) {
|
|
12889
|
-
/*
|
|
12890
|
-
DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
|
|
12891
|
-
the limit. To get the device limit, query
|
|
12892
|
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
12893
|
-
*/
|
|
12894
12427
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
12895
12428
|
{sycl::aspect::fp16});
|
|
12896
|
-
|
|
12897
|
-
|
|
12898
|
-
|
|
12899
|
-
|
|
12900
|
-
|
|
12901
|
-
|
|
12902
|
-
|
|
12903
|
-
|
|
12904
|
-
|
|
12429
|
+
if (freq_factors == nullptr) {
|
|
12430
|
+
stream->parallel_for(
|
|
12431
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12432
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12433
|
+
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12434
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12435
|
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
|
12436
|
+
item_ct1);
|
|
12437
|
+
});
|
|
12438
|
+
} else {
|
|
12439
|
+
stream->parallel_for(
|
|
12440
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12441
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12442
|
+
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12443
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12444
|
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
|
12445
|
+
item_ct1);
|
|
12446
|
+
});
|
|
12447
|
+
}
|
|
12905
12448
|
} else {
|
|
12906
|
-
/*
|
|
12907
|
-
DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
|
|
12908
|
-
the limit. To get the device limit, query
|
|
12909
|
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
12910
|
-
*/
|
|
12911
12449
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
12912
12450
|
{sycl::aspect::fp16});
|
|
12913
12451
|
|
|
12914
|
-
|
|
12915
|
-
|
|
12916
|
-
|
|
12917
|
-
|
|
12918
|
-
|
|
12919
|
-
|
|
12920
|
-
|
|
12452
|
+
if (freq_factors == nullptr) {
|
|
12453
|
+
stream->parallel_for(
|
|
12454
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12455
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12456
|
+
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12457
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12458
|
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
|
12459
|
+
});
|
|
12460
|
+
} else {
|
|
12461
|
+
stream->parallel_for(
|
|
12462
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12463
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12464
|
+
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12465
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12466
|
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
|
12467
|
+
});
|
|
12468
|
+
}
|
|
12921
12469
|
}
|
|
12922
12470
|
}
|
|
12923
12471
|
|
|
@@ -13964,6 +13512,10 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
|
|
|
13964
13512
|
const float *src0_dd, const float *src1_dd,
|
|
13965
13513
|
float *dst_dd,
|
|
13966
13514
|
const dpct::queue_ptr &main_stream) {
|
|
13515
|
+
#pragma message("TODO: generalize concat kernel for dim != 2")
|
|
13516
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
|
13517
|
+
int dim = dst->op_params[0];
|
|
13518
|
+
GGML_ASSERT(dim != 2);
|
|
13967
13519
|
|
|
13968
13520
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
13969
13521
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
@@ -13985,11 +13537,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
|
|
13985
13537
|
|
|
13986
13538
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
13987
13539
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
13988
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
|
13989
13540
|
|
|
13990
|
-
const
|
|
13541
|
+
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
|
13542
|
+
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
|
13543
|
+
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
|
13544
|
+
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
|
13991
13545
|
|
|
13992
|
-
upscale_f32_sycl(src0_dd, dst_dd, src0->
|
|
13546
|
+
upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
|
13547
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
|
13548
|
+
main_stream);
|
|
13993
13549
|
|
|
13994
13550
|
(void) src1;
|
|
13995
13551
|
(void) dst;
|
|
@@ -14445,6 +14001,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14445
14001
|
ggml_tensor *dst, const float *src0_dd,
|
|
14446
14002
|
const float *src1_dd, float *dst_dd,
|
|
14447
14003
|
const dpct::queue_ptr &main_stream) {
|
|
14004
|
+
const ggml_tensor * src2 = dst->src[2];
|
|
14448
14005
|
|
|
14449
14006
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
14450
14007
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
@@ -14470,6 +14027,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14470
14027
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
14471
14028
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
14472
14029
|
|
|
14030
|
+
const float * freq_factors = nullptr;
|
|
14473
14031
|
const int32_t * pos = nullptr;
|
|
14474
14032
|
if ((mode & 1) == 0) {
|
|
14475
14033
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
@@ -14480,6 +14038,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14480
14038
|
const bool is_neox = mode & 2;
|
|
14481
14039
|
const bool is_glm = mode & 4;
|
|
14482
14040
|
|
|
14041
|
+
if (is_neox) {
|
|
14042
|
+
pos = (const int32_t *) src1_dd;
|
|
14043
|
+
|
|
14044
|
+
if (src2 != nullptr) {
|
|
14045
|
+
freq_factors = (const float *) src2->data;
|
|
14046
|
+
}
|
|
14047
|
+
} else {
|
|
14048
|
+
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
|
14049
|
+
}
|
|
14050
|
+
|
|
14483
14051
|
rope_corr_dims corr_dims;
|
|
14484
14052
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
|
14485
14053
|
|
|
@@ -14491,13 +14059,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14491
14059
|
if (src0->type == GGML_TYPE_F32) {
|
|
14492
14060
|
rope_neox_sycl(
|
|
14493
14061
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
|
14494
|
-
attn_factor, corr_dims, main_stream
|
|
14062
|
+
attn_factor, corr_dims, freq_factors, main_stream
|
|
14495
14063
|
);
|
|
14496
14064
|
} else if (src0->type == GGML_TYPE_F16) {
|
|
14497
14065
|
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
|
14498
14066
|
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
|
14499
14067
|
freq_base, ext_factor, attn_factor, corr_dims,
|
|
14500
|
-
main_stream);
|
|
14068
|
+
freq_factors, main_stream);
|
|
14501
14069
|
} else {
|
|
14502
14070
|
GGML_ASSERT(false);
|
|
14503
14071
|
}
|
|
@@ -15699,6 +15267,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
|
15699
15267
|
}
|
|
15700
15268
|
} else {
|
|
15701
15269
|
bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
|
15270
|
+
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
|
15702
15271
|
|
|
15703
15272
|
if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
|
|
15704
15273
|
use_mul_mat_q = false;
|