@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
|
3154
3154
|
#define SYCL_SCALE_BLOCK_SIZE 256
|
|
3155
3155
|
#define SYCL_CLAMP_BLOCK_SIZE 256
|
|
3156
3156
|
#define SYCL_ROPE_BLOCK_SIZE 256
|
|
3157
|
-
#define SYCL_ALIBI_BLOCK_SIZE 32
|
|
3158
3157
|
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
|
3159
3158
|
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
|
3160
3159
|
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
|
|
@@ -3848,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
|
|
|
3848
3847
|
}
|
|
3849
3848
|
}
|
|
3850
3849
|
|
|
3851
|
-
static void upscale_f32(const float *x, float *dst, const int
|
|
3852
|
-
const
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
|
|
3850
|
+
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
|
3851
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
3852
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
3853
|
+
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
|
3854
|
+
int index = item_ct1.get_local_id(0) +
|
|
3855
|
+
item_ct1.get_group(0) * item_ct1.get_local_range(0);
|
|
3856
|
+
if (index >= ne10 * ne11 * ne12 * ne13) {
|
|
3857
3857
|
return;
|
|
3858
3858
|
}
|
|
3859
3859
|
// operation
|
|
3860
|
-
int
|
|
3861
|
-
int
|
|
3862
|
-
int
|
|
3863
|
-
int
|
|
3864
|
-
|
|
3865
|
-
|
|
3860
|
+
int i10 = index % ne10;
|
|
3861
|
+
int i11 = (index / ne10) % ne11;
|
|
3862
|
+
int i12 = (index / (ne10 * ne11)) % ne12;
|
|
3863
|
+
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
|
3864
|
+
|
|
3865
|
+
int i00 = i10 / sf0;
|
|
3866
|
+
int i01 = i11 / sf1;
|
|
3867
|
+
int i02 = i12 / sf2;
|
|
3868
|
+
int i03 = i13 / sf3;
|
|
3869
|
+
|
|
3870
|
+
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
|
3866
3871
|
}
|
|
3867
3872
|
|
|
3868
3873
|
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
@@ -4192,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4192
4197
|
const block_q2_K * x = (const block_q2_K *) vx;
|
|
4193
4198
|
|
|
4194
4199
|
const int tid = item_ct1.get_local_id(2);
|
|
4195
|
-
#if QK_K == 256
|
|
4196
4200
|
const int n = tid/32;
|
|
4197
4201
|
const int l = tid - 32*n;
|
|
4198
4202
|
const int is = 8*n + l/16;
|
|
@@ -4206,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4206
4210
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
|
4207
4211
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
|
4208
4212
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
|
4209
|
-
#else
|
|
4210
|
-
const int is = tid/16; // 0 or 1
|
|
4211
|
-
const int il = tid%16; // 0...15
|
|
4212
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
4213
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
4214
|
-
|
|
4215
|
-
float dall = x[i].dm[0];
|
|
4216
|
-
float dmin = x[i].dm[1];
|
|
4217
|
-
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
|
4218
|
-
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
|
4219
|
-
#endif
|
|
4220
|
-
|
|
4221
4213
|
}
|
|
4222
4214
|
|
|
4223
4215
|
template<typename dst_t>
|
|
@@ -4227,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4227
4219
|
const int i = item_ct1.get_group(2);
|
|
4228
4220
|
const block_q3_K * x = (const block_q3_K *) vx;
|
|
4229
4221
|
|
|
4230
|
-
#if QK_K == 256
|
|
4231
4222
|
const int r = item_ct1.get_local_id(2) / 4;
|
|
4232
4223
|
const int tid = r/2;
|
|
4233
4224
|
const int is0 = r%2;
|
|
@@ -4251,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4251
4242
|
const uint8_t * hm = x[i].hmask;
|
|
4252
4243
|
|
|
4253
4244
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
|
4254
|
-
#else
|
|
4255
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4256
|
-
const int is = tid/16; // 0 or 1
|
|
4257
|
-
const int il = tid%16; // 0...15
|
|
4258
|
-
const int im = il/8; // 0...1
|
|
4259
|
-
const int in = il%8; // 0...7
|
|
4260
|
-
|
|
4261
|
-
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
4262
|
-
|
|
4263
|
-
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
4264
|
-
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
|
4265
|
-
const float d = (float)x[i].d;
|
|
4266
|
-
|
|
4267
|
-
if (is == 0) {
|
|
4268
|
-
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
4269
|
-
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
4270
|
-
} else {
|
|
4271
|
-
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
4272
|
-
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
4273
|
-
}
|
|
4274
|
-
#endif
|
|
4275
|
-
|
|
4276
4245
|
}
|
|
4277
4246
|
|
|
4278
|
-
#if QK_K == 256
|
|
4279
4247
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
|
4280
4248
|
if (j < 4) {
|
|
4281
4249
|
d = q[j] & 63; m = q[j + 4] & 63;
|
|
@@ -4284,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
|
4284
4252
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
|
4285
4253
|
}
|
|
4286
4254
|
}
|
|
4287
|
-
#endif
|
|
4288
4255
|
|
|
4289
4256
|
template<typename dst_t>
|
|
4290
4257
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
@@ -4293,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4293
4260
|
|
|
4294
4261
|
const int i = item_ct1.get_group(2);
|
|
4295
4262
|
|
|
4296
|
-
#if QK_K == 256
|
|
4297
4263
|
// assume 32 threads
|
|
4298
4264
|
const int tid = item_ct1.get_local_id(2);
|
|
4299
4265
|
const int il = tid/8;
|
|
@@ -4317,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4317
4283
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
|
4318
4284
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
|
4319
4285
|
}
|
|
4320
|
-
#else
|
|
4321
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4322
|
-
const uint8_t * q = x[i].qs;
|
|
4323
|
-
dst_t * y = yy + i*QK_K;
|
|
4324
|
-
const float d = (float)x[i].dm[0];
|
|
4325
|
-
const float m = (float)x[i].dm[1];
|
|
4326
|
-
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
|
4327
|
-
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
|
4328
|
-
#endif
|
|
4329
4286
|
}
|
|
4330
4287
|
|
|
4331
4288
|
template<typename dst_t>
|
|
@@ -4335,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4335
4292
|
|
|
4336
4293
|
const int i = item_ct1.get_group(2);
|
|
4337
4294
|
|
|
4338
|
-
#if QK_K == 256
|
|
4339
4295
|
// assume 64 threads - this is very slightly better than the one below
|
|
4340
4296
|
const int tid = item_ct1.get_local_id(2);
|
|
4341
4297
|
const int il = tid/16; // il is in 0...3
|
|
@@ -4362,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4362
4318
|
hm <<= 1;
|
|
4363
4319
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
|
4364
4320
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
|
4365
|
-
#else
|
|
4366
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4367
|
-
const uint8_t q = x[i].qs[tid];
|
|
4368
|
-
const int im = tid/8; // 0...3
|
|
4369
|
-
const int in = tid%8; // 0...7
|
|
4370
|
-
const int is = tid/16; // 0 or 1
|
|
4371
|
-
const uint8_t h = x[i].qh[in] >> im;
|
|
4372
|
-
const float d = x[i].d;
|
|
4373
|
-
dst_t * y = yy + i*QK_K + tid;
|
|
4374
|
-
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
|
4375
|
-
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
|
4376
|
-
#endif
|
|
4377
4321
|
}
|
|
4378
4322
|
|
|
4379
4323
|
template<typename dst_t>
|
|
@@ -4382,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4382
4326
|
const block_q6_K * x = (const block_q6_K *) vx;
|
|
4383
4327
|
|
|
4384
4328
|
const int i = item_ct1.get_group(2);
|
|
4385
|
-
#if QK_K == 256
|
|
4386
4329
|
|
|
4387
4330
|
// assume 64 threads - this is very slightly better than the one below
|
|
4388
4331
|
const int tid = item_ct1.get_local_id(2);
|
|
@@ -4402,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
4402
4345
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
|
4403
4346
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
|
4404
4347
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
|
4405
|
-
#else
|
|
4406
|
-
|
|
4407
|
-
// assume 32 threads
|
|
4408
|
-
const int tid = item_ct1.get_local_id(2);
|
|
4409
|
-
const int ip = tid/16; // 0 or 1
|
|
4410
|
-
const int il = tid - 16*ip; // 0...15
|
|
4411
|
-
|
|
4412
|
-
dst_t * y = yy + i*QK_K + 16*ip + il;
|
|
4413
|
-
|
|
4414
|
-
const float d = x[i].d;
|
|
4415
|
-
|
|
4416
|
-
const uint8_t ql = x[i].ql[16*ip + il];
|
|
4417
|
-
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
|
4418
|
-
const int8_t * sc = x[i].scales;
|
|
4419
|
-
|
|
4420
|
-
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
|
4421
|
-
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
|
4422
|
-
#endif
|
|
4423
4348
|
}
|
|
4424
4349
|
|
|
4425
4350
|
template<typename dst_t>
|
|
@@ -4433,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4433
4358
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
|
4434
4359
|
|
|
4435
4360
|
const int tid = item_ct1.get_local_id(2);
|
|
4436
|
-
#if QK_K == 256
|
|
4437
4361
|
const int il = tid/8; // 0...3
|
|
4438
4362
|
const int ib = tid%8; // 0...7
|
|
4439
4363
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4444,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4444
4368
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
|
4445
4369
|
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
|
4446
4370
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
|
4447
|
-
#else
|
|
4448
|
-
assert(false);
|
|
4449
|
-
#endif
|
|
4450
|
-
|
|
4451
4371
|
}
|
|
4452
4372
|
|
|
4453
4373
|
template<typename dst_t>
|
|
@@ -4461,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
|
4461
4381
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
|
4462
4382
|
|
|
4463
4383
|
const int tid = item_ct1.get_local_id(2);
|
|
4464
|
-
#if QK_K == 256
|
|
4465
4384
|
const int il = tid/8; // 0...3
|
|
4466
4385
|
const int ib = tid%8; // 0...7
|
|
4467
4386
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4470,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
|
4470
4389
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
|
4471
4390
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
|
4472
4391
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
4473
|
-
#else
|
|
4474
|
-
assert(false);
|
|
4475
|
-
#endif
|
|
4476
|
-
|
|
4477
4392
|
}
|
|
4478
4393
|
|
|
4479
4394
|
template <typename dst_t>
|
|
@@ -4485,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4485
4400
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
|
4486
4401
|
|
|
4487
4402
|
const int tid = item_ct1.get_local_id(2);
|
|
4488
|
-
#if QK_K == 256
|
|
4489
4403
|
const int il = tid/8; // 0...3
|
|
4490
4404
|
const int ib = tid%8; // 0...7
|
|
4491
4405
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4493,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4493
4407
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
|
4494
4408
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
|
4495
4409
|
#pragma unroll
|
|
4496
|
-
for (int j = 0; j < 8; ++j)
|
|
4410
|
+
for (int j = 0; j < 8; ++j) {
|
|
4497
4411
|
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
4498
|
-
|
|
4499
|
-
assert(false);
|
|
4500
|
-
|
|
4501
|
-
#endif
|
|
4502
|
-
|
|
4412
|
+
}
|
|
4503
4413
|
}
|
|
4504
4414
|
|
|
4505
4415
|
template<typename dst_t>
|
|
@@ -4513,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4513
4423
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
|
4514
4424
|
|
|
4515
4425
|
const int tid = item_ct1.get_local_id(2);
|
|
4516
|
-
#if QK_K == 256
|
|
4517
4426
|
const int il = tid/8; // 0...3
|
|
4518
4427
|
const int ib = tid%8; // 0...7
|
|
4519
4428
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4528,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
|
4528
4437
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4529
4438
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
4530
4439
|
}
|
|
4531
|
-
#else
|
|
4532
|
-
assert(false);
|
|
4533
|
-
#endif
|
|
4534
|
-
|
|
4535
4440
|
}
|
|
4536
4441
|
|
|
4537
4442
|
template <typename dst_t>
|
|
@@ -4544,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4544
4449
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
|
4545
4450
|
|
|
4546
4451
|
const int tid = item_ct1.get_local_id(2);
|
|
4547
|
-
#if QK_K == 256
|
|
4548
4452
|
const int il = tid/8; // 0...3
|
|
4549
4453
|
const int ib = tid%8; // 0...7
|
|
4550
4454
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4558,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4558
4462
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4559
4463
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
4560
4464
|
}
|
|
4561
|
-
#else
|
|
4562
|
-
assert(false);
|
|
4563
|
-
#endif
|
|
4564
|
-
|
|
4565
4465
|
}
|
|
4566
4466
|
|
|
4567
4467
|
template <typename dst_t>
|
|
@@ -4574,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4574
4474
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
|
4575
4475
|
|
|
4576
4476
|
const int tid = item_ct1.get_local_id(2);
|
|
4577
|
-
#if QK_K == 256
|
|
4578
4477
|
const int il = tid/8; // 0...3
|
|
4579
4478
|
const int ib = tid%8; // 0...7
|
|
4580
4479
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4588,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4588
4487
|
for (int j = 0; j < 8; ++j) {
|
|
4589
4488
|
y[j] = d * (q[j] + delta);
|
|
4590
4489
|
}
|
|
4591
|
-
#else
|
|
4592
|
-
assert(false);
|
|
4593
|
-
#endif
|
|
4594
|
-
|
|
4595
4490
|
}
|
|
4596
4491
|
|
|
4597
4492
|
template <typename dst_t>
|
|
@@ -4604,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4604
4499
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
|
4605
4500
|
|
|
4606
4501
|
const int tid = item_ct1.get_local_id(2);
|
|
4607
|
-
#if QK_K == 256
|
|
4608
4502
|
const int il = tid/8; // 0...3
|
|
4609
4503
|
const int ib = tid%8; // 0...7
|
|
4610
4504
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
|
@@ -4622,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
|
|
4622
4516
|
for (int j = 0; j < 8; ++j) {
|
|
4623
4517
|
y[j] = d * (q[j] + delta);
|
|
4624
4518
|
}
|
|
4625
|
-
#else
|
|
4626
|
-
assert(false);
|
|
4627
|
-
#endif
|
|
4628
|
-
|
|
4629
4519
|
}
|
|
4630
4520
|
|
|
4631
4521
|
template <typename dst_t>
|
|
@@ -4699,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
|
4699
4589
|
|
|
4700
4590
|
float tmp = 0; // partial sum for thread in warp
|
|
4701
4591
|
|
|
4702
|
-
#if QK_K == 256
|
|
4703
4592
|
const int tid =
|
|
4704
4593
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
|
4705
4594
|
const int ix =
|
|
@@ -4750,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
|
|
4750
4639
|
tmp += dall * sum1 - dmin * sum2;
|
|
4751
4640
|
|
|
4752
4641
|
}
|
|
4753
|
-
#else
|
|
4754
|
-
const int tid = item_ct1.get_local_id(2) /
|
|
4755
|
-
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
4756
|
-
const int ix = item_ct1.get_local_id(2) %
|
|
4757
|
-
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
4758
|
-
const int offset = tid * K_QUANTS_PER_ITERATION;
|
|
4759
|
-
|
|
4760
|
-
uint32_t uaux[2];
|
|
4761
|
-
const uint8_t * d = (const uint8_t *)uaux;
|
|
4762
|
-
|
|
4763
|
-
|
|
4764
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
4765
|
-
|
|
4766
|
-
const float * y = yy + i * QK_K + offset;
|
|
4767
|
-
const uint8_t * q = x[i].qs + offset;
|
|
4768
|
-
const uint32_t * s = (const uint32_t *)x[i].scales;
|
|
4769
|
-
|
|
4770
|
-
uaux[0] = s[0] & 0x0f0f0f0f;
|
|
4771
|
-
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
|
4772
|
-
|
|
4773
|
-
const sycl::float2 dall =
|
|
4774
|
-
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
|
4775
|
-
|
|
4776
|
-
float sum1 = 0, sum2 = 0;
|
|
4777
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
4778
|
-
const uint8_t ql = q[l];
|
|
4779
|
-
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
|
4780
|
-
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
|
4781
|
-
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
|
4782
|
-
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
|
4783
|
-
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
|
4784
|
-
}
|
|
4785
|
-
tmp += dall.x() * sum1 - dall.y() * sum2;
|
|
4786
|
-
}
|
|
4787
|
-
|
|
4788
|
-
#endif
|
|
4789
4642
|
|
|
4790
4643
|
// sum up partial sums and write back result
|
|
4791
4644
|
#pragma unroll
|
|
@@ -4823,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
|
4823
4676
|
|
|
4824
4677
|
float tmp = 0; // partial sum for thread in warp
|
|
4825
4678
|
|
|
4826
|
-
#if QK_K == 256
|
|
4827
|
-
|
|
4828
4679
|
const uint16_t kmask1 = 0x0303;
|
|
4829
4680
|
const uint16_t kmask2 = 0x0f0f;
|
|
4830
4681
|
|
|
@@ -4877,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
|
|
4877
4728
|
tmp += d * sum;
|
|
4878
4729
|
|
|
4879
4730
|
}
|
|
4880
|
-
#else
|
|
4881
|
-
|
|
4882
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
4883
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
4884
|
-
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
|
4885
|
-
const int in = offset/8; // 0 or 1
|
|
4886
|
-
const int im = offset%8; // 0...7
|
|
4887
|
-
|
|
4888
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
4889
|
-
|
|
4890
|
-
const float * y = yy + i * QK_K + offset;
|
|
4891
|
-
const uint8_t * q = x[i].qs + offset;
|
|
4892
|
-
const uint8_t * s = x[i].scales;
|
|
4893
|
-
|
|
4894
|
-
const float dall = (float)x[i].d;
|
|
4895
|
-
|
|
4896
|
-
float sum = 0;
|
|
4897
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
4898
|
-
const uint8_t hl = x[i].hmask[im+l] >> in;
|
|
4899
|
-
const uint8_t ql = q[l];
|
|
4900
|
-
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
|
4901
|
-
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
|
4902
|
-
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
|
4903
|
-
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
|
4904
|
-
}
|
|
4905
|
-
tmp += sum;
|
|
4906
|
-
}
|
|
4907
|
-
#endif
|
|
4908
4731
|
|
|
4909
4732
|
// sum up partial sums and write back result
|
|
4910
4733
|
#pragma unroll
|
|
@@ -4939,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
|
4939
4762
|
|
|
4940
4763
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
|
4941
4764
|
|
|
4942
|
-
#if QK_K == 256
|
|
4943
4765
|
const uint16_t kmask1 = 0x3f3f;
|
|
4944
4766
|
const uint16_t kmask2 = 0x0f0f;
|
|
4945
4767
|
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -5028,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
|
|
5028
4850
|
#endif
|
|
5029
4851
|
|
|
5030
4852
|
}
|
|
5031
|
-
#else
|
|
5032
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
5033
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
|
5034
|
-
|
|
5035
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
5036
|
-
|
|
5037
|
-
uint16_t aux16[2];
|
|
5038
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
|
5039
|
-
|
|
5040
|
-
float tmp = 0;
|
|
5041
|
-
|
|
5042
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
5043
|
-
const uint8_t * q = x[i].qs + step;
|
|
5044
|
-
const float * y = yy + i*QK_K + step;
|
|
5045
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
5046
|
-
aux16[0] = a[0] & 0x0f0f;
|
|
5047
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
5048
|
-
const float d = (float)x[i].dm[0];
|
|
5049
|
-
const float m = (float)x[i].dm[1];
|
|
5050
|
-
float sum = 0.f;
|
|
5051
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
5052
|
-
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
|
5053
|
-
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
|
5054
|
-
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
|
5055
|
-
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
|
5056
|
-
}
|
|
5057
|
-
tmp += sum;
|
|
5058
|
-
}
|
|
5059
|
-
|
|
5060
|
-
#endif
|
|
5061
4853
|
|
|
5062
4854
|
// sum up partial sums and write back result
|
|
5063
4855
|
#pragma unroll
|
|
@@ -5092,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
|
5092
4884
|
|
|
5093
4885
|
float tmp = 0; // partial sum for thread in warp
|
|
5094
4886
|
|
|
5095
|
-
#if QK_K == 256
|
|
5096
4887
|
const uint16_t kmask1 = 0x3f3f;
|
|
5097
4888
|
const uint16_t kmask2 = 0x0f0f;
|
|
5098
4889
|
const uint16_t kmask3 = 0xc0c0;
|
|
@@ -5169,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
|
|
5169
4960
|
dmin * smin;
|
|
5170
4961
|
}
|
|
5171
4962
|
|
|
5172
|
-
#else
|
|
5173
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
5174
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
|
5175
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
5176
|
-
const int im = step/8;
|
|
5177
|
-
const int in = step%8;
|
|
5178
|
-
|
|
5179
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
5180
|
-
const uint8_t * q = x[i].qs + step;
|
|
5181
|
-
const int8_t * s = x[i].scales;
|
|
5182
|
-
const float * y = yy + i*QK_K + step;
|
|
5183
|
-
const float d = x[i].d;
|
|
5184
|
-
float sum = 0.f;
|
|
5185
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
5186
|
-
const uint8_t h = x[i].qh[in+j] >> im;
|
|
5187
|
-
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
|
5188
|
-
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
|
5189
|
-
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
|
5190
|
-
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
|
5191
|
-
}
|
|
5192
|
-
tmp += sum;
|
|
5193
|
-
}
|
|
5194
|
-
#endif
|
|
5195
|
-
|
|
5196
4963
|
// sum up partial sums and write back result
|
|
5197
4964
|
#pragma unroll
|
|
5198
4965
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -5219,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
|
5219
4986
|
|
|
5220
4987
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
|
5221
4988
|
|
|
5222
|
-
#if QK_K == 256
|
|
5223
|
-
|
|
5224
4989
|
const int tid =
|
|
5225
4990
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
|
5226
4991
|
const int ix =
|
|
@@ -5277,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
|
|
5277
5042
|
|
|
5278
5043
|
}
|
|
5279
5044
|
|
|
5280
|
-
#else
|
|
5281
|
-
|
|
5282
|
-
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
|
5283
|
-
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
|
5284
|
-
|
|
5285
|
-
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
5286
|
-
|
|
5287
|
-
float tmp = 0; // partial sum for thread in warp
|
|
5288
|
-
|
|
5289
|
-
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
5290
|
-
|
|
5291
|
-
const float * y = yy + i * QK_K + step;
|
|
5292
|
-
const uint8_t * ql = x[i].ql + step;
|
|
5293
|
-
const uint8_t * qh = x[i].qh + step;
|
|
5294
|
-
const int8_t * s = x[i].scales;
|
|
5295
|
-
|
|
5296
|
-
const float d = x[i+0].d;
|
|
5297
|
-
|
|
5298
|
-
float sum = 0;
|
|
5299
|
-
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
5300
|
-
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
|
5301
|
-
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
|
5302
|
-
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
|
5303
|
-
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
|
5304
|
-
}
|
|
5305
|
-
tmp += sum;
|
|
5306
|
-
|
|
5307
|
-
}
|
|
5308
|
-
|
|
5309
|
-
#endif
|
|
5310
|
-
|
|
5311
5045
|
// sum up partial sums and write back result
|
|
5312
5046
|
#pragma unroll
|
|
5313
5047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
@@ -6852,7 +6586,6 @@ static __dpct_inline__ float
|
|
|
6852
6586
|
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
6853
6587
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
6854
6588
|
|
|
6855
|
-
#ifndef GGML_QKK_64
|
|
6856
6589
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
6857
6590
|
|
|
6858
6591
|
int v[2];
|
|
@@ -6894,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
|
|
6894
6627
|
}
|
|
6895
6628
|
|
|
6896
6629
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
|
6897
|
-
|
|
6898
|
-
#else
|
|
6899
|
-
|
|
6900
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
|
6901
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
6902
|
-
|
|
6903
|
-
float sumf_d = 0.0f;
|
|
6904
|
-
float sumf_m = 0.0f;
|
|
6905
|
-
|
|
6906
|
-
uint16_t aux16[2];
|
|
6907
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
|
6908
|
-
|
|
6909
|
-
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
|
6910
|
-
aux16[0] = a[0] & 0x0f0f;
|
|
6911
|
-
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
6912
|
-
|
|
6913
|
-
const float dall = bq4_K->dm[0];
|
|
6914
|
-
const float dmin = bq4_K->dm[1];
|
|
6915
|
-
|
|
6916
|
-
const float d8_1 = bq8_1[0].ds[0];
|
|
6917
|
-
const float d8_2 = bq8_1[1].ds[1];
|
|
6918
|
-
|
|
6919
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
6920
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
6921
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
6922
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
6923
|
-
|
|
6924
|
-
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
|
6925
|
-
const int v1 = q4[0];
|
|
6926
|
-
const int v2 = q4[4];
|
|
6927
|
-
|
|
6928
|
-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
|
6929
|
-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
|
6930
|
-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
|
6931
|
-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
|
6932
|
-
|
|
6933
|
-
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
|
6934
|
-
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
|
6935
|
-
|
|
6936
|
-
return dall * sumf_d - dmin * sumf_m;
|
|
6937
|
-
|
|
6938
|
-
#else
|
|
6939
|
-
bad_arch();
|
|
6940
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
|
6941
|
-
|
|
6942
|
-
#endif
|
|
6943
6630
|
}
|
|
6944
6631
|
|
|
6945
6632
|
template <int mmq_y>
|
|
@@ -6998,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
|
6998
6685
|
|
|
6999
6686
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
7000
6687
|
|
|
7001
|
-
#if QK_K == 256
|
|
7002
6688
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
|
7003
|
-
#else
|
|
7004
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
|
7005
|
-
#endif
|
|
7006
6689
|
}
|
|
7007
6690
|
|
|
7008
6691
|
#pragma unroll
|
|
@@ -7045,7 +6728,6 @@ static __dpct_inline__ float
|
|
|
7045
6728
|
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
7046
6729
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7047
6730
|
|
|
7048
|
-
#ifndef GGML_QKK_64
|
|
7049
6731
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
7050
6732
|
|
|
7051
6733
|
int vl[2];
|
|
@@ -7087,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
|
|
7087
6769
|
}
|
|
7088
6770
|
|
|
7089
6771
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
|
7090
|
-
|
|
7091
|
-
#else
|
|
7092
|
-
|
|
7093
|
-
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
|
7094
|
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
7095
|
-
|
|
7096
|
-
const int8_t * s = bq5_K->scales;
|
|
7097
|
-
|
|
7098
|
-
const float d = bq5_K->d;
|
|
7099
|
-
|
|
7100
|
-
const float d8_1 = bq8_1[0].ds[0];
|
|
7101
|
-
const float d8_2 = bq8_1[1].ds[1];
|
|
7102
|
-
|
|
7103
|
-
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
7104
|
-
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
7105
|
-
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
7106
|
-
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
7107
|
-
|
|
7108
|
-
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
|
7109
|
-
const int vl1 = ql[0];
|
|
7110
|
-
const int vl2 = ql[4];
|
|
7111
|
-
|
|
7112
|
-
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
|
7113
|
-
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
|
7114
|
-
const int in = step%8; // 0, 4, 0, 4
|
|
7115
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
|
7116
|
-
|
|
7117
|
-
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
|
7118
|
-
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
|
7119
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
|
7120
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
|
7121
|
-
|
|
7122
|
-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
|
7123
|
-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
|
7124
|
-
|
|
7125
|
-
return d * sumf_d;
|
|
7126
|
-
|
|
7127
|
-
#else
|
|
7128
|
-
bad_arch();
|
|
7129
|
-
#endif // __SYCL_ARCH__ >= VER_4VEC
|
|
7130
|
-
|
|
7131
|
-
#endif
|
|
7132
6772
|
}
|
|
7133
6773
|
|
|
7134
6774
|
template <int mmq_y>
|
|
@@ -7200,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
|
|
7200
6840
|
|
|
7201
6841
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
7202
6842
|
|
|
7203
|
-
#if QK_K == 256
|
|
7204
6843
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
|
7205
|
-
#endif
|
|
7206
6844
|
}
|
|
7207
6845
|
|
|
7208
6846
|
#pragma unroll
|
|
@@ -7382,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7382
7020
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
7383
7021
|
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
|
7384
7022
|
const uint8_t *kmask_iq2xs) {
|
|
7385
|
-
#if QK_K == 256
|
|
7386
7023
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
|
7387
7024
|
|
|
7388
7025
|
#if QR2_XXS == 8
|
|
@@ -7423,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7423
7060
|
}
|
|
7424
7061
|
return d * (sumi1 + sumi2);
|
|
7425
7062
|
#endif
|
|
7426
|
-
#else
|
|
7427
|
-
assert(false);
|
|
7428
|
-
return 0.f;
|
|
7429
|
-
#endif
|
|
7430
7063
|
}
|
|
7431
7064
|
|
|
7432
7065
|
static __dpct_inline__ float
|
|
@@ -7435,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
|
7435
7068
|
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
|
7436
7069
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
|
7437
7070
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
7438
|
-
#if QK_K == 256
|
|
7439
7071
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
|
7440
7072
|
|
|
7441
7073
|
const int ib32 = iqs;
|
|
@@ -7473,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
|
7473
7105
|
assert(false);
|
|
7474
7106
|
return 0.f;
|
|
7475
7107
|
#endif
|
|
7476
|
-
#else
|
|
7477
|
-
assert(false);
|
|
7478
|
-
return 0.f;
|
|
7479
|
-
#endif
|
|
7480
7108
|
}
|
|
7481
7109
|
|
|
7482
7110
|
static __dpct_inline__ float
|
|
7483
7111
|
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
7484
7112
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7485
|
-
#if QK_K == 256
|
|
7486
7113
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
|
7487
7114
|
|
|
7488
7115
|
const int ib32 = iqs;
|
|
@@ -7526,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
|
|
7526
7153
|
}
|
|
7527
7154
|
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
|
7528
7155
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
|
7529
|
-
#else
|
|
7530
|
-
assert(false);
|
|
7531
|
-
#endif
|
|
7532
7156
|
}
|
|
7533
7157
|
|
|
7534
7158
|
static __dpct_inline__ float
|
|
@@ -7537,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7537
7161
|
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
|
7538
7162
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
|
7539
7163
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
7540
|
-
#if QK_K == 256
|
|
7541
7164
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
|
7542
7165
|
|
|
7543
7166
|
const int ib32 = iqs;
|
|
@@ -7565,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
|
7565
7188
|
assert(false);
|
|
7566
7189
|
return 0.f;
|
|
7567
7190
|
#endif
|
|
7568
|
-
#else
|
|
7569
|
-
assert(false);
|
|
7570
|
-
return 0.f;
|
|
7571
|
-
#endif
|
|
7572
7191
|
}
|
|
7573
7192
|
|
|
7574
7193
|
static __dpct_inline__ float
|
|
7575
7194
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7576
7195
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
7577
7196
|
const uint32_t *iq3s_grid) {
|
|
7578
|
-
#if QK_K == 256
|
|
7579
7197
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
|
7580
7198
|
|
|
7581
7199
|
const int ib32 = iqs;
|
|
@@ -7604,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
|
7604
7222
|
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
|
7605
7223
|
bq8_1[ib32].ds[0];
|
|
7606
7224
|
return d * sumi;
|
|
7607
|
-
#else
|
|
7608
|
-
assert(false);
|
|
7609
|
-
#endif
|
|
7610
7225
|
}
|
|
7611
7226
|
|
|
7612
7227
|
static __dpct_inline__ float
|
|
7613
7228
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
7614
7229
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
|
7615
7230
|
const uint32_t *iq1s_grid_gpu) {
|
|
7616
|
-
#if QK_K == 256
|
|
7617
7231
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
|
7618
7232
|
|
|
7619
7233
|
const int ib32 = iqs;
|
|
@@ -7632,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
|
|
7632
7246
|
const float d = d1q * bq8_1[ib32].ds[0];
|
|
7633
7247
|
const float m = d1q * bq8_1[ib32].ds[1];
|
|
7634
7248
|
return d * sumi + m * delta;
|
|
7635
|
-
#else
|
|
7636
|
-
assert(false);
|
|
7637
|
-
#endif
|
|
7638
7249
|
}
|
|
7639
7250
|
|
|
7640
7251
|
static __dpct_inline__ float
|
|
7641
7252
|
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
7642
7253
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7643
|
-
#if QK_K == 256
|
|
7644
7254
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
|
7645
7255
|
|
|
7646
7256
|
const int ib32 = iqs;
|
|
@@ -7665,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
|
|
7665
7275
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
7666
7276
|
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
|
7667
7277
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
|
7668
|
-
#else
|
|
7669
|
-
assert(false);
|
|
7670
|
-
#endif
|
|
7671
7278
|
}
|
|
7672
7279
|
|
|
7673
7280
|
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
|
@@ -7715,7 +7322,6 @@ static __dpct_inline__ float
|
|
|
7715
7322
|
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
7716
7323
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
7717
7324
|
|
|
7718
|
-
#if QK_K == 256
|
|
7719
7325
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
|
7720
7326
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
|
7721
7327
|
|
|
@@ -7733,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
|
|
7733
7339
|
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
|
7734
7340
|
}
|
|
7735
7341
|
return d * (sumi1 + sumi2);
|
|
7736
|
-
#else
|
|
7737
|
-
assert(false);
|
|
7738
|
-
#endif
|
|
7739
7342
|
}
|
|
7740
7343
|
|
|
7741
7344
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
|
@@ -9227,12 +8830,11 @@ static void rope(
|
|
|
9227
8830
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
|
9228
8831
|
}
|
|
9229
8832
|
|
|
9230
|
-
template<typename T, bool has_pos>
|
|
8833
|
+
template<typename T, bool has_pos, bool has_freq_facs>
|
|
9231
8834
|
static void rope_neox(
|
|
9232
8835
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
|
9233
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
|
9234
|
-
,
|
|
9235
|
-
const sycl::nd_item<3> &item_ct1) {
|
|
8836
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
|
8837
|
+
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
|
9236
8838
|
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
9237
8839
|
item_ct1.get_local_id(1));
|
|
9238
8840
|
|
|
@@ -9260,8 +8862,10 @@ static void rope_neox(
|
|
|
9260
8862
|
float cur_rot = inv_ndims * ic - ib;
|
|
9261
8863
|
|
|
9262
8864
|
const int p = has_pos ? pos[i2] : 0;
|
|
8865
|
+
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
|
8866
|
+
|
|
9263
8867
|
const float theta_base =
|
|
9264
|
-
p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
|
|
8868
|
+
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
|
9265
8869
|
|
|
9266
8870
|
float cos_theta, sin_theta;
|
|
9267
8871
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
@@ -9316,32 +8920,6 @@ static void rope_glm_f32(
|
|
|
9316
8920
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
|
9317
8921
|
}
|
|
9318
8922
|
|
|
9319
|
-
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
|
9320
|
-
const int n_heads_log2_floor, const float m0, const float m1,
|
|
9321
|
-
const sycl::nd_item<3> &item_ct1) {
|
|
9322
|
-
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
9323
|
-
item_ct1.get_local_id(2);
|
|
9324
|
-
|
|
9325
|
-
if (col >= ncols) {
|
|
9326
|
-
return;
|
|
9327
|
-
}
|
|
9328
|
-
|
|
9329
|
-
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
9330
|
-
item_ct1.get_local_id(1);
|
|
9331
|
-
const int i = row*ncols + col;
|
|
9332
|
-
|
|
9333
|
-
const int k = row/k_rows;
|
|
9334
|
-
|
|
9335
|
-
float m_k;
|
|
9336
|
-
if (k < n_heads_log2_floor) {
|
|
9337
|
-
m_k = dpct::pow(m0, k + 1);
|
|
9338
|
-
} else {
|
|
9339
|
-
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
9340
|
-
}
|
|
9341
|
-
|
|
9342
|
-
dst[i] = col * m_k + x[i];
|
|
9343
|
-
}
|
|
9344
|
-
|
|
9345
8923
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
|
9346
8924
|
const sycl::nd_item<3> &item_ct1) {
|
|
9347
8925
|
const int row = item_ct1.get_group(1);
|
|
@@ -9443,7 +9021,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
|
9443
9021
|
|
|
9444
9022
|
|
|
9445
9023
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
|
9446
|
-
static void soft_max_f32(const float * x, const float * mask,
|
|
9024
|
+
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
|
|
9447
9025
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
|
9448
9026
|
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
|
9449
9027
|
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
|
@@ -9457,7 +9035,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
|
9457
9035
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
9458
9036
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
9459
9037
|
|
|
9460
|
-
float slope =
|
|
9038
|
+
float slope = 1.0f;
|
|
9461
9039
|
|
|
9462
9040
|
// ALiBi
|
|
9463
9041
|
if (max_bias > 0.0f) {
|
|
@@ -9482,7 +9060,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
|
9482
9060
|
const int ix = rowx*ncols + col;
|
|
9483
9061
|
const int iy = rowy*ncols + col;
|
|
9484
9062
|
|
|
9485
|
-
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f)
|
|
9063
|
+
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
|
|
9486
9064
|
|
|
9487
9065
|
vals[col] = val;
|
|
9488
9066
|
max_val = sycl::max(max_val, val);
|
|
@@ -10112,18 +9690,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
10112
9690
|
});
|
|
10113
9691
|
}
|
|
10114
9692
|
|
|
10115
|
-
static void upscale_f32_sycl(const float *x, float *dst, const int
|
|
10116
|
-
const int
|
|
10117
|
-
const int
|
|
10118
|
-
|
|
10119
|
-
int
|
|
10120
|
-
|
|
9693
|
+
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
|
9694
|
+
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
9695
|
+
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
9696
|
+
const float sf2, const float sf3, dpct::queue_ptr stream) {
|
|
9697
|
+
int dst_size = ne10 * ne11 * ne12 * ne13;
|
|
9698
|
+
int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
|
9699
|
+
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
|
10121
9700
|
stream->parallel_for(
|
|
10122
|
-
sycl::nd_range<
|
|
10123
|
-
|
|
10124
|
-
|
|
10125
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10126
|
-
upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
|
|
9701
|
+
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
|
9702
|
+
[=](sycl::nd_item<1> item_ct1) {
|
|
9703
|
+
upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
|
10127
9704
|
});
|
|
10128
9705
|
}
|
|
10129
9706
|
|
|
@@ -10225,7 +9802,6 @@ template <typename dst_t>
|
|
|
10225
9802
|
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10226
9803
|
dpct::queue_ptr stream) {
|
|
10227
9804
|
const int nb = k / QK_K;
|
|
10228
|
-
#if QK_K == 256
|
|
10229
9805
|
{
|
|
10230
9806
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10231
9807
|
{sycl::aspect::fp16});
|
|
@@ -10237,27 +9813,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10237
9813
|
dequantize_block_q2_K(vx, y, item_ct1);
|
|
10238
9814
|
});
|
|
10239
9815
|
}
|
|
10240
|
-
#else
|
|
10241
|
-
{
|
|
10242
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10243
|
-
{sycl::aspect::fp16});
|
|
10244
|
-
|
|
10245
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10246
|
-
sycl::range<3>(1, 1, 32),
|
|
10247
|
-
sycl::range<3>(1, 1, 32)),
|
|
10248
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10249
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
|
10250
|
-
});
|
|
10251
|
-
}
|
|
10252
|
-
|
|
10253
|
-
#endif
|
|
10254
9816
|
}
|
|
10255
9817
|
|
|
10256
9818
|
template <typename dst_t>
|
|
10257
9819
|
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10258
9820
|
dpct::queue_ptr stream) {
|
|
10259
9821
|
const int nb = k / QK_K;
|
|
10260
|
-
#if QK_K == 256
|
|
10261
9822
|
{
|
|
10262
9823
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10263
9824
|
{sycl::aspect::fp16});
|
|
@@ -10269,19 +9830,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10269
9830
|
dequantize_block_q3_K(vx, y, item_ct1);
|
|
10270
9831
|
});
|
|
10271
9832
|
}
|
|
10272
|
-
#else
|
|
10273
|
-
{
|
|
10274
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10275
|
-
{sycl::aspect::fp16});
|
|
10276
|
-
|
|
10277
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10278
|
-
sycl::range<3>(1, 1, 32),
|
|
10279
|
-
sycl::range<3>(1, 1, 32)),
|
|
10280
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10281
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
|
10282
|
-
});
|
|
10283
|
-
}
|
|
10284
|
-
#endif
|
|
10285
9833
|
}
|
|
10286
9834
|
|
|
10287
9835
|
template <typename dst_t>
|
|
@@ -10342,7 +9890,6 @@ template <typename dst_t>
|
|
|
10342
9890
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10343
9891
|
dpct::queue_ptr stream) {
|
|
10344
9892
|
const int nb = k / QK_K;
|
|
10345
|
-
#if QK_K == 256
|
|
10346
9893
|
{
|
|
10347
9894
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10348
9895
|
{sycl::aspect::fp16});
|
|
@@ -10354,27 +9901,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10354
9901
|
dequantize_block_q5_K(vx, y, item_ct1);
|
|
10355
9902
|
});
|
|
10356
9903
|
}
|
|
10357
|
-
#else
|
|
10358
|
-
{
|
|
10359
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10360
|
-
{sycl::aspect::fp16});
|
|
10361
|
-
|
|
10362
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10363
|
-
sycl::range<3>(1, 1, 32),
|
|
10364
|
-
sycl::range<3>(1, 1, 32)),
|
|
10365
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10366
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
|
10367
|
-
});
|
|
10368
|
-
}
|
|
10369
|
-
|
|
10370
|
-
#endif
|
|
10371
9904
|
}
|
|
10372
9905
|
|
|
10373
9906
|
template <typename dst_t>
|
|
10374
9907
|
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10375
9908
|
dpct::queue_ptr stream) {
|
|
10376
9909
|
const int nb = k / QK_K;
|
|
10377
|
-
#if QK_K == 256
|
|
10378
9910
|
{
|
|
10379
9911
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10380
9912
|
{sycl::aspect::fp16});
|
|
@@ -10386,20 +9918,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10386
9918
|
dequantize_block_q6_K(vx, y, item_ct1);
|
|
10387
9919
|
});
|
|
10388
9920
|
}
|
|
10389
|
-
#else
|
|
10390
|
-
{
|
|
10391
|
-
dpct::has_capability_or_fail(stream->get_device(),
|
|
10392
|
-
{sycl::aspect::fp16});
|
|
10393
|
-
|
|
10394
|
-
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
10395
|
-
sycl::range<3>(1, 1, 32),
|
|
10396
|
-
sycl::range<3>(1, 1, 32)),
|
|
10397
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
10398
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
|
10399
|
-
});
|
|
10400
|
-
}
|
|
10401
|
-
|
|
10402
|
-
#endif
|
|
10403
9921
|
}
|
|
10404
9922
|
|
|
10405
9923
|
template <typename dst_t>
|
|
@@ -10551,9 +10069,6 @@ template <typename dst_t>
|
|
|
10551
10069
|
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10552
10070
|
dpct::queue_ptr stream) {
|
|
10553
10071
|
const int nb = (k + QK_K - 1) / QK_K;
|
|
10554
|
-
#if QK_K == 64
|
|
10555
|
-
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
|
10556
|
-
#else
|
|
10557
10072
|
{
|
|
10558
10073
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
10559
10074
|
{sycl::aspect::fp16});
|
|
@@ -10568,7 +10083,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
|
10568
10083
|
});
|
|
10569
10084
|
});
|
|
10570
10085
|
}
|
|
10571
|
-
#endif
|
|
10572
10086
|
}
|
|
10573
10087
|
|
|
10574
10088
|
|
|
@@ -12073,8 +11587,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
12073
11587
|
const int nrows_y, const int nrows_dst,
|
|
12074
11588
|
dpct::queue_ptr stream) try {
|
|
12075
11589
|
|
|
12076
|
-
#if QK_K == 256
|
|
12077
|
-
|
|
12078
11590
|
int id;
|
|
12079
11591
|
SYCL_CHECK(
|
|
12080
11592
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
|
@@ -12189,7 +11701,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
12189
11701
|
});
|
|
12190
11702
|
}
|
|
12191
11703
|
}
|
|
12192
|
-
#endif
|
|
12193
11704
|
}
|
|
12194
11705
|
catch (sycl::exception const &exc) {
|
|
12195
11706
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -12903,7 +12414,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
|
12903
12414
|
const int32_t *pos, float freq_scale,
|
|
12904
12415
|
int p_delta_rows, float freq_base, float ext_factor,
|
|
12905
12416
|
float attn_factor, rope_corr_dims corr_dims,
|
|
12906
|
-
dpct::queue_ptr stream) {
|
|
12417
|
+
const float * freq_factors, dpct::queue_ptr stream) {
|
|
12907
12418
|
GGML_ASSERT(ncols % 2 == 0);
|
|
12908
12419
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
12909
12420
|
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
|
@@ -12913,38 +12424,48 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
|
12913
12424
|
const float inv_ndims = -1.0f / n_dims;
|
|
12914
12425
|
|
|
12915
12426
|
if (pos == nullptr) {
|
|
12916
|
-
/*
|
|
12917
|
-
DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
|
|
12918
|
-
the limit. To get the device limit, query
|
|
12919
|
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
12920
|
-
*/
|
|
12921
12427
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
12922
12428
|
{sycl::aspect::fp16});
|
|
12923
|
-
|
|
12924
|
-
|
|
12925
|
-
|
|
12926
|
-
|
|
12927
|
-
|
|
12928
|
-
|
|
12929
|
-
|
|
12930
|
-
|
|
12931
|
-
|
|
12429
|
+
if (freq_factors == nullptr) {
|
|
12430
|
+
stream->parallel_for(
|
|
12431
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12432
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12433
|
+
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12434
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12435
|
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
|
12436
|
+
item_ct1);
|
|
12437
|
+
});
|
|
12438
|
+
} else {
|
|
12439
|
+
stream->parallel_for(
|
|
12440
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12441
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12442
|
+
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12443
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12444
|
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
|
12445
|
+
item_ct1);
|
|
12446
|
+
});
|
|
12447
|
+
}
|
|
12932
12448
|
} else {
|
|
12933
|
-
/*
|
|
12934
|
-
DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
|
|
12935
|
-
the limit. To get the device limit, query
|
|
12936
|
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
12937
|
-
*/
|
|
12938
12449
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
12939
12450
|
{sycl::aspect::fp16});
|
|
12940
12451
|
|
|
12941
|
-
|
|
12942
|
-
|
|
12943
|
-
|
|
12944
|
-
|
|
12945
|
-
|
|
12946
|
-
|
|
12947
|
-
|
|
12452
|
+
if (freq_factors == nullptr) {
|
|
12453
|
+
stream->parallel_for(
|
|
12454
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12455
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12456
|
+
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12457
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12458
|
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
|
12459
|
+
});
|
|
12460
|
+
} else {
|
|
12461
|
+
stream->parallel_for(
|
|
12462
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12463
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
12464
|
+
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
12465
|
+
p_delta_rows, ext_factor, attn_factor,
|
|
12466
|
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
|
12467
|
+
});
|
|
12468
|
+
}
|
|
12948
12469
|
}
|
|
12949
12470
|
}
|
|
12950
12471
|
|
|
@@ -12964,20 +12485,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
|
|
12964
12485
|
});
|
|
12965
12486
|
}
|
|
12966
12487
|
|
|
12967
|
-
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
|
|
12968
|
-
const int nrows, const int k_rows,
|
|
12969
|
-
const int n_heads_log2_floor, const float m0,
|
|
12970
|
-
const float m1, dpct::queue_ptr stream) {
|
|
12971
|
-
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
|
|
12972
|
-
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
|
|
12973
|
-
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
|
|
12974
|
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12975
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
12976
|
-
alibi_f32(x, dst, ncols, k_rows,
|
|
12977
|
-
n_heads_log2_floor, m0, m1, item_ct1);
|
|
12978
|
-
});
|
|
12979
|
-
}
|
|
12980
|
-
|
|
12981
12488
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
|
12982
12489
|
const int nrows, dpct::queue_ptr stream) {
|
|
12983
12490
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
@@ -13058,7 +12565,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
|
|
13058
12565
|
}
|
|
13059
12566
|
|
|
13060
12567
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
|
13061
|
-
static void soft_max_f32_submitter(const float * x, const float * mask,
|
|
12568
|
+
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
|
|
13062
12569
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
|
13063
12570
|
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
|
13064
12571
|
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
|
@@ -13068,7 +12575,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
|
13068
12575
|
cgh.parallel_for(
|
|
13069
12576
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
13070
12577
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
13071
|
-
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask,
|
|
12578
|
+
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
|
13072
12579
|
nrows_y, scale, max_bias, m0,
|
|
13073
12580
|
m1, n_head_log2, item_ct1,
|
|
13074
12581
|
local_buf_acc.get_pointer());
|
|
@@ -13076,7 +12583,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
|
13076
12583
|
});
|
|
13077
12584
|
}
|
|
13078
12585
|
|
|
13079
|
-
static void soft_max_f32_sycl(const float * x, const float * mask,
|
|
12586
|
+
static void soft_max_f32_sycl(const float * x, const float * mask,
|
|
13080
12587
|
float * dst, const int ncols_x, const int nrows_x,
|
|
13081
12588
|
const int nrows_y, const float scale, const float max_bias,
|
|
13082
12589
|
dpct::queue_ptr stream) {
|
|
@@ -13098,60 +12605,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
|
13098
12605
|
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
|
13099
12606
|
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
|
13100
12607
|
if (ncols_x > max_block_size) {
|
|
13101
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
|
12608
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13102
12609
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13103
12610
|
block_dims, n_local_scratch, stream);
|
|
13104
12611
|
return;
|
|
13105
12612
|
}
|
|
13106
12613
|
switch (ncols_x) {
|
|
13107
12614
|
case 32:
|
|
13108
|
-
soft_max_f32_submitter<true, 32, 32>(x, mask,
|
|
12615
|
+
soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13109
12616
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13110
12617
|
block_dims, n_local_scratch, stream);
|
|
13111
12618
|
break;
|
|
13112
12619
|
case 64:
|
|
13113
|
-
soft_max_f32_submitter<true, 64, 64>(x, mask,
|
|
12620
|
+
soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13114
12621
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13115
12622
|
block_dims, n_local_scratch, stream);
|
|
13116
12623
|
break;
|
|
13117
12624
|
case 128:
|
|
13118
|
-
soft_max_f32_submitter<true, 128, 128>(x, mask,
|
|
12625
|
+
soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13119
12626
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13120
12627
|
block_dims, n_local_scratch, stream);
|
|
13121
12628
|
break;
|
|
13122
12629
|
case 256:
|
|
13123
|
-
soft_max_f32_submitter<true, 256, 256>(x, mask,
|
|
12630
|
+
soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13124
12631
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13125
12632
|
block_dims, n_local_scratch, stream);
|
|
13126
12633
|
break;
|
|
13127
12634
|
case 512:
|
|
13128
|
-
soft_max_f32_submitter<true, 512, 512>(x, mask,
|
|
12635
|
+
soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13129
12636
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13130
12637
|
block_dims, n_local_scratch, stream);
|
|
13131
12638
|
break;
|
|
13132
12639
|
case 1024:
|
|
13133
|
-
soft_max_f32_submitter<true, 1024, 1024>(x, mask,
|
|
12640
|
+
soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13134
12641
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13135
12642
|
block_dims, n_local_scratch, stream);
|
|
13136
12643
|
break;
|
|
13137
12644
|
case 2048:
|
|
13138
|
-
soft_max_f32_submitter<true, 2048, 1024>(x, mask,
|
|
12645
|
+
soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13139
12646
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13140
12647
|
block_dims, n_local_scratch, stream);
|
|
13141
12648
|
break;
|
|
13142
12649
|
case 4096:
|
|
13143
|
-
soft_max_f32_submitter<true, 4096, 1024>(x, mask,
|
|
12650
|
+
soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13144
12651
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13145
12652
|
block_dims, n_local_scratch, stream);
|
|
13146
12653
|
break;
|
|
13147
12654
|
default:
|
|
13148
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
|
12655
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13149
12656
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13150
12657
|
block_dims, n_local_scratch, stream);
|
|
13151
12658
|
break;
|
|
13152
12659
|
}
|
|
13153
12660
|
} else {
|
|
13154
|
-
soft_max_f32_submitter<false, 0, 0>(x, mask,
|
|
12661
|
+
soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
|
13155
12662
|
max_bias, m0, m1, n_head_log2, block_nums,
|
|
13156
12663
|
block_dims, WARP_SIZE, stream);
|
|
13157
12664
|
}
|
|
@@ -14005,6 +13512,10 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
|
|
|
14005
13512
|
const float *src0_dd, const float *src1_dd,
|
|
14006
13513
|
float *dst_dd,
|
|
14007
13514
|
const dpct::queue_ptr &main_stream) {
|
|
13515
|
+
#pragma message("TODO: generalize concat kernel for dim != 2")
|
|
13516
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
|
13517
|
+
int dim = dst->op_params[0];
|
|
13518
|
+
GGML_ASSERT(dim != 2);
|
|
14008
13519
|
|
|
14009
13520
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
14010
13521
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
@@ -14026,11 +13537,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
|
|
14026
13537
|
|
|
14027
13538
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
14028
13539
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
14029
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
|
14030
13540
|
|
|
14031
|
-
const
|
|
13541
|
+
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
|
13542
|
+
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
|
13543
|
+
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
|
13544
|
+
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
|
14032
13545
|
|
|
14033
|
-
upscale_f32_sycl(src0_dd, dst_dd, src0->
|
|
13546
|
+
upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
|
13547
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
|
13548
|
+
main_stream);
|
|
14034
13549
|
|
|
14035
13550
|
(void) src1;
|
|
14036
13551
|
(void) dst;
|
|
@@ -14486,6 +14001,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14486
14001
|
ggml_tensor *dst, const float *src0_dd,
|
|
14487
14002
|
const float *src1_dd, float *dst_dd,
|
|
14488
14003
|
const dpct::queue_ptr &main_stream) {
|
|
14004
|
+
const ggml_tensor * src2 = dst->src[2];
|
|
14489
14005
|
|
|
14490
14006
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
14491
14007
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
@@ -14511,6 +14027,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14511
14027
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
14512
14028
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
14513
14029
|
|
|
14030
|
+
const float * freq_factors = nullptr;
|
|
14514
14031
|
const int32_t * pos = nullptr;
|
|
14515
14032
|
if ((mode & 1) == 0) {
|
|
14516
14033
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
@@ -14521,6 +14038,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14521
14038
|
const bool is_neox = mode & 2;
|
|
14522
14039
|
const bool is_glm = mode & 4;
|
|
14523
14040
|
|
|
14041
|
+
if (is_neox) {
|
|
14042
|
+
pos = (const int32_t *) src1_dd;
|
|
14043
|
+
|
|
14044
|
+
if (src2 != nullptr) {
|
|
14045
|
+
freq_factors = (const float *) src2->data;
|
|
14046
|
+
}
|
|
14047
|
+
} else {
|
|
14048
|
+
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
|
14049
|
+
}
|
|
14050
|
+
|
|
14524
14051
|
rope_corr_dims corr_dims;
|
|
14525
14052
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
|
14526
14053
|
|
|
@@ -14532,13 +14059,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14532
14059
|
if (src0->type == GGML_TYPE_F32) {
|
|
14533
14060
|
rope_neox_sycl(
|
|
14534
14061
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
|
14535
|
-
attn_factor, corr_dims, main_stream
|
|
14062
|
+
attn_factor, corr_dims, freq_factors, main_stream
|
|
14536
14063
|
);
|
|
14537
14064
|
} else if (src0->type == GGML_TYPE_F16) {
|
|
14538
14065
|
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
|
14539
14066
|
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
|
14540
14067
|
freq_base, ext_factor, attn_factor, corr_dims,
|
|
14541
|
-
main_stream);
|
|
14068
|
+
freq_factors, main_stream);
|
|
14542
14069
|
} else {
|
|
14543
14070
|
GGML_ASSERT(false);
|
|
14544
14071
|
}
|
|
@@ -14562,36 +14089,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14562
14089
|
(void) src1_dd;
|
|
14563
14090
|
}
|
|
14564
14091
|
|
|
14565
|
-
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14566
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
14567
|
-
const float *src1_dd, float *dst_dd,
|
|
14568
|
-
const dpct::queue_ptr &main_stream) {
|
|
14569
|
-
|
|
14570
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
14571
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
14572
|
-
|
|
14573
|
-
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
|
14574
|
-
const int64_t nrows = ggml_nrows(src0);
|
|
14575
|
-
|
|
14576
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
14577
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
14578
|
-
float max_bias;
|
|
14579
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
14580
|
-
|
|
14581
|
-
//GGML_ASSERT(ne01 + n_past == ne00);
|
|
14582
|
-
GGML_ASSERT(n_head == ne02);
|
|
14583
|
-
|
|
14584
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
14585
|
-
|
|
14586
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
14587
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
14588
|
-
|
|
14589
|
-
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
|
14590
|
-
|
|
14591
|
-
(void) src1;
|
|
14592
|
-
(void) src1_dd;
|
|
14593
|
-
}
|
|
14594
|
-
|
|
14595
14092
|
static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
|
|
14596
14093
|
const ggml_tensor *src1, ggml_tensor *dst,
|
|
14597
14094
|
const float *src0_dd, const float *src1_dd,
|
|
@@ -14746,12 +14243,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
|
14746
14243
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
14747
14244
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
14748
14245
|
|
|
14749
|
-
|
|
14750
|
-
|
|
14751
|
-
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
|
|
14246
|
+
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
|
|
14752
14247
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
|
14753
14248
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
|
14754
|
-
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
|
|
14755
14249
|
|
|
14756
14250
|
const int64_t ne00 = src0->ne[0];
|
|
14757
14251
|
const int64_t nrows_x = ggml_nrows(src0);
|
|
@@ -14763,25 +14257,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
|
14763
14257
|
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
|
14764
14258
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
|
14765
14259
|
|
|
14766
|
-
|
|
14767
|
-
float * src2_dd = nullptr;
|
|
14768
|
-
sycl_pool_alloc<float> src2_f;
|
|
14769
|
-
|
|
14770
|
-
const bool use_src2 = src2 != nullptr;
|
|
14771
|
-
|
|
14772
|
-
if (use_src2) {
|
|
14773
|
-
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
|
14774
|
-
|
|
14775
|
-
if (src2_on_device) {
|
|
14776
|
-
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
|
14777
|
-
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
|
14778
|
-
} else {
|
|
14779
|
-
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
|
14780
|
-
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
|
14781
|
-
}
|
|
14782
|
-
}
|
|
14783
|
-
|
|
14784
|
-
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
|
14260
|
+
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
|
|
14785
14261
|
nrows_x, nrows_y, scale, max_bias, main_stream);
|
|
14786
14262
|
}
|
|
14787
14263
|
|
|
@@ -15656,26 +15132,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
15656
15132
|
const int64_t r2 = ne12/ne02;
|
|
15657
15133
|
const int64_t r3 = ne13/ne03;
|
|
15658
15134
|
|
|
15659
|
-
#if 0
|
|
15660
|
-
// use syclGemmEx
|
|
15661
|
-
{
|
|
15662
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
|
15663
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
|
15664
|
-
int i03 = i13 / r3;
|
|
15665
|
-
int i02 = i12 / r2;
|
|
15666
|
-
|
|
15667
|
-
SYCL_CHECK(
|
|
15668
|
-
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
|
15669
|
-
ne01, ne11, ne10,
|
|
15670
|
-
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
|
|
15671
|
-
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
|
|
15672
|
-
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
|
15673
|
-
cu_compute_type,
|
|
15674
|
-
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
15675
|
-
}
|
|
15676
|
-
}
|
|
15677
|
-
}
|
|
15678
|
-
#else
|
|
15679
15135
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
|
15680
15136
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
|
15681
15137
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
|
@@ -15687,7 +15143,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
15687
15143
|
nb11 / nb10, nb12 / nb10, beta,
|
|
15688
15144
|
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
|
15689
15145
|
ne12 * ne13, cu_compute_type)));
|
|
15690
|
-
g_sycl_handles[g_main_device]->wait();
|
|
15691
15146
|
} else {
|
|
15692
15147
|
const int ne23 = ne12*ne13;
|
|
15693
15148
|
|
|
@@ -15718,7 +15173,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
15718
15173
|
nb02, nb03, nb12_scaled, nb13_scaled,
|
|
15719
15174
|
nbd2, nbd3, r2, r3, item_ct1);
|
|
15720
15175
|
});
|
|
15721
|
-
})
|
|
15176
|
+
});
|
|
15722
15177
|
}
|
|
15723
15178
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
|
15724
15179
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
|
@@ -15729,9 +15184,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
15729
15184
|
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
|
15730
15185
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
|
15731
15186
|
cu_compute_type)));
|
|
15732
|
-
g_sycl_handles[g_main_device]->wait();
|
|
15733
15187
|
}
|
|
15734
|
-
#endif
|
|
15735
15188
|
|
|
15736
15189
|
if (no_mixed_dtypes) {
|
|
15737
15190
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
|
@@ -15814,6 +15267,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
|
15814
15267
|
}
|
|
15815
15268
|
} else {
|
|
15816
15269
|
bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
|
15270
|
+
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
|
15817
15271
|
|
|
15818
15272
|
if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
|
|
15819
15273
|
use_mul_mat_q = false;
|
|
@@ -16232,10 +15686,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
|
16232
15686
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
|
|
16233
15687
|
}
|
|
16234
15688
|
|
|
16235
|
-
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
16236
|
-
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
|
|
16237
|
-
}
|
|
16238
|
-
|
|
16239
15689
|
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
16240
15690
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
|
|
16241
15691
|
}
|
|
@@ -16612,9 +16062,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
|
16612
16062
|
case GGML_OP_ROPE:
|
|
16613
16063
|
func = ggml_sycl_rope;
|
|
16614
16064
|
break;
|
|
16615
|
-
case GGML_OP_ALIBI:
|
|
16616
|
-
func = ggml_sycl_alibi;
|
|
16617
|
-
break;
|
|
16618
16065
|
case GGML_OP_IM2COL:
|
|
16619
16066
|
func = ggml_sycl_im2col;
|
|
16620
16067
|
break;
|
|
@@ -17744,7 +17191,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
|
17744
17191
|
case GGML_OP_DIAG_MASK_INF:
|
|
17745
17192
|
case GGML_OP_SOFT_MAX:
|
|
17746
17193
|
case GGML_OP_ROPE:
|
|
17747
|
-
case GGML_OP_ALIBI:
|
|
17748
17194
|
case GGML_OP_IM2COL:
|
|
17749
17195
|
case GGML_OP_POOL_2D:
|
|
17750
17196
|
case GGML_OP_SUM_ROWS:
|