@fugood/llama.node 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -12
- package/src/llama.cpp/common/arg.cpp +10 -0
- package/src/llama.cpp/common/chat.cpp +37 -20
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +90 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +534 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +54 -0
- package/src/llama.cpp/src/llama-arch.cpp +18 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -31
- package/src/llama.cpp/src/llama-graph.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -16
- package/src/llama.cpp/src/llama-model.cpp +178 -0
- package/src/llama.cpp/src/llama-model.h +1 -0
|
@@ -696,24 +696,8 @@ static void ggml_compute_forward_dup_f32(
|
|
|
696
696
|
if (ggml_is_contiguous(dst)) {
|
|
697
697
|
// TODO: simplify
|
|
698
698
|
if (nb00 == sizeof(float)) {
|
|
699
|
-
if (dst->type
|
|
700
|
-
|
|
701
|
-
const size_t rs = ne00 * nb00;
|
|
702
|
-
char * dst_ptr = (char *) dst->data;
|
|
703
|
-
|
|
704
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
|
705
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
|
706
|
-
id += rs * ir0;
|
|
707
|
-
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
708
|
-
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
|
709
|
-
memcpy(dst_ptr + id, src0_ptr, rs);
|
|
710
|
-
id += rs;
|
|
711
|
-
}
|
|
712
|
-
id += rs * (ne01 - ir1);
|
|
713
|
-
}
|
|
714
|
-
}
|
|
715
|
-
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
|
716
|
-
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
|
699
|
+
if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
|
700
|
+
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
|
|
717
701
|
|
|
718
702
|
size_t id = 0;
|
|
719
703
|
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
|
@@ -724,7 +708,7 @@ static void ggml_compute_forward_dup_f32(
|
|
|
724
708
|
id += rs * ir0;
|
|
725
709
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
726
710
|
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
727
|
-
|
|
711
|
+
from_float(src0_ptr, dst_ptr + id, ne00);
|
|
728
712
|
id += rs;
|
|
729
713
|
}
|
|
730
714
|
id += rs * (ne01 - ir1);
|
|
@@ -2300,6 +2284,12 @@ void ggml_compute_forward_repeat(
|
|
|
2300
2284
|
{
|
|
2301
2285
|
ggml_compute_forward_repeat_f32(params, dst);
|
|
2302
2286
|
} break;
|
|
2287
|
+
// TODO: templateify the implemenation and support for I64
|
|
2288
|
+
// ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
|
|
2289
|
+
//case GGML_TYPE_I64:
|
|
2290
|
+
// {
|
|
2291
|
+
// ggml_compute_forward_repeat_i64(params, dst);
|
|
2292
|
+
// } break;
|
|
2303
2293
|
default:
|
|
2304
2294
|
{
|
|
2305
2295
|
GGML_ABORT("fatal error");
|
|
@@ -3194,6 +3184,435 @@ void ggml_compute_forward_silu_back(
|
|
|
3194
3184
|
}
|
|
3195
3185
|
}
|
|
3196
3186
|
|
|
3187
|
+
// ggml_compute_forward_reglu
|
|
3188
|
+
|
|
3189
|
+
static void ggml_compute_forward_reglu_f32(
|
|
3190
|
+
const ggml_compute_params * params,
|
|
3191
|
+
ggml_tensor * dst) {
|
|
3192
|
+
|
|
3193
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3194
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3195
|
+
char * src0_d = (char *) src0->data;
|
|
3196
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3197
|
+
const size_t src0_o = src0->nb[1];
|
|
3198
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3199
|
+
|
|
3200
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3201
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3202
|
+
|
|
3203
|
+
if (src1) {
|
|
3204
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3205
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3206
|
+
}
|
|
3207
|
+
|
|
3208
|
+
const int ith = params->ith;
|
|
3209
|
+
const int nth = params->nth;
|
|
3210
|
+
|
|
3211
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3212
|
+
const int nr = ggml_nrows(src0);
|
|
3213
|
+
|
|
3214
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3215
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3216
|
+
|
|
3217
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3218
|
+
|
|
3219
|
+
// rows per thread
|
|
3220
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3221
|
+
|
|
3222
|
+
// row range for this thread
|
|
3223
|
+
const int ir0 = dr*ith;
|
|
3224
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3225
|
+
|
|
3226
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3227
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3228
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3229
|
+
|
|
3230
|
+
if (!src1) {
|
|
3231
|
+
src0_p += swapped ? nc : 0;
|
|
3232
|
+
src1_p += swapped ? 0 : nc;
|
|
3233
|
+
}
|
|
3234
|
+
|
|
3235
|
+
ggml_vec_reglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3236
|
+
|
|
3237
|
+
#ifndef NDEBUG
|
|
3238
|
+
for (int k = 0; k < nc; k++) {
|
|
3239
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3240
|
+
GGML_UNUSED(x);
|
|
3241
|
+
assert(!isnan(x));
|
|
3242
|
+
assert(!isinf(x));
|
|
3243
|
+
}
|
|
3244
|
+
#endif
|
|
3245
|
+
}
|
|
3246
|
+
}
|
|
3247
|
+
|
|
3248
|
+
static void ggml_compute_forward_reglu_f16(
|
|
3249
|
+
const ggml_compute_params * params,
|
|
3250
|
+
ggml_tensor * dst) {
|
|
3251
|
+
|
|
3252
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3253
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3254
|
+
char * src0_d = (char *) src0->data;
|
|
3255
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3256
|
+
const size_t src0_o = src0->nb[1];
|
|
3257
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3258
|
+
|
|
3259
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3260
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3261
|
+
|
|
3262
|
+
if (src1) {
|
|
3263
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3264
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3265
|
+
}
|
|
3266
|
+
|
|
3267
|
+
const int ith = params->ith;
|
|
3268
|
+
const int nth = params->nth;
|
|
3269
|
+
|
|
3270
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3271
|
+
const int nr = ggml_nrows(src0);
|
|
3272
|
+
|
|
3273
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3274
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3275
|
+
|
|
3276
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3277
|
+
|
|
3278
|
+
// rows per thread
|
|
3279
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3280
|
+
|
|
3281
|
+
// row range for this thread
|
|
3282
|
+
const int ir0 = dr*ith;
|
|
3283
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3284
|
+
|
|
3285
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3286
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3287
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3288
|
+
|
|
3289
|
+
if (!src1) {
|
|
3290
|
+
src0_p += swapped ? nc : 0;
|
|
3291
|
+
src1_p += swapped ? 0 : nc;
|
|
3292
|
+
}
|
|
3293
|
+
|
|
3294
|
+
ggml_vec_reglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3295
|
+
|
|
3296
|
+
#ifndef NDEBUG
|
|
3297
|
+
for (int k = 0; k < nc; k++) {
|
|
3298
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3299
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3300
|
+
GGML_UNUSED(v);
|
|
3301
|
+
assert(!isnan(v));
|
|
3302
|
+
assert(!isinf(v));
|
|
3303
|
+
}
|
|
3304
|
+
#endif
|
|
3305
|
+
}
|
|
3306
|
+
}
|
|
3307
|
+
|
|
3308
|
+
static void ggml_compute_forward_reglu(
|
|
3309
|
+
const ggml_compute_params * params,
|
|
3310
|
+
ggml_tensor * dst) {
|
|
3311
|
+
|
|
3312
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3313
|
+
|
|
3314
|
+
switch (src0->type) {
|
|
3315
|
+
case GGML_TYPE_F32:
|
|
3316
|
+
{
|
|
3317
|
+
ggml_compute_forward_reglu_f32(params, dst);
|
|
3318
|
+
} break;
|
|
3319
|
+
case GGML_TYPE_F16:
|
|
3320
|
+
{
|
|
3321
|
+
ggml_compute_forward_reglu_f16(params, dst);
|
|
3322
|
+
} break;
|
|
3323
|
+
default:
|
|
3324
|
+
{
|
|
3325
|
+
GGML_ABORT("fatal error");
|
|
3326
|
+
}
|
|
3327
|
+
}
|
|
3328
|
+
}
|
|
3329
|
+
|
|
3330
|
+
// ggml_compute_forward_geglu
|
|
3331
|
+
|
|
3332
|
+
static void ggml_compute_forward_geglu_f32(
|
|
3333
|
+
const ggml_compute_params * params,
|
|
3334
|
+
ggml_tensor * dst) {
|
|
3335
|
+
|
|
3336
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3337
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3338
|
+
char * src0_d = (char *) src0->data;
|
|
3339
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3340
|
+
const size_t src0_o = src0->nb[1];
|
|
3341
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3342
|
+
|
|
3343
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3344
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3345
|
+
|
|
3346
|
+
if (src1) {
|
|
3347
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3348
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3349
|
+
}
|
|
3350
|
+
|
|
3351
|
+
const int ith = params->ith;
|
|
3352
|
+
const int nth = params->nth;
|
|
3353
|
+
|
|
3354
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3355
|
+
const int nr = ggml_nrows(src0);
|
|
3356
|
+
|
|
3357
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3358
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3359
|
+
|
|
3360
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3361
|
+
|
|
3362
|
+
// rows per thread
|
|
3363
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3364
|
+
|
|
3365
|
+
// row range for this thread
|
|
3366
|
+
const int ir0 = dr*ith;
|
|
3367
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3368
|
+
|
|
3369
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3370
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3371
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3372
|
+
|
|
3373
|
+
if (!src1) {
|
|
3374
|
+
src0_p += swapped ? nc : 0;
|
|
3375
|
+
src1_p += swapped ? 0 : nc;
|
|
3376
|
+
}
|
|
3377
|
+
|
|
3378
|
+
ggml_vec_geglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3379
|
+
|
|
3380
|
+
#ifndef NDEBUG
|
|
3381
|
+
for (int k = 0; k < nc; k++) {
|
|
3382
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3383
|
+
GGML_UNUSED(x);
|
|
3384
|
+
assert(!isnan(x));
|
|
3385
|
+
assert(!isinf(x));
|
|
3386
|
+
}
|
|
3387
|
+
#endif
|
|
3388
|
+
}
|
|
3389
|
+
}
|
|
3390
|
+
|
|
3391
|
+
static void ggml_compute_forward_geglu_f16(
|
|
3392
|
+
const ggml_compute_params * params,
|
|
3393
|
+
ggml_tensor * dst) {
|
|
3394
|
+
|
|
3395
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3396
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3397
|
+
char * src0_d = (char *) src0->data;
|
|
3398
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3399
|
+
const size_t src0_o = src0->nb[1];
|
|
3400
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3401
|
+
|
|
3402
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3403
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3404
|
+
|
|
3405
|
+
if (src1) {
|
|
3406
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3407
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3408
|
+
}
|
|
3409
|
+
|
|
3410
|
+
const int ith = params->ith;
|
|
3411
|
+
const int nth = params->nth;
|
|
3412
|
+
|
|
3413
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3414
|
+
const int nr = ggml_nrows(src0);
|
|
3415
|
+
|
|
3416
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3417
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3418
|
+
|
|
3419
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3420
|
+
|
|
3421
|
+
// rows per thread
|
|
3422
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3423
|
+
|
|
3424
|
+
// row range for this thread
|
|
3425
|
+
const int ir0 = dr*ith;
|
|
3426
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3427
|
+
|
|
3428
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3429
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3430
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3431
|
+
|
|
3432
|
+
if (!src1) {
|
|
3433
|
+
src0_p += swapped ? nc : 0;
|
|
3434
|
+
src1_p += swapped ? 0 : nc;
|
|
3435
|
+
}
|
|
3436
|
+
|
|
3437
|
+
ggml_vec_geglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3438
|
+
|
|
3439
|
+
#ifndef NDEBUG
|
|
3440
|
+
for (int k = 0; k < nc; k++) {
|
|
3441
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3442
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3443
|
+
GGML_UNUSED(v);
|
|
3444
|
+
assert(!isnan(v));
|
|
3445
|
+
assert(!isinf(v));
|
|
3446
|
+
}
|
|
3447
|
+
#endif
|
|
3448
|
+
}
|
|
3449
|
+
}
|
|
3450
|
+
|
|
3451
|
+
static void ggml_compute_forward_geglu(
|
|
3452
|
+
const ggml_compute_params * params,
|
|
3453
|
+
ggml_tensor * dst) {
|
|
3454
|
+
|
|
3455
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3456
|
+
|
|
3457
|
+
switch (src0->type) {
|
|
3458
|
+
case GGML_TYPE_F32:
|
|
3459
|
+
{
|
|
3460
|
+
ggml_compute_forward_geglu_f32(params, dst);
|
|
3461
|
+
} break;
|
|
3462
|
+
case GGML_TYPE_F16:
|
|
3463
|
+
{
|
|
3464
|
+
ggml_compute_forward_geglu_f16(params, dst);
|
|
3465
|
+
} break;
|
|
3466
|
+
default:
|
|
3467
|
+
{
|
|
3468
|
+
GGML_ABORT("fatal error");
|
|
3469
|
+
}
|
|
3470
|
+
}
|
|
3471
|
+
}
|
|
3472
|
+
|
|
3473
|
+
// ggml_compute_forward_swiglu
|
|
3474
|
+
|
|
3475
|
+
static void ggml_compute_forward_swiglu_f32(
|
|
3476
|
+
const ggml_compute_params * params,
|
|
3477
|
+
ggml_tensor * dst) {
|
|
3478
|
+
|
|
3479
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3480
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3481
|
+
char * src0_d = (char *) src0->data;
|
|
3482
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3483
|
+
const size_t src0_o = src0->nb[1];
|
|
3484
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3485
|
+
|
|
3486
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3487
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3488
|
+
|
|
3489
|
+
if (src1) {
|
|
3490
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3491
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3492
|
+
}
|
|
3493
|
+
|
|
3494
|
+
const int ith = params->ith;
|
|
3495
|
+
const int nth = params->nth;
|
|
3496
|
+
|
|
3497
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3498
|
+
const int nr = ggml_nrows(src0);
|
|
3499
|
+
|
|
3500
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3501
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3502
|
+
|
|
3503
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3504
|
+
|
|
3505
|
+
// rows per thread
|
|
3506
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3507
|
+
|
|
3508
|
+
// row range for this thread
|
|
3509
|
+
const int ir0 = dr*ith;
|
|
3510
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3511
|
+
|
|
3512
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3513
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3514
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3515
|
+
|
|
3516
|
+
if (!src1) {
|
|
3517
|
+
src0_p += swapped ? nc : 0;
|
|
3518
|
+
src1_p += swapped ? 0 : nc;
|
|
3519
|
+
}
|
|
3520
|
+
|
|
3521
|
+
ggml_vec_swiglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3522
|
+
|
|
3523
|
+
#ifndef NDEBUG
|
|
3524
|
+
for (int k = 0; k < nc; k++) {
|
|
3525
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3526
|
+
GGML_UNUSED(x);
|
|
3527
|
+
assert(!isnan(x));
|
|
3528
|
+
assert(!isinf(x));
|
|
3529
|
+
}
|
|
3530
|
+
#endif
|
|
3531
|
+
}
|
|
3532
|
+
}
|
|
3533
|
+
|
|
3534
|
+
static void ggml_compute_forward_swiglu_f16(
|
|
3535
|
+
const ggml_compute_params * params,
|
|
3536
|
+
ggml_tensor * dst) {
|
|
3537
|
+
|
|
3538
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3539
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3540
|
+
char * src0_d = (char *) src0->data;
|
|
3541
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3542
|
+
const size_t src0_o = src0->nb[1];
|
|
3543
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3544
|
+
|
|
3545
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3546
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3547
|
+
|
|
3548
|
+
if (src1) {
|
|
3549
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3550
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3551
|
+
}
|
|
3552
|
+
|
|
3553
|
+
const int ith = params->ith;
|
|
3554
|
+
const int nth = params->nth;
|
|
3555
|
+
|
|
3556
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3557
|
+
const int nr = ggml_nrows(src0);
|
|
3558
|
+
|
|
3559
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3560
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3561
|
+
|
|
3562
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3563
|
+
|
|
3564
|
+
// rows per thread
|
|
3565
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3566
|
+
|
|
3567
|
+
// row range for this thread
|
|
3568
|
+
const int ir0 = dr*ith;
|
|
3569
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3570
|
+
|
|
3571
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3572
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3573
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3574
|
+
|
|
3575
|
+
if (!src1) {
|
|
3576
|
+
src0_p += swapped ? nc : 0;
|
|
3577
|
+
src1_p += swapped ? 0 : nc;
|
|
3578
|
+
}
|
|
3579
|
+
|
|
3580
|
+
ggml_vec_swiglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3581
|
+
|
|
3582
|
+
#ifndef NDEBUG
|
|
3583
|
+
for (int k = 0; k < nc; k++) {
|
|
3584
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3585
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3586
|
+
GGML_UNUSED(v);
|
|
3587
|
+
assert(!isnan(v));
|
|
3588
|
+
assert(!isinf(v));
|
|
3589
|
+
}
|
|
3590
|
+
#endif
|
|
3591
|
+
}
|
|
3592
|
+
}
|
|
3593
|
+
|
|
3594
|
+
static void ggml_compute_forward_swiglu(
|
|
3595
|
+
const ggml_compute_params * params,
|
|
3596
|
+
ggml_tensor * dst) {
|
|
3597
|
+
|
|
3598
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3599
|
+
|
|
3600
|
+
switch (src0->type) {
|
|
3601
|
+
case GGML_TYPE_F32:
|
|
3602
|
+
{
|
|
3603
|
+
ggml_compute_forward_swiglu_f32(params, dst);
|
|
3604
|
+
} break;
|
|
3605
|
+
case GGML_TYPE_F16:
|
|
3606
|
+
{
|
|
3607
|
+
ggml_compute_forward_swiglu_f16(params, dst);
|
|
3608
|
+
} break;
|
|
3609
|
+
default:
|
|
3610
|
+
{
|
|
3611
|
+
GGML_ABORT("fatal error");
|
|
3612
|
+
}
|
|
3613
|
+
}
|
|
3614
|
+
}
|
|
3615
|
+
|
|
3197
3616
|
// ggml_compute_forward_norm
|
|
3198
3617
|
|
|
3199
3618
|
static void ggml_compute_forward_norm_f32(
|
|
@@ -4470,6 +4889,74 @@ void ggml_compute_forward_get_rows(
|
|
|
4470
4889
|
//}
|
|
4471
4890
|
}
|
|
4472
4891
|
|
|
4892
|
+
static void ggml_compute_forward_set_rows_f32(
|
|
4893
|
+
const ggml_compute_params * params,
|
|
4894
|
+
ggml_tensor * dst) {
|
|
4895
|
+
|
|
4896
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
4897
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
4898
|
+
|
|
4899
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
4900
|
+
|
|
4901
|
+
const int64_t nc = ne00;
|
|
4902
|
+
const int64_t nr = ne01;
|
|
4903
|
+
|
|
4904
|
+
assert(ne0 == nc);
|
|
4905
|
+
assert(ne2 == ne02);
|
|
4906
|
+
assert(ne3 == ne03);
|
|
4907
|
+
assert(src0->type == GGML_TYPE_F32);
|
|
4908
|
+
assert(ne02 % ne11 == 0);
|
|
4909
|
+
assert(ne03 % ne12 == 0);
|
|
4910
|
+
|
|
4911
|
+
const int ith = params->ith;
|
|
4912
|
+
const int nth = params->nth;
|
|
4913
|
+
|
|
4914
|
+
// rows per thread
|
|
4915
|
+
const int64_t dr = (nr + nth - 1)/nth;
|
|
4916
|
+
|
|
4917
|
+
// row range for this thread
|
|
4918
|
+
const int64_t ir0 = dr*ith;
|
|
4919
|
+
const int64_t ir1 = std::min(ir0 + dr, nr);
|
|
4920
|
+
|
|
4921
|
+
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
|
|
4922
|
+
|
|
4923
|
+
for (int64_t i03 = 0; i03 < ne03; ++i03) {
|
|
4924
|
+
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
|
4925
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
|
4926
|
+
const int64_t i12 = i03%ne12;
|
|
4927
|
+
const int64_t i11 = i02%ne11;
|
|
4928
|
+
const int64_t i10 = i;
|
|
4929
|
+
|
|
4930
|
+
const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
4931
|
+
|
|
4932
|
+
GGML_ASSERT(i1 >= 0 && i1 < ne1);
|
|
4933
|
+
|
|
4934
|
+
from_float(
|
|
4935
|
+
(const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03),
|
|
4936
|
+
((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc);
|
|
4937
|
+
}
|
|
4938
|
+
}
|
|
4939
|
+
}
|
|
4940
|
+
}
|
|
4941
|
+
|
|
4942
|
+
void ggml_compute_forward_set_rows(
|
|
4943
|
+
const ggml_compute_params * params,
|
|
4944
|
+
ggml_tensor * dst) {
|
|
4945
|
+
|
|
4946
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
4947
|
+
|
|
4948
|
+
switch (src0->type) {
|
|
4949
|
+
case GGML_TYPE_F32:
|
|
4950
|
+
{
|
|
4951
|
+
ggml_compute_forward_set_rows_f32(params, dst);
|
|
4952
|
+
} break;
|
|
4953
|
+
default:
|
|
4954
|
+
{
|
|
4955
|
+
GGML_ABORT("src0->type = %d (%s) not supported", src0->type, ggml_type_name(src0->type));
|
|
4956
|
+
}
|
|
4957
|
+
}
|
|
4958
|
+
}
|
|
4959
|
+
|
|
4473
4960
|
// ggml_compute_forward_get_rows_back
|
|
4474
4961
|
|
|
4475
4962
|
static void ggml_compute_forward_get_rows_back_f32_f16(
|
|
@@ -7994,6 +8481,34 @@ void ggml_compute_forward_unary(
|
|
|
7994
8481
|
}
|
|
7995
8482
|
}
|
|
7996
8483
|
|
|
8484
|
+
//ggml_compute_forward_glu
|
|
8485
|
+
|
|
8486
|
+
void ggml_compute_forward_glu(
|
|
8487
|
+
const ggml_compute_params * params,
|
|
8488
|
+
ggml_tensor * dst) {
|
|
8489
|
+
|
|
8490
|
+
const ggml_glu_op op = ggml_get_glu_op(dst);
|
|
8491
|
+
|
|
8492
|
+
switch (op) {
|
|
8493
|
+
case GGML_GLU_OP_REGLU:
|
|
8494
|
+
{
|
|
8495
|
+
ggml_compute_forward_reglu(params, dst);
|
|
8496
|
+
} break;
|
|
8497
|
+
case GGML_GLU_OP_GEGLU:
|
|
8498
|
+
{
|
|
8499
|
+
ggml_compute_forward_geglu(params, dst);
|
|
8500
|
+
} break;
|
|
8501
|
+
case GGML_GLU_OP_SWIGLU:
|
|
8502
|
+
{
|
|
8503
|
+
ggml_compute_forward_swiglu(params, dst);
|
|
8504
|
+
} break;
|
|
8505
|
+
default:
|
|
8506
|
+
{
|
|
8507
|
+
GGML_ABORT("fatal error");
|
|
8508
|
+
}
|
|
8509
|
+
}
|
|
8510
|
+
}
|
|
8511
|
+
|
|
7997
8512
|
// ggml_compute_forward_get_rel_pos
|
|
7998
8513
|
|
|
7999
8514
|
static void ggml_compute_forward_get_rel_pos_f16(
|
|
@@ -53,6 +53,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
|
|
|
53
53
|
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
54
54
|
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
55
55
|
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
56
|
+
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
56
57
|
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
57
58
|
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
58
59
|
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -93,6 +94,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st
|
|
|
93
94
|
void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
94
95
|
void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
95
96
|
void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
97
|
+
void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
96
98
|
void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
97
99
|
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
98
100
|
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
254
254
|
}
|
|
255
255
|
}
|
|
256
256
|
|
|
257
|
+
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
258
|
+
int i = 0;
|
|
259
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
|
260
|
+
for (; i + 15 < n; i += 16) {
|
|
261
|
+
_mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
|
|
262
|
+
}
|
|
263
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
|
264
|
+
for (; i + 7 < n; i += 8) {
|
|
265
|
+
_mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
|
|
266
|
+
}
|
|
267
|
+
#elif defined(__SSE2__)
|
|
268
|
+
for (; i + 3 < n; i += 4) {
|
|
269
|
+
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
|
270
|
+
}
|
|
271
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
272
|
+
for (; i + 3 < n; i += 4) {
|
|
273
|
+
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
|
274
|
+
}
|
|
275
|
+
#endif
|
|
276
|
+
for (; i < n; ++i) {
|
|
277
|
+
y[i] = ggml_silu_f32(x[i]) * g[i];
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
257
281
|
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
|
258
282
|
int i = 0;
|
|
259
283
|
ggml_float sum = 0;
|
|
@@ -905,6 +905,60 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
|
|
|
905
905
|
}
|
|
906
906
|
}
|
|
907
907
|
|
|
908
|
+
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
|
|
909
|
+
for (int i = 0; i < n; ++i) {
|
|
910
|
+
y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
915
|
+
for (int i = 0; i < n; ++i) {
|
|
916
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
917
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
#ifdef GGML_GELU_FP16
|
|
922
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
923
|
+
uint16_t t;
|
|
924
|
+
for (int i = 0; i < n; ++i) {
|
|
925
|
+
if (x[i] <= -10.0f) {
|
|
926
|
+
y[i] = 0.0f;
|
|
927
|
+
} else if (x[i] >= 10.0f) {
|
|
928
|
+
y[i] = x[i] * g[i];
|
|
929
|
+
} else {
|
|
930
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
931
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
932
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
#else
|
|
937
|
+
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
|
938
|
+
for (int i = 0; i < n; ++i) {
|
|
939
|
+
y[i] = ggml_gelu_f32(x[i]) * g[i];
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
#endif
|
|
943
|
+
|
|
944
|
+
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
945
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
|
946
|
+
for (int i = 0; i < n; ++i) {
|
|
947
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
948
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
|
|
953
|
+
|
|
954
|
+
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
955
|
+
for (int i = 0; i < n; ++i) {
|
|
956
|
+
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
957
|
+
float w = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
958
|
+
y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
|
|
908
962
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
|
909
963
|
#ifndef GGML_USE_ACCELERATE
|
|
910
964
|
ggml_float sum = 0.0;
|