@fugood/llama.node 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -696,24 +696,8 @@ static void ggml_compute_forward_dup_f32(
696
696
  if (ggml_is_contiguous(dst)) {
697
697
  // TODO: simplify
698
698
  if (nb00 == sizeof(float)) {
699
- if (dst->type == GGML_TYPE_F32) {
700
- size_t id = 0;
701
- const size_t rs = ne00 * nb00;
702
- char * dst_ptr = (char *) dst->data;
703
-
704
- for (int i03 = 0; i03 < ne03; i03++) {
705
- for (int i02 = 0; i02 < ne02; i02++) {
706
- id += rs * ir0;
707
- for (int i01 = ir0; i01 < ir1; i01++) {
708
- const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
709
- memcpy(dst_ptr + id, src0_ptr, rs);
710
- id += rs;
711
- }
712
- id += rs * (ne01 - ir1);
713
- }
714
- }
715
- } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
716
- ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
699
+ if (ggml_get_type_traits_cpu(dst->type)->from_float) {
700
+ ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
717
701
 
718
702
  size_t id = 0;
719
703
  size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
@@ -724,7 +708,7 @@ static void ggml_compute_forward_dup_f32(
724
708
  id += rs * ir0;
725
709
  for (int i01 = ir0; i01 < ir1; i01++) {
726
710
  const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
727
- quantize_row_q(src0_ptr, dst_ptr + id, ne00);
711
+ from_float(src0_ptr, dst_ptr + id, ne00);
728
712
  id += rs;
729
713
  }
730
714
  id += rs * (ne01 - ir1);
@@ -2300,6 +2284,12 @@ void ggml_compute_forward_repeat(
2300
2284
  {
2301
2285
  ggml_compute_forward_repeat_f32(params, dst);
2302
2286
  } break;
2287
+ // TODO: templateify the implemenation and support for I64
2288
+ // ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
2289
+ //case GGML_TYPE_I64:
2290
+ // {
2291
+ // ggml_compute_forward_repeat_i64(params, dst);
2292
+ // } break;
2303
2293
  default:
2304
2294
  {
2305
2295
  GGML_ABORT("fatal error");
@@ -3194,6 +3184,435 @@ void ggml_compute_forward_silu_back(
3194
3184
  }
3195
3185
  }
3196
3186
 
3187
+ // ggml_compute_forward_reglu
3188
+
3189
+ static void ggml_compute_forward_reglu_f32(
3190
+ const ggml_compute_params * params,
3191
+ ggml_tensor * dst) {
3192
+
3193
+ const ggml_tensor * src0 = dst->src[0];
3194
+ const ggml_tensor * src1 = dst->src[1];
3195
+ char * src0_d = (char *) src0->data;
3196
+ char * src1_d = (char *) (src1 ? src1->data : src0->data);
3197
+ const size_t src0_o = src0->nb[1];
3198
+ const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
3199
+
3200
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
3201
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
3202
+
3203
+ if (src1) {
3204
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
3205
+ GGML_ASSERT(src0->type == src1->type);
3206
+ }
3207
+
3208
+ const int ith = params->ith;
3209
+ const int nth = params->nth;
3210
+
3211
+ const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
3212
+ const int nr = ggml_nrows(src0);
3213
+
3214
+ GGML_ASSERT(dst->ne[0] == nc);
3215
+ GGML_ASSERT(ggml_nrows(dst) == nr);
3216
+
3217
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
3218
+
3219
+ // rows per thread
3220
+ const int dr = (nr + nth - 1)/nth;
3221
+
3222
+ // row range for this thread
3223
+ const int ir0 = dr*ith;
3224
+ const int ir1 = MIN(ir0 + dr, nr);
3225
+
3226
+ for (int i1 = ir0; i1 < ir1; i1++) {
3227
+ float * src0_p = (float *) (src0_d + i1*src0_o);
3228
+ float * src1_p = (float *) (src1_d + i1*src1_o);
3229
+
3230
+ if (!src1) {
3231
+ src0_p += swapped ? nc : 0;
3232
+ src1_p += swapped ? 0 : nc;
3233
+ }
3234
+
3235
+ ggml_vec_reglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
3236
+
3237
+ #ifndef NDEBUG
3238
+ for (int k = 0; k < nc; k++) {
3239
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3240
+ GGML_UNUSED(x);
3241
+ assert(!isnan(x));
3242
+ assert(!isinf(x));
3243
+ }
3244
+ #endif
3245
+ }
3246
+ }
3247
+
3248
+ static void ggml_compute_forward_reglu_f16(
3249
+ const ggml_compute_params * params,
3250
+ ggml_tensor * dst) {
3251
+
3252
+ const ggml_tensor * src0 = dst->src[0];
3253
+ const ggml_tensor * src1 = dst->src[1];
3254
+ char * src0_d = (char *) src0->data;
3255
+ char * src1_d = (char *) (src1 ? src1->data : src0->data);
3256
+ const size_t src0_o = src0->nb[1];
3257
+ const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
3258
+
3259
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
3260
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
3261
+
3262
+ if (src1) {
3263
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
3264
+ GGML_ASSERT(src0->type == src1->type);
3265
+ }
3266
+
3267
+ const int ith = params->ith;
3268
+ const int nth = params->nth;
3269
+
3270
+ const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
3271
+ const int nr = ggml_nrows(src0);
3272
+
3273
+ GGML_ASSERT(dst->ne[0] == nc);
3274
+ GGML_ASSERT(ggml_nrows(dst) == nr);
3275
+
3276
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
3277
+
3278
+ // rows per thread
3279
+ const int dr = (nr + nth - 1)/nth;
3280
+
3281
+ // row range for this thread
3282
+ const int ir0 = dr*ith;
3283
+ const int ir1 = MIN(ir0 + dr, nr);
3284
+
3285
+ for (int i1 = ir0; i1 < ir1; i1++) {
3286
+ ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
3287
+ ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
3288
+
3289
+ if (!src1) {
3290
+ src0_p += swapped ? nc : 0;
3291
+ src1_p += swapped ? 0 : nc;
3292
+ }
3293
+
3294
+ ggml_vec_reglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
3295
+
3296
+ #ifndef NDEBUG
3297
+ for (int k = 0; k < nc; k++) {
3298
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3299
+ const float v = GGML_FP16_TO_FP32(x);
3300
+ GGML_UNUSED(v);
3301
+ assert(!isnan(v));
3302
+ assert(!isinf(v));
3303
+ }
3304
+ #endif
3305
+ }
3306
+ }
3307
+
3308
+ static void ggml_compute_forward_reglu(
3309
+ const ggml_compute_params * params,
3310
+ ggml_tensor * dst) {
3311
+
3312
+ const ggml_tensor * src0 = dst->src[0];
3313
+
3314
+ switch (src0->type) {
3315
+ case GGML_TYPE_F32:
3316
+ {
3317
+ ggml_compute_forward_reglu_f32(params, dst);
3318
+ } break;
3319
+ case GGML_TYPE_F16:
3320
+ {
3321
+ ggml_compute_forward_reglu_f16(params, dst);
3322
+ } break;
3323
+ default:
3324
+ {
3325
+ GGML_ABORT("fatal error");
3326
+ }
3327
+ }
3328
+ }
3329
+
3330
+ // ggml_compute_forward_geglu
3331
+
3332
+ static void ggml_compute_forward_geglu_f32(
3333
+ const ggml_compute_params * params,
3334
+ ggml_tensor * dst) {
3335
+
3336
+ const ggml_tensor * src0 = dst->src[0];
3337
+ const ggml_tensor * src1 = dst->src[1];
3338
+ char * src0_d = (char *) src0->data;
3339
+ char * src1_d = (char *) (src1 ? src1->data : src0->data);
3340
+ const size_t src0_o = src0->nb[1];
3341
+ const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
3342
+
3343
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
3344
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
3345
+
3346
+ if (src1) {
3347
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
3348
+ GGML_ASSERT(src0->type == src1->type);
3349
+ }
3350
+
3351
+ const int ith = params->ith;
3352
+ const int nth = params->nth;
3353
+
3354
+ const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
3355
+ const int nr = ggml_nrows(src0);
3356
+
3357
+ GGML_ASSERT(dst->ne[0] == nc);
3358
+ GGML_ASSERT(ggml_nrows(dst) == nr);
3359
+
3360
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
3361
+
3362
+ // rows per thread
3363
+ const int dr = (nr + nth - 1)/nth;
3364
+
3365
+ // row range for this thread
3366
+ const int ir0 = dr*ith;
3367
+ const int ir1 = MIN(ir0 + dr, nr);
3368
+
3369
+ for (int i1 = ir0; i1 < ir1; i1++) {
3370
+ float * src0_p = (float *) (src0_d + i1*src0_o);
3371
+ float * src1_p = (float *) (src1_d + i1*src1_o);
3372
+
3373
+ if (!src1) {
3374
+ src0_p += swapped ? nc : 0;
3375
+ src1_p += swapped ? 0 : nc;
3376
+ }
3377
+
3378
+ ggml_vec_geglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
3379
+
3380
+ #ifndef NDEBUG
3381
+ for (int k = 0; k < nc; k++) {
3382
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3383
+ GGML_UNUSED(x);
3384
+ assert(!isnan(x));
3385
+ assert(!isinf(x));
3386
+ }
3387
+ #endif
3388
+ }
3389
+ }
3390
+
3391
+ static void ggml_compute_forward_geglu_f16(
3392
+ const ggml_compute_params * params,
3393
+ ggml_tensor * dst) {
3394
+
3395
+ const ggml_tensor * src0 = dst->src[0];
3396
+ const ggml_tensor * src1 = dst->src[1];
3397
+ char * src0_d = (char *) src0->data;
3398
+ char * src1_d = (char *) (src1 ? src1->data : src0->data);
3399
+ const size_t src0_o = src0->nb[1];
3400
+ const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
3401
+
3402
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
3403
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
3404
+
3405
+ if (src1) {
3406
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
3407
+ GGML_ASSERT(src0->type == src1->type);
3408
+ }
3409
+
3410
+ const int ith = params->ith;
3411
+ const int nth = params->nth;
3412
+
3413
+ const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
3414
+ const int nr = ggml_nrows(src0);
3415
+
3416
+ GGML_ASSERT(dst->ne[0] == nc);
3417
+ GGML_ASSERT(ggml_nrows(dst) == nr);
3418
+
3419
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
3420
+
3421
+ // rows per thread
3422
+ const int dr = (nr + nth - 1)/nth;
3423
+
3424
+ // row range for this thread
3425
+ const int ir0 = dr*ith;
3426
+ const int ir1 = MIN(ir0 + dr, nr);
3427
+
3428
+ for (int i1 = ir0; i1 < ir1; i1++) {
3429
+ ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
3430
+ ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
3431
+
3432
+ if (!src1) {
3433
+ src0_p += swapped ? nc : 0;
3434
+ src1_p += swapped ? 0 : nc;
3435
+ }
3436
+
3437
+ ggml_vec_geglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
3438
+
3439
+ #ifndef NDEBUG
3440
+ for (int k = 0; k < nc; k++) {
3441
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3442
+ const float v = GGML_FP16_TO_FP32(x);
3443
+ GGML_UNUSED(v);
3444
+ assert(!isnan(v));
3445
+ assert(!isinf(v));
3446
+ }
3447
+ #endif
3448
+ }
3449
+ }
3450
+
3451
+ static void ggml_compute_forward_geglu(
3452
+ const ggml_compute_params * params,
3453
+ ggml_tensor * dst) {
3454
+
3455
+ const ggml_tensor * src0 = dst->src[0];
3456
+
3457
+ switch (src0->type) {
3458
+ case GGML_TYPE_F32:
3459
+ {
3460
+ ggml_compute_forward_geglu_f32(params, dst);
3461
+ } break;
3462
+ case GGML_TYPE_F16:
3463
+ {
3464
+ ggml_compute_forward_geglu_f16(params, dst);
3465
+ } break;
3466
+ default:
3467
+ {
3468
+ GGML_ABORT("fatal error");
3469
+ }
3470
+ }
3471
+ }
3472
+
3473
+ // ggml_compute_forward_swiglu
3474
+
3475
+ static void ggml_compute_forward_swiglu_f32(
3476
+ const ggml_compute_params * params,
3477
+ ggml_tensor * dst) {
3478
+
3479
+ const ggml_tensor * src0 = dst->src[0];
3480
+ const ggml_tensor * src1 = dst->src[1];
3481
+ char * src0_d = (char *) src0->data;
3482
+ char * src1_d = (char *) (src1 ? src1->data : src0->data);
3483
+ const size_t src0_o = src0->nb[1];
3484
+ const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
3485
+
3486
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
3487
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
3488
+
3489
+ if (src1) {
3490
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
3491
+ GGML_ASSERT(src0->type == src1->type);
3492
+ }
3493
+
3494
+ const int ith = params->ith;
3495
+ const int nth = params->nth;
3496
+
3497
+ const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
3498
+ const int nr = ggml_nrows(src0);
3499
+
3500
+ GGML_ASSERT(dst->ne[0] == nc);
3501
+ GGML_ASSERT(ggml_nrows(dst) == nr);
3502
+
3503
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
3504
+
3505
+ // rows per thread
3506
+ const int dr = (nr + nth - 1)/nth;
3507
+
3508
+ // row range for this thread
3509
+ const int ir0 = dr*ith;
3510
+ const int ir1 = MIN(ir0 + dr, nr);
3511
+
3512
+ for (int i1 = ir0; i1 < ir1; i1++) {
3513
+ float * src0_p = (float *) (src0_d + i1*src0_o);
3514
+ float * src1_p = (float *) (src1_d + i1*src1_o);
3515
+
3516
+ if (!src1) {
3517
+ src0_p += swapped ? nc : 0;
3518
+ src1_p += swapped ? 0 : nc;
3519
+ }
3520
+
3521
+ ggml_vec_swiglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
3522
+
3523
+ #ifndef NDEBUG
3524
+ for (int k = 0; k < nc; k++) {
3525
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3526
+ GGML_UNUSED(x);
3527
+ assert(!isnan(x));
3528
+ assert(!isinf(x));
3529
+ }
3530
+ #endif
3531
+ }
3532
+ }
3533
+
3534
+ static void ggml_compute_forward_swiglu_f16(
3535
+ const ggml_compute_params * params,
3536
+ ggml_tensor * dst) {
3537
+
3538
+ const ggml_tensor * src0 = dst->src[0];
3539
+ const ggml_tensor * src1 = dst->src[1];
3540
+ char * src0_d = (char *) src0->data;
3541
+ char * src1_d = (char *) (src1 ? src1->data : src0->data);
3542
+ const size_t src0_o = src0->nb[1];
3543
+ const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
3544
+
3545
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
3546
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
3547
+
3548
+ if (src1) {
3549
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
3550
+ GGML_ASSERT(src0->type == src1->type);
3551
+ }
3552
+
3553
+ const int ith = params->ith;
3554
+ const int nth = params->nth;
3555
+
3556
+ const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
3557
+ const int nr = ggml_nrows(src0);
3558
+
3559
+ GGML_ASSERT(dst->ne[0] == nc);
3560
+ GGML_ASSERT(ggml_nrows(dst) == nr);
3561
+
3562
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
3563
+
3564
+ // rows per thread
3565
+ const int dr = (nr + nth - 1)/nth;
3566
+
3567
+ // row range for this thread
3568
+ const int ir0 = dr*ith;
3569
+ const int ir1 = MIN(ir0 + dr, nr);
3570
+
3571
+ for (int i1 = ir0; i1 < ir1; i1++) {
3572
+ ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
3573
+ ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
3574
+
3575
+ if (!src1) {
3576
+ src0_p += swapped ? nc : 0;
3577
+ src1_p += swapped ? 0 : nc;
3578
+ }
3579
+
3580
+ ggml_vec_swiglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
3581
+
3582
+ #ifndef NDEBUG
3583
+ for (int k = 0; k < nc; k++) {
3584
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3585
+ const float v = GGML_FP16_TO_FP32(x);
3586
+ GGML_UNUSED(v);
3587
+ assert(!isnan(v));
3588
+ assert(!isinf(v));
3589
+ }
3590
+ #endif
3591
+ }
3592
+ }
3593
+
3594
+ static void ggml_compute_forward_swiglu(
3595
+ const ggml_compute_params * params,
3596
+ ggml_tensor * dst) {
3597
+
3598
+ const ggml_tensor * src0 = dst->src[0];
3599
+
3600
+ switch (src0->type) {
3601
+ case GGML_TYPE_F32:
3602
+ {
3603
+ ggml_compute_forward_swiglu_f32(params, dst);
3604
+ } break;
3605
+ case GGML_TYPE_F16:
3606
+ {
3607
+ ggml_compute_forward_swiglu_f16(params, dst);
3608
+ } break;
3609
+ default:
3610
+ {
3611
+ GGML_ABORT("fatal error");
3612
+ }
3613
+ }
3614
+ }
3615
+
3197
3616
  // ggml_compute_forward_norm
3198
3617
 
3199
3618
  static void ggml_compute_forward_norm_f32(
@@ -4470,6 +4889,74 @@ void ggml_compute_forward_get_rows(
4470
4889
  //}
4471
4890
  }
4472
4891
 
4892
+ static void ggml_compute_forward_set_rows_f32(
4893
+ const ggml_compute_params * params,
4894
+ ggml_tensor * dst) {
4895
+
4896
+ const ggml_tensor * src0 = dst->src[0];
4897
+ const ggml_tensor * src1 = dst->src[1];
4898
+
4899
+ GGML_TENSOR_BINARY_OP_LOCALS
4900
+
4901
+ const int64_t nc = ne00;
4902
+ const int64_t nr = ne01;
4903
+
4904
+ assert(ne0 == nc);
4905
+ assert(ne2 == ne02);
4906
+ assert(ne3 == ne03);
4907
+ assert(src0->type == GGML_TYPE_F32);
4908
+ assert(ne02 % ne11 == 0);
4909
+ assert(ne03 % ne12 == 0);
4910
+
4911
+ const int ith = params->ith;
4912
+ const int nth = params->nth;
4913
+
4914
+ // rows per thread
4915
+ const int64_t dr = (nr + nth - 1)/nth;
4916
+
4917
+ // row range for this thread
4918
+ const int64_t ir0 = dr*ith;
4919
+ const int64_t ir1 = std::min(ir0 + dr, nr);
4920
+
4921
+ ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
4922
+
4923
+ for (int64_t i03 = 0; i03 < ne03; ++i03) {
4924
+ for (int64_t i02 = 0; i02 < ne02; ++i02) {
4925
+ for (int64_t i = ir0; i < ir1; ++i) {
4926
+ const int64_t i12 = i03%ne12;
4927
+ const int64_t i11 = i02%ne11;
4928
+ const int64_t i10 = i;
4929
+
4930
+ const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
4931
+
4932
+ GGML_ASSERT(i1 >= 0 && i1 < ne1);
4933
+
4934
+ from_float(
4935
+ (const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03),
4936
+ ((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc);
4937
+ }
4938
+ }
4939
+ }
4940
+ }
4941
+
4942
+ void ggml_compute_forward_set_rows(
4943
+ const ggml_compute_params * params,
4944
+ ggml_tensor * dst) {
4945
+
4946
+ const ggml_tensor * src0 = dst->src[0];
4947
+
4948
+ switch (src0->type) {
4949
+ case GGML_TYPE_F32:
4950
+ {
4951
+ ggml_compute_forward_set_rows_f32(params, dst);
4952
+ } break;
4953
+ default:
4954
+ {
4955
+ GGML_ABORT("src0->type = %d (%s) not supported", src0->type, ggml_type_name(src0->type));
4956
+ }
4957
+ }
4958
+ }
4959
+
4473
4960
  // ggml_compute_forward_get_rows_back
4474
4961
 
4475
4962
  static void ggml_compute_forward_get_rows_back_f32_f16(
@@ -7994,6 +8481,34 @@ void ggml_compute_forward_unary(
7994
8481
  }
7995
8482
  }
7996
8483
 
8484
+ //ggml_compute_forward_glu
8485
+
8486
+ void ggml_compute_forward_glu(
8487
+ const ggml_compute_params * params,
8488
+ ggml_tensor * dst) {
8489
+
8490
+ const ggml_glu_op op = ggml_get_glu_op(dst);
8491
+
8492
+ switch (op) {
8493
+ case GGML_GLU_OP_REGLU:
8494
+ {
8495
+ ggml_compute_forward_reglu(params, dst);
8496
+ } break;
8497
+ case GGML_GLU_OP_GEGLU:
8498
+ {
8499
+ ggml_compute_forward_geglu(params, dst);
8500
+ } break;
8501
+ case GGML_GLU_OP_SWIGLU:
8502
+ {
8503
+ ggml_compute_forward_swiglu(params, dst);
8504
+ } break;
8505
+ default:
8506
+ {
8507
+ GGML_ABORT("fatal error");
8508
+ }
8509
+ }
8510
+ }
8511
+
7997
8512
  // ggml_compute_forward_get_rel_pos
7998
8513
 
7999
8514
  static void ggml_compute_forward_get_rel_pos_f16(
@@ -53,6 +53,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
53
53
  void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
54
54
  void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
55
55
  void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
56
+ void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
56
57
  void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
57
58
  void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
58
59
  void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -93,6 +94,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st
93
94
  void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
94
95
  void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
95
96
  void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
97
+ void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
96
98
  void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
97
99
  void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
98
100
  void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
254
254
  }
255
255
  }
256
256
 
257
+ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
258
+ int i = 0;
259
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
260
+ for (; i + 15 < n; i += 16) {
261
+ _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
262
+ }
263
+ #elif defined(__AVX2__) && defined(__FMA__)
264
+ for (; i + 7 < n; i += 8) {
265
+ _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
266
+ }
267
+ #elif defined(__SSE2__)
268
+ for (; i + 3 < n; i += 4) {
269
+ _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
270
+ }
271
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
272
+ for (; i + 3 < n; i += 4) {
273
+ vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
274
+ }
275
+ #endif
276
+ for (; i < n; ++i) {
277
+ y[i] = ggml_silu_f32(x[i]) * g[i];
278
+ }
279
+ }
280
+
257
281
  ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
258
282
  int i = 0;
259
283
  ggml_float sum = 0;
@@ -905,6 +905,60 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
905
905
  }
906
906
  }
907
907
 
908
+ inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
909
+ for (int i = 0; i < n; ++i) {
910
+ y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
911
+ }
912
+ }
913
+
914
+ inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
915
+ for (int i = 0; i < n; ++i) {
916
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
917
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
918
+ }
919
+ }
920
+
921
+ #ifdef GGML_GELU_FP16
922
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
923
+ uint16_t t;
924
+ for (int i = 0; i < n; ++i) {
925
+ if (x[i] <= -10.0f) {
926
+ y[i] = 0.0f;
927
+ } else if (x[i] >= 10.0f) {
928
+ y[i] = x[i] * g[i];
929
+ } else {
930
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
931
+ memcpy(&t, &fp16, sizeof(uint16_t));
932
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
933
+ }
934
+ }
935
+ }
936
+ #else
937
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
938
+ for (int i = 0; i < n; ++i) {
939
+ y[i] = ggml_gelu_f32(x[i]) * g[i];
940
+ }
941
+ }
942
+ #endif
943
+
944
+ inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
945
+ const uint16_t * i16 = (const uint16_t *) x;
946
+ for (int i = 0; i < n; ++i) {
947
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
948
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
949
+ }
950
+ }
951
+
952
+ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
953
+
954
+ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
955
+ for (int i = 0; i < n; ++i) {
956
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
957
+ float w = GGML_CPU_FP16_TO_FP32(g[i]);
958
+ y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
959
+ }
960
+ }
961
+
908
962
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
909
963
  #ifndef GGML_USE_ACCELERATE
910
964
  ggml_float sum = 0.0;