@fugood/llama.node 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -12
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/arg.cpp +17 -0
- package/src/llama.cpp/common/chat.cpp +37 -20
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.h +4 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +181 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -2
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-batch.cpp +27 -1
- package/src/llama.cpp/src/llama-batch.h +8 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +95 -81
- package/src/llama.cpp/src/llama-graph.h +43 -16
- package/src/llama.cpp/src/llama-hparams.cpp +2 -1
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +1374 -210
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +8 -1
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-cpu.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "binary-ops.h"
|
|
6
|
+
#include "ggml.h"
|
|
6
7
|
#include "unary-ops.h"
|
|
7
8
|
#include "vec.h"
|
|
8
9
|
|
|
@@ -696,24 +697,8 @@ static void ggml_compute_forward_dup_f32(
|
|
|
696
697
|
if (ggml_is_contiguous(dst)) {
|
|
697
698
|
// TODO: simplify
|
|
698
699
|
if (nb00 == sizeof(float)) {
|
|
699
|
-
if (dst->type
|
|
700
|
-
|
|
701
|
-
const size_t rs = ne00 * nb00;
|
|
702
|
-
char * dst_ptr = (char *) dst->data;
|
|
703
|
-
|
|
704
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
|
705
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
|
706
|
-
id += rs * ir0;
|
|
707
|
-
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
708
|
-
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
|
709
|
-
memcpy(dst_ptr + id, src0_ptr, rs);
|
|
710
|
-
id += rs;
|
|
711
|
-
}
|
|
712
|
-
id += rs * (ne01 - ir1);
|
|
713
|
-
}
|
|
714
|
-
}
|
|
715
|
-
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
|
716
|
-
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
|
700
|
+
if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
|
701
|
+
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
|
|
717
702
|
|
|
718
703
|
size_t id = 0;
|
|
719
704
|
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
|
@@ -724,7 +709,7 @@ static void ggml_compute_forward_dup_f32(
|
|
|
724
709
|
id += rs * ir0;
|
|
725
710
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
726
711
|
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
727
|
-
|
|
712
|
+
from_float(src0_ptr, dst_ptr + id, ne00);
|
|
728
713
|
id += rs;
|
|
729
714
|
}
|
|
730
715
|
id += rs * (ne01 - ir1);
|
|
@@ -2300,6 +2285,12 @@ void ggml_compute_forward_repeat(
|
|
|
2300
2285
|
{
|
|
2301
2286
|
ggml_compute_forward_repeat_f32(params, dst);
|
|
2302
2287
|
} break;
|
|
2288
|
+
// TODO: templateify the implemenation and support for I64
|
|
2289
|
+
// ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
|
|
2290
|
+
//case GGML_TYPE_I64:
|
|
2291
|
+
// {
|
|
2292
|
+
// ggml_compute_forward_repeat_i64(params, dst);
|
|
2293
|
+
// } break;
|
|
2303
2294
|
default:
|
|
2304
2295
|
{
|
|
2305
2296
|
GGML_ABORT("fatal error");
|
|
@@ -3061,7 +3052,690 @@ static void ggml_compute_forward_leaky_relu_f16(
|
|
|
3061
3052
|
}
|
|
3062
3053
|
}
|
|
3063
3054
|
|
|
3064
|
-
void ggml_compute_forward_leaky_relu(
|
|
3055
|
+
void ggml_compute_forward_leaky_relu(
|
|
3056
|
+
const ggml_compute_params * params,
|
|
3057
|
+
ggml_tensor * dst) {
|
|
3058
|
+
|
|
3059
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3060
|
+
|
|
3061
|
+
switch (src0->type) {
|
|
3062
|
+
case GGML_TYPE_F32:
|
|
3063
|
+
{
|
|
3064
|
+
ggml_compute_forward_leaky_relu_f32(params, dst);
|
|
3065
|
+
} break;
|
|
3066
|
+
case GGML_TYPE_F16:
|
|
3067
|
+
{
|
|
3068
|
+
ggml_compute_forward_leaky_relu_f16(params, dst);
|
|
3069
|
+
} break;
|
|
3070
|
+
default:
|
|
3071
|
+
{
|
|
3072
|
+
GGML_ABORT("fatal error");
|
|
3073
|
+
}
|
|
3074
|
+
}
|
|
3075
|
+
}
|
|
3076
|
+
|
|
3077
|
+
// ggml_compute_forward_silu_back
|
|
3078
|
+
|
|
3079
|
+
static void ggml_compute_forward_silu_back_f32(
|
|
3080
|
+
const ggml_compute_params * params,
|
|
3081
|
+
ggml_tensor * dst) {
|
|
3082
|
+
|
|
3083
|
+
const ggml_tensor * grad = dst->src[0];
|
|
3084
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3085
|
+
|
|
3086
|
+
assert(ggml_is_contiguous_1(grad));
|
|
3087
|
+
assert(ggml_is_contiguous_1(src1));
|
|
3088
|
+
assert(ggml_is_contiguous_1(dst));
|
|
3089
|
+
assert(ggml_are_same_shape(src1, dst));
|
|
3090
|
+
assert(ggml_are_same_shape(src1, grad));
|
|
3091
|
+
|
|
3092
|
+
const int ith = params->ith;
|
|
3093
|
+
const int nth = params->nth;
|
|
3094
|
+
|
|
3095
|
+
const int nc = src1->ne[0];
|
|
3096
|
+
const int nr = ggml_nrows(src1);
|
|
3097
|
+
|
|
3098
|
+
// rows per thread
|
|
3099
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3100
|
+
|
|
3101
|
+
// row range for this thread
|
|
3102
|
+
const int ir0 = dr*ith;
|
|
3103
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3104
|
+
|
|
3105
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3106
|
+
ggml_vec_silu_backward_f32(nc,
|
|
3107
|
+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
3108
|
+
(float *) ((char *) src1->data + i1*(src1->nb[1])),
|
|
3109
|
+
(float *) ((char *) grad->data + i1*(grad->nb[1])));
|
|
3110
|
+
|
|
3111
|
+
#ifndef NDEBUG
|
|
3112
|
+
for (int k = 0; k < nc; k++) {
|
|
3113
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3114
|
+
GGML_UNUSED(x);
|
|
3115
|
+
assert(!isnan(x));
|
|
3116
|
+
assert(!isinf(x));
|
|
3117
|
+
}
|
|
3118
|
+
#endif
|
|
3119
|
+
}
|
|
3120
|
+
}
|
|
3121
|
+
|
|
3122
|
+
static void ggml_compute_forward_silu_back_f16(
|
|
3123
|
+
const ggml_compute_params * params,
|
|
3124
|
+
ggml_tensor * dst) {
|
|
3125
|
+
|
|
3126
|
+
const ggml_tensor * grad = dst->src[0];
|
|
3127
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3128
|
+
|
|
3129
|
+
assert(ggml_is_contiguous_1(grad));
|
|
3130
|
+
assert(ggml_is_contiguous_1(src1));
|
|
3131
|
+
assert(ggml_is_contiguous_1(dst));
|
|
3132
|
+
assert(ggml_are_same_shape(src1, dst));
|
|
3133
|
+
assert(ggml_are_same_shape(src1, grad));
|
|
3134
|
+
|
|
3135
|
+
const int ith = params->ith;
|
|
3136
|
+
const int nth = params->nth;
|
|
3137
|
+
|
|
3138
|
+
const int nc = src1->ne[0];
|
|
3139
|
+
const int nr = ggml_nrows(src1);
|
|
3140
|
+
|
|
3141
|
+
// rows per thread
|
|
3142
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3143
|
+
|
|
3144
|
+
// row range for this thread
|
|
3145
|
+
const int ir0 = dr*ith;
|
|
3146
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3147
|
+
|
|
3148
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3149
|
+
ggml_vec_silu_backward_f16(nc,
|
|
3150
|
+
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
3151
|
+
(ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])),
|
|
3152
|
+
(ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1])));
|
|
3153
|
+
|
|
3154
|
+
#ifndef NDEBUG
|
|
3155
|
+
for (int k = 0; k < nc; k++) {
|
|
3156
|
+
const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3157
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
3158
|
+
GGML_UNUSED(v);
|
|
3159
|
+
assert(!isnan(v));
|
|
3160
|
+
assert(!isinf(v));
|
|
3161
|
+
}
|
|
3162
|
+
#endif
|
|
3163
|
+
}
|
|
3164
|
+
}
|
|
3165
|
+
|
|
3166
|
+
void ggml_compute_forward_silu_back(
|
|
3167
|
+
const ggml_compute_params * params,
|
|
3168
|
+
ggml_tensor * dst) {
|
|
3169
|
+
|
|
3170
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3171
|
+
|
|
3172
|
+
switch (src0->type) {
|
|
3173
|
+
case GGML_TYPE_F32:
|
|
3174
|
+
{
|
|
3175
|
+
ggml_compute_forward_silu_back_f32(params, dst);
|
|
3176
|
+
} break;
|
|
3177
|
+
case GGML_TYPE_F16:
|
|
3178
|
+
{
|
|
3179
|
+
ggml_compute_forward_silu_back_f16(params, dst);
|
|
3180
|
+
} break;
|
|
3181
|
+
default:
|
|
3182
|
+
{
|
|
3183
|
+
GGML_ABORT("fatal error");
|
|
3184
|
+
}
|
|
3185
|
+
}
|
|
3186
|
+
}
|
|
3187
|
+
|
|
3188
|
+
// ggml_compute_forward_reglu
|
|
3189
|
+
|
|
3190
|
+
static void ggml_compute_forward_reglu_f32(
|
|
3191
|
+
const ggml_compute_params * params,
|
|
3192
|
+
ggml_tensor * dst) {
|
|
3193
|
+
|
|
3194
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3195
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3196
|
+
char * src0_d = (char *) src0->data;
|
|
3197
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3198
|
+
const size_t src0_o = src0->nb[1];
|
|
3199
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3200
|
+
|
|
3201
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3202
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3203
|
+
|
|
3204
|
+
if (src1) {
|
|
3205
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3206
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3207
|
+
}
|
|
3208
|
+
|
|
3209
|
+
const int ith = params->ith;
|
|
3210
|
+
const int nth = params->nth;
|
|
3211
|
+
|
|
3212
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3213
|
+
const int nr = ggml_nrows(src0);
|
|
3214
|
+
|
|
3215
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3216
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3217
|
+
|
|
3218
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3219
|
+
|
|
3220
|
+
// rows per thread
|
|
3221
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3222
|
+
|
|
3223
|
+
// row range for this thread
|
|
3224
|
+
const int ir0 = dr*ith;
|
|
3225
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3226
|
+
|
|
3227
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3228
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3229
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3230
|
+
|
|
3231
|
+
if (!src1) {
|
|
3232
|
+
src0_p += swapped ? nc : 0;
|
|
3233
|
+
src1_p += swapped ? 0 : nc;
|
|
3234
|
+
}
|
|
3235
|
+
|
|
3236
|
+
ggml_vec_reglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3237
|
+
|
|
3238
|
+
#ifndef NDEBUG
|
|
3239
|
+
for (int k = 0; k < nc; k++) {
|
|
3240
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3241
|
+
GGML_UNUSED(x);
|
|
3242
|
+
assert(!isnan(x));
|
|
3243
|
+
assert(!isinf(x));
|
|
3244
|
+
}
|
|
3245
|
+
#endif
|
|
3246
|
+
}
|
|
3247
|
+
}
|
|
3248
|
+
|
|
3249
|
+
static void ggml_compute_forward_reglu_f16(
|
|
3250
|
+
const ggml_compute_params * params,
|
|
3251
|
+
ggml_tensor * dst) {
|
|
3252
|
+
|
|
3253
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3254
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3255
|
+
char * src0_d = (char *) src0->data;
|
|
3256
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3257
|
+
const size_t src0_o = src0->nb[1];
|
|
3258
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3259
|
+
|
|
3260
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3261
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3262
|
+
|
|
3263
|
+
if (src1) {
|
|
3264
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3265
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3266
|
+
}
|
|
3267
|
+
|
|
3268
|
+
const int ith = params->ith;
|
|
3269
|
+
const int nth = params->nth;
|
|
3270
|
+
|
|
3271
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3272
|
+
const int nr = ggml_nrows(src0);
|
|
3273
|
+
|
|
3274
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3275
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3276
|
+
|
|
3277
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3278
|
+
|
|
3279
|
+
// rows per thread
|
|
3280
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3281
|
+
|
|
3282
|
+
// row range for this thread
|
|
3283
|
+
const int ir0 = dr*ith;
|
|
3284
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3285
|
+
|
|
3286
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3287
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3288
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3289
|
+
|
|
3290
|
+
if (!src1) {
|
|
3291
|
+
src0_p += swapped ? nc : 0;
|
|
3292
|
+
src1_p += swapped ? 0 : nc;
|
|
3293
|
+
}
|
|
3294
|
+
|
|
3295
|
+
ggml_vec_reglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3296
|
+
|
|
3297
|
+
#ifndef NDEBUG
|
|
3298
|
+
for (int k = 0; k < nc; k++) {
|
|
3299
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3300
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3301
|
+
GGML_UNUSED(v);
|
|
3302
|
+
assert(!isnan(v));
|
|
3303
|
+
assert(!isinf(v));
|
|
3304
|
+
}
|
|
3305
|
+
#endif
|
|
3306
|
+
}
|
|
3307
|
+
}
|
|
3308
|
+
|
|
3309
|
+
static void ggml_compute_forward_reglu(
|
|
3310
|
+
const ggml_compute_params * params,
|
|
3311
|
+
ggml_tensor * dst) {
|
|
3312
|
+
|
|
3313
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3314
|
+
|
|
3315
|
+
switch (src0->type) {
|
|
3316
|
+
case GGML_TYPE_F32:
|
|
3317
|
+
{
|
|
3318
|
+
ggml_compute_forward_reglu_f32(params, dst);
|
|
3319
|
+
} break;
|
|
3320
|
+
case GGML_TYPE_F16:
|
|
3321
|
+
{
|
|
3322
|
+
ggml_compute_forward_reglu_f16(params, dst);
|
|
3323
|
+
} break;
|
|
3324
|
+
default:
|
|
3325
|
+
{
|
|
3326
|
+
GGML_ABORT("fatal error");
|
|
3327
|
+
}
|
|
3328
|
+
}
|
|
3329
|
+
}
|
|
3330
|
+
|
|
3331
|
+
// ggml_compute_forward_geglu
|
|
3332
|
+
|
|
3333
|
+
static void ggml_compute_forward_geglu_f32(
|
|
3334
|
+
const ggml_compute_params * params,
|
|
3335
|
+
ggml_tensor * dst) {
|
|
3336
|
+
|
|
3337
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3338
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3339
|
+
char * src0_d = (char *) src0->data;
|
|
3340
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3341
|
+
const size_t src0_o = src0->nb[1];
|
|
3342
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3343
|
+
|
|
3344
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3345
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3346
|
+
|
|
3347
|
+
if (src1) {
|
|
3348
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3349
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3350
|
+
}
|
|
3351
|
+
|
|
3352
|
+
const int ith = params->ith;
|
|
3353
|
+
const int nth = params->nth;
|
|
3354
|
+
|
|
3355
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3356
|
+
const int nr = ggml_nrows(src0);
|
|
3357
|
+
|
|
3358
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3359
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3360
|
+
|
|
3361
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3362
|
+
|
|
3363
|
+
// rows per thread
|
|
3364
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3365
|
+
|
|
3366
|
+
// row range for this thread
|
|
3367
|
+
const int ir0 = dr*ith;
|
|
3368
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3369
|
+
|
|
3370
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3371
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3372
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3373
|
+
|
|
3374
|
+
if (!src1) {
|
|
3375
|
+
src0_p += swapped ? nc : 0;
|
|
3376
|
+
src1_p += swapped ? 0 : nc;
|
|
3377
|
+
}
|
|
3378
|
+
|
|
3379
|
+
ggml_vec_geglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3380
|
+
|
|
3381
|
+
#ifndef NDEBUG
|
|
3382
|
+
for (int k = 0; k < nc; k++) {
|
|
3383
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3384
|
+
GGML_UNUSED(x);
|
|
3385
|
+
assert(!isnan(x));
|
|
3386
|
+
assert(!isinf(x));
|
|
3387
|
+
}
|
|
3388
|
+
#endif
|
|
3389
|
+
}
|
|
3390
|
+
}
|
|
3391
|
+
|
|
3392
|
+
static void ggml_compute_forward_geglu_f16(
|
|
3393
|
+
const ggml_compute_params * params,
|
|
3394
|
+
ggml_tensor * dst) {
|
|
3395
|
+
|
|
3396
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3397
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3398
|
+
char * src0_d = (char *) src0->data;
|
|
3399
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3400
|
+
const size_t src0_o = src0->nb[1];
|
|
3401
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3402
|
+
|
|
3403
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3404
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3405
|
+
|
|
3406
|
+
if (src1) {
|
|
3407
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3408
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3409
|
+
}
|
|
3410
|
+
|
|
3411
|
+
const int ith = params->ith;
|
|
3412
|
+
const int nth = params->nth;
|
|
3413
|
+
|
|
3414
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3415
|
+
const int nr = ggml_nrows(src0);
|
|
3416
|
+
|
|
3417
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3418
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3419
|
+
|
|
3420
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3421
|
+
|
|
3422
|
+
// rows per thread
|
|
3423
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3424
|
+
|
|
3425
|
+
// row range for this thread
|
|
3426
|
+
const int ir0 = dr*ith;
|
|
3427
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3428
|
+
|
|
3429
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3430
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3431
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3432
|
+
|
|
3433
|
+
if (!src1) {
|
|
3434
|
+
src0_p += swapped ? nc : 0;
|
|
3435
|
+
src1_p += swapped ? 0 : nc;
|
|
3436
|
+
}
|
|
3437
|
+
|
|
3438
|
+
ggml_vec_geglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3439
|
+
|
|
3440
|
+
#ifndef NDEBUG
|
|
3441
|
+
for (int k = 0; k < nc; k++) {
|
|
3442
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3443
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3444
|
+
GGML_UNUSED(v);
|
|
3445
|
+
assert(!isnan(v));
|
|
3446
|
+
assert(!isinf(v));
|
|
3447
|
+
}
|
|
3448
|
+
#endif
|
|
3449
|
+
}
|
|
3450
|
+
}
|
|
3451
|
+
|
|
3452
|
+
static void ggml_compute_forward_geglu(
|
|
3453
|
+
const ggml_compute_params * params,
|
|
3454
|
+
ggml_tensor * dst) {
|
|
3455
|
+
|
|
3456
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3457
|
+
|
|
3458
|
+
switch (src0->type) {
|
|
3459
|
+
case GGML_TYPE_F32:
|
|
3460
|
+
{
|
|
3461
|
+
ggml_compute_forward_geglu_f32(params, dst);
|
|
3462
|
+
} break;
|
|
3463
|
+
case GGML_TYPE_F16:
|
|
3464
|
+
{
|
|
3465
|
+
ggml_compute_forward_geglu_f16(params, dst);
|
|
3466
|
+
} break;
|
|
3467
|
+
default:
|
|
3468
|
+
{
|
|
3469
|
+
GGML_ABORT("fatal error");
|
|
3470
|
+
}
|
|
3471
|
+
}
|
|
3472
|
+
}
|
|
3473
|
+
|
|
3474
|
+
// ggml_compute_forward_swiglu
|
|
3475
|
+
|
|
3476
|
+
static void ggml_compute_forward_swiglu_f32(
|
|
3477
|
+
const ggml_compute_params * params,
|
|
3478
|
+
ggml_tensor * dst) {
|
|
3479
|
+
|
|
3480
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3481
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3482
|
+
char * src0_d = (char *) src0->data;
|
|
3483
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3484
|
+
const size_t src0_o = src0->nb[1];
|
|
3485
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3486
|
+
|
|
3487
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3488
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3489
|
+
|
|
3490
|
+
if (src1) {
|
|
3491
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3492
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3493
|
+
}
|
|
3494
|
+
|
|
3495
|
+
const int ith = params->ith;
|
|
3496
|
+
const int nth = params->nth;
|
|
3497
|
+
|
|
3498
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3499
|
+
const int nr = ggml_nrows(src0);
|
|
3500
|
+
|
|
3501
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3502
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3503
|
+
|
|
3504
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3505
|
+
|
|
3506
|
+
// rows per thread
|
|
3507
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3508
|
+
|
|
3509
|
+
// row range for this thread
|
|
3510
|
+
const int ir0 = dr*ith;
|
|
3511
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3512
|
+
|
|
3513
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3514
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3515
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3516
|
+
|
|
3517
|
+
if (!src1) {
|
|
3518
|
+
src0_p += swapped ? nc : 0;
|
|
3519
|
+
src1_p += swapped ? 0 : nc;
|
|
3520
|
+
}
|
|
3521
|
+
|
|
3522
|
+
ggml_vec_swiglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3523
|
+
|
|
3524
|
+
#ifndef NDEBUG
|
|
3525
|
+
for (int k = 0; k < nc; k++) {
|
|
3526
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3527
|
+
GGML_UNUSED(x);
|
|
3528
|
+
assert(!isnan(x));
|
|
3529
|
+
assert(!isinf(x));
|
|
3530
|
+
}
|
|
3531
|
+
#endif
|
|
3532
|
+
}
|
|
3533
|
+
}
|
|
3534
|
+
|
|
3535
|
+
static void ggml_compute_forward_swiglu_f16(
|
|
3536
|
+
const ggml_compute_params * params,
|
|
3537
|
+
ggml_tensor * dst) {
|
|
3538
|
+
|
|
3539
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3540
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3541
|
+
char * src0_d = (char *) src0->data;
|
|
3542
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3543
|
+
const size_t src0_o = src0->nb[1];
|
|
3544
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3545
|
+
|
|
3546
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3547
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3548
|
+
|
|
3549
|
+
if (src1) {
|
|
3550
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3551
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3552
|
+
}
|
|
3553
|
+
|
|
3554
|
+
const int ith = params->ith;
|
|
3555
|
+
const int nth = params->nth;
|
|
3556
|
+
|
|
3557
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3558
|
+
const int nr = ggml_nrows(src0);
|
|
3559
|
+
|
|
3560
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3561
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3562
|
+
|
|
3563
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3564
|
+
|
|
3565
|
+
// rows per thread
|
|
3566
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3567
|
+
|
|
3568
|
+
// row range for this thread
|
|
3569
|
+
const int ir0 = dr*ith;
|
|
3570
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3571
|
+
|
|
3572
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3573
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3574
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3575
|
+
|
|
3576
|
+
if (!src1) {
|
|
3577
|
+
src0_p += swapped ? nc : 0;
|
|
3578
|
+
src1_p += swapped ? 0 : nc;
|
|
3579
|
+
}
|
|
3580
|
+
|
|
3581
|
+
ggml_vec_swiglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3582
|
+
|
|
3583
|
+
#ifndef NDEBUG
|
|
3584
|
+
for (int k = 0; k < nc; k++) {
|
|
3585
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3586
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3587
|
+
GGML_UNUSED(v);
|
|
3588
|
+
assert(!isnan(v));
|
|
3589
|
+
assert(!isinf(v));
|
|
3590
|
+
}
|
|
3591
|
+
#endif
|
|
3592
|
+
}
|
|
3593
|
+
}
|
|
3594
|
+
|
|
3595
|
+
static void ggml_compute_forward_swiglu(
|
|
3596
|
+
const ggml_compute_params * params,
|
|
3597
|
+
ggml_tensor * dst) {
|
|
3598
|
+
|
|
3599
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3600
|
+
|
|
3601
|
+
switch (src0->type) {
|
|
3602
|
+
case GGML_TYPE_F32:
|
|
3603
|
+
{
|
|
3604
|
+
ggml_compute_forward_swiglu_f32(params, dst);
|
|
3605
|
+
} break;
|
|
3606
|
+
case GGML_TYPE_F16:
|
|
3607
|
+
{
|
|
3608
|
+
ggml_compute_forward_swiglu_f16(params, dst);
|
|
3609
|
+
} break;
|
|
3610
|
+
default:
|
|
3611
|
+
{
|
|
3612
|
+
GGML_ABORT("fatal error");
|
|
3613
|
+
}
|
|
3614
|
+
}
|
|
3615
|
+
}
|
|
3616
|
+
|
|
3617
|
+
// ggml_compute_forward_geglu_erf
|
|
3618
|
+
|
|
3619
|
+
static void ggml_compute_forward_geglu_erf_f32(
|
|
3620
|
+
const ggml_compute_params * params,
|
|
3621
|
+
ggml_tensor * dst) {
|
|
3622
|
+
|
|
3623
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3624
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3625
|
+
char * src0_d = (char *) src0->data;
|
|
3626
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3627
|
+
const size_t src0_o = src0->nb[1];
|
|
3628
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3629
|
+
|
|
3630
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3631
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3632
|
+
|
|
3633
|
+
if (src1) {
|
|
3634
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3635
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3636
|
+
}
|
|
3637
|
+
|
|
3638
|
+
const int ith = params->ith;
|
|
3639
|
+
const int nth = params->nth;
|
|
3640
|
+
|
|
3641
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3642
|
+
const int nr = ggml_nrows(src0);
|
|
3643
|
+
|
|
3644
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3645
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3646
|
+
|
|
3647
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3648
|
+
|
|
3649
|
+
// rows per thread
|
|
3650
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3651
|
+
|
|
3652
|
+
// row range for this thread
|
|
3653
|
+
const int ir0 = dr*ith;
|
|
3654
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3655
|
+
|
|
3656
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3657
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3658
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3659
|
+
|
|
3660
|
+
if (!src1) {
|
|
3661
|
+
src0_p += swapped ? nc : 0;
|
|
3662
|
+
src1_p += swapped ? 0 : nc;
|
|
3663
|
+
}
|
|
3664
|
+
|
|
3665
|
+
ggml_vec_geglu_erf_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3666
|
+
|
|
3667
|
+
#ifndef NDEBUG
|
|
3668
|
+
for (int k = 0; k < nc; k++) {
|
|
3669
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3670
|
+
GGML_UNUSED(x);
|
|
3671
|
+
assert(!isnan(x));
|
|
3672
|
+
assert(!isinf(x));
|
|
3673
|
+
}
|
|
3674
|
+
#endif
|
|
3675
|
+
}
|
|
3676
|
+
}
|
|
3677
|
+
|
|
3678
|
+
static void ggml_compute_forward_geglu_erf_f16(
|
|
3679
|
+
const ggml_compute_params * params,
|
|
3680
|
+
ggml_tensor * dst) {
|
|
3681
|
+
|
|
3682
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3683
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
3684
|
+
char * src0_d = (char *) src0->data;
|
|
3685
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3686
|
+
const size_t src0_o = src0->nb[1];
|
|
3687
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3688
|
+
|
|
3689
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3690
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3691
|
+
|
|
3692
|
+
if (src1) {
|
|
3693
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3694
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3695
|
+
}
|
|
3696
|
+
|
|
3697
|
+
const int ith = params->ith;
|
|
3698
|
+
const int nth = params->nth;
|
|
3699
|
+
|
|
3700
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3701
|
+
const int nr = ggml_nrows(src0);
|
|
3702
|
+
|
|
3703
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3704
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3705
|
+
|
|
3706
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3707
|
+
|
|
3708
|
+
// rows per thread
|
|
3709
|
+
const int dr = (nr + nth - 1)/nth;
|
|
3710
|
+
|
|
3711
|
+
// row range for this thread
|
|
3712
|
+
const int ir0 = dr*ith;
|
|
3713
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
3714
|
+
|
|
3715
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3716
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3717
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3718
|
+
|
|
3719
|
+
if (!src1) {
|
|
3720
|
+
src0_p += swapped ? nc : 0;
|
|
3721
|
+
src1_p += swapped ? 0 : nc;
|
|
3722
|
+
}
|
|
3723
|
+
|
|
3724
|
+
ggml_vec_geglu_erf_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3725
|
+
|
|
3726
|
+
#ifndef NDEBUG
|
|
3727
|
+
for (int k = 0; k < nc; k++) {
|
|
3728
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3729
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3730
|
+
GGML_UNUSED(v);
|
|
3731
|
+
assert(!isnan(v));
|
|
3732
|
+
assert(!isinf(v));
|
|
3733
|
+
}
|
|
3734
|
+
#endif
|
|
3735
|
+
}
|
|
3736
|
+
}
|
|
3737
|
+
|
|
3738
|
+
static void ggml_compute_forward_geglu_erf(
|
|
3065
3739
|
const ggml_compute_params * params,
|
|
3066
3740
|
ggml_tensor * dst) {
|
|
3067
3741
|
|
|
@@ -3070,11 +3744,11 @@ void ggml_compute_forward_leaky_relu(
|
|
|
3070
3744
|
switch (src0->type) {
|
|
3071
3745
|
case GGML_TYPE_F32:
|
|
3072
3746
|
{
|
|
3073
|
-
|
|
3747
|
+
ggml_compute_forward_geglu_erf_f32(params, dst);
|
|
3074
3748
|
} break;
|
|
3075
3749
|
case GGML_TYPE_F16:
|
|
3076
3750
|
{
|
|
3077
|
-
|
|
3751
|
+
ggml_compute_forward_geglu_erf_f16(params, dst);
|
|
3078
3752
|
} break;
|
|
3079
3753
|
default:
|
|
3080
3754
|
{
|
|
@@ -3083,26 +3757,37 @@ void ggml_compute_forward_leaky_relu(
|
|
|
3083
3757
|
}
|
|
3084
3758
|
}
|
|
3085
3759
|
|
|
3086
|
-
//
|
|
3760
|
+
// ggml_compute_forward_geglu_quick
|
|
3087
3761
|
|
|
3088
|
-
static void
|
|
3762
|
+
static void ggml_compute_forward_geglu_quick_f32(
|
|
3089
3763
|
const ggml_compute_params * params,
|
|
3090
3764
|
ggml_tensor * dst) {
|
|
3091
3765
|
|
|
3092
|
-
const ggml_tensor *
|
|
3766
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3093
3767
|
const ggml_tensor * src1 = dst->src[1];
|
|
3768
|
+
char * src0_d = (char *) src0->data;
|
|
3769
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3770
|
+
const size_t src0_o = src0->nb[1];
|
|
3771
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3094
3772
|
|
|
3095
|
-
|
|
3096
|
-
|
|
3097
|
-
|
|
3098
|
-
|
|
3099
|
-
|
|
3773
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3774
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3775
|
+
|
|
3776
|
+
if (src1) {
|
|
3777
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3778
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3779
|
+
}
|
|
3100
3780
|
|
|
3101
3781
|
const int ith = params->ith;
|
|
3102
3782
|
const int nth = params->nth;
|
|
3103
3783
|
|
|
3104
|
-
const int nc = src1->ne[0];
|
|
3105
|
-
const int nr = ggml_nrows(
|
|
3784
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3785
|
+
const int nr = ggml_nrows(src0);
|
|
3786
|
+
|
|
3787
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3788
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3789
|
+
|
|
3790
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3106
3791
|
|
|
3107
3792
|
// rows per thread
|
|
3108
3793
|
const int dr = (nr + nth - 1)/nth;
|
|
@@ -3112,10 +3797,15 @@ static void ggml_compute_forward_silu_back_f32(
|
|
|
3112
3797
|
const int ir1 = MIN(ir0 + dr, nr);
|
|
3113
3798
|
|
|
3114
3799
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3118
|
-
|
|
3800
|
+
float * src0_p = (float *) (src0_d + i1*src0_o);
|
|
3801
|
+
float * src1_p = (float *) (src1_d + i1*src1_o);
|
|
3802
|
+
|
|
3803
|
+
if (!src1) {
|
|
3804
|
+
src0_p += swapped ? nc : 0;
|
|
3805
|
+
src1_p += swapped ? 0 : nc;
|
|
3806
|
+
}
|
|
3807
|
+
|
|
3808
|
+
ggml_vec_geglu_quick_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3119
3809
|
|
|
3120
3810
|
#ifndef NDEBUG
|
|
3121
3811
|
for (int k = 0; k < nc; k++) {
|
|
@@ -3128,24 +3818,35 @@ static void ggml_compute_forward_silu_back_f32(
|
|
|
3128
3818
|
}
|
|
3129
3819
|
}
|
|
3130
3820
|
|
|
3131
|
-
static void
|
|
3821
|
+
static void ggml_compute_forward_geglu_quick_f16(
|
|
3132
3822
|
const ggml_compute_params * params,
|
|
3133
3823
|
ggml_tensor * dst) {
|
|
3134
3824
|
|
|
3135
|
-
const ggml_tensor *
|
|
3825
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
3136
3826
|
const ggml_tensor * src1 = dst->src[1];
|
|
3827
|
+
char * src0_d = (char *) src0->data;
|
|
3828
|
+
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
|
3829
|
+
const size_t src0_o = src0->nb[1];
|
|
3830
|
+
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
3137
3831
|
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3832
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
3833
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
3834
|
+
|
|
3835
|
+
if (src1) {
|
|
3836
|
+
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
3837
|
+
GGML_ASSERT(src0->type == src1->type);
|
|
3838
|
+
}
|
|
3143
3839
|
|
|
3144
3840
|
const int ith = params->ith;
|
|
3145
3841
|
const int nth = params->nth;
|
|
3146
3842
|
|
|
3147
|
-
const int nc = src1->ne[0];
|
|
3148
|
-
const int nr = ggml_nrows(
|
|
3843
|
+
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
3844
|
+
const int nr = ggml_nrows(src0);
|
|
3845
|
+
|
|
3846
|
+
GGML_ASSERT(dst->ne[0] == nc);
|
|
3847
|
+
GGML_ASSERT(ggml_nrows(dst) == nr);
|
|
3848
|
+
|
|
3849
|
+
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
3149
3850
|
|
|
3150
3851
|
// rows per thread
|
|
3151
3852
|
const int dr = (nr + nth - 1)/nth;
|
|
@@ -3155,24 +3856,29 @@ static void ggml_compute_forward_silu_back_f16(
|
|
|
3155
3856
|
const int ir1 = MIN(ir0 + dr, nr);
|
|
3156
3857
|
|
|
3157
3858
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
3158
|
-
|
|
3159
|
-
|
|
3160
|
-
(ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])),
|
|
3161
|
-
(ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1])));
|
|
3859
|
+
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
|
3860
|
+
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
|
3162
3861
|
|
|
3163
|
-
|
|
3862
|
+
if (!src1) {
|
|
3863
|
+
src0_p += swapped ? nc : 0;
|
|
3864
|
+
src1_p += swapped ? 0 : nc;
|
|
3865
|
+
}
|
|
3866
|
+
|
|
3867
|
+
ggml_vec_geglu_quick_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
|
3868
|
+
|
|
3869
|
+
#ifndef NDEBUG
|
|
3164
3870
|
for (int k = 0; k < nc; k++) {
|
|
3165
|
-
const
|
|
3166
|
-
const float v =
|
|
3871
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3872
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
3167
3873
|
GGML_UNUSED(v);
|
|
3168
3874
|
assert(!isnan(v));
|
|
3169
3875
|
assert(!isinf(v));
|
|
3170
3876
|
}
|
|
3171
|
-
|
|
3877
|
+
#endif
|
|
3172
3878
|
}
|
|
3173
3879
|
}
|
|
3174
3880
|
|
|
3175
|
-
void
|
|
3881
|
+
static void ggml_compute_forward_geglu_quick(
|
|
3176
3882
|
const ggml_compute_params * params,
|
|
3177
3883
|
ggml_tensor * dst) {
|
|
3178
3884
|
|
|
@@ -3181,11 +3887,11 @@ void ggml_compute_forward_silu_back(
|
|
|
3181
3887
|
switch (src0->type) {
|
|
3182
3888
|
case GGML_TYPE_F32:
|
|
3183
3889
|
{
|
|
3184
|
-
|
|
3890
|
+
ggml_compute_forward_geglu_quick_f32(params, dst);
|
|
3185
3891
|
} break;
|
|
3186
3892
|
case GGML_TYPE_F16:
|
|
3187
3893
|
{
|
|
3188
|
-
|
|
3894
|
+
ggml_compute_forward_geglu_quick_f16(params, dst);
|
|
3189
3895
|
} break;
|
|
3190
3896
|
default:
|
|
3191
3897
|
{
|
|
@@ -4470,6 +5176,74 @@ void ggml_compute_forward_get_rows(
|
|
|
4470
5176
|
//}
|
|
4471
5177
|
}
|
|
4472
5178
|
|
|
5179
|
+
static void ggml_compute_forward_set_rows_f32(
|
|
5180
|
+
const ggml_compute_params * params,
|
|
5181
|
+
ggml_tensor * dst) {
|
|
5182
|
+
|
|
5183
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
5184
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
5185
|
+
|
|
5186
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
5187
|
+
|
|
5188
|
+
const int64_t nc = ne00;
|
|
5189
|
+
const int64_t nr = ne01;
|
|
5190
|
+
|
|
5191
|
+
assert(ne0 == nc);
|
|
5192
|
+
assert(ne2 == ne02);
|
|
5193
|
+
assert(ne3 == ne03);
|
|
5194
|
+
assert(src0->type == GGML_TYPE_F32);
|
|
5195
|
+
assert(ne02 % ne11 == 0);
|
|
5196
|
+
assert(ne03 % ne12 == 0);
|
|
5197
|
+
|
|
5198
|
+
const int ith = params->ith;
|
|
5199
|
+
const int nth = params->nth;
|
|
5200
|
+
|
|
5201
|
+
// rows per thread
|
|
5202
|
+
const int64_t dr = (nr + nth - 1)/nth;
|
|
5203
|
+
|
|
5204
|
+
// row range for this thread
|
|
5205
|
+
const int64_t ir0 = dr*ith;
|
|
5206
|
+
const int64_t ir1 = std::min(ir0 + dr, nr);
|
|
5207
|
+
|
|
5208
|
+
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
|
|
5209
|
+
|
|
5210
|
+
for (int64_t i03 = 0; i03 < ne03; ++i03) {
|
|
5211
|
+
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
|
5212
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
|
5213
|
+
const int64_t i12 = i03%ne12;
|
|
5214
|
+
const int64_t i11 = i02%ne11;
|
|
5215
|
+
const int64_t i10 = i;
|
|
5216
|
+
|
|
5217
|
+
const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
5218
|
+
|
|
5219
|
+
GGML_ASSERT(i1 >= 0 && i1 < ne1);
|
|
5220
|
+
|
|
5221
|
+
from_float(
|
|
5222
|
+
(const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03),
|
|
5223
|
+
((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc);
|
|
5224
|
+
}
|
|
5225
|
+
}
|
|
5226
|
+
}
|
|
5227
|
+
}
|
|
5228
|
+
|
|
5229
|
+
void ggml_compute_forward_set_rows(
|
|
5230
|
+
const ggml_compute_params * params,
|
|
5231
|
+
ggml_tensor * dst) {
|
|
5232
|
+
|
|
5233
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
5234
|
+
|
|
5235
|
+
switch (src0->type) {
|
|
5236
|
+
case GGML_TYPE_F32:
|
|
5237
|
+
{
|
|
5238
|
+
ggml_compute_forward_set_rows_f32(params, dst);
|
|
5239
|
+
} break;
|
|
5240
|
+
default:
|
|
5241
|
+
{
|
|
5242
|
+
GGML_ABORT("src0->type = %d (%s) not supported", src0->type, ggml_type_name(src0->type));
|
|
5243
|
+
}
|
|
5244
|
+
}
|
|
5245
|
+
}
|
|
5246
|
+
|
|
4473
5247
|
// ggml_compute_forward_get_rows_back
|
|
4474
5248
|
|
|
4475
5249
|
static void ggml_compute_forward_get_rows_back_f32_f16(
|
|
@@ -4744,14 +5518,17 @@ static void ggml_compute_forward_soft_max_f32(
|
|
|
4744
5518
|
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
|
4745
5519
|
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
|
4746
5520
|
|
|
4747
|
-
// TODO: handle transposed/permuted matrices
|
|
4748
|
-
|
|
4749
5521
|
const int ith = params->ith;
|
|
4750
5522
|
const int nth = params->nth;
|
|
4751
5523
|
|
|
4752
5524
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
4753
5525
|
|
|
4754
|
-
|
|
5526
|
+
const int64_t nb11 = src1 ? src1->nb[1] : 1;
|
|
5527
|
+
const int64_t nb12 = src1 ? src1->nb[2] : 1;
|
|
5528
|
+
const int64_t nb13 = src1 ? src1->nb[3] : 1;
|
|
5529
|
+
|
|
5530
|
+
const int64_t ne12 = src1 ? src1->ne[2] : 1;
|
|
5531
|
+
const int64_t ne13 = src1 ? src1->ne[3] : 1;
|
|
4755
5532
|
|
|
4756
5533
|
// TODO: is this supposed to be ceil instead of floor?
|
|
4757
5534
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
|
@@ -4761,68 +5538,66 @@ static void ggml_compute_forward_soft_max_f32(
|
|
|
4761
5538
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
4762
5539
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
4763
5540
|
|
|
4764
|
-
|
|
4765
|
-
const int nr = ggml_nrows(src0);
|
|
4766
|
-
|
|
4767
|
-
// rows per thread
|
|
4768
|
-
const int dr = (nr + nth - 1)/nth;
|
|
4769
|
-
|
|
4770
|
-
// row range for this thread
|
|
4771
|
-
const int ir0 = dr*ith;
|
|
4772
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
|
4773
|
-
|
|
4774
|
-
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
|
5541
|
+
float * wp = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
|
4775
5542
|
|
|
4776
5543
|
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
|
4777
5544
|
|
|
4778
|
-
for (
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
|
|
4782
|
-
|
|
4783
|
-
|
|
4784
|
-
|
|
4785
|
-
|
|
4786
|
-
|
|
4787
|
-
|
|
4788
|
-
|
|
4789
|
-
|
|
4790
|
-
|
|
4791
|
-
|
|
4792
|
-
|
|
4793
|
-
|
|
4794
|
-
|
|
4795
|
-
|
|
4796
|
-
|
|
4797
|
-
|
|
4798
|
-
|
|
4799
|
-
|
|
5545
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
5546
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
5547
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
|
5548
|
+
const int64_t i11 = i01;
|
|
5549
|
+
const int64_t i12 = i02%ne12;
|
|
5550
|
+
const int64_t i13 = i03%ne13;
|
|
5551
|
+
|
|
5552
|
+
// ALiBi
|
|
5553
|
+
const uint32_t h = i02; // head
|
|
5554
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
|
5555
|
+
|
|
5556
|
+
float * sp = (float *)((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
5557
|
+
float * dp = (float *)((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
|
5558
|
+
|
|
5559
|
+
// broadcast the mask across rows
|
|
5560
|
+
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL;
|
|
5561
|
+
float * mp_f32 = src1 ? (float *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL;
|
|
5562
|
+
|
|
5563
|
+
ggml_vec_cpy_f32 (ne00, wp, sp);
|
|
5564
|
+
ggml_vec_scale_f32(ne00, wp, scale);
|
|
5565
|
+
if (mp_f32) {
|
|
5566
|
+
if (use_f16) {
|
|
5567
|
+
for (int i = 0; i < ne00; ++i) {
|
|
5568
|
+
wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]);
|
|
5569
|
+
}
|
|
5570
|
+
} else {
|
|
5571
|
+
for (int i = 0; i < ne00; ++i) {
|
|
5572
|
+
wp[i] += slope*mp_f32[i];
|
|
5573
|
+
}
|
|
5574
|
+
}
|
|
4800
5575
|
}
|
|
4801
|
-
}
|
|
4802
|
-
}
|
|
4803
5576
|
|
|
4804
5577
|
#ifndef NDEBUG
|
|
4805
|
-
|
|
4806
|
-
|
|
4807
|
-
|
|
4808
|
-
|
|
5578
|
+
for (int i = 0; i < ne00; ++i) {
|
|
5579
|
+
//printf("p[%d] = %f\n", i, p[i]);
|
|
5580
|
+
assert(!isnan(wp[i]));
|
|
5581
|
+
}
|
|
4809
5582
|
#endif
|
|
4810
5583
|
|
|
4811
|
-
|
|
4812
|
-
|
|
5584
|
+
float max = -INFINITY;
|
|
5585
|
+
ggml_vec_max_f32(ne00, &max, wp);
|
|
4813
5586
|
|
|
4814
|
-
|
|
4815
|
-
|
|
5587
|
+
ggml_float sum = ggml_vec_soft_max_f32(ne00, dp, wp, max);
|
|
5588
|
+
assert(sum > 0.0);
|
|
4816
5589
|
|
|
4817
|
-
|
|
4818
|
-
|
|
5590
|
+
sum = 1.0/sum;
|
|
5591
|
+
ggml_vec_scale_f32(ne00, dp, sum);
|
|
4819
5592
|
|
|
4820
5593
|
#ifndef NDEBUG
|
|
4821
|
-
|
|
4822
|
-
|
|
4823
|
-
|
|
4824
|
-
|
|
5594
|
+
for (int i = 0; i < ne00; ++i) {
|
|
5595
|
+
assert(!isnan(dp[i]));
|
|
5596
|
+
assert(!isinf(dp[i]));
|
|
5597
|
+
}
|
|
4825
5598
|
#endif
|
|
5599
|
+
}
|
|
5600
|
+
}
|
|
4826
5601
|
}
|
|
4827
5602
|
}
|
|
4828
5603
|
|
|
@@ -6058,6 +6833,186 @@ void ggml_compute_forward_im2col_back_f32(
|
|
|
6058
6833
|
}
|
|
6059
6834
|
}
|
|
6060
6835
|
|
|
6836
|
+
static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
|
6837
|
+
void * a, void * b, float * c) {
|
|
6838
|
+
const ggml_type_traits * traits = ggml_get_type_traits(type);
|
|
6839
|
+
struct ggml_tensor src1 = {};
|
|
6840
|
+
src1.type = type;
|
|
6841
|
+
src1.ne[0] = k;
|
|
6842
|
+
src1.ne[1] = m;
|
|
6843
|
+
src1.ne[2] = 1;
|
|
6844
|
+
src1.ne[3] = 1;
|
|
6845
|
+
src1.nb[0] = traits->type_size;
|
|
6846
|
+
src1.nb[1] = k * traits->type_size;
|
|
6847
|
+
src1.nb[2] = src1.nb[1];
|
|
6848
|
+
src1.nb[3] = src1.nb[2];
|
|
6849
|
+
src1.data = a;
|
|
6850
|
+
|
|
6851
|
+
struct ggml_tensor src0 = {};
|
|
6852
|
+
src0.type = type;
|
|
6853
|
+
src0.ne[0] = k;
|
|
6854
|
+
src0.ne[1] = n;
|
|
6855
|
+
src0.ne[2] = 1;
|
|
6856
|
+
src0.ne[3] = 1;
|
|
6857
|
+
src0.nb[0] = traits->type_size;
|
|
6858
|
+
src0.nb[1] = k * traits->type_size;
|
|
6859
|
+
src0.nb[2] = src0.nb[1];
|
|
6860
|
+
src0.nb[3] = src0.nb[2];
|
|
6861
|
+
src0.data = b;
|
|
6862
|
+
|
|
6863
|
+
struct ggml_tensor dst = {};
|
|
6864
|
+
dst.ne[0] = n;
|
|
6865
|
+
dst.ne[1] = m;
|
|
6866
|
+
dst.ne[2] = 1;
|
|
6867
|
+
dst.ne[3] = 1;
|
|
6868
|
+
dst.nb[0] = sizeof(float);
|
|
6869
|
+
dst.nb[1] = n * sizeof(float);
|
|
6870
|
+
dst.nb[2] = dst.nb[1];
|
|
6871
|
+
dst.nb[3] = dst.nb[2];
|
|
6872
|
+
dst.data = c;
|
|
6873
|
+
dst.src[0] = &src0;
|
|
6874
|
+
dst.src[1] = &src1;
|
|
6875
|
+
|
|
6876
|
+
ggml_compute_forward_mul_mat(params, &dst);
|
|
6877
|
+
}
|
|
6878
|
+
|
|
6879
|
+
// ggml_compute_forward_conv_2d
|
|
6880
|
+
|
|
6881
|
+
static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
|
|
6882
|
+
const ggml_tensor * kernel, // [KW, KH, IC, OC]
|
|
6883
|
+
const ggml_tensor * src, // [W, H, C, N]
|
|
6884
|
+
ggml_tensor * dst, // [OW, OH, OC, N]
|
|
6885
|
+
ggml_type kernel_type) {
|
|
6886
|
+
|
|
6887
|
+
GGML_ASSERT(ggml_is_contiguous(kernel));
|
|
6888
|
+
GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32);
|
|
6889
|
+
GGML_ASSERT(kernel->type == kernel_type);
|
|
6890
|
+
|
|
6891
|
+
const ggml_type_traits * traits = ggml_get_type_traits(kernel_type);
|
|
6892
|
+
|
|
6893
|
+
const int32_t stride_x = dst->op_params[0];
|
|
6894
|
+
const int32_t stride_y = dst->op_params[1];
|
|
6895
|
+
const int32_t pad_x = dst->op_params[2];
|
|
6896
|
+
const int32_t pad_y = dst->op_params[3];
|
|
6897
|
+
const int32_t dilation_x = dst->op_params[4];
|
|
6898
|
+
const int32_t dilation_y = dst->op_params[5];
|
|
6899
|
+
|
|
6900
|
+
const int64_t c_in = src->ne[2];
|
|
6901
|
+
const int64_t c_out = kernel->ne[3];
|
|
6902
|
+
GGML_ASSERT(c_in == kernel->ne[2]);
|
|
6903
|
+
|
|
6904
|
+
const int64_t src_w = src->ne[0];
|
|
6905
|
+
const int64_t src_h = src->ne[1];
|
|
6906
|
+
const int64_t knl_w = kernel->ne[0];
|
|
6907
|
+
const int64_t knl_h = kernel->ne[1];
|
|
6908
|
+
const int64_t dst_w = dst->ne[0];
|
|
6909
|
+
const int64_t dst_h = dst->ne[1];
|
|
6910
|
+
|
|
6911
|
+
const float * src_data = (float *) src->data;
|
|
6912
|
+
void * knl_data = kernel->data;
|
|
6913
|
+
float * dst_data = (float *) dst->data;
|
|
6914
|
+
|
|
6915
|
+
const int64_t knl_n = knl_w * knl_h * c_in;
|
|
6916
|
+
const int64_t patch_total = dst->ne[3] * dst_w * dst_h;
|
|
6917
|
+
|
|
6918
|
+
const int64_t space_per_patch = knl_n * traits->type_size + c_out * sizeof(float);
|
|
6919
|
+
const int64_t batch_size = params->wsize / space_per_patch;
|
|
6920
|
+
const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
|
|
6921
|
+
const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
|
|
6922
|
+
|
|
6923
|
+
GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
|
|
6924
|
+
|
|
6925
|
+
void * tmp = params->wdata;
|
|
6926
|
+
|
|
6927
|
+
for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
|
|
6928
|
+
|
|
6929
|
+
const int64_t patch_start_batch = batch_i * patches_per_batch;
|
|
6930
|
+
const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch,
|
|
6931
|
+
patch_total);
|
|
6932
|
+
const int64_t patch_n = patch_end_batch - patch_start_batch;
|
|
6933
|
+
|
|
6934
|
+
const int64_t patch_per_thread = (patch_n + params->nth - 1) / params->nth;
|
|
6935
|
+
const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread;
|
|
6936
|
+
const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch);
|
|
6937
|
+
|
|
6938
|
+
//im2col for a patch
|
|
6939
|
+
for (int64_t p = patch_start; p < patch_end; ++p) {
|
|
6940
|
+
const int64_t batch_n = p / (dst_w * dst_h);
|
|
6941
|
+
const int64_t src_x = (p / dst_w) % dst_h;
|
|
6942
|
+
const int64_t src_y = p % dst_w;
|
|
6943
|
+
|
|
6944
|
+
const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]);
|
|
6945
|
+
char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
|
|
6946
|
+
|
|
6947
|
+
for (int64_t ic = 0; ic < c_in; ++ic) {
|
|
6948
|
+
for (int64_t ky = 0; ky < knl_h; ++ky) {
|
|
6949
|
+
for (int64_t kx = 0; kx < knl_w; ++kx) {
|
|
6950
|
+
const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
|
|
6951
|
+
const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;
|
|
6952
|
+
|
|
6953
|
+
int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
|
|
6954
|
+
|
|
6955
|
+
float src_val;
|
|
6956
|
+
if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
|
|
6957
|
+
src_val = 0.0f;
|
|
6958
|
+
} else {
|
|
6959
|
+
const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
|
|
6960
|
+
src_val = *src_ptr;
|
|
6961
|
+
}
|
|
6962
|
+
|
|
6963
|
+
char * element_ptr = dst_row + dst_idx * traits->type_size;
|
|
6964
|
+
if (kernel_type == GGML_TYPE_F32) {
|
|
6965
|
+
*(float *) element_ptr = src_val;
|
|
6966
|
+
} else if (kernel_type == GGML_TYPE_F16) {
|
|
6967
|
+
*(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
|
|
6968
|
+
}
|
|
6969
|
+
}
|
|
6970
|
+
}
|
|
6971
|
+
}
|
|
6972
|
+
} // patches handled by this thread
|
|
6973
|
+
|
|
6974
|
+
ggml_barrier(params->threadpool);
|
|
6975
|
+
|
|
6976
|
+
float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size);
|
|
6977
|
+
|
|
6978
|
+
GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize);
|
|
6979
|
+
|
|
6980
|
+
// GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
|
|
6981
|
+
ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output);
|
|
6982
|
+
|
|
6983
|
+
ggml_barrier(params->threadpool);
|
|
6984
|
+
|
|
6985
|
+
|
|
6986
|
+
//permute back [OC, N, OH, OW] to [N, OC, OH, OW]
|
|
6987
|
+
const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
|
|
6988
|
+
const int64_t permute_start = params->ith * permute_per_thread;
|
|
6989
|
+
const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);
|
|
6990
|
+
|
|
6991
|
+
for (int64_t i = permute_start; i < permute_end; ++i) {
|
|
6992
|
+
const int64_t p = patch_start_batch + i;
|
|
6993
|
+
const int64_t batch_n = p / (dst_w * dst_h);
|
|
6994
|
+
const int64_t dst_y = (p / dst_w) % dst_h;
|
|
6995
|
+
const int64_t dst_x = p % dst_w;
|
|
6996
|
+
|
|
6997
|
+
for (int64_t oc = 0; oc < c_out; ++oc) {
|
|
6998
|
+
const float value = gemm_output[i * c_out + oc];
|
|
6999
|
+
float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]);
|
|
7000
|
+
*dst_ptr = value;
|
|
7001
|
+
}
|
|
7002
|
+
}
|
|
7003
|
+
}
|
|
7004
|
+
}
|
|
7005
|
+
|
|
7006
|
+
void ggml_compute_forward_conv_2d(
|
|
7007
|
+
const ggml_compute_params * params,
|
|
7008
|
+
ggml_tensor * dst) {
|
|
7009
|
+
|
|
7010
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7011
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7012
|
+
|
|
7013
|
+
ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type);
|
|
7014
|
+
}
|
|
7015
|
+
|
|
6061
7016
|
// ggml_compute_forward_conv_transpose_2d
|
|
6062
7017
|
|
|
6063
7018
|
void ggml_compute_forward_conv_transpose_2d(
|
|
@@ -6608,12 +7563,13 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
6608
7563
|
|
|
6609
7564
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
6610
7565
|
|
|
6611
|
-
|
|
6612
|
-
|
|
6613
|
-
|
|
6614
|
-
|
|
7566
|
+
float sf0 = (float)ne0/src0->ne[0];
|
|
7567
|
+
float sf1 = (float)ne1/src0->ne[1];
|
|
7568
|
+
float sf2 = (float)ne2/src0->ne[2];
|
|
7569
|
+
float sf3 = (float)ne3/src0->ne[3];
|
|
6615
7570
|
|
|
6616
|
-
const
|
|
7571
|
+
const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
|
|
7572
|
+
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
|
|
6617
7573
|
|
|
6618
7574
|
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
6619
7575
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
@@ -6634,8 +7590,12 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
6634
7590
|
}
|
|
6635
7591
|
}
|
|
6636
7592
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
6637
|
-
|
|
6638
|
-
|
|
7593
|
+
float pixel_offset = 0.5f;
|
|
7594
|
+
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
7595
|
+
pixel_offset = 0.0f;
|
|
7596
|
+
sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
|
|
7597
|
+
sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
|
|
7598
|
+
}
|
|
6639
7599
|
|
|
6640
7600
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
6641
7601
|
const int64_t i03 = i3 / sf3;
|
|
@@ -7093,7 +8053,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
7093
8053
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
7094
8054
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
7095
8055
|
|
|
7096
|
-
ggml_type
|
|
8056
|
+
ggml_type const k_vec_dot_type = ggml_get_type_traits_cpu(k->type)->vec_dot_type;
|
|
7097
8057
|
ggml_from_float_t const q_to_vec_dot = ggml_get_type_traits_cpu(k_vec_dot_type)->from_float;
|
|
7098
8058
|
ggml_vec_dot_t const kq_vec_dot = ggml_get_type_traits_cpu(k->type)->vec_dot;
|
|
7099
8059
|
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
|
|
@@ -7125,7 +8085,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
7125
8085
|
memset(VKQ32, 0, DV*sizeof(float));
|
|
7126
8086
|
}
|
|
7127
8087
|
|
|
7128
|
-
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
|
8088
|
+
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]) : NULL;
|
|
7129
8089
|
|
|
7130
8090
|
// k indices
|
|
7131
8091
|
const int ik3 = iq3 / rk3;
|
|
@@ -7663,120 +8623,210 @@ void ggml_compute_forward_ssm_conv(
|
|
|
7663
8623
|
static void ggml_compute_forward_ssm_scan_f32(
|
|
7664
8624
|
const ggml_compute_params * params,
|
|
7665
8625
|
ggml_tensor * dst) {
|
|
7666
|
-
const ggml_tensor * src0 = dst->src[0]; // s
|
|
7667
|
-
const ggml_tensor * src1 = dst->src[1]; // x
|
|
7668
|
-
const ggml_tensor * src2 = dst->src[2]; // dt
|
|
7669
|
-
const ggml_tensor * src3 = dst->src[3]; // A
|
|
7670
|
-
const ggml_tensor * src4 = dst->src[4]; // B
|
|
7671
|
-
const ggml_tensor * src5 = dst->src[5]; // C
|
|
8626
|
+
const ggml_tensor * src0 = dst->src[0]; // s {d_state, dim, n_head, n_seqs+}
|
|
8627
|
+
const ggml_tensor * src1 = dst->src[1]; // x {dim, n_head, n_seq_tokens, n_seqs}
|
|
8628
|
+
const ggml_tensor * src2 = dst->src[2]; // dt {n_head, n_seq_tokens, n_seqs}
|
|
8629
|
+
const ggml_tensor * src3 = dst->src[3]; // A {d_state, n_head} or {1, n_head}
|
|
8630
|
+
const ggml_tensor * src4 = dst->src[4]; // B {d_state, n_group, n_seq_tokens, n_seqs}
|
|
8631
|
+
const ggml_tensor * src5 = dst->src[5]; // C {d_state, n_group, n_seq_tokens, n_seqs}
|
|
8632
|
+
const ggml_tensor * src6 = dst->src[6]; // ids {n_seqs}
|
|
7672
8633
|
|
|
7673
8634
|
const int ith = params->ith;
|
|
7674
8635
|
const int nth = params->nth;
|
|
7675
8636
|
|
|
7676
|
-
const int64_t nc
|
|
7677
|
-
const int64_t nr
|
|
7678
|
-
const int64_t
|
|
7679
|
-
const int64_t
|
|
8637
|
+
const int64_t nc = src0->ne[0]; // d_state
|
|
8638
|
+
const int64_t nr = src0->ne[1]; // dim
|
|
8639
|
+
const int64_t nh = src1->ne[1]; // n_head
|
|
8640
|
+
const int64_t ng = src4->ne[1];
|
|
8641
|
+
const int64_t nt = src1->ne[2]; // number of tokens per sequence
|
|
8642
|
+
const int64_t ns = src1->ne[3]; // number of sequences in the batch
|
|
8643
|
+
|
|
8644
|
+
// can't use ggml_nbytes because src1 is not necessarily contiguous
|
|
8645
|
+
const int64_t s_off = ggml_nelements(src1) * ggml_element_size(src1);
|
|
7680
8646
|
|
|
7681
|
-
GGML_ASSERT(ggml_nelements(src1) +
|
|
8647
|
+
GGML_ASSERT(ggml_nelements(src1) + nc*nr*nh*ns == ggml_nelements(dst));
|
|
7682
8648
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
|
7683
8649
|
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
|
7684
8650
|
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
|
7685
8651
|
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
|
7686
8652
|
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
|
7687
8653
|
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
|
7688
|
-
|
|
7689
|
-
|
|
7690
|
-
|
|
7691
|
-
GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
|
|
7692
|
-
// required to get correct offset for state destination (i.e. src1->nb[3])
|
|
7693
|
-
GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
|
|
8654
|
+
GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
|
|
8655
|
+
// allows optimizing the modulo since n_group should be a power of 2
|
|
8656
|
+
GGML_ASSERT((ng & -ng) == ng);
|
|
7694
8657
|
|
|
7695
|
-
//
|
|
7696
|
-
const int
|
|
8658
|
+
// heads per thread
|
|
8659
|
+
const int dh = (nh + nth - 1)/nth;
|
|
7697
8660
|
|
|
7698
|
-
//
|
|
7699
|
-
const int
|
|
7700
|
-
const int
|
|
7701
|
-
|
|
8661
|
+
// head range for this thread
|
|
8662
|
+
const int ih0 = dh*ith;
|
|
8663
|
+
const int ih1 = MIN(ih0 + dh, nh);
|
|
8664
|
+
|
|
8665
|
+
const int32_t * ids = (const int32_t *) src6->data;
|
|
7702
8666
|
|
|
7703
|
-
|
|
7704
|
-
|
|
7705
|
-
|
|
7706
|
-
|
|
7707
|
-
|
|
7708
|
-
|
|
7709
|
-
|
|
7710
|
-
|
|
7711
|
-
|
|
7712
|
-
|
|
7713
|
-
|
|
7714
|
-
|
|
7715
|
-
|
|
7716
|
-
|
|
7717
|
-
|
|
7718
|
-
//
|
|
7719
|
-
for (int
|
|
7720
|
-
|
|
7721
|
-
float
|
|
7722
|
-
|
|
7723
|
-
|
|
7724
|
-
|
|
7725
|
-
|
|
7726
|
-
|
|
7727
|
-
|
|
7728
|
-
|
|
7729
|
-
|
|
7730
|
-
|
|
7731
|
-
|
|
7732
|
-
|
|
7733
|
-
|
|
7734
|
-
|
|
7735
|
-
|
|
7736
|
-
|
|
7737
|
-
|
|
7738
|
-
|
|
7739
|
-
|
|
8667
|
+
for (int i3 = 0; i3 < ns; ++i3) {
|
|
8668
|
+
const float * s0 = (const float *) ((const char *) src0->data + ids[i3]*(src0->nb[3])); // {d_state, dim, nh, ns}
|
|
8669
|
+
float * s = ( float *) (( char *) dst->data + i3*(src0->nb[3]) + s_off); // {d_state, dim, nh, ns}
|
|
8670
|
+
|
|
8671
|
+
for (int i2 = 0; i2 < nt; ++i2) {
|
|
8672
|
+
const float * x = (const float *) ((const char *) src1->data + i2*(src1->nb[2]) + i3*(src1->nb[3])); // {dim, nh, nt, ns}
|
|
8673
|
+
const float * dt = (const float *) ((const char *) src2->data + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {nh, nt, ns}
|
|
8674
|
+
const float * A = (const float *) ((const char *) src3->data); // {d_state, nh} or {1, nh}
|
|
8675
|
+
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[2]) + i3*(src4->nb[3])); // {d_state, ng, nt, ns}
|
|
8676
|
+
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[2]) + i3*(src5->nb[3])); // {d_state, ng, nt, ns}
|
|
8677
|
+
float * y = ( float *) (( char *) dst->data + i2*(nh*nr*sizeof(float)) + i3*(nt*nh*nr*sizeof(float))); // {dim, nh, nt, ns}
|
|
8678
|
+
|
|
8679
|
+
if (src3->ne[0] == 1) {
|
|
8680
|
+
// Mamba-2 has a scalar decay factor per head; dA can be outside the state-wise loop
|
|
8681
|
+
|
|
8682
|
+
// n_head
|
|
8683
|
+
for (int h = ih0; h < ih1; ++h) {
|
|
8684
|
+
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
|
|
8685
|
+
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
|
|
8686
|
+
const float dA = expf(dt_soft_plus * A[h]);
|
|
8687
|
+
|
|
8688
|
+
// dim
|
|
8689
|
+
for (int i1 = 0; i1 < nr; ++i1) {
|
|
8690
|
+
const int ii = i1 + h*nr;
|
|
8691
|
+
const float x_dt = x[ii] * dt_soft_plus;
|
|
8692
|
+
float sumf = 0.0f;
|
|
8693
|
+
#if defined(GGML_SIMD)
|
|
8694
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
8695
|
+
const int ggml_f32_epr = svcntw();
|
|
8696
|
+
const int ggml_f32_step = 1 * ggml_f32_epr;
|
|
8697
|
+
|
|
8698
|
+
const int np = (nc & ~(ggml_f32_step - 1));
|
|
8699
|
+
|
|
8700
|
+
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
|
|
8701
|
+
|
|
8702
|
+
GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA);
|
|
8703
|
+
GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt);
|
|
8704
|
+
|
|
8705
|
+
for (int i = 0; i < np; i += ggml_f32_step) {
|
|
8706
|
+
// TODO: maybe unroll more?
|
|
8707
|
+
for (int j = 0; j < 1; j++) {
|
|
8708
|
+
GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
|
|
8709
|
+
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
|
|
8710
|
+
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
|
|
8711
|
+
|
|
8712
|
+
t0 = GGML_F32_VEC_MUL(t0, adA);
|
|
8713
|
+
t1 = GGML_F32_VEC_MUL(t1, axdt);
|
|
8714
|
+
|
|
8715
|
+
t0 = GGML_F32_VEC_ADD(t0, t1);
|
|
8716
|
+
|
|
8717
|
+
sum = GGML_F32_VEC_FMA(sum, t0, t2);
|
|
8718
|
+
|
|
8719
|
+
GGML_F32_VEC_STORE(s + i + j*ggml_f32_epr + ii*nc, t0);
|
|
8720
|
+
}
|
|
8721
|
+
}
|
|
8722
|
+
|
|
8723
|
+
sumf = GGML_F32xt_REDUCE_ONE(sum);
|
|
8724
|
+
#else
|
|
8725
|
+
const int np = (nc & ~(GGML_F32_STEP - 1));
|
|
8726
|
+
|
|
8727
|
+
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
|
8728
|
+
|
|
8729
|
+
GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA);
|
|
8730
|
+
GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt);
|
|
8731
|
+
|
|
8732
|
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
|
8733
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
8734
|
+
GGML_F32_VEC az[GGML_F32_ARR];
|
|
8735
|
+
|
|
8736
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
8737
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
8738
|
+
ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
|
|
8739
|
+
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
|
|
8740
|
+
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
|
|
8741
|
+
|
|
8742
|
+
ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
|
|
8743
|
+
ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
|
|
8744
|
+
|
|
8745
|
+
ax[j] = GGML_F32_VEC_ADD(ax[j], ay[j]);
|
|
8746
|
+
|
|
8747
|
+
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], az[j]);
|
|
8748
|
+
|
|
8749
|
+
GGML_F32_VEC_STORE(s + i + j*GGML_F32_EPR + ii*nc, ax[j]);
|
|
8750
|
+
}
|
|
8751
|
+
}
|
|
8752
|
+
|
|
8753
|
+
// reduce sum0..sum3 to sum0
|
|
8754
|
+
GGML_F32_VEC_REDUCE(sumf, sum);
|
|
8755
|
+
#endif
|
|
8756
|
+
#else
|
|
8757
|
+
const int np = 0;
|
|
8758
|
+
#endif
|
|
8759
|
+
// d_state
|
|
8760
|
+
for (int i0 = np; i0 < nc; ++i0) {
|
|
8761
|
+
const int i = i0 + ii*nc;
|
|
8762
|
+
const int ig = i0 + (h & (ng - 1))*nc;
|
|
8763
|
+
// state = prev_state * dA + dB * x
|
|
8764
|
+
const float state = (s0[i] * dA) + (B[ig] * x_dt);
|
|
8765
|
+
// y = rowwise_dotprod(state, C)
|
|
8766
|
+
sumf += state * C[ig];
|
|
8767
|
+
s[i] = state;
|
|
8768
|
+
}
|
|
8769
|
+
y[ii] = sumf;
|
|
7740
8770
|
}
|
|
7741
|
-
y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
|
|
7742
8771
|
}
|
|
7743
|
-
}
|
|
7744
|
-
|
|
7745
|
-
|
|
7746
|
-
|
|
7747
|
-
|
|
7748
|
-
|
|
7749
|
-
|
|
7750
|
-
|
|
7751
|
-
|
|
7752
|
-
|
|
7753
|
-
|
|
7754
|
-
|
|
7755
|
-
|
|
7756
|
-
|
|
7757
|
-
|
|
7758
|
-
|
|
7759
|
-
|
|
7760
|
-
|
|
7761
|
-
|
|
7762
|
-
|
|
7763
|
-
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
|
|
7767
|
-
|
|
7768
|
-
|
|
7769
|
-
|
|
7770
|
-
|
|
7771
|
-
|
|
7772
|
-
|
|
7773
|
-
|
|
8772
|
+
} else {
|
|
8773
|
+
// Mamba-1 has an element-wise decay factor for the states
|
|
8774
|
+
|
|
8775
|
+
// n_head
|
|
8776
|
+
for (int h = ih0; h < ih1; ++h) {
|
|
8777
|
+
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
|
|
8778
|
+
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
|
|
8779
|
+
|
|
8780
|
+
// dim
|
|
8781
|
+
for (int i1 = 0; i1 < nr; ++i1) {
|
|
8782
|
+
const int ii = i1 + h*nr;
|
|
8783
|
+
const float x_dt = x[ii] * dt_soft_plus;
|
|
8784
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
8785
|
+
svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
|
|
8786
|
+
svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
|
|
8787
|
+
svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
|
|
8788
|
+
|
|
8789
|
+
// d_state
|
|
8790
|
+
// TODO: what happens when (d_state % svcntw()) != 0?
|
|
8791
|
+
for (int64_t k = 0; k < nc; k += svcntw()) {
|
|
8792
|
+
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
|
|
8793
|
+
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]);
|
|
8794
|
+
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]);
|
|
8795
|
+
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
|
|
8796
|
+
|
|
8797
|
+
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
|
|
8798
|
+
t1 = exp_ps_sve(svptrue_b32(), t1);
|
|
8799
|
+
svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
|
|
8800
|
+
|
|
8801
|
+
vs0 = GGML_F32_VEC_FMA(t2, vs0, t1);
|
|
8802
|
+
r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
|
|
8803
|
+
|
|
8804
|
+
GGML_F32_VEC_STORE(&s[ii*nc + k], vs0);
|
|
8805
|
+
}
|
|
8806
|
+
y[ii] = GGML_F32xt_REDUCE_ONE(r1_vector);
|
|
8807
|
+
#else
|
|
8808
|
+
float sumf = 0.0f;
|
|
8809
|
+
// NOTE: can't really use GGML_SIMD here because d_state is usually 16
|
|
8810
|
+
// and also because expf is used within the loop.
|
|
8811
|
+
// d_state
|
|
8812
|
+
for (int i0 = 0; i0 < nc; ++i0) {
|
|
8813
|
+
const int i = i0 + ii*nc;
|
|
8814
|
+
const int ig = i0 + (h & (ng - 1))*nc;
|
|
8815
|
+
// state = prev_state * dA + dB * x
|
|
8816
|
+
const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
|
|
8817
|
+
// y = rowwise_dotprod(state, C)
|
|
8818
|
+
sumf += state * C[ig];
|
|
8819
|
+
s[i] = state;
|
|
8820
|
+
}
|
|
8821
|
+
y[ii] = sumf;
|
|
8822
|
+
#endif
|
|
7774
8823
|
}
|
|
7775
|
-
y[i1] = sumf;
|
|
7776
8824
|
}
|
|
7777
8825
|
}
|
|
8826
|
+
// use the output as the source when it's not the first token-wise iteration
|
|
8827
|
+
s0 = s;
|
|
7778
8828
|
}
|
|
7779
|
-
|
|
8829
|
+
}
|
|
7780
8830
|
}
|
|
7781
8831
|
|
|
7782
8832
|
void ggml_compute_forward_ssm_scan(
|
|
@@ -7994,6 +9044,42 @@ void ggml_compute_forward_unary(
|
|
|
7994
9044
|
}
|
|
7995
9045
|
}
|
|
7996
9046
|
|
|
9047
|
+
//ggml_compute_forward_glu
|
|
9048
|
+
|
|
9049
|
+
void ggml_compute_forward_glu(
|
|
9050
|
+
const ggml_compute_params * params,
|
|
9051
|
+
ggml_tensor * dst) {
|
|
9052
|
+
|
|
9053
|
+
const ggml_glu_op op = ggml_get_glu_op(dst);
|
|
9054
|
+
|
|
9055
|
+
switch (op) {
|
|
9056
|
+
case GGML_GLU_OP_REGLU:
|
|
9057
|
+
{
|
|
9058
|
+
ggml_compute_forward_reglu(params, dst);
|
|
9059
|
+
} break;
|
|
9060
|
+
case GGML_GLU_OP_GEGLU:
|
|
9061
|
+
{
|
|
9062
|
+
ggml_compute_forward_geglu(params, dst);
|
|
9063
|
+
} break;
|
|
9064
|
+
case GGML_GLU_OP_SWIGLU:
|
|
9065
|
+
{
|
|
9066
|
+
ggml_compute_forward_swiglu(params, dst);
|
|
9067
|
+
} break;
|
|
9068
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
9069
|
+
{
|
|
9070
|
+
ggml_compute_forward_geglu_erf(params, dst);
|
|
9071
|
+
} break;
|
|
9072
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
9073
|
+
{
|
|
9074
|
+
ggml_compute_forward_geglu_quick(params, dst);
|
|
9075
|
+
} break;
|
|
9076
|
+
default:
|
|
9077
|
+
{
|
|
9078
|
+
GGML_ABORT("fatal error");
|
|
9079
|
+
}
|
|
9080
|
+
}
|
|
9081
|
+
}
|
|
9082
|
+
|
|
7997
9083
|
// ggml_compute_forward_get_rel_pos
|
|
7998
9084
|
|
|
7999
9085
|
static void ggml_compute_forward_get_rel_pos_f16(
|