llama_cpp 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -31,11 +31,17 @@
|
|
31
31
|
#include <unistd.h>
|
32
32
|
#endif
|
33
33
|
|
34
|
+
// static_assert should be a #define, but if it's not,
|
35
|
+
// fall back to the _Static_assert C11 keyword.
|
34
36
|
// if C99 - static_assert is noop
|
35
37
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
36
38
|
#ifndef static_assert
|
39
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
40
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
41
|
+
#else
|
37
42
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
38
43
|
#endif
|
44
|
+
#endif
|
39
45
|
|
40
46
|
#if defined(_MSC_VER)
|
41
47
|
// disable "possible loss of data" to avoid hundreds of casts
|
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
|
|
112
118
|
#endif
|
113
119
|
#endif
|
114
120
|
|
115
|
-
#ifdef __HAIKU__
|
116
|
-
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
117
|
-
#endif
|
118
|
-
|
119
121
|
/*#define GGML_PERF*/
|
120
122
|
#define GGML_DEBUG 0
|
121
123
|
#define GGML_GELU_FP16
|
@@ -4410,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
|
|
4410
4412
|
if (&g_state.contexts[i].context == ctx) {
|
4411
4413
|
g_state.contexts[i].used = false;
|
4412
4414
|
|
4413
|
-
GGML_PRINT_DEBUG("%s: context %d
|
4414
|
-
__func__, i, ctx
|
4415
|
+
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
4416
|
+
__func__, i, ggml_used_mem(ctx));
|
4415
4417
|
|
4416
4418
|
if (ctx->mem_buffer_owned) {
|
4417
4419
|
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
@@ -6955,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6955
6957
|
int n_dims,
|
6956
6958
|
int mode,
|
6957
6959
|
int n_ctx,
|
6960
|
+
float freq_base,
|
6961
|
+
float freq_scale,
|
6958
6962
|
bool inplace) {
|
6959
6963
|
GGML_ASSERT(n_past >= 0);
|
6960
6964
|
bool is_node = false;
|
@@ -6967,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6967
6971
|
|
6968
6972
|
ggml_scratch_save(ctx);
|
6969
6973
|
|
6970
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6974
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
6971
6975
|
|
6972
6976
|
((int32_t *) b->data)[0] = n_past;
|
6973
6977
|
((int32_t *) b->data)[1] = n_dims;
|
6974
6978
|
((int32_t *) b->data)[2] = mode;
|
6975
6979
|
((int32_t *) b->data)[3] = n_ctx;
|
6980
|
+
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
6981
|
+
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
6976
6982
|
|
6977
6983
|
ggml_scratch_load(ctx);
|
6978
6984
|
|
@@ -6991,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
|
|
6991
6997
|
int n_dims,
|
6992
6998
|
int mode,
|
6993
6999
|
int n_ctx) {
|
6994
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
7000
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
6995
7001
|
}
|
6996
7002
|
|
6997
7003
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -7001,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
7001
7007
|
int n_dims,
|
7002
7008
|
int mode,
|
7003
7009
|
int n_ctx) {
|
7004
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
7010
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
7011
|
+
}
|
7012
|
+
|
7013
|
+
struct ggml_tensor * ggml_rope_custom_inplace(
|
7014
|
+
struct ggml_context * ctx,
|
7015
|
+
struct ggml_tensor * a,
|
7016
|
+
int n_past,
|
7017
|
+
int n_dims,
|
7018
|
+
int mode,
|
7019
|
+
int n_ctx,
|
7020
|
+
float freq_base,
|
7021
|
+
float freq_scale) {
|
7022
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
7005
7023
|
}
|
7006
7024
|
|
7007
7025
|
// ggml_rope_back
|
@@ -7011,7 +7029,8 @@ struct ggml_tensor * ggml_rope_back(
|
|
7011
7029
|
struct ggml_tensor * a,
|
7012
7030
|
int n_past,
|
7013
7031
|
int n_dims,
|
7014
|
-
int mode
|
7032
|
+
int mode,
|
7033
|
+
int n_ctx) {
|
7015
7034
|
GGML_ASSERT(n_past >= 0);
|
7016
7035
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
7017
7036
|
|
@@ -7025,12 +7044,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
7025
7044
|
|
7026
7045
|
ggml_scratch_save(ctx);
|
7027
7046
|
|
7028
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
7047
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
7029
7048
|
ggml_set_name(b, "n_past, n_dims, mode");
|
7030
7049
|
|
7031
7050
|
((int32_t *) b->data)[0] = n_past;
|
7032
7051
|
((int32_t *) b->data)[1] = n_dims;
|
7033
7052
|
((int32_t *) b->data)[2] = mode;
|
7053
|
+
((int32_t *) b->data)[3] = n_ctx;
|
7034
7054
|
|
7035
7055
|
ggml_scratch_load(ctx);
|
7036
7056
|
|
@@ -10684,6 +10704,8 @@ static void ggml_compute_forward_mul_mat(
|
|
10684
10704
|
|
10685
10705
|
const enum ggml_type type = src0->type;
|
10686
10706
|
|
10707
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
10708
|
+
|
10687
10709
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10688
10710
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10689
10711
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
@@ -10747,7 +10769,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10747
10769
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
10748
10770
|
|
10749
10771
|
if (type != GGML_TYPE_F32) {
|
10750
|
-
|
10772
|
+
float * const wdata = params->wdata;
|
10751
10773
|
ggml_to_float_t const to_float = type_traits[type].to_float;
|
10752
10774
|
|
10753
10775
|
size_t id = 0;
|
@@ -10805,7 +10827,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10805
10827
|
// src1 rows
|
10806
10828
|
const int64_t nr1 = ne11*ne12*ne13;
|
10807
10829
|
|
10808
|
-
void * wdata
|
10830
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10809
10831
|
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10810
10832
|
|
10811
10833
|
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
|
@@ -10828,7 +10850,15 @@ static void ggml_compute_forward_mul_mat(
|
|
10828
10850
|
const int64_t i3 = i13;
|
10829
10851
|
|
10830
10852
|
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
10831
|
-
|
10853
|
+
|
10854
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10855
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10856
|
+
// the original src1 data pointer, so we should index using the indices directly
|
10857
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10858
|
+
const char * src1_col = (const char *) wdata +
|
10859
|
+
(src1_cont || src1->type != vec_dot_type
|
10860
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10861
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10832
10862
|
|
10833
10863
|
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10834
10864
|
|
@@ -12062,16 +12092,21 @@ static void ggml_compute_forward_rope_f32(
|
|
12062
12092
|
const struct ggml_tensor * src1,
|
12063
12093
|
struct ggml_tensor * dst) {
|
12064
12094
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12065
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12095
|
+
GGML_ASSERT(ggml_nelements(src1) == 6);
|
12066
12096
|
|
12067
12097
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12068
12098
|
return;
|
12069
12099
|
}
|
12070
12100
|
|
12101
|
+
float freq_base;
|
12102
|
+
float freq_scale;
|
12103
|
+
|
12071
12104
|
const int n_past = ((int32_t *) src1->data)[0];
|
12072
12105
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12073
12106
|
const int mode = ((int32_t *) src1->data)[2];
|
12074
12107
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
12108
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
12109
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
12075
12110
|
|
12076
12111
|
assert(n_past >= 0);
|
12077
12112
|
|
@@ -12100,7 +12135,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12100
12135
|
// row index used to determine which thread to use
|
12101
12136
|
int ir = 0;
|
12102
12137
|
|
12103
|
-
const float theta_scale = powf(
|
12138
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12104
12139
|
|
12105
12140
|
const bool is_neox = mode & 2;
|
12106
12141
|
const bool is_glm = mode & 4;
|
@@ -12112,7 +12147,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12112
12147
|
if (ir++ < ir0) continue;
|
12113
12148
|
if (ir > ir1) break;
|
12114
12149
|
|
12115
|
-
float theta = (float)p;
|
12150
|
+
float theta = freq_scale * (float)p;
|
12116
12151
|
|
12117
12152
|
if (is_glm) {
|
12118
12153
|
theta = MIN(p, n_ctx - 2);
|
@@ -12189,16 +12224,21 @@ static void ggml_compute_forward_rope_f16(
|
|
12189
12224
|
const struct ggml_tensor * src1,
|
12190
12225
|
struct ggml_tensor * dst) {
|
12191
12226
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12192
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12227
|
+
GGML_ASSERT(ggml_nelements(src1) == 6);
|
12193
12228
|
|
12194
12229
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12195
12230
|
return;
|
12196
12231
|
}
|
12197
12232
|
|
12233
|
+
float freq_base;
|
12234
|
+
float freq_scale;
|
12235
|
+
|
12198
12236
|
const int n_past = ((int32_t *) src1->data)[0];
|
12199
12237
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12200
12238
|
const int mode = ((int32_t *) src1->data)[2];
|
12201
12239
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
12240
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
12241
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
12202
12242
|
|
12203
12243
|
assert(n_past >= 0);
|
12204
12244
|
|
@@ -12227,7 +12267,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12227
12267
|
// row index used to determine which thread to use
|
12228
12268
|
int ir = 0;
|
12229
12269
|
|
12230
|
-
const float theta_scale = powf(
|
12270
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12231
12271
|
|
12232
12272
|
const bool is_neox = mode & 2;
|
12233
12273
|
const bool is_glm = mode & 4;
|
@@ -12239,7 +12279,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12239
12279
|
if (ir++ < ir0) continue;
|
12240
12280
|
if (ir > ir1) break;
|
12241
12281
|
|
12242
|
-
float theta = (float)p;
|
12282
|
+
float theta = freq_scale * (float)p;
|
12243
12283
|
|
12244
12284
|
if (is_glm) {
|
12245
12285
|
theta = MIN(p, n_ctx - 2);
|
@@ -12300,7 +12340,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12300
12340
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
12301
12341
|
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
12302
12342
|
|
12303
|
-
dst_data[0]
|
12343
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
12304
12344
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12305
12345
|
}
|
12306
12346
|
}
|
@@ -12339,7 +12379,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12339
12379
|
const struct ggml_tensor * src1,
|
12340
12380
|
struct ggml_tensor * dst) {
|
12341
12381
|
assert(src1->type == GGML_TYPE_I32);
|
12342
|
-
assert(ggml_nelements(src1) ==
|
12382
|
+
assert(ggml_nelements(src1) == 4);
|
12343
12383
|
|
12344
12384
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12345
12385
|
return;
|
@@ -12982,12 +13022,13 @@ static void ggml_compute_forward_conv_1d(
|
|
12982
13022
|
};
|
12983
13023
|
}
|
12984
13024
|
|
12985
|
-
//
|
13025
|
+
// ggml_compute_forward_conv_2d
|
12986
13026
|
|
12987
|
-
static void
|
13027
|
+
static void ggml_compute_forward_conv_2d_f16_f32(
|
12988
13028
|
const struct ggml_compute_params * params,
|
12989
13029
|
const struct ggml_tensor * src0,
|
12990
13030
|
const struct ggml_tensor * src1,
|
13031
|
+
const struct ggml_tensor * opt0,
|
12991
13032
|
struct ggml_tensor * dst) {
|
12992
13033
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12993
13034
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -13007,28 +13048,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13007
13048
|
// size of the convolution row - the kernel size unrolled across all channels
|
13008
13049
|
const int ew0 = nk0*nk1*ne02;
|
13009
13050
|
|
13051
|
+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
13052
|
+
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
13053
|
+
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
13054
|
+
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
13055
|
+
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
13056
|
+
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
13057
|
+
|
13010
13058
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13011
13059
|
GGML_ASSERT(nb10 == sizeof(float));
|
13012
13060
|
|
13013
13061
|
if (params->type == GGML_TASK_INIT) {
|
13014
|
-
// TODO: fix this memset (wsize is overestimated)
|
13015
13062
|
memset(params->wdata, 0, params->wsize);
|
13016
13063
|
|
13017
13064
|
// prepare source data (src1)
|
13018
13065
|
{
|
13019
13066
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13020
13067
|
|
13021
|
-
for (int
|
13022
|
-
|
13023
|
-
|
13024
|
-
|
13068
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13069
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
13070
|
+
ggml_fp16_t * dst_data = wdata;
|
13071
|
+
|
13072
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13073
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13074
|
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13075
|
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13076
|
+
const int idx0 = i0*s0 + ik0*d0 - p0;
|
13077
|
+
const int idx1 = i1*s1 + ik1*d1 - p1;
|
13025
13078
|
|
13026
|
-
|
13027
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
13028
|
-
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13029
|
-
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13079
|
+
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
|
13030
13080
|
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13031
|
-
GGML_FP32_TO_FP16(src[
|
13081
|
+
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
|
13032
13082
|
}
|
13033
13083
|
}
|
13034
13084
|
}
|
@@ -13071,19 +13121,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13071
13121
|
}
|
13072
13122
|
}
|
13073
13123
|
|
13074
|
-
static void
|
13124
|
+
static void ggml_compute_forward_conv_2d(
|
13075
13125
|
const struct ggml_compute_params * params,
|
13076
13126
|
const struct ggml_tensor * src0,
|
13077
13127
|
const struct ggml_tensor * src1,
|
13078
|
-
struct ggml_tensor *
|
13128
|
+
const struct ggml_tensor * opt0,
|
13129
|
+
struct ggml_tensor * dst
|
13130
|
+
) {
|
13079
13131
|
switch (src0->type) {
|
13080
13132
|
case GGML_TYPE_F16:
|
13081
13133
|
{
|
13082
|
-
|
13134
|
+
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
|
13083
13135
|
} break;
|
13084
13136
|
case GGML_TYPE_F32:
|
13085
13137
|
{
|
13086
|
-
//
|
13138
|
+
//ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
|
13087
13139
|
GGML_ASSERT(false);
|
13088
13140
|
} break;
|
13089
13141
|
default:
|
@@ -13093,32 +13145,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
|
|
13093
13145
|
}
|
13094
13146
|
}
|
13095
13147
|
|
13096
|
-
// ggml_compute_forward_conv_2d
|
13097
|
-
|
13098
|
-
static void ggml_compute_forward_conv_2d(
|
13099
|
-
const struct ggml_compute_params* params,
|
13100
|
-
const struct ggml_tensor* src0,
|
13101
|
-
const struct ggml_tensor* src1,
|
13102
|
-
const struct ggml_tensor* opt0,
|
13103
|
-
struct ggml_tensor* dst) {
|
13104
|
-
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
13105
|
-
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
13106
|
-
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
13107
|
-
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
13108
|
-
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
13109
|
-
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
13110
|
-
GGML_ASSERT(d0 == 1); // dilation not supported
|
13111
|
-
GGML_ASSERT(d1 == 1);
|
13112
|
-
GGML_ASSERT(p0 == 0); // padding not supported
|
13113
|
-
GGML_ASSERT(p1 == 0);
|
13114
|
-
|
13115
|
-
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
|
13116
|
-
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
|
13117
|
-
} else {
|
13118
|
-
GGML_ASSERT(false); // only stride equal to kernel size is supported
|
13119
|
-
}
|
13120
|
-
}
|
13121
|
-
|
13122
13148
|
// ggml_compute_forward_pool_1d_sk_p0
|
13123
13149
|
|
13124
13150
|
static void ggml_compute_forward_pool_1d_sk_p0(
|
@@ -15712,17 +15738,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15712
15738
|
// necessary for llama
|
15713
15739
|
if (src0->grad) {
|
15714
15740
|
assert(src1->type == GGML_TYPE_I32);
|
15715
|
-
assert(ggml_nelements(src1) ==
|
15741
|
+
assert(ggml_nelements(src1) == 6);
|
15716
15742
|
const int n_past = ((int32_t *) src1->data)[0];
|
15717
15743
|
const int n_dims = ((int32_t *) src1->data)[1];
|
15718
15744
|
const int mode = ((int32_t *) src1->data)[2];
|
15745
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
15719
15746
|
src0->grad = ggml_add_impl(ctx,
|
15720
15747
|
src0->grad,
|
15721
15748
|
ggml_rope_back(ctx,
|
15722
15749
|
tensor->grad,
|
15723
15750
|
n_past,
|
15724
15751
|
n_dims,
|
15725
|
-
mode
|
15752
|
+
mode,
|
15753
|
+
n_ctx),
|
15726
15754
|
inplace);
|
15727
15755
|
}
|
15728
15756
|
if (src1->grad) {
|
@@ -16293,8 +16321,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16293
16321
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16294
16322
|
params.nth = n_tasks_arr[node_n];
|
16295
16323
|
ggml_compute_forward(¶ms, node);
|
16296
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16297
16324
|
}
|
16325
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16298
16326
|
}
|
16299
16327
|
|
16300
16328
|
// distribute new work or execute it direct if 1T
|
@@ -16324,8 +16352,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16324
16352
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16325
16353
|
params.type = GGML_TASK_FINALIZE;
|
16326
16354
|
ggml_compute_forward(¶ms, node);
|
16327
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16328
16355
|
}
|
16356
|
+
|
16357
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16329
16358
|
} else {
|
16330
16359
|
break;
|
16331
16360
|
}
|
@@ -16575,19 +16604,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16575
16604
|
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16576
16605
|
const int64_t ne12 = node->src[1]->ne[2]; // C
|
16577
16606
|
|
16607
|
+
const int64_t ne0 = node->ne[0];
|
16608
|
+
const int64_t ne1 = node->ne[1];
|
16609
|
+
const int64_t ne2 = node->ne[2];
|
16578
16610
|
const int64_t nk = ne00*ne01;
|
16611
|
+
const int64_t ew0 = nk * ne02;
|
16579
16612
|
|
16580
|
-
UNUSED(ne02);
|
16581
16613
|
UNUSED(ne03);
|
16582
|
-
UNUSED(
|
16614
|
+
UNUSED(ne2);
|
16583
16615
|
|
16584
16616
|
size_t cur = 0;
|
16585
16617
|
|
16586
16618
|
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16587
|
-
|
16588
|
-
cur = sizeof(ggml_fp16_t)*(
|
16619
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16620
|
+
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
16589
16621
|
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16590
|
-
|
16622
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16591
16623
|
cur = sizeof(float)* (ne10*ne11*ne12);
|
16592
16624
|
} else {
|
16593
16625
|
GGML_ASSERT(false);
|
@@ -16864,9 +16896,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16864
16896
|
}
|
16865
16897
|
|
16866
16898
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
16867
|
-
//assert(cgraph->work == NULL);
|
16868
|
-
//assert(cgraph->work_size == 0);
|
16869
|
-
|
16870
16899
|
uint64_t size_eval = 0;
|
16871
16900
|
|
16872
16901
|
// compute size of intermediate results
|
@@ -17305,9 +17334,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
17305
17334
|
|
17306
17335
|
GGML_PRINT("=== GRAPH ===\n");
|
17307
17336
|
|
17308
|
-
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
17309
|
-
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
17310
|
-
|
17311
17337
|
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
17312
17338
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17313
17339
|
struct ggml_tensor * node = cgraph->nodes[i];
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -1121,6 +1121,17 @@ extern "C" {
|
|
1121
1121
|
int mode,
|
1122
1122
|
int n_ctx);
|
1123
1123
|
|
1124
|
+
// custom RoPE, in-place, returns view(a)
|
1125
|
+
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1126
|
+
struct ggml_context * ctx,
|
1127
|
+
struct ggml_tensor * a,
|
1128
|
+
int n_past,
|
1129
|
+
int n_dims,
|
1130
|
+
int mode,
|
1131
|
+
int n_ctx,
|
1132
|
+
float freq_base,
|
1133
|
+
float freq_scale);
|
1134
|
+
|
1124
1135
|
// rotary position embedding backward, i.e compute dx from dy
|
1125
1136
|
// a - dy
|
1126
1137
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
@@ -1128,7 +1139,8 @@ extern "C" {
|
|
1128
1139
|
struct ggml_tensor * a,
|
1129
1140
|
int n_past,
|
1130
1141
|
int n_dims,
|
1131
|
-
int mode
|
1142
|
+
int mode,
|
1143
|
+
int n_ctx);
|
1132
1144
|
|
1133
1145
|
// alibi position embedding
|
1134
1146
|
// in-place, returns view(a)
|
@@ -15,6 +15,14 @@
|
|
15
15
|
#define K_SCALE_SIZE 12
|
16
16
|
#endif
|
17
17
|
|
18
|
+
#ifndef static_assert
|
19
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
20
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
21
|
+
#else
|
22
|
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
23
|
+
#endif
|
24
|
+
#endif
|
25
|
+
|
18
26
|
//
|
19
27
|
// Super-block quantization structures
|
20
28
|
//
|