llama_cpp 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -31,11 +31,17 @@
|
|
31
31
|
#include <unistd.h>
|
32
32
|
#endif
|
33
33
|
|
34
|
+
// static_assert should be a #define, but if it's not,
|
35
|
+
// fall back to the _Static_assert C11 keyword.
|
34
36
|
// if C99 - static_assert is noop
|
35
37
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
36
38
|
#ifndef static_assert
|
39
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
40
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
41
|
+
#else
|
37
42
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
38
43
|
#endif
|
44
|
+
#endif
|
39
45
|
|
40
46
|
#if defined(_MSC_VER)
|
41
47
|
// disable "possible loss of data" to avoid hundreds of casts
|
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
|
|
112
118
|
#endif
|
113
119
|
#endif
|
114
120
|
|
115
|
-
#ifdef __HAIKU__
|
116
|
-
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
117
|
-
#endif
|
118
|
-
|
119
121
|
/*#define GGML_PERF*/
|
120
122
|
#define GGML_DEBUG 0
|
121
123
|
#define GGML_GELU_FP16
|
@@ -4410,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
|
|
4410
4412
|
if (&g_state.contexts[i].context == ctx) {
|
4411
4413
|
g_state.contexts[i].used = false;
|
4412
4414
|
|
4413
|
-
GGML_PRINT_DEBUG("%s: context %d
|
4414
|
-
__func__, i, ctx
|
4415
|
+
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
4416
|
+
__func__, i, ggml_used_mem(ctx));
|
4415
4417
|
|
4416
4418
|
if (ctx->mem_buffer_owned) {
|
4417
4419
|
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
@@ -6955,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6955
6957
|
int n_dims,
|
6956
6958
|
int mode,
|
6957
6959
|
int n_ctx,
|
6960
|
+
float freq_base,
|
6961
|
+
float freq_scale,
|
6958
6962
|
bool inplace) {
|
6959
6963
|
GGML_ASSERT(n_past >= 0);
|
6960
6964
|
bool is_node = false;
|
@@ -6967,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6967
6971
|
|
6968
6972
|
ggml_scratch_save(ctx);
|
6969
6973
|
|
6970
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6974
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
6971
6975
|
|
6972
6976
|
((int32_t *) b->data)[0] = n_past;
|
6973
6977
|
((int32_t *) b->data)[1] = n_dims;
|
6974
6978
|
((int32_t *) b->data)[2] = mode;
|
6975
6979
|
((int32_t *) b->data)[3] = n_ctx;
|
6980
|
+
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
6981
|
+
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
6976
6982
|
|
6977
6983
|
ggml_scratch_load(ctx);
|
6978
6984
|
|
@@ -6991,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
|
|
6991
6997
|
int n_dims,
|
6992
6998
|
int mode,
|
6993
6999
|
int n_ctx) {
|
6994
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
7000
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
6995
7001
|
}
|
6996
7002
|
|
6997
7003
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -7001,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
7001
7007
|
int n_dims,
|
7002
7008
|
int mode,
|
7003
7009
|
int n_ctx) {
|
7004
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
7010
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
7011
|
+
}
|
7012
|
+
|
7013
|
+
struct ggml_tensor * ggml_rope_custom_inplace(
|
7014
|
+
struct ggml_context * ctx,
|
7015
|
+
struct ggml_tensor * a,
|
7016
|
+
int n_past,
|
7017
|
+
int n_dims,
|
7018
|
+
int mode,
|
7019
|
+
int n_ctx,
|
7020
|
+
float freq_base,
|
7021
|
+
float freq_scale) {
|
7022
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
7005
7023
|
}
|
7006
7024
|
|
7007
7025
|
// ggml_rope_back
|
@@ -7011,7 +7029,8 @@ struct ggml_tensor * ggml_rope_back(
|
|
7011
7029
|
struct ggml_tensor * a,
|
7012
7030
|
int n_past,
|
7013
7031
|
int n_dims,
|
7014
|
-
int mode
|
7032
|
+
int mode,
|
7033
|
+
int n_ctx) {
|
7015
7034
|
GGML_ASSERT(n_past >= 0);
|
7016
7035
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
7017
7036
|
|
@@ -7025,12 +7044,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
7025
7044
|
|
7026
7045
|
ggml_scratch_save(ctx);
|
7027
7046
|
|
7028
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
7047
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
7029
7048
|
ggml_set_name(b, "n_past, n_dims, mode");
|
7030
7049
|
|
7031
7050
|
((int32_t *) b->data)[0] = n_past;
|
7032
7051
|
((int32_t *) b->data)[1] = n_dims;
|
7033
7052
|
((int32_t *) b->data)[2] = mode;
|
7053
|
+
((int32_t *) b->data)[3] = n_ctx;
|
7034
7054
|
|
7035
7055
|
ggml_scratch_load(ctx);
|
7036
7056
|
|
@@ -10684,6 +10704,8 @@ static void ggml_compute_forward_mul_mat(
|
|
10684
10704
|
|
10685
10705
|
const enum ggml_type type = src0->type;
|
10686
10706
|
|
10707
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
10708
|
+
|
10687
10709
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10688
10710
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10689
10711
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
@@ -10747,7 +10769,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10747
10769
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
10748
10770
|
|
10749
10771
|
if (type != GGML_TYPE_F32) {
|
10750
|
-
|
10772
|
+
float * const wdata = params->wdata;
|
10751
10773
|
ggml_to_float_t const to_float = type_traits[type].to_float;
|
10752
10774
|
|
10753
10775
|
size_t id = 0;
|
@@ -10805,7 +10827,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10805
10827
|
// src1 rows
|
10806
10828
|
const int64_t nr1 = ne11*ne12*ne13;
|
10807
10829
|
|
10808
|
-
void * wdata
|
10830
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10809
10831
|
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10810
10832
|
|
10811
10833
|
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
|
@@ -10828,7 +10850,15 @@ static void ggml_compute_forward_mul_mat(
|
|
10828
10850
|
const int64_t i3 = i13;
|
10829
10851
|
|
10830
10852
|
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
10831
|
-
|
10853
|
+
|
10854
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10855
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10856
|
+
// the original src1 data pointer, so we should index using the indices directly
|
10857
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10858
|
+
const char * src1_col = (const char *) wdata +
|
10859
|
+
(src1_cont || src1->type != vec_dot_type
|
10860
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10861
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10832
10862
|
|
10833
10863
|
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10834
10864
|
|
@@ -12062,16 +12092,21 @@ static void ggml_compute_forward_rope_f32(
|
|
12062
12092
|
const struct ggml_tensor * src1,
|
12063
12093
|
struct ggml_tensor * dst) {
|
12064
12094
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12065
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12095
|
+
GGML_ASSERT(ggml_nelements(src1) == 6);
|
12066
12096
|
|
12067
12097
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12068
12098
|
return;
|
12069
12099
|
}
|
12070
12100
|
|
12101
|
+
float freq_base;
|
12102
|
+
float freq_scale;
|
12103
|
+
|
12071
12104
|
const int n_past = ((int32_t *) src1->data)[0];
|
12072
12105
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12073
12106
|
const int mode = ((int32_t *) src1->data)[2];
|
12074
12107
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
12108
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
12109
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
12075
12110
|
|
12076
12111
|
assert(n_past >= 0);
|
12077
12112
|
|
@@ -12100,7 +12135,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12100
12135
|
// row index used to determine which thread to use
|
12101
12136
|
int ir = 0;
|
12102
12137
|
|
12103
|
-
const float theta_scale = powf(
|
12138
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12104
12139
|
|
12105
12140
|
const bool is_neox = mode & 2;
|
12106
12141
|
const bool is_glm = mode & 4;
|
@@ -12112,7 +12147,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12112
12147
|
if (ir++ < ir0) continue;
|
12113
12148
|
if (ir > ir1) break;
|
12114
12149
|
|
12115
|
-
float theta = (float)p;
|
12150
|
+
float theta = freq_scale * (float)p;
|
12116
12151
|
|
12117
12152
|
if (is_glm) {
|
12118
12153
|
theta = MIN(p, n_ctx - 2);
|
@@ -12189,16 +12224,21 @@ static void ggml_compute_forward_rope_f16(
|
|
12189
12224
|
const struct ggml_tensor * src1,
|
12190
12225
|
struct ggml_tensor * dst) {
|
12191
12226
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12192
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12227
|
+
GGML_ASSERT(ggml_nelements(src1) == 6);
|
12193
12228
|
|
12194
12229
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12195
12230
|
return;
|
12196
12231
|
}
|
12197
12232
|
|
12233
|
+
float freq_base;
|
12234
|
+
float freq_scale;
|
12235
|
+
|
12198
12236
|
const int n_past = ((int32_t *) src1->data)[0];
|
12199
12237
|
const int n_dims = ((int32_t *) src1->data)[1];
|
12200
12238
|
const int mode = ((int32_t *) src1->data)[2];
|
12201
12239
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
12240
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
12241
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
12202
12242
|
|
12203
12243
|
assert(n_past >= 0);
|
12204
12244
|
|
@@ -12227,7 +12267,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12227
12267
|
// row index used to determine which thread to use
|
12228
12268
|
int ir = 0;
|
12229
12269
|
|
12230
|
-
const float theta_scale = powf(
|
12270
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12231
12271
|
|
12232
12272
|
const bool is_neox = mode & 2;
|
12233
12273
|
const bool is_glm = mode & 4;
|
@@ -12239,7 +12279,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12239
12279
|
if (ir++ < ir0) continue;
|
12240
12280
|
if (ir > ir1) break;
|
12241
12281
|
|
12242
|
-
float theta = (float)p;
|
12282
|
+
float theta = freq_scale * (float)p;
|
12243
12283
|
|
12244
12284
|
if (is_glm) {
|
12245
12285
|
theta = MIN(p, n_ctx - 2);
|
@@ -12300,7 +12340,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12300
12340
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
12301
12341
|
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
12302
12342
|
|
12303
|
-
dst_data[0]
|
12343
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
12304
12344
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12305
12345
|
}
|
12306
12346
|
}
|
@@ -12339,7 +12379,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12339
12379
|
const struct ggml_tensor * src1,
|
12340
12380
|
struct ggml_tensor * dst) {
|
12341
12381
|
assert(src1->type == GGML_TYPE_I32);
|
12342
|
-
assert(ggml_nelements(src1) ==
|
12382
|
+
assert(ggml_nelements(src1) == 4);
|
12343
12383
|
|
12344
12384
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12345
12385
|
return;
|
@@ -12982,12 +13022,13 @@ static void ggml_compute_forward_conv_1d(
|
|
12982
13022
|
};
|
12983
13023
|
}
|
12984
13024
|
|
12985
|
-
//
|
13025
|
+
// ggml_compute_forward_conv_2d
|
12986
13026
|
|
12987
|
-
static void
|
13027
|
+
static void ggml_compute_forward_conv_2d_f16_f32(
|
12988
13028
|
const struct ggml_compute_params * params,
|
12989
13029
|
const struct ggml_tensor * src0,
|
12990
13030
|
const struct ggml_tensor * src1,
|
13031
|
+
const struct ggml_tensor * opt0,
|
12991
13032
|
struct ggml_tensor * dst) {
|
12992
13033
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12993
13034
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -13007,28 +13048,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13007
13048
|
// size of the convolution row - the kernel size unrolled across all channels
|
13008
13049
|
const int ew0 = nk0*nk1*ne02;
|
13009
13050
|
|
13051
|
+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
13052
|
+
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
13053
|
+
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
13054
|
+
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
13055
|
+
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
13056
|
+
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
13057
|
+
|
13010
13058
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13011
13059
|
GGML_ASSERT(nb10 == sizeof(float));
|
13012
13060
|
|
13013
13061
|
if (params->type == GGML_TASK_INIT) {
|
13014
|
-
// TODO: fix this memset (wsize is overestimated)
|
13015
13062
|
memset(params->wdata, 0, params->wsize);
|
13016
13063
|
|
13017
13064
|
// prepare source data (src1)
|
13018
13065
|
{
|
13019
13066
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13020
13067
|
|
13021
|
-
for (int
|
13022
|
-
|
13023
|
-
|
13024
|
-
|
13068
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13069
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
13070
|
+
ggml_fp16_t * dst_data = wdata;
|
13071
|
+
|
13072
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13073
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13074
|
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13075
|
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13076
|
+
const int idx0 = i0*s0 + ik0*d0 - p0;
|
13077
|
+
const int idx1 = i1*s1 + ik1*d1 - p1;
|
13025
13078
|
|
13026
|
-
|
13027
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
13028
|
-
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13029
|
-
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13079
|
+
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
|
13030
13080
|
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13031
|
-
GGML_FP32_TO_FP16(src[
|
13081
|
+
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
|
13032
13082
|
}
|
13033
13083
|
}
|
13034
13084
|
}
|
@@ -13071,19 +13121,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13071
13121
|
}
|
13072
13122
|
}
|
13073
13123
|
|
13074
|
-
static void
|
13124
|
+
static void ggml_compute_forward_conv_2d(
|
13075
13125
|
const struct ggml_compute_params * params,
|
13076
13126
|
const struct ggml_tensor * src0,
|
13077
13127
|
const struct ggml_tensor * src1,
|
13078
|
-
struct ggml_tensor *
|
13128
|
+
const struct ggml_tensor * opt0,
|
13129
|
+
struct ggml_tensor * dst
|
13130
|
+
) {
|
13079
13131
|
switch (src0->type) {
|
13080
13132
|
case GGML_TYPE_F16:
|
13081
13133
|
{
|
13082
|
-
|
13134
|
+
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
|
13083
13135
|
} break;
|
13084
13136
|
case GGML_TYPE_F32:
|
13085
13137
|
{
|
13086
|
-
//
|
13138
|
+
//ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
|
13087
13139
|
GGML_ASSERT(false);
|
13088
13140
|
} break;
|
13089
13141
|
default:
|
@@ -13093,32 +13145,6 @@ static void ggml_compute_forward_conv_2d_sk_p0(
|
|
13093
13145
|
}
|
13094
13146
|
}
|
13095
13147
|
|
13096
|
-
// ggml_compute_forward_conv_2d
|
13097
|
-
|
13098
|
-
static void ggml_compute_forward_conv_2d(
|
13099
|
-
const struct ggml_compute_params* params,
|
13100
|
-
const struct ggml_tensor* src0,
|
13101
|
-
const struct ggml_tensor* src1,
|
13102
|
-
const struct ggml_tensor* opt0,
|
13103
|
-
struct ggml_tensor* dst) {
|
13104
|
-
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
13105
|
-
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
13106
|
-
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
13107
|
-
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
13108
|
-
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
13109
|
-
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
13110
|
-
GGML_ASSERT(d0 == 1); // dilation not supported
|
13111
|
-
GGML_ASSERT(d1 == 1);
|
13112
|
-
GGML_ASSERT(p0 == 0); // padding not supported
|
13113
|
-
GGML_ASSERT(p1 == 0);
|
13114
|
-
|
13115
|
-
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
|
13116
|
-
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
|
13117
|
-
} else {
|
13118
|
-
GGML_ASSERT(false); // only stride equal to kernel size is supported
|
13119
|
-
}
|
13120
|
-
}
|
13121
|
-
|
13122
13148
|
// ggml_compute_forward_pool_1d_sk_p0
|
13123
13149
|
|
13124
13150
|
static void ggml_compute_forward_pool_1d_sk_p0(
|
@@ -15712,17 +15738,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15712
15738
|
// necessary for llama
|
15713
15739
|
if (src0->grad) {
|
15714
15740
|
assert(src1->type == GGML_TYPE_I32);
|
15715
|
-
assert(ggml_nelements(src1) ==
|
15741
|
+
assert(ggml_nelements(src1) == 6);
|
15716
15742
|
const int n_past = ((int32_t *) src1->data)[0];
|
15717
15743
|
const int n_dims = ((int32_t *) src1->data)[1];
|
15718
15744
|
const int mode = ((int32_t *) src1->data)[2];
|
15745
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
15719
15746
|
src0->grad = ggml_add_impl(ctx,
|
15720
15747
|
src0->grad,
|
15721
15748
|
ggml_rope_back(ctx,
|
15722
15749
|
tensor->grad,
|
15723
15750
|
n_past,
|
15724
15751
|
n_dims,
|
15725
|
-
mode
|
15752
|
+
mode,
|
15753
|
+
n_ctx),
|
15726
15754
|
inplace);
|
15727
15755
|
}
|
15728
15756
|
if (src1->grad) {
|
@@ -16293,8 +16321,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16293
16321
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16294
16322
|
params.nth = n_tasks_arr[node_n];
|
16295
16323
|
ggml_compute_forward(¶ms, node);
|
16296
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16297
16324
|
}
|
16325
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16298
16326
|
}
|
16299
16327
|
|
16300
16328
|
// distribute new work or execute it direct if 1T
|
@@ -16324,8 +16352,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16324
16352
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16325
16353
|
params.type = GGML_TASK_FINALIZE;
|
16326
16354
|
ggml_compute_forward(¶ms, node);
|
16327
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16328
16355
|
}
|
16356
|
+
|
16357
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16329
16358
|
} else {
|
16330
16359
|
break;
|
16331
16360
|
}
|
@@ -16575,19 +16604,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16575
16604
|
const int64_t ne11 = node->src[1]->ne[1]; // H
|
16576
16605
|
const int64_t ne12 = node->src[1]->ne[2]; // C
|
16577
16606
|
|
16607
|
+
const int64_t ne0 = node->ne[0];
|
16608
|
+
const int64_t ne1 = node->ne[1];
|
16609
|
+
const int64_t ne2 = node->ne[2];
|
16578
16610
|
const int64_t nk = ne00*ne01;
|
16611
|
+
const int64_t ew0 = nk * ne02;
|
16579
16612
|
|
16580
|
-
UNUSED(ne02);
|
16581
16613
|
UNUSED(ne03);
|
16582
|
-
UNUSED(
|
16614
|
+
UNUSED(ne2);
|
16583
16615
|
|
16584
16616
|
size_t cur = 0;
|
16585
16617
|
|
16586
16618
|
if (node->src[0]->type == GGML_TYPE_F16 &&
|
16587
|
-
|
16588
|
-
cur = sizeof(ggml_fp16_t)*(
|
16619
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16620
|
+
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
16589
16621
|
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
16590
|
-
|
16622
|
+
node->src[1]->type == GGML_TYPE_F32) {
|
16591
16623
|
cur = sizeof(float)* (ne10*ne11*ne12);
|
16592
16624
|
} else {
|
16593
16625
|
GGML_ASSERT(false);
|
@@ -16864,9 +16896,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16864
16896
|
}
|
16865
16897
|
|
16866
16898
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
16867
|
-
//assert(cgraph->work == NULL);
|
16868
|
-
//assert(cgraph->work_size == 0);
|
16869
|
-
|
16870
16899
|
uint64_t size_eval = 0;
|
16871
16900
|
|
16872
16901
|
// compute size of intermediate results
|
@@ -17305,9 +17334,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
17305
17334
|
|
17306
17335
|
GGML_PRINT("=== GRAPH ===\n");
|
17307
17336
|
|
17308
|
-
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
17309
|
-
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
17310
|
-
|
17311
17337
|
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
17312
17338
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17313
17339
|
struct ggml_tensor * node = cgraph->nodes[i];
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -1121,6 +1121,17 @@ extern "C" {
|
|
1121
1121
|
int mode,
|
1122
1122
|
int n_ctx);
|
1123
1123
|
|
1124
|
+
// custom RoPE, in-place, returns view(a)
|
1125
|
+
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1126
|
+
struct ggml_context * ctx,
|
1127
|
+
struct ggml_tensor * a,
|
1128
|
+
int n_past,
|
1129
|
+
int n_dims,
|
1130
|
+
int mode,
|
1131
|
+
int n_ctx,
|
1132
|
+
float freq_base,
|
1133
|
+
float freq_scale);
|
1134
|
+
|
1124
1135
|
// rotary position embedding backward, i.e compute dx from dy
|
1125
1136
|
// a - dy
|
1126
1137
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
@@ -1128,7 +1139,8 @@ extern "C" {
|
|
1128
1139
|
struct ggml_tensor * a,
|
1129
1140
|
int n_past,
|
1130
1141
|
int n_dims,
|
1131
|
-
int mode
|
1142
|
+
int mode,
|
1143
|
+
int n_ctx);
|
1132
1144
|
|
1133
1145
|
// alibi position embedding
|
1134
1146
|
// in-place, returns view(a)
|
@@ -15,6 +15,14 @@
|
|
15
15
|
#define K_SCALE_SIZE 12
|
16
16
|
#endif
|
17
17
|
|
18
|
+
#ifndef static_assert
|
19
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
20
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
21
|
+
#else
|
22
|
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
23
|
+
#endif
|
24
|
+
#endif
|
25
|
+
|
18
26
|
//
|
19
27
|
// Super-block quantization structures
|
20
28
|
//
|