llama_cpp 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
|
2383
2383
|
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
2384
2384
|
size_t max_size = 0;
|
2385
2385
|
|
2386
|
-
struct
|
2387
|
-
|
2388
|
-
while (obj != NULL) {
|
2389
|
-
if (obj->type == GGML_OBJECT_TENSOR) {
|
2390
|
-
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
2391
|
-
|
2392
|
-
const size_t size = ggml_nbytes(tensor);
|
2393
|
-
|
2394
|
-
if (max_size < size) {
|
2395
|
-
max_size = size;
|
2396
|
-
}
|
2397
|
-
}
|
2398
|
-
|
2399
|
-
obj = obj->next;
|
2386
|
+
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
2387
|
+
max_size = MAX(max_size, ggml_nbytes(tensor));
|
2400
2388
|
}
|
2401
2389
|
|
2402
2390
|
return max_size;
|
@@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
|
|
3093
3081
|
return result;
|
3094
3082
|
}
|
3095
3083
|
|
3096
|
-
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
3084
|
+
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
|
3097
3085
|
struct ggml_object * obj = ctx->objects_begin;
|
3098
3086
|
|
3099
3087
|
char * const mem_buffer = ctx->mem_buffer;
|
@@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
|
3109
3097
|
return NULL;
|
3110
3098
|
}
|
3111
3099
|
|
3112
|
-
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
3100
|
+
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
3113
3101
|
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
3114
3102
|
obj = obj->next;
|
3115
3103
|
|
@@ -4098,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4098
4086
|
return result;
|
4099
4087
|
}
|
4100
4088
|
|
4089
|
+
void ggml_mul_mat_set_prec(
|
4090
|
+
struct ggml_tensor * a,
|
4091
|
+
enum ggml_prec prec) {
|
4092
|
+
const int32_t prec_i32 = (int32_t) prec;
|
4093
|
+
|
4094
|
+
ggml_set_op_params_i32(a, 0, prec_i32);
|
4095
|
+
}
|
4096
|
+
|
4101
4097
|
// ggml_mul_mat_id
|
4102
4098
|
|
4103
4099
|
struct ggml_tensor * ggml_mul_mat_id(
|
@@ -4175,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
|
|
4175
4171
|
static struct ggml_tensor * ggml_scale_impl(
|
4176
4172
|
struct ggml_context * ctx,
|
4177
4173
|
struct ggml_tensor * a,
|
4178
|
-
|
4174
|
+
float s,
|
4179
4175
|
bool inplace) {
|
4180
|
-
GGML_ASSERT(ggml_is_scalar(b));
|
4181
4176
|
GGML_ASSERT(ggml_is_padded_1d(a));
|
4182
4177
|
|
4183
4178
|
bool is_node = false;
|
4184
4179
|
|
4185
|
-
if (a->grad
|
4180
|
+
if (a->grad) {
|
4186
4181
|
is_node = true;
|
4187
4182
|
}
|
4188
4183
|
|
4189
4184
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4190
4185
|
|
4186
|
+
ggml_set_op_params(result, &s, sizeof(s));
|
4187
|
+
|
4191
4188
|
result->op = GGML_OP_SCALE;
|
4192
4189
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4193
4190
|
result->src[0] = a;
|
4194
|
-
result->src[1] = b;
|
4195
4191
|
|
4196
4192
|
return result;
|
4197
4193
|
}
|
@@ -4199,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
|
|
4199
4195
|
struct ggml_tensor * ggml_scale(
|
4200
4196
|
struct ggml_context * ctx,
|
4201
4197
|
struct ggml_tensor * a,
|
4202
|
-
|
4203
|
-
return ggml_scale_impl(ctx, a,
|
4198
|
+
float s) {
|
4199
|
+
return ggml_scale_impl(ctx, a, s, false);
|
4204
4200
|
}
|
4205
4201
|
|
4206
4202
|
struct ggml_tensor * ggml_scale_inplace(
|
4207
4203
|
struct ggml_context * ctx,
|
4208
4204
|
struct ggml_tensor * a,
|
4209
|
-
|
4210
|
-
return ggml_scale_impl(ctx, a,
|
4205
|
+
float s) {
|
4206
|
+
return ggml_scale_impl(ctx, a, s, true);
|
4211
4207
|
}
|
4212
4208
|
|
4213
4209
|
// ggml_set
|
@@ -9168,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
|
|
9168
9164
|
float eps;
|
9169
9165
|
memcpy(&eps, dst->op_params, sizeof(float));
|
9170
9166
|
|
9167
|
+
GGML_ASSERT(eps > 0.0f);
|
9168
|
+
|
9171
9169
|
// TODO: optimize
|
9172
9170
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9173
9171
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -9237,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
9237
9235
|
float eps;
|
9238
9236
|
memcpy(&eps, dst->op_params, sizeof(float));
|
9239
9237
|
|
9238
|
+
GGML_ASSERT(eps > 0.0f);
|
9239
|
+
|
9240
9240
|
// TODO: optimize
|
9241
9241
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9242
9242
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -9580,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9580
9580
|
}
|
9581
9581
|
#endif
|
9582
9582
|
|
9583
|
-
// off1 = offset in i11 and i1
|
9584
|
-
// cne1 = ne11 and ne1
|
9585
|
-
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
9586
|
-
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
9587
9583
|
static void ggml_compute_forward_mul_mat(
|
9588
9584
|
const struct ggml_compute_params * params,
|
9589
9585
|
const struct ggml_tensor * src0,
|
9590
9586
|
const struct ggml_tensor * src1,
|
9591
|
-
struct ggml_tensor * dst
|
9592
|
-
int64_t off1, int64_t cne1) {
|
9587
|
+
struct ggml_tensor * dst) {
|
9593
9588
|
int64_t t0 = ggml_perf_time_us();
|
9594
9589
|
UNUSED(t0);
|
9595
9590
|
|
@@ -9657,9 +9652,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9657
9652
|
const int64_t i03 = i13/r3;
|
9658
9653
|
const int64_t i02 = i12/r2;
|
9659
9654
|
|
9660
|
-
const void * x = (char *) src0->data +
|
9661
|
-
const float * y = (float *) ((char *) src1->data +
|
9662
|
-
float * d = (float *) ((char *) dst->data +
|
9655
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
9656
|
+
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
9657
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9663
9658
|
|
9664
9659
|
if (type != GGML_TYPE_F32) {
|
9665
9660
|
float * const wdata = params->wdata;
|
@@ -9676,7 +9671,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9676
9671
|
}
|
9677
9672
|
|
9678
9673
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9679
|
-
|
9674
|
+
ne1, ne01, ne10,
|
9680
9675
|
1.0f, y, ne10,
|
9681
9676
|
x, ne00,
|
9682
9677
|
0.0f, d, ne01);
|
@@ -9717,8 +9712,8 @@ static void ggml_compute_forward_mul_mat(
|
|
9717
9712
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9718
9713
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9719
9714
|
|
9720
|
-
const int64_t nr0 = ne01;
|
9721
|
-
const int64_t nr1 =
|
9715
|
+
const int64_t nr0 = ne01; // src0 rows
|
9716
|
+
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
9722
9717
|
|
9723
9718
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9724
9719
|
|
@@ -9760,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9760
9755
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9761
9756
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9762
9757
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9763
|
-
const int64_t i13 = (ir1/(ne12*
|
9764
|
-
const int64_t i12 = (ir1 - i13*ne12*
|
9765
|
-
const int64_t i11 = (ir1 - i13*ne12*
|
9758
|
+
const int64_t i13 = (ir1/(ne12*ne1));
|
9759
|
+
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
9760
|
+
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
9766
9761
|
|
9767
9762
|
// broadcast src0 into src1
|
9768
9763
|
const int64_t i03 = i13/r3;
|
@@ -9802,28 +9797,191 @@ static void ggml_compute_forward_mul_mat(
|
|
9802
9797
|
|
9803
9798
|
static void ggml_compute_forward_mul_mat_id(
|
9804
9799
|
const struct ggml_compute_params * params,
|
9805
|
-
const struct ggml_tensor *
|
9800
|
+
const struct ggml_tensor * ids,
|
9806
9801
|
const struct ggml_tensor * src1,
|
9807
9802
|
struct ggml_tensor * dst) {
|
9808
9803
|
|
9809
|
-
|
9810
|
-
|
9811
|
-
|
9812
|
-
|
9813
|
-
|
9804
|
+
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
9805
|
+
|
9806
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
9807
|
+
|
9808
|
+
const int ith = params->ith;
|
9809
|
+
const int nth = params->nth;
|
9810
|
+
|
9811
|
+
const enum ggml_type type = src0->type;
|
9812
|
+
|
9813
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
9814
9814
|
|
9815
|
-
const
|
9815
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
9816
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
9817
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
9818
|
+
|
9819
|
+
GGML_ASSERT(ne0 == ne01);
|
9820
|
+
GGML_ASSERT(ne1 == ne11);
|
9821
|
+
GGML_ASSERT(ne2 == ne12);
|
9822
|
+
GGML_ASSERT(ne3 == ne13);
|
9823
|
+
|
9824
|
+
// we don't support permuted src0 or src1
|
9825
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
9826
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
9827
|
+
|
9828
|
+
// dst cannot be transposed or permuted
|
9829
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
9830
|
+
GGML_ASSERT(nb0 <= nb1);
|
9831
|
+
GGML_ASSERT(nb1 <= nb2);
|
9832
|
+
GGML_ASSERT(nb2 <= nb3);
|
9833
|
+
|
9834
|
+
// broadcast factors
|
9835
|
+
const int64_t r2 = ne12/ne02;
|
9836
|
+
const int64_t r3 = ne13/ne03;
|
9837
|
+
|
9838
|
+
// row groups
|
9816
9839
|
const int id = ggml_get_op_params_i32(dst, 0);
|
9817
9840
|
const int n_as = ggml_get_op_params_i32(dst, 1);
|
9818
9841
|
|
9819
|
-
|
9820
|
-
|
9842
|
+
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
9843
|
+
(char *) params->wdata :
|
9844
|
+
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
9845
|
+
|
9846
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
9847
|
+
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
|
9848
|
+
|
9849
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
9850
|
+
|
9851
|
+
if (params->type == GGML_TASK_INIT) {
|
9852
|
+
char * wdata = params->wdata;
|
9853
|
+
if (src1->type != vec_dot_type) {
|
9854
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9855
|
+
|
9856
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9857
|
+
assert(src1->type == GGML_TYPE_F32);
|
9858
|
+
|
9859
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9860
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
9861
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
9862
|
+
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
9863
|
+
wdata += row_size;
|
9864
|
+
}
|
9865
|
+
}
|
9866
|
+
}
|
9867
|
+
}
|
9868
|
+
|
9869
|
+
// initialize matrix_row_counts
|
9870
|
+
GGML_ASSERT(wdata == wdata_src1_end);
|
9871
|
+
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
9872
|
+
|
9873
|
+
// group rows by src0 matrix
|
9874
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
9875
|
+
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
9876
|
+
|
9877
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
9878
|
+
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
|
9879
|
+
matrix_row_counts[row_id] += 1;
|
9880
|
+
}
|
9881
|
+
|
9882
|
+
return;
|
9883
|
+
}
|
9884
|
+
|
9885
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
9886
|
+
return;
|
9887
|
+
}
|
9888
|
+
|
9889
|
+
// compute each matrix multiplication in sequence
|
9890
|
+
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
9891
|
+
const int64_t cne1 = matrix_row_counts[cur_a];
|
9892
|
+
|
9893
|
+
if (cne1 == 0) {
|
9894
|
+
continue;
|
9895
|
+
}
|
9896
|
+
|
9897
|
+
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
9898
|
+
|
9899
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9900
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9901
|
+
|
9902
|
+
const int64_t nr0 = ne01; // src0 rows
|
9903
|
+
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
9904
|
+
|
9905
|
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9906
|
+
|
9907
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
9908
|
+
|
9909
|
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
9910
|
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
9911
|
+
|
9912
|
+
const int64_t ith0 = ith % nth0;
|
9913
|
+
const int64_t ith1 = ith / nth0;
|
9914
|
+
|
9915
|
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
9916
|
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
9917
|
+
|
9918
|
+
const int64_t ir010 = dr0*ith0;
|
9919
|
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
9920
|
+
|
9921
|
+
const int64_t ir110 = dr1*ith1;
|
9922
|
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
9923
|
+
|
9924
|
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
9925
|
+
|
9926
|
+
// threads with no work simply yield (not sure if it helps)
|
9927
|
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
9928
|
+
sched_yield();
|
9929
|
+
continue;
|
9930
|
+
}
|
9931
|
+
|
9932
|
+
assert(ne12 % ne02 == 0);
|
9933
|
+
assert(ne13 % ne03 == 0);
|
9934
|
+
|
9935
|
+
// block-tiling attempt
|
9936
|
+
const int64_t blck_0 = 16;
|
9937
|
+
const int64_t blck_1 = 16;
|
9938
|
+
|
9939
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
9940
|
+
float tmp[16];
|
9941
|
+
|
9942
|
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9943
|
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9944
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9945
|
+
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
|
9946
|
+
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
9947
|
+
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
9948
|
+
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
9949
|
+
|
9950
|
+
// broadcast src0 into src1
|
9951
|
+
const int64_t i03 = i13/r3;
|
9952
|
+
const int64_t i02 = i12/r2;
|
9953
|
+
|
9954
|
+
const int64_t i1 = i11;
|
9955
|
+
const int64_t i2 = i12;
|
9956
|
+
const int64_t i3 = i13;
|
9821
9957
|
|
9822
|
-
|
9958
|
+
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
|
9823
9959
|
|
9824
|
-
|
9825
|
-
|
9960
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
9961
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
9962
|
+
// the original src1 data pointer, so we should index using the indices directly
|
9963
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
9964
|
+
const char * src1_col = (const char *) wdata +
|
9965
|
+
(src1_cont || src1->type != vec_dot_type
|
9966
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
9967
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
9968
|
+
|
9969
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
9970
|
+
|
9971
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
9972
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
9973
|
+
//}
|
9974
|
+
|
9975
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
9976
|
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
9977
|
+
}
|
9978
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
9979
|
+
}
|
9980
|
+
}
|
9981
|
+
}
|
9826
9982
|
}
|
9983
|
+
|
9984
|
+
#undef MMID_MATRIX_ROW
|
9827
9985
|
}
|
9828
9986
|
|
9829
9987
|
// ggml_compute_forward_out_prod
|
@@ -10167,19 +10325,17 @@ static void ggml_compute_forward_out_prod(
|
|
10167
10325
|
static void ggml_compute_forward_scale_f32(
|
10168
10326
|
const struct ggml_compute_params * params,
|
10169
10327
|
const struct ggml_tensor * src0,
|
10170
|
-
const struct ggml_tensor * src1,
|
10171
10328
|
struct ggml_tensor * dst) {
|
10172
10329
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
10173
10330
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
10174
10331
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10175
|
-
GGML_ASSERT(ggml_is_scalar(src1));
|
10176
10332
|
|
10177
10333
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10178
10334
|
return;
|
10179
10335
|
}
|
10180
10336
|
|
10181
10337
|
// scale factor
|
10182
|
-
const float v = *(float *)
|
10338
|
+
const float v = *(float *) dst->op_params;
|
10183
10339
|
|
10184
10340
|
const int ith = params->ith;
|
10185
10341
|
const int nth = params->nth;
|
@@ -10210,12 +10366,11 @@ static void ggml_compute_forward_scale_f32(
|
|
10210
10366
|
static void ggml_compute_forward_scale(
|
10211
10367
|
const struct ggml_compute_params * params,
|
10212
10368
|
const struct ggml_tensor * src0,
|
10213
|
-
const struct ggml_tensor * src1,
|
10214
10369
|
struct ggml_tensor * dst) {
|
10215
10370
|
switch (src0->type) {
|
10216
10371
|
case GGML_TYPE_F32:
|
10217
10372
|
{
|
10218
|
-
ggml_compute_forward_scale_f32(params, src0,
|
10373
|
+
ggml_compute_forward_scale_f32(params, src0, dst);
|
10219
10374
|
} break;
|
10220
10375
|
default:
|
10221
10376
|
{
|
@@ -11404,10 +11559,13 @@ static void ggml_compute_forward_rope_f32(
|
|
11404
11559
|
}
|
11405
11560
|
} else {
|
11406
11561
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
11407
|
-
//
|
11562
|
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
11563
|
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
11408
11564
|
theta_base *= freq_scale;
|
11409
|
-
for (int64_t
|
11410
|
-
|
11565
|
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
11566
|
+
if (ic < n_dims) {
|
11567
|
+
const int64_t ib = 0;
|
11568
|
+
|
11411
11569
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11412
11570
|
float cur_rot = inv_ndims * ic - ib;
|
11413
11571
|
|
@@ -11430,6 +11588,14 @@ static void ggml_compute_forward_rope_f32(
|
|
11430
11588
|
|
11431
11589
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
11432
11590
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
11591
|
+
} else {
|
11592
|
+
const int64_t i0 = ic;
|
11593
|
+
|
11594
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11595
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11596
|
+
|
11597
|
+
dst_data[0] = src[0];
|
11598
|
+
dst_data[1] = src[1];
|
11433
11599
|
}
|
11434
11600
|
}
|
11435
11601
|
}
|
@@ -11557,10 +11723,13 @@ static void ggml_compute_forward_rope_f16(
|
|
11557
11723
|
}
|
11558
11724
|
} else {
|
11559
11725
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
11560
|
-
//
|
11726
|
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
11727
|
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
11561
11728
|
theta_base *= freq_scale;
|
11562
|
-
for (int64_t
|
11563
|
-
|
11729
|
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
11730
|
+
if (ic < n_dims) {
|
11731
|
+
const int64_t ib = 0;
|
11732
|
+
|
11564
11733
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11565
11734
|
float cur_rot = inv_ndims * ic - ib;
|
11566
11735
|
|
@@ -11583,6 +11752,14 @@ static void ggml_compute_forward_rope_f16(
|
|
11583
11752
|
|
11584
11753
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
11585
11754
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
11755
|
+
} else {
|
11756
|
+
const int64_t i0 = ic;
|
11757
|
+
|
11758
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11759
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11760
|
+
|
11761
|
+
dst_data[0] = src[0];
|
11762
|
+
dst_data[1] = src[1];
|
11586
11763
|
}
|
11587
11764
|
}
|
11588
11765
|
}
|
@@ -14191,7 +14368,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14191
14368
|
} break;
|
14192
14369
|
case GGML_OP_MUL_MAT:
|
14193
14370
|
{
|
14194
|
-
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor
|
14371
|
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
14195
14372
|
} break;
|
14196
14373
|
case GGML_OP_MUL_MAT_ID:
|
14197
14374
|
{
|
@@ -14203,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14203
14380
|
} break;
|
14204
14381
|
case GGML_OP_SCALE:
|
14205
14382
|
{
|
14206
|
-
ggml_compute_forward_scale(params, tensor->src[0], tensor
|
14383
|
+
ggml_compute_forward_scale(params, tensor->src[0], tensor);
|
14207
14384
|
} break;
|
14208
14385
|
case GGML_OP_SET:
|
14209
14386
|
{
|
@@ -14659,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
|
|
14659
14836
|
|
14660
14837
|
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
14661
14838
|
if (ggml_hash_contains(zero_table, a)) {
|
14662
|
-
struct ggml_tensor * a_zero = ggml_scale(ctx, a,
|
14839
|
+
struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
|
14663
14840
|
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
14664
14841
|
} else {
|
14665
14842
|
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
|
@@ -14795,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14795
14972
|
src0->grad,
|
14796
14973
|
ggml_scale(ctx,
|
14797
14974
|
ggml_mul(ctx, src0, tensor->grad),
|
14798
|
-
|
14975
|
+
2.0f),
|
14799
14976
|
zero_table);
|
14800
14977
|
}
|
14801
14978
|
} break;
|
@@ -14809,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14809
14986
|
ggml_div(ctx,
|
14810
14987
|
tensor->grad,
|
14811
14988
|
tensor),
|
14812
|
-
|
14989
|
+
0.5f),
|
14813
14990
|
zero_table);
|
14814
14991
|
}
|
14815
14992
|
} break;
|
@@ -14975,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14975
15152
|
{
|
14976
15153
|
// necessary for llama
|
14977
15154
|
if (src0->grad) {
|
15155
|
+
const float s = ((float *) tensor->op_params)[0];
|
15156
|
+
|
14978
15157
|
src0->grad =
|
14979
15158
|
ggml_add_or_set(ctx,
|
14980
15159
|
src0->grad,
|
14981
|
-
ggml_scale_impl(ctx, tensor->grad,
|
14982
|
-
zero_table);
|
14983
|
-
}
|
14984
|
-
if (src1->grad) {
|
14985
|
-
src1->grad =
|
14986
|
-
ggml_add_or_set(ctx,
|
14987
|
-
src1->grad,
|
14988
|
-
ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
|
15160
|
+
ggml_scale_impl(ctx, tensor->grad, s, false),
|
14989
15161
|
zero_table);
|
14990
15162
|
}
|
14991
15163
|
} break;
|
@@ -15163,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15163
15335
|
const int n_past = ((int32_t *) tensor->op_params)[0];
|
15164
15336
|
src0->grad =
|
15165
15337
|
ggml_add_or_set(ctx, src0->grad,
|
15338
|
+
/* ggml_diag_mask_inf_impl() shouldn't be here */
|
15339
|
+
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
|
15166
15340
|
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
|
15167
15341
|
zero_table);
|
15168
15342
|
}
|
@@ -15991,7 +16165,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15991
16165
|
} break;
|
15992
16166
|
case GGML_OP_MUL_MAT_ID:
|
15993
16167
|
{
|
15994
|
-
// FIXME: blas
|
15995
16168
|
n_tasks = n_threads;
|
15996
16169
|
} break;
|
15997
16170
|
case GGML_OP_OUT_PROD:
|
@@ -16325,20 +16498,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16325
16498
|
} break;
|
16326
16499
|
case GGML_OP_MUL_MAT_ID:
|
16327
16500
|
{
|
16328
|
-
const struct ggml_tensor *
|
16329
|
-
const struct ggml_tensor *
|
16330
|
-
const enum ggml_type vec_dot_type = type_traits[
|
16331
|
-
|
16332
|
-
|
16333
|
-
if (a->type != GGML_TYPE_F32) {
|
16334
|
-
// here we need memory just for single 2D matrix from src0
|
16335
|
-
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16336
|
-
}
|
16337
|
-
} else
|
16338
|
-
#endif
|
16339
|
-
if (b->type != vec_dot_type) {
|
16340
|
-
cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
|
16501
|
+
const struct ggml_tensor * src0 = node->src[2];
|
16502
|
+
const struct ggml_tensor * src1 = node->src[1];
|
16503
|
+
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
16504
|
+
if (src1->type != vec_dot_type) {
|
16505
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
16341
16506
|
}
|
16507
|
+
const int n_as = ggml_get_op_params_i32(node, 1);
|
16508
|
+
cur = GGML_PAD(cur, sizeof(int64_t)); // align
|
16509
|
+
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
16510
|
+
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
16342
16511
|
} break;
|
16343
16512
|
case GGML_OP_OUT_PROD:
|
16344
16513
|
{
|
@@ -19026,6 +19195,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
|
19026
19195
|
return ctx->infos[i].name.data;
|
19027
19196
|
}
|
19028
19197
|
|
19198
|
+
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
|
19199
|
+
return ctx->infos[i].type;
|
19200
|
+
}
|
19201
|
+
|
19029
19202
|
// returns the index
|
19030
19203
|
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
19031
19204
|
const int idx = gguf_find_key(ctx, key);
|