llama_cpp 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
|
2383
2383
|
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
2384
2384
|
size_t max_size = 0;
|
2385
2385
|
|
2386
|
-
struct
|
2387
|
-
|
2388
|
-
while (obj != NULL) {
|
2389
|
-
if (obj->type == GGML_OBJECT_TENSOR) {
|
2390
|
-
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
2391
|
-
|
2392
|
-
const size_t size = ggml_nbytes(tensor);
|
2393
|
-
|
2394
|
-
if (max_size < size) {
|
2395
|
-
max_size = size;
|
2396
|
-
}
|
2397
|
-
}
|
2398
|
-
|
2399
|
-
obj = obj->next;
|
2386
|
+
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
2387
|
+
max_size = MAX(max_size, ggml_nbytes(tensor));
|
2400
2388
|
}
|
2401
2389
|
|
2402
2390
|
return max_size;
|
@@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
|
|
3093
3081
|
return result;
|
3094
3082
|
}
|
3095
3083
|
|
3096
|
-
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
3084
|
+
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
|
3097
3085
|
struct ggml_object * obj = ctx->objects_begin;
|
3098
3086
|
|
3099
3087
|
char * const mem_buffer = ctx->mem_buffer;
|
@@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
|
3109
3097
|
return NULL;
|
3110
3098
|
}
|
3111
3099
|
|
3112
|
-
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
3100
|
+
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
3113
3101
|
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
3114
3102
|
obj = obj->next;
|
3115
3103
|
|
@@ -4098,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4098
4086
|
return result;
|
4099
4087
|
}
|
4100
4088
|
|
4089
|
+
void ggml_mul_mat_set_prec(
|
4090
|
+
struct ggml_tensor * a,
|
4091
|
+
enum ggml_prec prec) {
|
4092
|
+
const int32_t prec_i32 = (int32_t) prec;
|
4093
|
+
|
4094
|
+
ggml_set_op_params_i32(a, 0, prec_i32);
|
4095
|
+
}
|
4096
|
+
|
4101
4097
|
// ggml_mul_mat_id
|
4102
4098
|
|
4103
4099
|
struct ggml_tensor * ggml_mul_mat_id(
|
@@ -4175,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
|
|
4175
4171
|
static struct ggml_tensor * ggml_scale_impl(
|
4176
4172
|
struct ggml_context * ctx,
|
4177
4173
|
struct ggml_tensor * a,
|
4178
|
-
|
4174
|
+
float s,
|
4179
4175
|
bool inplace) {
|
4180
|
-
GGML_ASSERT(ggml_is_scalar(b));
|
4181
4176
|
GGML_ASSERT(ggml_is_padded_1d(a));
|
4182
4177
|
|
4183
4178
|
bool is_node = false;
|
4184
4179
|
|
4185
|
-
if (a->grad
|
4180
|
+
if (a->grad) {
|
4186
4181
|
is_node = true;
|
4187
4182
|
}
|
4188
4183
|
|
4189
4184
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4190
4185
|
|
4186
|
+
ggml_set_op_params(result, &s, sizeof(s));
|
4187
|
+
|
4191
4188
|
result->op = GGML_OP_SCALE;
|
4192
4189
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4193
4190
|
result->src[0] = a;
|
4194
|
-
result->src[1] = b;
|
4195
4191
|
|
4196
4192
|
return result;
|
4197
4193
|
}
|
@@ -4199,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
|
|
4199
4195
|
struct ggml_tensor * ggml_scale(
|
4200
4196
|
struct ggml_context * ctx,
|
4201
4197
|
struct ggml_tensor * a,
|
4202
|
-
|
4203
|
-
return ggml_scale_impl(ctx, a,
|
4198
|
+
float s) {
|
4199
|
+
return ggml_scale_impl(ctx, a, s, false);
|
4204
4200
|
}
|
4205
4201
|
|
4206
4202
|
struct ggml_tensor * ggml_scale_inplace(
|
4207
4203
|
struct ggml_context * ctx,
|
4208
4204
|
struct ggml_tensor * a,
|
4209
|
-
|
4210
|
-
return ggml_scale_impl(ctx, a,
|
4205
|
+
float s) {
|
4206
|
+
return ggml_scale_impl(ctx, a, s, true);
|
4211
4207
|
}
|
4212
4208
|
|
4213
4209
|
// ggml_set
|
@@ -9168,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
|
|
9168
9164
|
float eps;
|
9169
9165
|
memcpy(&eps, dst->op_params, sizeof(float));
|
9170
9166
|
|
9167
|
+
GGML_ASSERT(eps > 0.0f);
|
9168
|
+
|
9171
9169
|
// TODO: optimize
|
9172
9170
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9173
9171
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -9237,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
9237
9235
|
float eps;
|
9238
9236
|
memcpy(&eps, dst->op_params, sizeof(float));
|
9239
9237
|
|
9238
|
+
GGML_ASSERT(eps > 0.0f);
|
9239
|
+
|
9240
9240
|
// TODO: optimize
|
9241
9241
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9242
9242
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -9580,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9580
9580
|
}
|
9581
9581
|
#endif
|
9582
9582
|
|
9583
|
-
// off1 = offset in i11 and i1
|
9584
|
-
// cne1 = ne11 and ne1
|
9585
|
-
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
9586
|
-
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
9587
9583
|
static void ggml_compute_forward_mul_mat(
|
9588
9584
|
const struct ggml_compute_params * params,
|
9589
9585
|
const struct ggml_tensor * src0,
|
9590
9586
|
const struct ggml_tensor * src1,
|
9591
|
-
struct ggml_tensor * dst
|
9592
|
-
int64_t off1, int64_t cne1) {
|
9587
|
+
struct ggml_tensor * dst) {
|
9593
9588
|
int64_t t0 = ggml_perf_time_us();
|
9594
9589
|
UNUSED(t0);
|
9595
9590
|
|
@@ -9657,9 +9652,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9657
9652
|
const int64_t i03 = i13/r3;
|
9658
9653
|
const int64_t i02 = i12/r2;
|
9659
9654
|
|
9660
|
-
const void * x = (char *) src0->data +
|
9661
|
-
const float * y = (float *) ((char *) src1->data +
|
9662
|
-
float * d = (float *) ((char *) dst->data +
|
9655
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
9656
|
+
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
9657
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9663
9658
|
|
9664
9659
|
if (type != GGML_TYPE_F32) {
|
9665
9660
|
float * const wdata = params->wdata;
|
@@ -9676,7 +9671,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9676
9671
|
}
|
9677
9672
|
|
9678
9673
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9679
|
-
|
9674
|
+
ne1, ne01, ne10,
|
9680
9675
|
1.0f, y, ne10,
|
9681
9676
|
x, ne00,
|
9682
9677
|
0.0f, d, ne01);
|
@@ -9717,8 +9712,8 @@ static void ggml_compute_forward_mul_mat(
|
|
9717
9712
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9718
9713
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9719
9714
|
|
9720
|
-
const int64_t nr0 = ne01;
|
9721
|
-
const int64_t nr1 =
|
9715
|
+
const int64_t nr0 = ne01; // src0 rows
|
9716
|
+
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
9722
9717
|
|
9723
9718
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9724
9719
|
|
@@ -9760,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9760
9755
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9761
9756
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9762
9757
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9763
|
-
const int64_t i13 = (ir1/(ne12*
|
9764
|
-
const int64_t i12 = (ir1 - i13*ne12*
|
9765
|
-
const int64_t i11 = (ir1 - i13*ne12*
|
9758
|
+
const int64_t i13 = (ir1/(ne12*ne1));
|
9759
|
+
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
9760
|
+
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
9766
9761
|
|
9767
9762
|
// broadcast src0 into src1
|
9768
9763
|
const int64_t i03 = i13/r3;
|
@@ -9802,28 +9797,191 @@ static void ggml_compute_forward_mul_mat(
|
|
9802
9797
|
|
9803
9798
|
static void ggml_compute_forward_mul_mat_id(
|
9804
9799
|
const struct ggml_compute_params * params,
|
9805
|
-
const struct ggml_tensor *
|
9800
|
+
const struct ggml_tensor * ids,
|
9806
9801
|
const struct ggml_tensor * src1,
|
9807
9802
|
struct ggml_tensor * dst) {
|
9808
9803
|
|
9809
|
-
|
9810
|
-
|
9811
|
-
|
9812
|
-
|
9813
|
-
|
9804
|
+
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
9805
|
+
|
9806
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
9807
|
+
|
9808
|
+
const int ith = params->ith;
|
9809
|
+
const int nth = params->nth;
|
9810
|
+
|
9811
|
+
const enum ggml_type type = src0->type;
|
9812
|
+
|
9813
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
9814
9814
|
|
9815
|
-
const
|
9815
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
9816
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
9817
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
9818
|
+
|
9819
|
+
GGML_ASSERT(ne0 == ne01);
|
9820
|
+
GGML_ASSERT(ne1 == ne11);
|
9821
|
+
GGML_ASSERT(ne2 == ne12);
|
9822
|
+
GGML_ASSERT(ne3 == ne13);
|
9823
|
+
|
9824
|
+
// we don't support permuted src0 or src1
|
9825
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
9826
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
9827
|
+
|
9828
|
+
// dst cannot be transposed or permuted
|
9829
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
9830
|
+
GGML_ASSERT(nb0 <= nb1);
|
9831
|
+
GGML_ASSERT(nb1 <= nb2);
|
9832
|
+
GGML_ASSERT(nb2 <= nb3);
|
9833
|
+
|
9834
|
+
// broadcast factors
|
9835
|
+
const int64_t r2 = ne12/ne02;
|
9836
|
+
const int64_t r3 = ne13/ne03;
|
9837
|
+
|
9838
|
+
// row groups
|
9816
9839
|
const int id = ggml_get_op_params_i32(dst, 0);
|
9817
9840
|
const int n_as = ggml_get_op_params_i32(dst, 1);
|
9818
9841
|
|
9819
|
-
|
9820
|
-
|
9842
|
+
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
9843
|
+
(char *) params->wdata :
|
9844
|
+
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
9845
|
+
|
9846
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
9847
|
+
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
|
9848
|
+
|
9849
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
9850
|
+
|
9851
|
+
if (params->type == GGML_TASK_INIT) {
|
9852
|
+
char * wdata = params->wdata;
|
9853
|
+
if (src1->type != vec_dot_type) {
|
9854
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9855
|
+
|
9856
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9857
|
+
assert(src1->type == GGML_TYPE_F32);
|
9858
|
+
|
9859
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9860
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
9861
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
9862
|
+
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
9863
|
+
wdata += row_size;
|
9864
|
+
}
|
9865
|
+
}
|
9866
|
+
}
|
9867
|
+
}
|
9868
|
+
|
9869
|
+
// initialize matrix_row_counts
|
9870
|
+
GGML_ASSERT(wdata == wdata_src1_end);
|
9871
|
+
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
9872
|
+
|
9873
|
+
// group rows by src0 matrix
|
9874
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
9875
|
+
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
9876
|
+
|
9877
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
9878
|
+
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
|
9879
|
+
matrix_row_counts[row_id] += 1;
|
9880
|
+
}
|
9881
|
+
|
9882
|
+
return;
|
9883
|
+
}
|
9884
|
+
|
9885
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
9886
|
+
return;
|
9887
|
+
}
|
9888
|
+
|
9889
|
+
// compute each matrix multiplication in sequence
|
9890
|
+
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
9891
|
+
const int64_t cne1 = matrix_row_counts[cur_a];
|
9892
|
+
|
9893
|
+
if (cne1 == 0) {
|
9894
|
+
continue;
|
9895
|
+
}
|
9896
|
+
|
9897
|
+
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
9898
|
+
|
9899
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9900
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9901
|
+
|
9902
|
+
const int64_t nr0 = ne01; // src0 rows
|
9903
|
+
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
9904
|
+
|
9905
|
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9906
|
+
|
9907
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
9908
|
+
|
9909
|
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
9910
|
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
9911
|
+
|
9912
|
+
const int64_t ith0 = ith % nth0;
|
9913
|
+
const int64_t ith1 = ith / nth0;
|
9914
|
+
|
9915
|
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
9916
|
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
9917
|
+
|
9918
|
+
const int64_t ir010 = dr0*ith0;
|
9919
|
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
9920
|
+
|
9921
|
+
const int64_t ir110 = dr1*ith1;
|
9922
|
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
9923
|
+
|
9924
|
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
9925
|
+
|
9926
|
+
// threads with no work simply yield (not sure if it helps)
|
9927
|
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
9928
|
+
sched_yield();
|
9929
|
+
continue;
|
9930
|
+
}
|
9931
|
+
|
9932
|
+
assert(ne12 % ne02 == 0);
|
9933
|
+
assert(ne13 % ne03 == 0);
|
9934
|
+
|
9935
|
+
// block-tiling attempt
|
9936
|
+
const int64_t blck_0 = 16;
|
9937
|
+
const int64_t blck_1 = 16;
|
9938
|
+
|
9939
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
9940
|
+
float tmp[16];
|
9941
|
+
|
9942
|
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9943
|
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9944
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9945
|
+
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
|
9946
|
+
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
9947
|
+
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
9948
|
+
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
9949
|
+
|
9950
|
+
// broadcast src0 into src1
|
9951
|
+
const int64_t i03 = i13/r3;
|
9952
|
+
const int64_t i02 = i12/r2;
|
9953
|
+
|
9954
|
+
const int64_t i1 = i11;
|
9955
|
+
const int64_t i2 = i12;
|
9956
|
+
const int64_t i3 = i13;
|
9821
9957
|
|
9822
|
-
|
9958
|
+
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
|
9823
9959
|
|
9824
|
-
|
9825
|
-
|
9960
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
9961
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
9962
|
+
// the original src1 data pointer, so we should index using the indices directly
|
9963
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
9964
|
+
const char * src1_col = (const char *) wdata +
|
9965
|
+
(src1_cont || src1->type != vec_dot_type
|
9966
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
9967
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
9968
|
+
|
9969
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
9970
|
+
|
9971
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
9972
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
9973
|
+
//}
|
9974
|
+
|
9975
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
9976
|
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
9977
|
+
}
|
9978
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
9979
|
+
}
|
9980
|
+
}
|
9981
|
+
}
|
9826
9982
|
}
|
9983
|
+
|
9984
|
+
#undef MMID_MATRIX_ROW
|
9827
9985
|
}
|
9828
9986
|
|
9829
9987
|
// ggml_compute_forward_out_prod
|
@@ -10167,19 +10325,17 @@ static void ggml_compute_forward_out_prod(
|
|
10167
10325
|
static void ggml_compute_forward_scale_f32(
|
10168
10326
|
const struct ggml_compute_params * params,
|
10169
10327
|
const struct ggml_tensor * src0,
|
10170
|
-
const struct ggml_tensor * src1,
|
10171
10328
|
struct ggml_tensor * dst) {
|
10172
10329
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
10173
10330
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
10174
10331
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10175
|
-
GGML_ASSERT(ggml_is_scalar(src1));
|
10176
10332
|
|
10177
10333
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10178
10334
|
return;
|
10179
10335
|
}
|
10180
10336
|
|
10181
10337
|
// scale factor
|
10182
|
-
const float v = *(float *)
|
10338
|
+
const float v = *(float *) dst->op_params;
|
10183
10339
|
|
10184
10340
|
const int ith = params->ith;
|
10185
10341
|
const int nth = params->nth;
|
@@ -10210,12 +10366,11 @@ static void ggml_compute_forward_scale_f32(
|
|
10210
10366
|
static void ggml_compute_forward_scale(
|
10211
10367
|
const struct ggml_compute_params * params,
|
10212
10368
|
const struct ggml_tensor * src0,
|
10213
|
-
const struct ggml_tensor * src1,
|
10214
10369
|
struct ggml_tensor * dst) {
|
10215
10370
|
switch (src0->type) {
|
10216
10371
|
case GGML_TYPE_F32:
|
10217
10372
|
{
|
10218
|
-
ggml_compute_forward_scale_f32(params, src0,
|
10373
|
+
ggml_compute_forward_scale_f32(params, src0, dst);
|
10219
10374
|
} break;
|
10220
10375
|
default:
|
10221
10376
|
{
|
@@ -11404,10 +11559,13 @@ static void ggml_compute_forward_rope_f32(
|
|
11404
11559
|
}
|
11405
11560
|
} else {
|
11406
11561
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
11407
|
-
//
|
11562
|
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
11563
|
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
11408
11564
|
theta_base *= freq_scale;
|
11409
|
-
for (int64_t
|
11410
|
-
|
11565
|
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
11566
|
+
if (ic < n_dims) {
|
11567
|
+
const int64_t ib = 0;
|
11568
|
+
|
11411
11569
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11412
11570
|
float cur_rot = inv_ndims * ic - ib;
|
11413
11571
|
|
@@ -11430,6 +11588,14 @@ static void ggml_compute_forward_rope_f32(
|
|
11430
11588
|
|
11431
11589
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
11432
11590
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
11591
|
+
} else {
|
11592
|
+
const int64_t i0 = ic;
|
11593
|
+
|
11594
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11595
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11596
|
+
|
11597
|
+
dst_data[0] = src[0];
|
11598
|
+
dst_data[1] = src[1];
|
11433
11599
|
}
|
11434
11600
|
}
|
11435
11601
|
}
|
@@ -11557,10 +11723,13 @@ static void ggml_compute_forward_rope_f16(
|
|
11557
11723
|
}
|
11558
11724
|
} else {
|
11559
11725
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
11560
|
-
//
|
11726
|
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
11727
|
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
11561
11728
|
theta_base *= freq_scale;
|
11562
|
-
for (int64_t
|
11563
|
-
|
11729
|
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
11730
|
+
if (ic < n_dims) {
|
11731
|
+
const int64_t ib = 0;
|
11732
|
+
|
11564
11733
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11565
11734
|
float cur_rot = inv_ndims * ic - ib;
|
11566
11735
|
|
@@ -11583,6 +11752,14 @@ static void ggml_compute_forward_rope_f16(
|
|
11583
11752
|
|
11584
11753
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
11585
11754
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
11755
|
+
} else {
|
11756
|
+
const int64_t i0 = ic;
|
11757
|
+
|
11758
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11759
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11760
|
+
|
11761
|
+
dst_data[0] = src[0];
|
11762
|
+
dst_data[1] = src[1];
|
11586
11763
|
}
|
11587
11764
|
}
|
11588
11765
|
}
|
@@ -14191,7 +14368,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14191
14368
|
} break;
|
14192
14369
|
case GGML_OP_MUL_MAT:
|
14193
14370
|
{
|
14194
|
-
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor
|
14371
|
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
14195
14372
|
} break;
|
14196
14373
|
case GGML_OP_MUL_MAT_ID:
|
14197
14374
|
{
|
@@ -14203,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14203
14380
|
} break;
|
14204
14381
|
case GGML_OP_SCALE:
|
14205
14382
|
{
|
14206
|
-
ggml_compute_forward_scale(params, tensor->src[0], tensor
|
14383
|
+
ggml_compute_forward_scale(params, tensor->src[0], tensor);
|
14207
14384
|
} break;
|
14208
14385
|
case GGML_OP_SET:
|
14209
14386
|
{
|
@@ -14659,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
|
|
14659
14836
|
|
14660
14837
|
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
14661
14838
|
if (ggml_hash_contains(zero_table, a)) {
|
14662
|
-
struct ggml_tensor * a_zero = ggml_scale(ctx, a,
|
14839
|
+
struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
|
14663
14840
|
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
14664
14841
|
} else {
|
14665
14842
|
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
|
@@ -14795,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14795
14972
|
src0->grad,
|
14796
14973
|
ggml_scale(ctx,
|
14797
14974
|
ggml_mul(ctx, src0, tensor->grad),
|
14798
|
-
|
14975
|
+
2.0f),
|
14799
14976
|
zero_table);
|
14800
14977
|
}
|
14801
14978
|
} break;
|
@@ -14809,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14809
14986
|
ggml_div(ctx,
|
14810
14987
|
tensor->grad,
|
14811
14988
|
tensor),
|
14812
|
-
|
14989
|
+
0.5f),
|
14813
14990
|
zero_table);
|
14814
14991
|
}
|
14815
14992
|
} break;
|
@@ -14975,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14975
15152
|
{
|
14976
15153
|
// necessary for llama
|
14977
15154
|
if (src0->grad) {
|
15155
|
+
const float s = ((float *) tensor->op_params)[0];
|
15156
|
+
|
14978
15157
|
src0->grad =
|
14979
15158
|
ggml_add_or_set(ctx,
|
14980
15159
|
src0->grad,
|
14981
|
-
ggml_scale_impl(ctx, tensor->grad,
|
14982
|
-
zero_table);
|
14983
|
-
}
|
14984
|
-
if (src1->grad) {
|
14985
|
-
src1->grad =
|
14986
|
-
ggml_add_or_set(ctx,
|
14987
|
-
src1->grad,
|
14988
|
-
ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
|
15160
|
+
ggml_scale_impl(ctx, tensor->grad, s, false),
|
14989
15161
|
zero_table);
|
14990
15162
|
}
|
14991
15163
|
} break;
|
@@ -15163,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15163
15335
|
const int n_past = ((int32_t *) tensor->op_params)[0];
|
15164
15336
|
src0->grad =
|
15165
15337
|
ggml_add_or_set(ctx, src0->grad,
|
15338
|
+
/* ggml_diag_mask_inf_impl() shouldn't be here */
|
15339
|
+
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
|
15166
15340
|
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
|
15167
15341
|
zero_table);
|
15168
15342
|
}
|
@@ -15991,7 +16165,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15991
16165
|
} break;
|
15992
16166
|
case GGML_OP_MUL_MAT_ID:
|
15993
16167
|
{
|
15994
|
-
// FIXME: blas
|
15995
16168
|
n_tasks = n_threads;
|
15996
16169
|
} break;
|
15997
16170
|
case GGML_OP_OUT_PROD:
|
@@ -16325,20 +16498,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16325
16498
|
} break;
|
16326
16499
|
case GGML_OP_MUL_MAT_ID:
|
16327
16500
|
{
|
16328
|
-
const struct ggml_tensor *
|
16329
|
-
const struct ggml_tensor *
|
16330
|
-
const enum ggml_type vec_dot_type = type_traits[
|
16331
|
-
|
16332
|
-
|
16333
|
-
if (a->type != GGML_TYPE_F32) {
|
16334
|
-
// here we need memory just for single 2D matrix from src0
|
16335
|
-
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16336
|
-
}
|
16337
|
-
} else
|
16338
|
-
#endif
|
16339
|
-
if (b->type != vec_dot_type) {
|
16340
|
-
cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
|
16501
|
+
const struct ggml_tensor * src0 = node->src[2];
|
16502
|
+
const struct ggml_tensor * src1 = node->src[1];
|
16503
|
+
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
16504
|
+
if (src1->type != vec_dot_type) {
|
16505
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
16341
16506
|
}
|
16507
|
+
const int n_as = ggml_get_op_params_i32(node, 1);
|
16508
|
+
cur = GGML_PAD(cur, sizeof(int64_t)); // align
|
16509
|
+
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
16510
|
+
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
16342
16511
|
} break;
|
16343
16512
|
case GGML_OP_OUT_PROD:
|
16344
16513
|
{
|
@@ -19026,6 +19195,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
|
19026
19195
|
return ctx->infos[i].name.data;
|
19027
19196
|
}
|
19028
19197
|
|
19198
|
+
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
|
19199
|
+
return ctx->infos[i].type;
|
19200
|
+
}
|
19201
|
+
|
19029
19202
|
// returns the index
|
19030
19203
|
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
19031
19204
|
const int idx = gguf_find_key(ctx, key);
|