llama_cpp 0.3.5 → 0.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
|
|
195
195
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
196
196
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
197
|
#else
|
198
|
-
inline static void* ggml_aligned_malloc(size_t size) {
|
199
|
-
void* aligned_memory = NULL;
|
198
|
+
inline static void * ggml_aligned_malloc(size_t size) {
|
199
|
+
void * aligned_memory = NULL;
|
200
200
|
#ifdef GGML_USE_METAL
|
201
201
|
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
202
202
|
#else
|
@@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
3811
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
3812
|
};
|
3813
3813
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
3814
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3815
3815
|
|
3816
3816
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
3817
|
"none",
|
@@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
3883
|
"cross_entropy_loss_back(x,y)",
|
3884
3884
|
};
|
3885
3885
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
3886
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3887
3887
|
|
3888
3888
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
3889
|
|
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
4110
4110
|
//
|
4111
4111
|
// is enough, but just in case, adding the second part
|
4112
4112
|
|
4113
|
-
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
4113
|
+
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
|
4114
4114
|
}
|
4115
4115
|
|
4116
4116
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
@@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4253
4253
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4254
|
}
|
4255
4255
|
|
4256
|
-
|
4256
|
+
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
4257
4257
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4258
4258
|
|
4259
4259
|
return
|
@@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|
4557
4557
|
|
4558
4558
|
static struct ggml_tensor * ggml_new_tensor_impl(
|
4559
4559
|
struct ggml_context * ctx,
|
4560
|
-
enum ggml_type
|
4561
|
-
int
|
4562
|
-
const int64_t* ne,
|
4563
|
-
void*
|
4560
|
+
enum ggml_type type,
|
4561
|
+
int n_dims,
|
4562
|
+
const int64_t * ne,
|
4563
|
+
void * data) {
|
4564
|
+
|
4565
|
+
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4564
4566
|
|
4565
4567
|
size_t data_size = 0;
|
4566
4568
|
|
@@ -4600,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4600
4602
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4601
4603
|
/*.nb =*/ { 0, 0, 0, 0 },
|
4602
4604
|
/*.op =*/ GGML_OP_NONE,
|
4603
|
-
/*.op_params =*/ {0},
|
4605
|
+
/*.op_params =*/ { 0 },
|
4604
4606
|
/*.is_param =*/ false,
|
4605
4607
|
/*.grad =*/ NULL,
|
4606
4608
|
/*.src =*/ { NULL },
|
@@ -4632,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4632
4634
|
}
|
4633
4635
|
|
4634
4636
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4637
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4635
4638
|
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4636
4639
|
memcpy(tensor->op_params, params, params_size);
|
4637
4640
|
}
|
@@ -4648,22 +4651,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
|
|
4648
4651
|
|
4649
4652
|
struct ggml_tensor * ggml_new_tensor(
|
4650
4653
|
struct ggml_context * ctx,
|
4651
|
-
enum ggml_type
|
4652
|
-
int
|
4653
|
-
const int64_t
|
4654
|
+
enum ggml_type type,
|
4655
|
+
int n_dims,
|
4656
|
+
const int64_t * ne) {
|
4654
4657
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4655
4658
|
}
|
4656
4659
|
|
4657
4660
|
struct ggml_tensor * ggml_new_tensor_1d(
|
4658
4661
|
struct ggml_context * ctx,
|
4659
|
-
enum ggml_type
|
4662
|
+
enum ggml_type type,
|
4660
4663
|
int64_t ne0) {
|
4661
4664
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
4662
4665
|
}
|
4663
4666
|
|
4664
4667
|
struct ggml_tensor * ggml_new_tensor_2d(
|
4665
4668
|
struct ggml_context * ctx,
|
4666
|
-
enum ggml_type
|
4669
|
+
enum ggml_type type,
|
4667
4670
|
int64_t ne0,
|
4668
4671
|
int64_t ne1) {
|
4669
4672
|
const int64_t ne[2] = { ne0, ne1 };
|
@@ -4672,7 +4675,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
|
|
4672
4675
|
|
4673
4676
|
struct ggml_tensor * ggml_new_tensor_3d(
|
4674
4677
|
struct ggml_context * ctx,
|
4675
|
-
enum ggml_type
|
4678
|
+
enum ggml_type type,
|
4676
4679
|
int64_t ne0,
|
4677
4680
|
int64_t ne1,
|
4678
4681
|
int64_t ne2) {
|
@@ -6238,6 +6241,27 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6238
6241
|
|
6239
6242
|
// ggml_view_1d
|
6240
6243
|
|
6244
|
+
static struct ggml_tensor * ggml_view_tensor_offset(
|
6245
|
+
struct ggml_context * ctx,
|
6246
|
+
struct ggml_tensor * a,
|
6247
|
+
int n_dims,
|
6248
|
+
const int64_t * ne,
|
6249
|
+
size_t offset) {
|
6250
|
+
// don't calculate an offset from an unallocated tensor
|
6251
|
+
void * data = NULL;
|
6252
|
+
if (a->data != NULL) {
|
6253
|
+
data = (char *) a->data + offset;
|
6254
|
+
}
|
6255
|
+
|
6256
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
|
6257
|
+
|
6258
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6259
|
+
|
6260
|
+
ggml_set_op_params(result, &offset, sizeof(offset));
|
6261
|
+
|
6262
|
+
return result;
|
6263
|
+
}
|
6264
|
+
|
6241
6265
|
struct ggml_tensor * ggml_view_1d(
|
6242
6266
|
struct ggml_context * ctx,
|
6243
6267
|
struct ggml_tensor * a,
|
@@ -6250,10 +6274,7 @@ struct ggml_tensor * ggml_view_1d(
|
|
6250
6274
|
is_node = true;
|
6251
6275
|
}
|
6252
6276
|
|
6253
|
-
struct ggml_tensor * result =
|
6254
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6255
|
-
|
6256
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6277
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6257
6278
|
|
6258
6279
|
result->op = GGML_OP_VIEW;
|
6259
6280
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6280,10 +6301,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
6280
6301
|
|
6281
6302
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6282
6303
|
|
6283
|
-
struct ggml_tensor * result =
|
6284
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6285
|
-
|
6286
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6304
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
6287
6305
|
|
6288
6306
|
result->nb[1] = nb1;
|
6289
6307
|
result->nb[2] = result->nb[1]*ne1;
|
@@ -6316,10 +6334,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
6316
6334
|
|
6317
6335
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6318
6336
|
|
6319
|
-
struct ggml_tensor * result =
|
6320
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6321
|
-
|
6322
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6337
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
|
6323
6338
|
|
6324
6339
|
result->nb[1] = nb1;
|
6325
6340
|
result->nb[2] = nb2;
|
@@ -6354,10 +6369,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
6354
6369
|
|
6355
6370
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6356
6371
|
|
6357
|
-
struct ggml_tensor * result =
|
6358
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6359
|
-
|
6360
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6372
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
|
6361
6373
|
|
6362
6374
|
result->nb[1] = nb1;
|
6363
6375
|
result->nb[2] = nb2;
|
@@ -6428,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
|
|
6428
6440
|
result->src[0] = a;
|
6429
6441
|
|
6430
6442
|
int32_t params[] = { axis0, axis1, axis2, axis3 };
|
6431
|
-
ggml_set_op_params(result,
|
6443
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6432
6444
|
|
6433
6445
|
return result;
|
6434
6446
|
}
|
@@ -6554,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6554
6566
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6555
6567
|
|
6556
6568
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6557
|
-
ggml_set_op_params(result,
|
6569
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6558
6570
|
|
6559
6571
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6560
6572
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6594,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6594
6606
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6595
6607
|
|
6596
6608
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6597
|
-
ggml_set_op_params(result,
|
6609
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6598
6610
|
|
6599
6611
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
6600
6612
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6710,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6710
6722
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6711
6723
|
|
6712
6724
|
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
|
6713
|
-
memcpy(params + 4, &freq_base,
|
6725
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
6714
6726
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
6715
|
-
ggml_set_op_params(result,
|
6727
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6716
6728
|
|
6717
6729
|
result->op = GGML_OP_ROPE;
|
6718
6730
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6741,6 +6753,18 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6741
6753
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
6742
6754
|
}
|
6743
6755
|
|
6756
|
+
struct ggml_tensor * ggml_rope_custom(
|
6757
|
+
struct ggml_context * ctx,
|
6758
|
+
struct ggml_tensor * a,
|
6759
|
+
int n_past,
|
6760
|
+
int n_dims,
|
6761
|
+
int mode,
|
6762
|
+
int n_ctx,
|
6763
|
+
float freq_base,
|
6764
|
+
float freq_scale) {
|
6765
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
6766
|
+
}
|
6767
|
+
|
6744
6768
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
6745
6769
|
struct ggml_context * ctx,
|
6746
6770
|
struct ggml_tensor * a,
|
@@ -6774,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
6774
6798
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6775
6799
|
|
6776
6800
|
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
6777
|
-
ggml_set_op_params(result,
|
6801
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6778
6802
|
|
6779
6803
|
result->op = GGML_OP_ROPE_BACK;
|
6780
6804
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6805,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6805
6829
|
|
6806
6830
|
int32_t op_params[3] = { n_past, n_head };
|
6807
6831
|
memcpy(op_params + 2, &bias_max, sizeof(float));
|
6808
|
-
ggml_set_op_params(result,
|
6832
|
+
ggml_set_op_params(result, op_params, sizeof(op_params));
|
6809
6833
|
|
6810
6834
|
result->op = GGML_OP_ALIBI;
|
6811
6835
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6832,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6832
6856
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6833
6857
|
|
6834
6858
|
float params[] = { min, max };
|
6835
|
-
ggml_set_op_params(result,
|
6859
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6836
6860
|
|
6837
6861
|
result->op = GGML_OP_CLAMP;
|
6838
6862
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6867,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6867
6891
|
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
6868
6892
|
a->ne[2], 1, 1,
|
6869
6893
|
};
|
6870
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6894
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6871
6895
|
|
6872
6896
|
int32_t params[] = { s0, p0, d0 };
|
6873
|
-
ggml_set_op_params(result,
|
6897
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6874
6898
|
|
6875
6899
|
result->op = GGML_OP_CONV_1D;
|
6876
6900
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6882,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6882
6906
|
|
6883
6907
|
// ggml_conv_2d
|
6884
6908
|
|
6885
|
-
struct ggml_tensor* ggml_conv_2d(
|
6886
|
-
struct ggml_context* ctx,
|
6887
|
-
struct ggml_tensor
|
6888
|
-
struct ggml_tensor
|
6909
|
+
struct ggml_tensor * ggml_conv_2d(
|
6910
|
+
struct ggml_context * ctx,
|
6911
|
+
struct ggml_tensor * a,
|
6912
|
+
struct ggml_tensor * b,
|
6889
6913
|
int s0,
|
6890
6914
|
int s1,
|
6891
6915
|
int p0,
|
@@ -6906,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6906
6930
|
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
6907
6931
|
a->ne[3], b->ne[3],
|
6908
6932
|
};
|
6909
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6933
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6910
6934
|
|
6911
6935
|
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
6912
|
-
ggml_set_op_params(result,
|
6936
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6913
6937
|
|
6914
6938
|
result->op = GGML_OP_CONV_2D;
|
6915
6939
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6922,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6922
6946
|
|
6923
6947
|
// ggml_conv_1d_ph
|
6924
6948
|
|
6925
|
-
struct ggml_tensor* ggml_conv_1d_ph(
|
6949
|
+
struct ggml_tensor * ggml_conv_1d_ph(
|
6926
6950
|
struct ggml_context * ctx,
|
6927
6951
|
struct ggml_tensor * a,
|
6928
6952
|
struct ggml_tensor * b,
|
@@ -6940,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
|
6940
6964
|
|
6941
6965
|
// ggml_pool_1d
|
6942
6966
|
|
6943
|
-
struct ggml_tensor* ggml_pool_1d(
|
6967
|
+
struct ggml_tensor * ggml_pool_1d(
|
6944
6968
|
struct ggml_context * ctx,
|
6945
6969
|
struct ggml_tensor * a,
|
6946
6970
|
enum ggml_op_pool op,
|
@@ -6959,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6959
6983
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
6960
6984
|
a->ne[1],
|
6961
6985
|
};
|
6962
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6986
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6963
6987
|
|
6964
6988
|
int32_t params[] = { op, k0, s0, p0 };
|
6965
|
-
ggml_set_op_params(result,
|
6989
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6966
6990
|
|
6967
6991
|
result->op = GGML_OP_POOL_1D;
|
6968
6992
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6973,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6973
6997
|
|
6974
6998
|
// ggml_pool_2d
|
6975
6999
|
|
6976
|
-
struct ggml_tensor* ggml_pool_2d(
|
7000
|
+
struct ggml_tensor * ggml_pool_2d(
|
6977
7001
|
struct ggml_context * ctx,
|
6978
7002
|
struct ggml_tensor * a,
|
6979
7003
|
enum ggml_op_pool op,
|
@@ -6996,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d(
|
|
6996
7020
|
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
6997
7021
|
a->ne[2],
|
6998
7022
|
};
|
6999
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7023
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7000
7024
|
|
7001
7025
|
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
7002
|
-
ggml_set_op_params(result,
|
7026
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7003
7027
|
|
7004
7028
|
result->op = GGML_OP_POOL_2D;
|
7005
7029
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7167,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
|
|
7167
7191
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7168
7192
|
|
7169
7193
|
int32_t params[] = { npx, npy, w };
|
7170
|
-
ggml_set_op_params(result,
|
7194
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7171
7195
|
|
7172
7196
|
result->op = GGML_OP_WIN_PART;
|
7173
7197
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7197,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7197
7221
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7198
7222
|
|
7199
7223
|
int32_t params[] = { w };
|
7200
|
-
ggml_set_op_params(result,
|
7224
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7201
7225
|
|
7202
7226
|
result->op = GGML_OP_WIN_UNPART;
|
7203
7227
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7326,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
7326
7350
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
7327
7351
|
}
|
7328
7352
|
|
7329
|
-
//
|
7353
|
+
// ggml_map_custom1_f32
|
7330
7354
|
|
7331
7355
|
static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7332
7356
|
struct ggml_context * ctx,
|
@@ -7343,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
|
7343
7367
|
|
7344
7368
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7345
7369
|
|
7346
|
-
result->op =
|
7370
|
+
result->op = GGML_OP_MAP_CUSTOM1_F32;
|
7347
7371
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7348
7372
|
result->src[0] = a;
|
7349
7373
|
|
@@ -7364,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
|
7364
7388
|
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7365
7389
|
}
|
7366
7390
|
|
7367
|
-
//
|
7391
|
+
// ggml_map_custom2_f32
|
7368
7392
|
|
7369
7393
|
static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7370
7394
|
struct ggml_context * ctx,
|
@@ -7382,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
|
7382
7406
|
|
7383
7407
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7384
7408
|
|
7385
|
-
result->op =
|
7409
|
+
result->op = GGML_OP_MAP_CUSTOM2_F32;
|
7386
7410
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7387
7411
|
result->src[0] = a;
|
7388
7412
|
result->src[1] = b;
|
@@ -7406,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
|
7406
7430
|
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7407
7431
|
}
|
7408
7432
|
|
7409
|
-
//
|
7433
|
+
// ggml_map_custom3_f32
|
7410
7434
|
|
7411
7435
|
static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7412
7436
|
struct ggml_context * ctx,
|
@@ -7425,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
|
7425
7449
|
|
7426
7450
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7427
7451
|
|
7428
|
-
result->op =
|
7452
|
+
result->op = GGML_OP_MAP_CUSTOM3_F32;
|
7429
7453
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7430
7454
|
result->src[0] = a;
|
7431
7455
|
result->src[1] = b;
|
@@ -7452,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
|
7452
7476
|
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7453
7477
|
}
|
7454
7478
|
|
7479
|
+
// ggml_map_custom1
|
7480
|
+
struct ggml_map_custom1_op_params {
|
7481
|
+
ggml_custom1_op_t fun;
|
7482
|
+
int n_tasks;
|
7483
|
+
void * userdata;
|
7484
|
+
};
|
7485
|
+
|
7486
|
+
static struct ggml_tensor * ggml_map_custom1_impl(
|
7487
|
+
struct ggml_context * ctx,
|
7488
|
+
struct ggml_tensor * a,
|
7489
|
+
const ggml_custom1_op_t fun,
|
7490
|
+
int n_tasks,
|
7491
|
+
void * userdata,
|
7492
|
+
bool inplace) {
|
7493
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7494
|
+
|
7495
|
+
bool is_node = false;
|
7496
|
+
|
7497
|
+
if (!inplace && a->grad) {
|
7498
|
+
is_node = true;
|
7499
|
+
}
|
7500
|
+
|
7501
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7502
|
+
|
7503
|
+
struct ggml_map_custom1_op_params params = {
|
7504
|
+
/*.fun =*/ fun,
|
7505
|
+
/*.n_tasks =*/ n_tasks,
|
7506
|
+
/*.userdata =*/ userdata
|
7507
|
+
};
|
7508
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7509
|
+
|
7510
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7511
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7512
|
+
result->src[0] = a;
|
7513
|
+
|
7514
|
+
return result;
|
7515
|
+
}
|
7516
|
+
|
7517
|
+
struct ggml_tensor * ggml_map_custom1(
|
7518
|
+
struct ggml_context * ctx,
|
7519
|
+
struct ggml_tensor * a,
|
7520
|
+
const ggml_custom1_op_t fun,
|
7521
|
+
int n_tasks,
|
7522
|
+
void * userdata) {
|
7523
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
|
7524
|
+
}
|
7525
|
+
|
7526
|
+
struct ggml_tensor * ggml_map_custom1_inplace(
|
7527
|
+
struct ggml_context * ctx,
|
7528
|
+
struct ggml_tensor * a,
|
7529
|
+
const ggml_custom1_op_t fun,
|
7530
|
+
int n_tasks,
|
7531
|
+
void * userdata) {
|
7532
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
|
7533
|
+
}
|
7534
|
+
|
7535
|
+
// ggml_map_custom2
|
7536
|
+
|
7537
|
+
struct ggml_map_custom2_op_params {
|
7538
|
+
ggml_custom2_op_t fun;
|
7539
|
+
int n_tasks;
|
7540
|
+
void * userdata;
|
7541
|
+
};
|
7542
|
+
|
7543
|
+
static struct ggml_tensor * ggml_map_custom2_impl(
|
7544
|
+
struct ggml_context * ctx,
|
7545
|
+
struct ggml_tensor * a,
|
7546
|
+
struct ggml_tensor * b,
|
7547
|
+
const ggml_custom2_op_t fun,
|
7548
|
+
int n_tasks,
|
7549
|
+
void * userdata,
|
7550
|
+
bool inplace) {
|
7551
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7552
|
+
|
7553
|
+
bool is_node = false;
|
7554
|
+
|
7555
|
+
if (!inplace && (a->grad || b->grad)) {
|
7556
|
+
is_node = true;
|
7557
|
+
}
|
7558
|
+
|
7559
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7560
|
+
|
7561
|
+
struct ggml_map_custom2_op_params params = {
|
7562
|
+
/*.fun =*/ fun,
|
7563
|
+
/*.n_tasks =*/ n_tasks,
|
7564
|
+
/*.userdata =*/ userdata
|
7565
|
+
};
|
7566
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7567
|
+
|
7568
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7569
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7570
|
+
result->src[0] = a;
|
7571
|
+
result->src[1] = b;
|
7572
|
+
|
7573
|
+
return result;
|
7574
|
+
}
|
7575
|
+
|
7576
|
+
struct ggml_tensor * ggml_map_custom2(
|
7577
|
+
struct ggml_context * ctx,
|
7578
|
+
struct ggml_tensor * a,
|
7579
|
+
struct ggml_tensor * b,
|
7580
|
+
const ggml_custom2_op_t fun,
|
7581
|
+
int n_tasks,
|
7582
|
+
void * userdata) {
|
7583
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
|
7584
|
+
}
|
7585
|
+
|
7586
|
+
struct ggml_tensor * ggml_map_custom2_inplace(
|
7587
|
+
struct ggml_context * ctx,
|
7588
|
+
struct ggml_tensor * a,
|
7589
|
+
struct ggml_tensor * b,
|
7590
|
+
const ggml_custom2_op_t fun,
|
7591
|
+
int n_tasks,
|
7592
|
+
void * userdata) {
|
7593
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
|
7594
|
+
}
|
7595
|
+
|
7596
|
+
// ggml_map_custom3
|
7597
|
+
|
7598
|
+
struct ggml_map_custom3_op_params {
|
7599
|
+
ggml_custom3_op_t fun;
|
7600
|
+
int n_tasks;
|
7601
|
+
void * userdata;
|
7602
|
+
};
|
7603
|
+
|
7604
|
+
static struct ggml_tensor * ggml_map_custom3_impl(
|
7605
|
+
struct ggml_context * ctx,
|
7606
|
+
struct ggml_tensor * a,
|
7607
|
+
struct ggml_tensor * b,
|
7608
|
+
struct ggml_tensor * c,
|
7609
|
+
const ggml_custom3_op_t fun,
|
7610
|
+
int n_tasks,
|
7611
|
+
void * userdata,
|
7612
|
+
bool inplace) {
|
7613
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7614
|
+
|
7615
|
+
bool is_node = false;
|
7616
|
+
|
7617
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7618
|
+
is_node = true;
|
7619
|
+
}
|
7620
|
+
|
7621
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7622
|
+
|
7623
|
+
struct ggml_map_custom3_op_params params = {
|
7624
|
+
/*.fun =*/ fun,
|
7625
|
+
/*.n_tasks =*/ n_tasks,
|
7626
|
+
/*.userdata =*/ userdata
|
7627
|
+
};
|
7628
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7629
|
+
|
7630
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7631
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7632
|
+
result->src[0] = a;
|
7633
|
+
result->src[1] = b;
|
7634
|
+
result->src[2] = c;
|
7635
|
+
|
7636
|
+
return result;
|
7637
|
+
}
|
7638
|
+
|
7639
|
+
struct ggml_tensor * ggml_map_custom3(
|
7640
|
+
struct ggml_context * ctx,
|
7641
|
+
struct ggml_tensor * a,
|
7642
|
+
struct ggml_tensor * b,
|
7643
|
+
struct ggml_tensor * c,
|
7644
|
+
const ggml_custom3_op_t fun,
|
7645
|
+
int n_tasks,
|
7646
|
+
void * userdata) {
|
7647
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
|
7648
|
+
}
|
7649
|
+
|
7650
|
+
struct ggml_tensor * ggml_map_custom3_inplace(
|
7651
|
+
struct ggml_context * ctx,
|
7652
|
+
struct ggml_tensor * a,
|
7653
|
+
struct ggml_tensor * b,
|
7654
|
+
struct ggml_tensor * c,
|
7655
|
+
const ggml_custom3_op_t fun,
|
7656
|
+
int n_tasks,
|
7657
|
+
void * userdata) {
|
7658
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
|
7659
|
+
}
|
7660
|
+
|
7661
|
+
|
7662
|
+
|
7455
7663
|
// ggml_cross_entropy_loss
|
7456
7664
|
|
7457
7665
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -9260,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32(
|
|
9260
9468
|
for (int64_t i3 = 0; i3 < ne03; i3++) {
|
9261
9469
|
for (int64_t i2 = 0; i2 < ne02; i2++) {
|
9262
9470
|
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
9263
|
-
float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9264
|
-
float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9471
|
+
float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9472
|
+
float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9265
9473
|
float row_sum = 0;
|
9266
9474
|
ggml_vec_sum_f32(ne00, &row_sum, src_row);
|
9267
9475
|
dst_row[0] = row_sum;
|
@@ -10523,71 +10731,95 @@ static void ggml_compute_forward_mul_mat(
|
|
10523
10731
|
return;
|
10524
10732
|
}
|
10525
10733
|
|
10526
|
-
|
10527
|
-
const
|
10734
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10528
10736
|
|
10529
|
-
const int64_t
|
10530
|
-
const int64_t
|
10737
|
+
const int64_t nr0 = ne01; // src0 rows
|
10738
|
+
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
10531
10739
|
|
10532
|
-
//
|
10533
|
-
const int64_t nr1 = ne11*ne12*ne13;
|
10740
|
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
10534
10741
|
|
10535
|
-
|
10536
|
-
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10742
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
10537
10743
|
|
10538
|
-
|
10539
|
-
|
10540
|
-
|
10541
|
-
|
10542
|
-
|
10543
|
-
|
10544
|
-
|
10545
|
-
|
10546
|
-
|
10547
|
-
|
10548
|
-
|
10549
|
-
|
10550
|
-
|
10551
|
-
|
10552
|
-
|
10553
|
-
|
10554
|
-
|
10555
|
-
|
10556
|
-
|
10557
|
-
|
10558
|
-
|
10559
|
-
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10560
|
-
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10561
|
-
// the original src1 data pointer, so we should index using the indices directly
|
10562
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10563
|
-
const char * src1_col = (const char *) wdata +
|
10564
|
-
(src1_cont || src1->type != vec_dot_type
|
10565
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10566
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10567
|
-
|
10568
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10569
|
-
|
10570
|
-
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
10571
|
-
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
10572
|
-
}
|
10744
|
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
10745
|
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
10746
|
+
|
10747
|
+
const int64_t ith0 = ith % nth0;
|
10748
|
+
const int64_t ith1 = ith / nth0;
|
10749
|
+
|
10750
|
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
10751
|
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
10752
|
+
|
10753
|
+
const int64_t ir010 = dr0*ith0;
|
10754
|
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
10755
|
+
|
10756
|
+
const int64_t ir110 = dr1*ith1;
|
10757
|
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
10758
|
+
|
10759
|
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
10760
|
+
|
10761
|
+
// threads with no work simply yield (not sure if it helps)
|
10762
|
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
10763
|
+
sched_yield();
|
10764
|
+
return;
|
10573
10765
|
}
|
10574
10766
|
|
10575
|
-
|
10576
|
-
|
10577
|
-
//acc += t1 - t0;
|
10578
|
-
//if (t1 - t0 > 10) {
|
10579
|
-
// printf("\n");
|
10580
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10581
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10582
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10767
|
+
assert(ne12 % ne02 == 0);
|
10768
|
+
assert(ne13 % ne03 == 0);
|
10583
10769
|
|
10584
|
-
//
|
10585
|
-
|
10586
|
-
|
10770
|
+
// broadcast factors
|
10771
|
+
const int64_t r2 = ne12/ne02;
|
10772
|
+
const int64_t r3 = ne13/ne03;
|
10587
10773
|
|
10774
|
+
// block-tiling attempt
|
10775
|
+
const int64_t blck_0 = 16;
|
10776
|
+
const int64_t blck_1 = 16;
|
10588
10777
|
|
10589
|
-
//
|
10778
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
10779
|
+
float tmp[16];
|
10780
|
+
|
10781
|
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
10782
|
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
10783
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
10784
|
+
const int64_t i13 = (ir1/(ne12*ne11));
|
10785
|
+
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
10786
|
+
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
10590
10787
|
|
10788
|
+
// broadcast src0 into src1
|
10789
|
+
const int64_t i03 = i13/r3;
|
10790
|
+
const int64_t i02 = i12/r2;
|
10791
|
+
|
10792
|
+
const int64_t i1 = i11;
|
10793
|
+
const int64_t i2 = i12;
|
10794
|
+
const int64_t i3 = i13;
|
10795
|
+
|
10796
|
+
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
|
10797
|
+
|
10798
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10799
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10800
|
+
// the original src1 data pointer, so we should index using the indices directly
|
10801
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10802
|
+
const char * src1_col = (const char *) wdata +
|
10803
|
+
(src1_cont || src1->type != vec_dot_type
|
10804
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10805
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10806
|
+
|
10807
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10808
|
+
|
10809
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10810
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
10811
|
+
//}
|
10812
|
+
|
10813
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10814
|
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
10815
|
+
}
|
10816
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
10817
|
+
}
|
10818
|
+
}
|
10819
|
+
}
|
10820
|
+
}
|
10821
|
+
|
10822
|
+
// ggml_compute_forward_out_prod
|
10591
10823
|
|
10592
10824
|
static void ggml_compute_forward_out_prod_f32(
|
10593
10825
|
const struct ggml_compute_params * params,
|
@@ -12871,7 +13103,7 @@ static void ggml_compute_forward_pool_1d(
|
|
12871
13103
|
const struct ggml_tensor * src0,
|
12872
13104
|
struct ggml_tensor * dst) {
|
12873
13105
|
|
12874
|
-
const int32_t* opts = (const int32_t*)dst->op_params;
|
13106
|
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
12875
13107
|
enum ggml_op_pool op = opts[0];
|
12876
13108
|
const int k0 = opts[1];
|
12877
13109
|
const int s0 = opts[2];
|
@@ -14204,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32(
|
|
14204
14436
|
fun(dst, a);
|
14205
14437
|
}
|
14206
14438
|
|
14207
|
-
|
14208
|
-
static void ggml_compute_forward_map_custom1(
|
14209
|
-
const struct ggml_compute_params * params,
|
14210
|
-
const struct ggml_tensor * a,
|
14211
|
-
struct ggml_tensor * dst,
|
14212
|
-
const ggml_custom1_op_f32_t fun) {
|
14213
|
-
switch (a->type) {
|
14214
|
-
case GGML_TYPE_F32:
|
14215
|
-
{
|
14216
|
-
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
14217
|
-
} break;
|
14218
|
-
default:
|
14219
|
-
{
|
14220
|
-
GGML_ASSERT(false);
|
14221
|
-
} break;
|
14222
|
-
}
|
14223
|
-
}
|
14224
|
-
|
14225
14439
|
// ggml_compute_forward_map_custom2
|
14226
14440
|
|
14227
14441
|
static void ggml_compute_forward_map_custom2_f32(
|
@@ -14240,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32(
|
|
14240
14454
|
}
|
14241
14455
|
|
14242
14456
|
|
14243
|
-
static void ggml_compute_forward_map_custom2(
|
14244
|
-
const struct ggml_compute_params * params,
|
14245
|
-
const struct ggml_tensor * a,
|
14246
|
-
const struct ggml_tensor * b,
|
14247
|
-
struct ggml_tensor * dst,
|
14248
|
-
const ggml_custom2_op_f32_t fun) {
|
14249
|
-
switch (a->type) {
|
14250
|
-
case GGML_TYPE_F32:
|
14251
|
-
{
|
14252
|
-
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
14253
|
-
} break;
|
14254
|
-
default:
|
14255
|
-
{
|
14256
|
-
GGML_ASSERT(false);
|
14257
|
-
} break;
|
14258
|
-
}
|
14259
|
-
}
|
14260
|
-
|
14261
14457
|
// ggml_compute_forward_map_custom3
|
14262
14458
|
|
14263
14459
|
static void ggml_compute_forward_map_custom3_f32(
|
@@ -14276,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32(
|
|
14276
14472
|
fun(dst, a, b, c);
|
14277
14473
|
}
|
14278
14474
|
|
14475
|
+
// ggml_compute_forward_map_custom1
|
14476
|
+
|
14477
|
+
static void ggml_compute_forward_map_custom1(
|
14478
|
+
const struct ggml_compute_params * params,
|
14479
|
+
const struct ggml_tensor * a,
|
14480
|
+
struct ggml_tensor * dst) {
|
14481
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14482
|
+
return;
|
14483
|
+
}
|
14484
|
+
|
14485
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
|
14486
|
+
|
14487
|
+
p->fun(dst, a, params->ith, params->nth, p->userdata);
|
14488
|
+
}
|
14489
|
+
|
14490
|
+
// ggml_compute_forward_map_custom2
|
14491
|
+
|
14492
|
+
static void ggml_compute_forward_map_custom2(
|
14493
|
+
const struct ggml_compute_params * params,
|
14494
|
+
const struct ggml_tensor * a,
|
14495
|
+
const struct ggml_tensor * b,
|
14496
|
+
struct ggml_tensor * dst) {
|
14497
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14498
|
+
return;
|
14499
|
+
}
|
14500
|
+
|
14501
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
|
14502
|
+
|
14503
|
+
p->fun(dst, a, b, params->ith, params->nth, p->userdata);
|
14504
|
+
}
|
14505
|
+
|
14506
|
+
// ggml_compute_forward_map_custom3
|
14279
14507
|
|
14280
14508
|
static void ggml_compute_forward_map_custom3(
|
14281
14509
|
const struct ggml_compute_params * params,
|
14282
14510
|
const struct ggml_tensor * a,
|
14283
14511
|
const struct ggml_tensor * b,
|
14284
14512
|
const struct ggml_tensor * c,
|
14285
|
-
|
14286
|
-
|
14287
|
-
|
14288
|
-
case GGML_TYPE_F32:
|
14289
|
-
{
|
14290
|
-
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
14291
|
-
} break;
|
14292
|
-
default:
|
14293
|
-
{
|
14294
|
-
GGML_ASSERT(false);
|
14295
|
-
} break;
|
14513
|
+
struct ggml_tensor * dst) {
|
14514
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14515
|
+
return;
|
14296
14516
|
}
|
14517
|
+
|
14518
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
|
14519
|
+
|
14520
|
+
p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
|
14297
14521
|
}
|
14298
14522
|
|
14299
14523
|
// ggml_compute_forward_cross_entropy_loss
|
@@ -14815,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14815
15039
|
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14816
15040
|
}
|
14817
15041
|
break;
|
14818
|
-
case
|
15042
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
14819
15043
|
{
|
14820
15044
|
ggml_custom1_op_f32_t fun;
|
14821
15045
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14822
|
-
|
15046
|
+
ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
|
14823
15047
|
}
|
14824
15048
|
break;
|
14825
|
-
case
|
15049
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
14826
15050
|
{
|
14827
15051
|
ggml_custom2_op_f32_t fun;
|
14828
15052
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14829
|
-
|
15053
|
+
ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14830
15054
|
}
|
14831
15055
|
break;
|
14832
|
-
case
|
15056
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
14833
15057
|
{
|
14834
15058
|
ggml_custom3_op_f32_t fun;
|
14835
15059
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14836
|
-
|
15060
|
+
ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
|
15061
|
+
}
|
15062
|
+
break;
|
15063
|
+
case GGML_OP_MAP_CUSTOM1:
|
15064
|
+
{
|
15065
|
+
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
|
15066
|
+
}
|
15067
|
+
break;
|
15068
|
+
case GGML_OP_MAP_CUSTOM2:
|
15069
|
+
{
|
15070
|
+
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
|
15071
|
+
}
|
15072
|
+
break;
|
15073
|
+
case GGML_OP_MAP_CUSTOM3:
|
15074
|
+
{
|
15075
|
+
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14837
15076
|
}
|
14838
15077
|
break;
|
14839
15078
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
@@ -15641,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15641
15880
|
} break;
|
15642
15881
|
case GGML_OP_MAP_UNARY:
|
15643
15882
|
case GGML_OP_MAP_BINARY:
|
15883
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
15884
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
15885
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
15644
15886
|
case GGML_OP_MAP_CUSTOM1:
|
15645
15887
|
case GGML_OP_MAP_CUSTOM2:
|
15646
15888
|
case GGML_OP_MAP_CUSTOM3:
|
@@ -16426,11 +16668,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16426
16668
|
case GGML_OP_WIN_UNPART:
|
16427
16669
|
case GGML_OP_MAP_UNARY:
|
16428
16670
|
case GGML_OP_MAP_BINARY:
|
16671
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
16672
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
16673
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
16674
|
+
{
|
16675
|
+
n_tasks = 1;
|
16676
|
+
} break;
|
16429
16677
|
case GGML_OP_MAP_CUSTOM1:
|
16678
|
+
{
|
16679
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
16680
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16681
|
+
n_tasks = n_threads;
|
16682
|
+
} else {
|
16683
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16684
|
+
}
|
16685
|
+
} break;
|
16430
16686
|
case GGML_OP_MAP_CUSTOM2:
|
16687
|
+
{
|
16688
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
16689
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16690
|
+
n_tasks = n_threads;
|
16691
|
+
} else {
|
16692
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16693
|
+
}
|
16694
|
+
} break;
|
16431
16695
|
case GGML_OP_MAP_CUSTOM3:
|
16432
16696
|
{
|
16433
|
-
|
16697
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
16698
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16699
|
+
n_tasks = n_threads;
|
16700
|
+
} else {
|
16701
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16702
|
+
}
|
16434
16703
|
} break;
|
16435
16704
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16436
16705
|
{
|