llama_cpp 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
|
|
195
195
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
196
196
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
197
|
#else
|
198
|
-
inline static void* ggml_aligned_malloc(size_t size) {
|
199
|
-
void* aligned_memory = NULL;
|
198
|
+
inline static void * ggml_aligned_malloc(size_t size) {
|
199
|
+
void * aligned_memory = NULL;
|
200
200
|
#ifdef GGML_USE_METAL
|
201
201
|
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
202
202
|
#else
|
@@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
3811
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
3812
|
};
|
3813
3813
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
3814
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3815
3815
|
|
3816
3816
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
3817
|
"none",
|
@@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
3883
|
"cross_entropy_loss_back(x,y)",
|
3884
3884
|
};
|
3885
3885
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
3886
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3887
3887
|
|
3888
3888
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
3889
|
|
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
4110
4110
|
//
|
4111
4111
|
// is enough, but just in case, adding the second part
|
4112
4112
|
|
4113
|
-
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
4113
|
+
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
|
4114
4114
|
}
|
4115
4115
|
|
4116
4116
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
@@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4253
4253
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4254
|
}
|
4255
4255
|
|
4256
|
-
|
4256
|
+
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
4257
4257
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4258
4258
|
|
4259
4259
|
return
|
@@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|
4557
4557
|
|
4558
4558
|
static struct ggml_tensor * ggml_new_tensor_impl(
|
4559
4559
|
struct ggml_context * ctx,
|
4560
|
-
enum ggml_type
|
4561
|
-
int
|
4562
|
-
const int64_t* ne,
|
4563
|
-
void*
|
4560
|
+
enum ggml_type type,
|
4561
|
+
int n_dims,
|
4562
|
+
const int64_t * ne,
|
4563
|
+
void * data) {
|
4564
|
+
|
4565
|
+
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4564
4566
|
|
4565
4567
|
size_t data_size = 0;
|
4566
4568
|
|
@@ -4600,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4600
4602
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4601
4603
|
/*.nb =*/ { 0, 0, 0, 0 },
|
4602
4604
|
/*.op =*/ GGML_OP_NONE,
|
4603
|
-
/*.op_params =*/ {0},
|
4605
|
+
/*.op_params =*/ { 0 },
|
4604
4606
|
/*.is_param =*/ false,
|
4605
4607
|
/*.grad =*/ NULL,
|
4606
4608
|
/*.src =*/ { NULL },
|
@@ -4632,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4632
4634
|
}
|
4633
4635
|
|
4634
4636
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4637
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4635
4638
|
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4636
4639
|
memcpy(tensor->op_params, params, params_size);
|
4637
4640
|
}
|
@@ -4648,22 +4651,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
|
|
4648
4651
|
|
4649
4652
|
struct ggml_tensor * ggml_new_tensor(
|
4650
4653
|
struct ggml_context * ctx,
|
4651
|
-
enum ggml_type
|
4652
|
-
int
|
4653
|
-
const int64_t
|
4654
|
+
enum ggml_type type,
|
4655
|
+
int n_dims,
|
4656
|
+
const int64_t * ne) {
|
4654
4657
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4655
4658
|
}
|
4656
4659
|
|
4657
4660
|
struct ggml_tensor * ggml_new_tensor_1d(
|
4658
4661
|
struct ggml_context * ctx,
|
4659
|
-
enum ggml_type
|
4662
|
+
enum ggml_type type,
|
4660
4663
|
int64_t ne0) {
|
4661
4664
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
4662
4665
|
}
|
4663
4666
|
|
4664
4667
|
struct ggml_tensor * ggml_new_tensor_2d(
|
4665
4668
|
struct ggml_context * ctx,
|
4666
|
-
enum ggml_type
|
4669
|
+
enum ggml_type type,
|
4667
4670
|
int64_t ne0,
|
4668
4671
|
int64_t ne1) {
|
4669
4672
|
const int64_t ne[2] = { ne0, ne1 };
|
@@ -4672,7 +4675,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
|
|
4672
4675
|
|
4673
4676
|
struct ggml_tensor * ggml_new_tensor_3d(
|
4674
4677
|
struct ggml_context * ctx,
|
4675
|
-
enum ggml_type
|
4678
|
+
enum ggml_type type,
|
4676
4679
|
int64_t ne0,
|
4677
4680
|
int64_t ne1,
|
4678
4681
|
int64_t ne2) {
|
@@ -6238,6 +6241,27 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6238
6241
|
|
6239
6242
|
// ggml_view_1d
|
6240
6243
|
|
6244
|
+
static struct ggml_tensor * ggml_view_tensor_offset(
|
6245
|
+
struct ggml_context * ctx,
|
6246
|
+
struct ggml_tensor * a,
|
6247
|
+
int n_dims,
|
6248
|
+
const int64_t * ne,
|
6249
|
+
size_t offset) {
|
6250
|
+
// don't calculate an offset from an unallocated tensor
|
6251
|
+
void * data = NULL;
|
6252
|
+
if (a->data != NULL) {
|
6253
|
+
data = (char *) a->data + offset;
|
6254
|
+
}
|
6255
|
+
|
6256
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
|
6257
|
+
|
6258
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6259
|
+
|
6260
|
+
ggml_set_op_params(result, &offset, sizeof(offset));
|
6261
|
+
|
6262
|
+
return result;
|
6263
|
+
}
|
6264
|
+
|
6241
6265
|
struct ggml_tensor * ggml_view_1d(
|
6242
6266
|
struct ggml_context * ctx,
|
6243
6267
|
struct ggml_tensor * a,
|
@@ -6250,10 +6274,7 @@ struct ggml_tensor * ggml_view_1d(
|
|
6250
6274
|
is_node = true;
|
6251
6275
|
}
|
6252
6276
|
|
6253
|
-
struct ggml_tensor * result =
|
6254
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6255
|
-
|
6256
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6277
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6257
6278
|
|
6258
6279
|
result->op = GGML_OP_VIEW;
|
6259
6280
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6280,10 +6301,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
6280
6301
|
|
6281
6302
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6282
6303
|
|
6283
|
-
struct ggml_tensor * result =
|
6284
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6285
|
-
|
6286
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6304
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
6287
6305
|
|
6288
6306
|
result->nb[1] = nb1;
|
6289
6307
|
result->nb[2] = result->nb[1]*ne1;
|
@@ -6316,10 +6334,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
6316
6334
|
|
6317
6335
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6318
6336
|
|
6319
|
-
struct ggml_tensor * result =
|
6320
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6321
|
-
|
6322
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6337
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
|
6323
6338
|
|
6324
6339
|
result->nb[1] = nb1;
|
6325
6340
|
result->nb[2] = nb2;
|
@@ -6354,10 +6369,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
6354
6369
|
|
6355
6370
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6356
6371
|
|
6357
|
-
struct ggml_tensor * result =
|
6358
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6359
|
-
|
6360
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6372
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
|
6361
6373
|
|
6362
6374
|
result->nb[1] = nb1;
|
6363
6375
|
result->nb[2] = nb2;
|
@@ -6428,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
|
|
6428
6440
|
result->src[0] = a;
|
6429
6441
|
|
6430
6442
|
int32_t params[] = { axis0, axis1, axis2, axis3 };
|
6431
|
-
ggml_set_op_params(result,
|
6443
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6432
6444
|
|
6433
6445
|
return result;
|
6434
6446
|
}
|
@@ -6554,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6554
6566
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6555
6567
|
|
6556
6568
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6557
|
-
ggml_set_op_params(result,
|
6569
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6558
6570
|
|
6559
6571
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6560
6572
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6594,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6594
6606
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6595
6607
|
|
6596
6608
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6597
|
-
ggml_set_op_params(result,
|
6609
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6598
6610
|
|
6599
6611
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
6600
6612
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6710,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6710
6722
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6711
6723
|
|
6712
6724
|
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
|
6713
|
-
memcpy(params + 4, &freq_base,
|
6725
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
6714
6726
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
6715
|
-
ggml_set_op_params(result,
|
6727
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6716
6728
|
|
6717
6729
|
result->op = GGML_OP_ROPE;
|
6718
6730
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6741,6 +6753,18 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6741
6753
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
6742
6754
|
}
|
6743
6755
|
|
6756
|
+
struct ggml_tensor * ggml_rope_custom(
|
6757
|
+
struct ggml_context * ctx,
|
6758
|
+
struct ggml_tensor * a,
|
6759
|
+
int n_past,
|
6760
|
+
int n_dims,
|
6761
|
+
int mode,
|
6762
|
+
int n_ctx,
|
6763
|
+
float freq_base,
|
6764
|
+
float freq_scale) {
|
6765
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
6766
|
+
}
|
6767
|
+
|
6744
6768
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
6745
6769
|
struct ggml_context * ctx,
|
6746
6770
|
struct ggml_tensor * a,
|
@@ -6774,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
6774
6798
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6775
6799
|
|
6776
6800
|
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
6777
|
-
ggml_set_op_params(result,
|
6801
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6778
6802
|
|
6779
6803
|
result->op = GGML_OP_ROPE_BACK;
|
6780
6804
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6805,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6805
6829
|
|
6806
6830
|
int32_t op_params[3] = { n_past, n_head };
|
6807
6831
|
memcpy(op_params + 2, &bias_max, sizeof(float));
|
6808
|
-
ggml_set_op_params(result,
|
6832
|
+
ggml_set_op_params(result, op_params, sizeof(op_params));
|
6809
6833
|
|
6810
6834
|
result->op = GGML_OP_ALIBI;
|
6811
6835
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6832,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6832
6856
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6833
6857
|
|
6834
6858
|
float params[] = { min, max };
|
6835
|
-
ggml_set_op_params(result,
|
6859
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6836
6860
|
|
6837
6861
|
result->op = GGML_OP_CLAMP;
|
6838
6862
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6867,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6867
6891
|
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
6868
6892
|
a->ne[2], 1, 1,
|
6869
6893
|
};
|
6870
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6894
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6871
6895
|
|
6872
6896
|
int32_t params[] = { s0, p0, d0 };
|
6873
|
-
ggml_set_op_params(result,
|
6897
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6874
6898
|
|
6875
6899
|
result->op = GGML_OP_CONV_1D;
|
6876
6900
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6882,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6882
6906
|
|
6883
6907
|
// ggml_conv_2d
|
6884
6908
|
|
6885
|
-
struct ggml_tensor* ggml_conv_2d(
|
6886
|
-
struct ggml_context* ctx,
|
6887
|
-
struct ggml_tensor
|
6888
|
-
struct ggml_tensor
|
6909
|
+
struct ggml_tensor * ggml_conv_2d(
|
6910
|
+
struct ggml_context * ctx,
|
6911
|
+
struct ggml_tensor * a,
|
6912
|
+
struct ggml_tensor * b,
|
6889
6913
|
int s0,
|
6890
6914
|
int s1,
|
6891
6915
|
int p0,
|
@@ -6906,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6906
6930
|
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
6907
6931
|
a->ne[3], b->ne[3],
|
6908
6932
|
};
|
6909
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6933
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6910
6934
|
|
6911
6935
|
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
6912
|
-
ggml_set_op_params(result,
|
6936
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6913
6937
|
|
6914
6938
|
result->op = GGML_OP_CONV_2D;
|
6915
6939
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6922,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6922
6946
|
|
6923
6947
|
// ggml_conv_1d_ph
|
6924
6948
|
|
6925
|
-
struct ggml_tensor* ggml_conv_1d_ph(
|
6949
|
+
struct ggml_tensor * ggml_conv_1d_ph(
|
6926
6950
|
struct ggml_context * ctx,
|
6927
6951
|
struct ggml_tensor * a,
|
6928
6952
|
struct ggml_tensor * b,
|
@@ -6940,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
|
6940
6964
|
|
6941
6965
|
// ggml_pool_1d
|
6942
6966
|
|
6943
|
-
struct ggml_tensor* ggml_pool_1d(
|
6967
|
+
struct ggml_tensor * ggml_pool_1d(
|
6944
6968
|
struct ggml_context * ctx,
|
6945
6969
|
struct ggml_tensor * a,
|
6946
6970
|
enum ggml_op_pool op,
|
@@ -6959,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6959
6983
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
6960
6984
|
a->ne[1],
|
6961
6985
|
};
|
6962
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6986
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6963
6987
|
|
6964
6988
|
int32_t params[] = { op, k0, s0, p0 };
|
6965
|
-
ggml_set_op_params(result,
|
6989
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6966
6990
|
|
6967
6991
|
result->op = GGML_OP_POOL_1D;
|
6968
6992
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6973,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6973
6997
|
|
6974
6998
|
// ggml_pool_2d
|
6975
6999
|
|
6976
|
-
struct ggml_tensor* ggml_pool_2d(
|
7000
|
+
struct ggml_tensor * ggml_pool_2d(
|
6977
7001
|
struct ggml_context * ctx,
|
6978
7002
|
struct ggml_tensor * a,
|
6979
7003
|
enum ggml_op_pool op,
|
@@ -6996,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d(
|
|
6996
7020
|
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
6997
7021
|
a->ne[2],
|
6998
7022
|
};
|
6999
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7023
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7000
7024
|
|
7001
7025
|
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
7002
|
-
ggml_set_op_params(result,
|
7026
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7003
7027
|
|
7004
7028
|
result->op = GGML_OP_POOL_2D;
|
7005
7029
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7167,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
|
|
7167
7191
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7168
7192
|
|
7169
7193
|
int32_t params[] = { npx, npy, w };
|
7170
|
-
ggml_set_op_params(result,
|
7194
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7171
7195
|
|
7172
7196
|
result->op = GGML_OP_WIN_PART;
|
7173
7197
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7197,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7197
7221
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7198
7222
|
|
7199
7223
|
int32_t params[] = { w };
|
7200
|
-
ggml_set_op_params(result,
|
7224
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7201
7225
|
|
7202
7226
|
result->op = GGML_OP_WIN_UNPART;
|
7203
7227
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7326,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
7326
7350
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
7327
7351
|
}
|
7328
7352
|
|
7329
|
-
//
|
7353
|
+
// ggml_map_custom1_f32
|
7330
7354
|
|
7331
7355
|
static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7332
7356
|
struct ggml_context * ctx,
|
@@ -7343,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
|
7343
7367
|
|
7344
7368
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7345
7369
|
|
7346
|
-
result->op =
|
7370
|
+
result->op = GGML_OP_MAP_CUSTOM1_F32;
|
7347
7371
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7348
7372
|
result->src[0] = a;
|
7349
7373
|
|
@@ -7364,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
|
7364
7388
|
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7365
7389
|
}
|
7366
7390
|
|
7367
|
-
//
|
7391
|
+
// ggml_map_custom2_f32
|
7368
7392
|
|
7369
7393
|
static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7370
7394
|
struct ggml_context * ctx,
|
@@ -7382,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
|
7382
7406
|
|
7383
7407
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7384
7408
|
|
7385
|
-
result->op =
|
7409
|
+
result->op = GGML_OP_MAP_CUSTOM2_F32;
|
7386
7410
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7387
7411
|
result->src[0] = a;
|
7388
7412
|
result->src[1] = b;
|
@@ -7406,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
|
7406
7430
|
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7407
7431
|
}
|
7408
7432
|
|
7409
|
-
//
|
7433
|
+
// ggml_map_custom3_f32
|
7410
7434
|
|
7411
7435
|
static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7412
7436
|
struct ggml_context * ctx,
|
@@ -7425,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
|
7425
7449
|
|
7426
7450
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7427
7451
|
|
7428
|
-
result->op =
|
7452
|
+
result->op = GGML_OP_MAP_CUSTOM3_F32;
|
7429
7453
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7430
7454
|
result->src[0] = a;
|
7431
7455
|
result->src[1] = b;
|
@@ -7452,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
|
7452
7476
|
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7453
7477
|
}
|
7454
7478
|
|
7479
|
+
// ggml_map_custom1
|
7480
|
+
struct ggml_map_custom1_op_params {
|
7481
|
+
ggml_custom1_op_t fun;
|
7482
|
+
int n_tasks;
|
7483
|
+
void * userdata;
|
7484
|
+
};
|
7485
|
+
|
7486
|
+
static struct ggml_tensor * ggml_map_custom1_impl(
|
7487
|
+
struct ggml_context * ctx,
|
7488
|
+
struct ggml_tensor * a,
|
7489
|
+
const ggml_custom1_op_t fun,
|
7490
|
+
int n_tasks,
|
7491
|
+
void * userdata,
|
7492
|
+
bool inplace) {
|
7493
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7494
|
+
|
7495
|
+
bool is_node = false;
|
7496
|
+
|
7497
|
+
if (!inplace && a->grad) {
|
7498
|
+
is_node = true;
|
7499
|
+
}
|
7500
|
+
|
7501
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7502
|
+
|
7503
|
+
struct ggml_map_custom1_op_params params = {
|
7504
|
+
/*.fun =*/ fun,
|
7505
|
+
/*.n_tasks =*/ n_tasks,
|
7506
|
+
/*.userdata =*/ userdata
|
7507
|
+
};
|
7508
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7509
|
+
|
7510
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7511
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7512
|
+
result->src[0] = a;
|
7513
|
+
|
7514
|
+
return result;
|
7515
|
+
}
|
7516
|
+
|
7517
|
+
struct ggml_tensor * ggml_map_custom1(
|
7518
|
+
struct ggml_context * ctx,
|
7519
|
+
struct ggml_tensor * a,
|
7520
|
+
const ggml_custom1_op_t fun,
|
7521
|
+
int n_tasks,
|
7522
|
+
void * userdata) {
|
7523
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
|
7524
|
+
}
|
7525
|
+
|
7526
|
+
struct ggml_tensor * ggml_map_custom1_inplace(
|
7527
|
+
struct ggml_context * ctx,
|
7528
|
+
struct ggml_tensor * a,
|
7529
|
+
const ggml_custom1_op_t fun,
|
7530
|
+
int n_tasks,
|
7531
|
+
void * userdata) {
|
7532
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
|
7533
|
+
}
|
7534
|
+
|
7535
|
+
// ggml_map_custom2
|
7536
|
+
|
7537
|
+
struct ggml_map_custom2_op_params {
|
7538
|
+
ggml_custom2_op_t fun;
|
7539
|
+
int n_tasks;
|
7540
|
+
void * userdata;
|
7541
|
+
};
|
7542
|
+
|
7543
|
+
static struct ggml_tensor * ggml_map_custom2_impl(
|
7544
|
+
struct ggml_context * ctx,
|
7545
|
+
struct ggml_tensor * a,
|
7546
|
+
struct ggml_tensor * b,
|
7547
|
+
const ggml_custom2_op_t fun,
|
7548
|
+
int n_tasks,
|
7549
|
+
void * userdata,
|
7550
|
+
bool inplace) {
|
7551
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7552
|
+
|
7553
|
+
bool is_node = false;
|
7554
|
+
|
7555
|
+
if (!inplace && (a->grad || b->grad)) {
|
7556
|
+
is_node = true;
|
7557
|
+
}
|
7558
|
+
|
7559
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7560
|
+
|
7561
|
+
struct ggml_map_custom2_op_params params = {
|
7562
|
+
/*.fun =*/ fun,
|
7563
|
+
/*.n_tasks =*/ n_tasks,
|
7564
|
+
/*.userdata =*/ userdata
|
7565
|
+
};
|
7566
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7567
|
+
|
7568
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7569
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7570
|
+
result->src[0] = a;
|
7571
|
+
result->src[1] = b;
|
7572
|
+
|
7573
|
+
return result;
|
7574
|
+
}
|
7575
|
+
|
7576
|
+
struct ggml_tensor * ggml_map_custom2(
|
7577
|
+
struct ggml_context * ctx,
|
7578
|
+
struct ggml_tensor * a,
|
7579
|
+
struct ggml_tensor * b,
|
7580
|
+
const ggml_custom2_op_t fun,
|
7581
|
+
int n_tasks,
|
7582
|
+
void * userdata) {
|
7583
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
|
7584
|
+
}
|
7585
|
+
|
7586
|
+
struct ggml_tensor * ggml_map_custom2_inplace(
|
7587
|
+
struct ggml_context * ctx,
|
7588
|
+
struct ggml_tensor * a,
|
7589
|
+
struct ggml_tensor * b,
|
7590
|
+
const ggml_custom2_op_t fun,
|
7591
|
+
int n_tasks,
|
7592
|
+
void * userdata) {
|
7593
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
|
7594
|
+
}
|
7595
|
+
|
7596
|
+
// ggml_map_custom3
|
7597
|
+
|
7598
|
+
struct ggml_map_custom3_op_params {
|
7599
|
+
ggml_custom3_op_t fun;
|
7600
|
+
int n_tasks;
|
7601
|
+
void * userdata;
|
7602
|
+
};
|
7603
|
+
|
7604
|
+
static struct ggml_tensor * ggml_map_custom3_impl(
|
7605
|
+
struct ggml_context * ctx,
|
7606
|
+
struct ggml_tensor * a,
|
7607
|
+
struct ggml_tensor * b,
|
7608
|
+
struct ggml_tensor * c,
|
7609
|
+
const ggml_custom3_op_t fun,
|
7610
|
+
int n_tasks,
|
7611
|
+
void * userdata,
|
7612
|
+
bool inplace) {
|
7613
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7614
|
+
|
7615
|
+
bool is_node = false;
|
7616
|
+
|
7617
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7618
|
+
is_node = true;
|
7619
|
+
}
|
7620
|
+
|
7621
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7622
|
+
|
7623
|
+
struct ggml_map_custom3_op_params params = {
|
7624
|
+
/*.fun =*/ fun,
|
7625
|
+
/*.n_tasks =*/ n_tasks,
|
7626
|
+
/*.userdata =*/ userdata
|
7627
|
+
};
|
7628
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7629
|
+
|
7630
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7631
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7632
|
+
result->src[0] = a;
|
7633
|
+
result->src[1] = b;
|
7634
|
+
result->src[2] = c;
|
7635
|
+
|
7636
|
+
return result;
|
7637
|
+
}
|
7638
|
+
|
7639
|
+
struct ggml_tensor * ggml_map_custom3(
|
7640
|
+
struct ggml_context * ctx,
|
7641
|
+
struct ggml_tensor * a,
|
7642
|
+
struct ggml_tensor * b,
|
7643
|
+
struct ggml_tensor * c,
|
7644
|
+
const ggml_custom3_op_t fun,
|
7645
|
+
int n_tasks,
|
7646
|
+
void * userdata) {
|
7647
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
|
7648
|
+
}
|
7649
|
+
|
7650
|
+
struct ggml_tensor * ggml_map_custom3_inplace(
|
7651
|
+
struct ggml_context * ctx,
|
7652
|
+
struct ggml_tensor * a,
|
7653
|
+
struct ggml_tensor * b,
|
7654
|
+
struct ggml_tensor * c,
|
7655
|
+
const ggml_custom3_op_t fun,
|
7656
|
+
int n_tasks,
|
7657
|
+
void * userdata) {
|
7658
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
|
7659
|
+
}
|
7660
|
+
|
7661
|
+
|
7662
|
+
|
7455
7663
|
// ggml_cross_entropy_loss
|
7456
7664
|
|
7457
7665
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -9260,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32(
|
|
9260
9468
|
for (int64_t i3 = 0; i3 < ne03; i3++) {
|
9261
9469
|
for (int64_t i2 = 0; i2 < ne02; i2++) {
|
9262
9470
|
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
9263
|
-
float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9264
|
-
float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9471
|
+
float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9472
|
+
float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9265
9473
|
float row_sum = 0;
|
9266
9474
|
ggml_vec_sum_f32(ne00, &row_sum, src_row);
|
9267
9475
|
dst_row[0] = row_sum;
|
@@ -10523,71 +10731,95 @@ static void ggml_compute_forward_mul_mat(
|
|
10523
10731
|
return;
|
10524
10732
|
}
|
10525
10733
|
|
10526
|
-
|
10527
|
-
const
|
10734
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10528
10736
|
|
10529
|
-
const int64_t
|
10530
|
-
const int64_t
|
10737
|
+
const int64_t nr0 = ne01; // src0 rows
|
10738
|
+
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
10531
10739
|
|
10532
|
-
//
|
10533
|
-
const int64_t nr1 = ne11*ne12*ne13;
|
10740
|
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
10534
10741
|
|
10535
|
-
|
10536
|
-
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10742
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
10537
10743
|
|
10538
|
-
|
10539
|
-
|
10540
|
-
|
10541
|
-
|
10542
|
-
|
10543
|
-
|
10544
|
-
|
10545
|
-
|
10546
|
-
|
10547
|
-
|
10548
|
-
|
10549
|
-
|
10550
|
-
|
10551
|
-
|
10552
|
-
|
10553
|
-
|
10554
|
-
|
10555
|
-
|
10556
|
-
|
10557
|
-
|
10558
|
-
|
10559
|
-
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10560
|
-
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10561
|
-
// the original src1 data pointer, so we should index using the indices directly
|
10562
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10563
|
-
const char * src1_col = (const char *) wdata +
|
10564
|
-
(src1_cont || src1->type != vec_dot_type
|
10565
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10566
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10567
|
-
|
10568
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10569
|
-
|
10570
|
-
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
10571
|
-
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
10572
|
-
}
|
10744
|
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
10745
|
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
10746
|
+
|
10747
|
+
const int64_t ith0 = ith % nth0;
|
10748
|
+
const int64_t ith1 = ith / nth0;
|
10749
|
+
|
10750
|
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
10751
|
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
10752
|
+
|
10753
|
+
const int64_t ir010 = dr0*ith0;
|
10754
|
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
10755
|
+
|
10756
|
+
const int64_t ir110 = dr1*ith1;
|
10757
|
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
10758
|
+
|
10759
|
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
10760
|
+
|
10761
|
+
// threads with no work simply yield (not sure if it helps)
|
10762
|
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
10763
|
+
sched_yield();
|
10764
|
+
return;
|
10573
10765
|
}
|
10574
10766
|
|
10575
|
-
|
10576
|
-
|
10577
|
-
//acc += t1 - t0;
|
10578
|
-
//if (t1 - t0 > 10) {
|
10579
|
-
// printf("\n");
|
10580
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10581
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10582
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10767
|
+
assert(ne12 % ne02 == 0);
|
10768
|
+
assert(ne13 % ne03 == 0);
|
10583
10769
|
|
10584
|
-
//
|
10585
|
-
|
10586
|
-
|
10770
|
+
// broadcast factors
|
10771
|
+
const int64_t r2 = ne12/ne02;
|
10772
|
+
const int64_t r3 = ne13/ne03;
|
10587
10773
|
|
10774
|
+
// block-tiling attempt
|
10775
|
+
const int64_t blck_0 = 16;
|
10776
|
+
const int64_t blck_1 = 16;
|
10588
10777
|
|
10589
|
-
//
|
10778
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
10779
|
+
float tmp[16];
|
10780
|
+
|
10781
|
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
10782
|
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
10783
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
10784
|
+
const int64_t i13 = (ir1/(ne12*ne11));
|
10785
|
+
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
10786
|
+
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
10590
10787
|
|
10788
|
+
// broadcast src0 into src1
|
10789
|
+
const int64_t i03 = i13/r3;
|
10790
|
+
const int64_t i02 = i12/r2;
|
10791
|
+
|
10792
|
+
const int64_t i1 = i11;
|
10793
|
+
const int64_t i2 = i12;
|
10794
|
+
const int64_t i3 = i13;
|
10795
|
+
|
10796
|
+
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
|
10797
|
+
|
10798
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10799
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10800
|
+
// the original src1 data pointer, so we should index using the indices directly
|
10801
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10802
|
+
const char * src1_col = (const char *) wdata +
|
10803
|
+
(src1_cont || src1->type != vec_dot_type
|
10804
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10805
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10806
|
+
|
10807
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10808
|
+
|
10809
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10810
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
10811
|
+
//}
|
10812
|
+
|
10813
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10814
|
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
10815
|
+
}
|
10816
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
10817
|
+
}
|
10818
|
+
}
|
10819
|
+
}
|
10820
|
+
}
|
10821
|
+
|
10822
|
+
// ggml_compute_forward_out_prod
|
10591
10823
|
|
10592
10824
|
static void ggml_compute_forward_out_prod_f32(
|
10593
10825
|
const struct ggml_compute_params * params,
|
@@ -12871,7 +13103,7 @@ static void ggml_compute_forward_pool_1d(
|
|
12871
13103
|
const struct ggml_tensor * src0,
|
12872
13104
|
struct ggml_tensor * dst) {
|
12873
13105
|
|
12874
|
-
const int32_t* opts = (const int32_t*)dst->op_params;
|
13106
|
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
12875
13107
|
enum ggml_op_pool op = opts[0];
|
12876
13108
|
const int k0 = opts[1];
|
12877
13109
|
const int s0 = opts[2];
|
@@ -14204,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32(
|
|
14204
14436
|
fun(dst, a);
|
14205
14437
|
}
|
14206
14438
|
|
14207
|
-
|
14208
|
-
static void ggml_compute_forward_map_custom1(
|
14209
|
-
const struct ggml_compute_params * params,
|
14210
|
-
const struct ggml_tensor * a,
|
14211
|
-
struct ggml_tensor * dst,
|
14212
|
-
const ggml_custom1_op_f32_t fun) {
|
14213
|
-
switch (a->type) {
|
14214
|
-
case GGML_TYPE_F32:
|
14215
|
-
{
|
14216
|
-
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
14217
|
-
} break;
|
14218
|
-
default:
|
14219
|
-
{
|
14220
|
-
GGML_ASSERT(false);
|
14221
|
-
} break;
|
14222
|
-
}
|
14223
|
-
}
|
14224
|
-
|
14225
14439
|
// ggml_compute_forward_map_custom2
|
14226
14440
|
|
14227
14441
|
static void ggml_compute_forward_map_custom2_f32(
|
@@ -14240,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32(
|
|
14240
14454
|
}
|
14241
14455
|
|
14242
14456
|
|
14243
|
-
static void ggml_compute_forward_map_custom2(
|
14244
|
-
const struct ggml_compute_params * params,
|
14245
|
-
const struct ggml_tensor * a,
|
14246
|
-
const struct ggml_tensor * b,
|
14247
|
-
struct ggml_tensor * dst,
|
14248
|
-
const ggml_custom2_op_f32_t fun) {
|
14249
|
-
switch (a->type) {
|
14250
|
-
case GGML_TYPE_F32:
|
14251
|
-
{
|
14252
|
-
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
14253
|
-
} break;
|
14254
|
-
default:
|
14255
|
-
{
|
14256
|
-
GGML_ASSERT(false);
|
14257
|
-
} break;
|
14258
|
-
}
|
14259
|
-
}
|
14260
|
-
|
14261
14457
|
// ggml_compute_forward_map_custom3
|
14262
14458
|
|
14263
14459
|
static void ggml_compute_forward_map_custom3_f32(
|
@@ -14276,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32(
|
|
14276
14472
|
fun(dst, a, b, c);
|
14277
14473
|
}
|
14278
14474
|
|
14475
|
+
// ggml_compute_forward_map_custom1
|
14476
|
+
|
14477
|
+
static void ggml_compute_forward_map_custom1(
|
14478
|
+
const struct ggml_compute_params * params,
|
14479
|
+
const struct ggml_tensor * a,
|
14480
|
+
struct ggml_tensor * dst) {
|
14481
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14482
|
+
return;
|
14483
|
+
}
|
14484
|
+
|
14485
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
|
14486
|
+
|
14487
|
+
p->fun(dst, a, params->ith, params->nth, p->userdata);
|
14488
|
+
}
|
14489
|
+
|
14490
|
+
// ggml_compute_forward_map_custom2
|
14491
|
+
|
14492
|
+
static void ggml_compute_forward_map_custom2(
|
14493
|
+
const struct ggml_compute_params * params,
|
14494
|
+
const struct ggml_tensor * a,
|
14495
|
+
const struct ggml_tensor * b,
|
14496
|
+
struct ggml_tensor * dst) {
|
14497
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14498
|
+
return;
|
14499
|
+
}
|
14500
|
+
|
14501
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
|
14502
|
+
|
14503
|
+
p->fun(dst, a, b, params->ith, params->nth, p->userdata);
|
14504
|
+
}
|
14505
|
+
|
14506
|
+
// ggml_compute_forward_map_custom3
|
14279
14507
|
|
14280
14508
|
static void ggml_compute_forward_map_custom3(
|
14281
14509
|
const struct ggml_compute_params * params,
|
14282
14510
|
const struct ggml_tensor * a,
|
14283
14511
|
const struct ggml_tensor * b,
|
14284
14512
|
const struct ggml_tensor * c,
|
14285
|
-
|
14286
|
-
|
14287
|
-
|
14288
|
-
case GGML_TYPE_F32:
|
14289
|
-
{
|
14290
|
-
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
14291
|
-
} break;
|
14292
|
-
default:
|
14293
|
-
{
|
14294
|
-
GGML_ASSERT(false);
|
14295
|
-
} break;
|
14513
|
+
struct ggml_tensor * dst) {
|
14514
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14515
|
+
return;
|
14296
14516
|
}
|
14517
|
+
|
14518
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
|
14519
|
+
|
14520
|
+
p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
|
14297
14521
|
}
|
14298
14522
|
|
14299
14523
|
// ggml_compute_forward_cross_entropy_loss
|
@@ -14815,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14815
15039
|
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14816
15040
|
}
|
14817
15041
|
break;
|
14818
|
-
case
|
15042
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
14819
15043
|
{
|
14820
15044
|
ggml_custom1_op_f32_t fun;
|
14821
15045
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14822
|
-
|
15046
|
+
ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
|
14823
15047
|
}
|
14824
15048
|
break;
|
14825
|
-
case
|
15049
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
14826
15050
|
{
|
14827
15051
|
ggml_custom2_op_f32_t fun;
|
14828
15052
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14829
|
-
|
15053
|
+
ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14830
15054
|
}
|
14831
15055
|
break;
|
14832
|
-
case
|
15056
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
14833
15057
|
{
|
14834
15058
|
ggml_custom3_op_f32_t fun;
|
14835
15059
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14836
|
-
|
15060
|
+
ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
|
15061
|
+
}
|
15062
|
+
break;
|
15063
|
+
case GGML_OP_MAP_CUSTOM1:
|
15064
|
+
{
|
15065
|
+
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
|
15066
|
+
}
|
15067
|
+
break;
|
15068
|
+
case GGML_OP_MAP_CUSTOM2:
|
15069
|
+
{
|
15070
|
+
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
|
15071
|
+
}
|
15072
|
+
break;
|
15073
|
+
case GGML_OP_MAP_CUSTOM3:
|
15074
|
+
{
|
15075
|
+
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14837
15076
|
}
|
14838
15077
|
break;
|
14839
15078
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
@@ -15641,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15641
15880
|
} break;
|
15642
15881
|
case GGML_OP_MAP_UNARY:
|
15643
15882
|
case GGML_OP_MAP_BINARY:
|
15883
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
15884
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
15885
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
15644
15886
|
case GGML_OP_MAP_CUSTOM1:
|
15645
15887
|
case GGML_OP_MAP_CUSTOM2:
|
15646
15888
|
case GGML_OP_MAP_CUSTOM3:
|
@@ -16426,11 +16668,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16426
16668
|
case GGML_OP_WIN_UNPART:
|
16427
16669
|
case GGML_OP_MAP_UNARY:
|
16428
16670
|
case GGML_OP_MAP_BINARY:
|
16671
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
16672
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
16673
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
16674
|
+
{
|
16675
|
+
n_tasks = 1;
|
16676
|
+
} break;
|
16429
16677
|
case GGML_OP_MAP_CUSTOM1:
|
16678
|
+
{
|
16679
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
16680
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16681
|
+
n_tasks = n_threads;
|
16682
|
+
} else {
|
16683
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16684
|
+
}
|
16685
|
+
} break;
|
16430
16686
|
case GGML_OP_MAP_CUSTOM2:
|
16687
|
+
{
|
16688
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
16689
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16690
|
+
n_tasks = n_threads;
|
16691
|
+
} else {
|
16692
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16693
|
+
}
|
16694
|
+
} break;
|
16431
16695
|
case GGML_OP_MAP_CUSTOM3:
|
16432
16696
|
{
|
16433
|
-
|
16697
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
16698
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16699
|
+
n_tasks = n_threads;
|
16700
|
+
} else {
|
16701
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16702
|
+
}
|
16434
16703
|
} break;
|
16435
16704
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16436
16705
|
{
|