llama_cpp 0.3.6 → 0.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +8 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1165 -721
- data/ext/llama_cpp/src/ggml-metal.m +39 -18
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +214 -146
- data/ext/llama_cpp/src/llama.h +18 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
|
|
195
195
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
196
196
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
197
|
#else
|
198
|
-
inline static void* ggml_aligned_malloc(size_t size) {
|
199
|
-
void* aligned_memory = NULL;
|
198
|
+
inline static void * ggml_aligned_malloc(size_t size) {
|
199
|
+
void * aligned_memory = NULL;
|
200
200
|
#ifdef GGML_USE_METAL
|
201
201
|
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
202
202
|
#else
|
@@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
3811
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
3812
|
};
|
3813
3813
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
3814
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3815
3815
|
|
3816
3816
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
3817
|
"none",
|
@@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
3883
|
"cross_entropy_loss_back(x,y)",
|
3884
3884
|
};
|
3885
3885
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
3886
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3887
3887
|
|
3888
3888
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
3889
|
|
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
4110
4110
|
//
|
4111
4111
|
// is enough, but just in case, adding the second part
|
4112
4112
|
|
4113
|
-
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
4113
|
+
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
|
4114
4114
|
}
|
4115
4115
|
|
4116
4116
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
@@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4253
4253
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4254
|
}
|
4255
4255
|
|
4256
|
-
|
4256
|
+
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
4257
4257
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4258
4258
|
|
4259
4259
|
return
|
@@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4602
4602
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4603
4603
|
/*.nb =*/ { 0, 0, 0, 0 },
|
4604
4604
|
/*.op =*/ GGML_OP_NONE,
|
4605
|
-
/*.op_params =*/ {0},
|
4605
|
+
/*.op_params =*/ { 0 },
|
4606
4606
|
/*.is_param =*/ false,
|
4607
4607
|
/*.grad =*/ NULL,
|
4608
4608
|
/*.src =*/ { NULL },
|
@@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4634
4634
|
}
|
4635
4635
|
|
4636
4636
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4637
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4637
4638
|
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4638
4639
|
memcpy(tensor->op_params, params, params_size);
|
4639
4640
|
}
|
@@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
|
|
6439
6440
|
result->src[0] = a;
|
6440
6441
|
|
6441
6442
|
int32_t params[] = { axis0, axis1, axis2, axis3 };
|
6442
|
-
ggml_set_op_params(result,
|
6443
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6443
6444
|
|
6444
6445
|
return result;
|
6445
6446
|
}
|
@@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6565
6566
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6566
6567
|
|
6567
6568
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6568
|
-
ggml_set_op_params(result,
|
6569
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6569
6570
|
|
6570
6571
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6571
6572
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6605
6606
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6606
6607
|
|
6607
6608
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6608
|
-
ggml_set_op_params(result,
|
6609
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6609
6610
|
|
6610
6611
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
6611
6612
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6721
6722
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6722
6723
|
|
6723
6724
|
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
|
6724
|
-
memcpy(params + 4, &freq_base,
|
6725
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
6725
6726
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
6726
|
-
ggml_set_op_params(result,
|
6727
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6727
6728
|
|
6728
6729
|
result->op = GGML_OP_ROPE;
|
6729
6730
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
6797
6798
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6798
6799
|
|
6799
6800
|
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
6800
|
-
ggml_set_op_params(result,
|
6801
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6801
6802
|
|
6802
6803
|
result->op = GGML_OP_ROPE_BACK;
|
6803
6804
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6828
6829
|
|
6829
6830
|
int32_t op_params[3] = { n_past, n_head };
|
6830
6831
|
memcpy(op_params + 2, &bias_max, sizeof(float));
|
6831
|
-
ggml_set_op_params(result,
|
6832
|
+
ggml_set_op_params(result, op_params, sizeof(op_params));
|
6832
6833
|
|
6833
6834
|
result->op = GGML_OP_ALIBI;
|
6834
6835
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6855
6856
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6856
6857
|
|
6857
6858
|
float params[] = { min, max };
|
6858
|
-
ggml_set_op_params(result,
|
6859
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6859
6860
|
|
6860
6861
|
result->op = GGML_OP_CLAMP;
|
6861
6862
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6890,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6890
6891
|
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
6891
6892
|
a->ne[2], 1, 1,
|
6892
6893
|
};
|
6893
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6894
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6894
6895
|
|
6895
6896
|
int32_t params[] = { s0, p0, d0 };
|
6896
|
-
ggml_set_op_params(result,
|
6897
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6897
6898
|
|
6898
6899
|
result->op = GGML_OP_CONV_1D;
|
6899
6900
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6905,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6905
6906
|
|
6906
6907
|
// ggml_conv_2d
|
6907
6908
|
|
6908
|
-
struct ggml_tensor* ggml_conv_2d(
|
6909
|
-
struct ggml_context* ctx,
|
6910
|
-
struct ggml_tensor
|
6911
|
-
struct ggml_tensor
|
6909
|
+
struct ggml_tensor * ggml_conv_2d(
|
6910
|
+
struct ggml_context * ctx,
|
6911
|
+
struct ggml_tensor * a,
|
6912
|
+
struct ggml_tensor * b,
|
6912
6913
|
int s0,
|
6913
6914
|
int s1,
|
6914
6915
|
int p0,
|
@@ -6929,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6929
6930
|
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
6930
6931
|
a->ne[3], b->ne[3],
|
6931
6932
|
};
|
6932
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6933
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6933
6934
|
|
6934
6935
|
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
6935
|
-
ggml_set_op_params(result,
|
6936
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6936
6937
|
|
6937
6938
|
result->op = GGML_OP_CONV_2D;
|
6938
6939
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6945,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6945
6946
|
|
6946
6947
|
// ggml_conv_1d_ph
|
6947
6948
|
|
6948
|
-
struct ggml_tensor* ggml_conv_1d_ph(
|
6949
|
+
struct ggml_tensor * ggml_conv_1d_ph(
|
6949
6950
|
struct ggml_context * ctx,
|
6950
6951
|
struct ggml_tensor * a,
|
6951
6952
|
struct ggml_tensor * b,
|
@@ -6963,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
|
6963
6964
|
|
6964
6965
|
// ggml_pool_1d
|
6965
6966
|
|
6966
|
-
struct ggml_tensor* ggml_pool_1d(
|
6967
|
+
struct ggml_tensor * ggml_pool_1d(
|
6967
6968
|
struct ggml_context * ctx,
|
6968
6969
|
struct ggml_tensor * a,
|
6969
6970
|
enum ggml_op_pool op,
|
@@ -6982,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6982
6983
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
6983
6984
|
a->ne[1],
|
6984
6985
|
};
|
6985
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6986
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6986
6987
|
|
6987
6988
|
int32_t params[] = { op, k0, s0, p0 };
|
6988
|
-
ggml_set_op_params(result,
|
6989
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6989
6990
|
|
6990
6991
|
result->op = GGML_OP_POOL_1D;
|
6991
6992
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6996,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6996
6997
|
|
6997
6998
|
// ggml_pool_2d
|
6998
6999
|
|
6999
|
-
struct ggml_tensor* ggml_pool_2d(
|
7000
|
+
struct ggml_tensor * ggml_pool_2d(
|
7000
7001
|
struct ggml_context * ctx,
|
7001
7002
|
struct ggml_tensor * a,
|
7002
7003
|
enum ggml_op_pool op,
|
@@ -7019,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d(
|
|
7019
7020
|
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
7020
7021
|
a->ne[2],
|
7021
7022
|
};
|
7022
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7023
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7023
7024
|
|
7024
7025
|
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
7025
|
-
ggml_set_op_params(result,
|
7026
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7026
7027
|
|
7027
7028
|
result->op = GGML_OP_POOL_2D;
|
7028
7029
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
|
|
7190
7191
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7191
7192
|
|
7192
7193
|
int32_t params[] = { npx, npy, w };
|
7193
|
-
ggml_set_op_params(result,
|
7194
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7194
7195
|
|
7195
7196
|
result->op = GGML_OP_WIN_PART;
|
7196
7197
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7220
7221
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7221
7222
|
|
7222
7223
|
int32_t params[] = { w };
|
7223
|
-
ggml_set_op_params(result,
|
7224
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7224
7225
|
|
7225
7226
|
result->op = GGML_OP_WIN_UNPART;
|
7226
7227
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7349,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
7349
7350
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
7350
7351
|
}
|
7351
7352
|
|
7352
|
-
//
|
7353
|
+
// ggml_map_custom1_f32
|
7353
7354
|
|
7354
7355
|
static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7355
7356
|
struct ggml_context * ctx,
|
@@ -7366,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
|
7366
7367
|
|
7367
7368
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7368
7369
|
|
7369
|
-
result->op =
|
7370
|
+
result->op = GGML_OP_MAP_CUSTOM1_F32;
|
7370
7371
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7371
7372
|
result->src[0] = a;
|
7372
7373
|
|
@@ -7387,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
|
7387
7388
|
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7388
7389
|
}
|
7389
7390
|
|
7390
|
-
//
|
7391
|
+
// ggml_map_custom2_f32
|
7391
7392
|
|
7392
7393
|
static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7393
7394
|
struct ggml_context * ctx,
|
@@ -7405,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
|
7405
7406
|
|
7406
7407
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7407
7408
|
|
7408
|
-
result->op =
|
7409
|
+
result->op = GGML_OP_MAP_CUSTOM2_F32;
|
7409
7410
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7410
7411
|
result->src[0] = a;
|
7411
7412
|
result->src[1] = b;
|
@@ -7429,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
|
7429
7430
|
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7430
7431
|
}
|
7431
7432
|
|
7432
|
-
//
|
7433
|
+
// ggml_map_custom3_f32
|
7433
7434
|
|
7434
7435
|
static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7435
7436
|
struct ggml_context * ctx,
|
@@ -7448,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
|
7448
7449
|
|
7449
7450
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7450
7451
|
|
7451
|
-
result->op =
|
7452
|
+
result->op = GGML_OP_MAP_CUSTOM3_F32;
|
7452
7453
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7453
7454
|
result->src[0] = a;
|
7454
7455
|
result->src[1] = b;
|
@@ -7475,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
|
7475
7476
|
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7476
7477
|
}
|
7477
7478
|
|
7479
|
+
// ggml_map_custom1
|
7480
|
+
struct ggml_map_custom1_op_params {
|
7481
|
+
ggml_custom1_op_t fun;
|
7482
|
+
int n_tasks;
|
7483
|
+
void * userdata;
|
7484
|
+
};
|
7485
|
+
|
7486
|
+
static struct ggml_tensor * ggml_map_custom1_impl(
|
7487
|
+
struct ggml_context * ctx,
|
7488
|
+
struct ggml_tensor * a,
|
7489
|
+
const ggml_custom1_op_t fun,
|
7490
|
+
int n_tasks,
|
7491
|
+
void * userdata,
|
7492
|
+
bool inplace) {
|
7493
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7494
|
+
|
7495
|
+
bool is_node = false;
|
7496
|
+
|
7497
|
+
if (!inplace && a->grad) {
|
7498
|
+
is_node = true;
|
7499
|
+
}
|
7500
|
+
|
7501
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7502
|
+
|
7503
|
+
struct ggml_map_custom1_op_params params = {
|
7504
|
+
/*.fun =*/ fun,
|
7505
|
+
/*.n_tasks =*/ n_tasks,
|
7506
|
+
/*.userdata =*/ userdata
|
7507
|
+
};
|
7508
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7509
|
+
|
7510
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7511
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7512
|
+
result->src[0] = a;
|
7513
|
+
|
7514
|
+
return result;
|
7515
|
+
}
|
7516
|
+
|
7517
|
+
struct ggml_tensor * ggml_map_custom1(
|
7518
|
+
struct ggml_context * ctx,
|
7519
|
+
struct ggml_tensor * a,
|
7520
|
+
const ggml_custom1_op_t fun,
|
7521
|
+
int n_tasks,
|
7522
|
+
void * userdata) {
|
7523
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
|
7524
|
+
}
|
7525
|
+
|
7526
|
+
struct ggml_tensor * ggml_map_custom1_inplace(
|
7527
|
+
struct ggml_context * ctx,
|
7528
|
+
struct ggml_tensor * a,
|
7529
|
+
const ggml_custom1_op_t fun,
|
7530
|
+
int n_tasks,
|
7531
|
+
void * userdata) {
|
7532
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
|
7533
|
+
}
|
7534
|
+
|
7535
|
+
// ggml_map_custom2
|
7536
|
+
|
7537
|
+
struct ggml_map_custom2_op_params {
|
7538
|
+
ggml_custom2_op_t fun;
|
7539
|
+
int n_tasks;
|
7540
|
+
void * userdata;
|
7541
|
+
};
|
7542
|
+
|
7543
|
+
static struct ggml_tensor * ggml_map_custom2_impl(
|
7544
|
+
struct ggml_context * ctx,
|
7545
|
+
struct ggml_tensor * a,
|
7546
|
+
struct ggml_tensor * b,
|
7547
|
+
const ggml_custom2_op_t fun,
|
7548
|
+
int n_tasks,
|
7549
|
+
void * userdata,
|
7550
|
+
bool inplace) {
|
7551
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7552
|
+
|
7553
|
+
bool is_node = false;
|
7554
|
+
|
7555
|
+
if (!inplace && (a->grad || b->grad)) {
|
7556
|
+
is_node = true;
|
7557
|
+
}
|
7558
|
+
|
7559
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7560
|
+
|
7561
|
+
struct ggml_map_custom2_op_params params = {
|
7562
|
+
/*.fun =*/ fun,
|
7563
|
+
/*.n_tasks =*/ n_tasks,
|
7564
|
+
/*.userdata =*/ userdata
|
7565
|
+
};
|
7566
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7567
|
+
|
7568
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7569
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7570
|
+
result->src[0] = a;
|
7571
|
+
result->src[1] = b;
|
7572
|
+
|
7573
|
+
return result;
|
7574
|
+
}
|
7575
|
+
|
7576
|
+
struct ggml_tensor * ggml_map_custom2(
|
7577
|
+
struct ggml_context * ctx,
|
7578
|
+
struct ggml_tensor * a,
|
7579
|
+
struct ggml_tensor * b,
|
7580
|
+
const ggml_custom2_op_t fun,
|
7581
|
+
int n_tasks,
|
7582
|
+
void * userdata) {
|
7583
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
|
7584
|
+
}
|
7585
|
+
|
7586
|
+
struct ggml_tensor * ggml_map_custom2_inplace(
|
7587
|
+
struct ggml_context * ctx,
|
7588
|
+
struct ggml_tensor * a,
|
7589
|
+
struct ggml_tensor * b,
|
7590
|
+
const ggml_custom2_op_t fun,
|
7591
|
+
int n_tasks,
|
7592
|
+
void * userdata) {
|
7593
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
|
7594
|
+
}
|
7595
|
+
|
7596
|
+
// ggml_map_custom3
|
7597
|
+
|
7598
|
+
struct ggml_map_custom3_op_params {
|
7599
|
+
ggml_custom3_op_t fun;
|
7600
|
+
int n_tasks;
|
7601
|
+
void * userdata;
|
7602
|
+
};
|
7603
|
+
|
7604
|
+
static struct ggml_tensor * ggml_map_custom3_impl(
|
7605
|
+
struct ggml_context * ctx,
|
7606
|
+
struct ggml_tensor * a,
|
7607
|
+
struct ggml_tensor * b,
|
7608
|
+
struct ggml_tensor * c,
|
7609
|
+
const ggml_custom3_op_t fun,
|
7610
|
+
int n_tasks,
|
7611
|
+
void * userdata,
|
7612
|
+
bool inplace) {
|
7613
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7614
|
+
|
7615
|
+
bool is_node = false;
|
7616
|
+
|
7617
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7618
|
+
is_node = true;
|
7619
|
+
}
|
7620
|
+
|
7621
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7622
|
+
|
7623
|
+
struct ggml_map_custom3_op_params params = {
|
7624
|
+
/*.fun =*/ fun,
|
7625
|
+
/*.n_tasks =*/ n_tasks,
|
7626
|
+
/*.userdata =*/ userdata
|
7627
|
+
};
|
7628
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7629
|
+
|
7630
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7631
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7632
|
+
result->src[0] = a;
|
7633
|
+
result->src[1] = b;
|
7634
|
+
result->src[2] = c;
|
7635
|
+
|
7636
|
+
return result;
|
7637
|
+
}
|
7638
|
+
|
7639
|
+
struct ggml_tensor * ggml_map_custom3(
|
7640
|
+
struct ggml_context * ctx,
|
7641
|
+
struct ggml_tensor * a,
|
7642
|
+
struct ggml_tensor * b,
|
7643
|
+
struct ggml_tensor * c,
|
7644
|
+
const ggml_custom3_op_t fun,
|
7645
|
+
int n_tasks,
|
7646
|
+
void * userdata) {
|
7647
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
|
7648
|
+
}
|
7649
|
+
|
7650
|
+
struct ggml_tensor * ggml_map_custom3_inplace(
|
7651
|
+
struct ggml_context * ctx,
|
7652
|
+
struct ggml_tensor * a,
|
7653
|
+
struct ggml_tensor * b,
|
7654
|
+
struct ggml_tensor * c,
|
7655
|
+
const ggml_custom3_op_t fun,
|
7656
|
+
int n_tasks,
|
7657
|
+
void * userdata) {
|
7658
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
|
7659
|
+
}
|
7660
|
+
|
7661
|
+
|
7662
|
+
|
7478
7663
|
// ggml_cross_entropy_loss
|
7479
7664
|
|
7480
7665
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -9283,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32(
|
|
9283
9468
|
for (int64_t i3 = 0; i3 < ne03; i3++) {
|
9284
9469
|
for (int64_t i2 = 0; i2 < ne02; i2++) {
|
9285
9470
|
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
9286
|
-
float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9287
|
-
float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9471
|
+
float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9472
|
+
float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9288
9473
|
float row_sum = 0;
|
9289
9474
|
ggml_vec_sum_f32(ne00, &row_sum, src_row);
|
9290
9475
|
dst_row[0] = row_sum;
|
@@ -10546,71 +10731,95 @@ static void ggml_compute_forward_mul_mat(
|
|
10546
10731
|
return;
|
10547
10732
|
}
|
10548
10733
|
|
10549
|
-
|
10550
|
-
const
|
10734
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10736
|
+
|
10737
|
+
const int64_t nr0 = ne01; // src0 rows
|
10738
|
+
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
10551
10739
|
|
10552
|
-
|
10553
|
-
const int64_t ir11 = MIN(ir10 + dr, ne01);
|
10740
|
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
10554
10741
|
|
10555
|
-
//
|
10556
|
-
const int64_t nr1 = ne11*ne12*ne13;
|
10742
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
10557
10743
|
|
10558
|
-
const
|
10559
|
-
const
|
10744
|
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
10745
|
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
10560
10746
|
|
10561
|
-
|
10562
|
-
|
10563
|
-
|
10564
|
-
|
10565
|
-
|
10566
|
-
|
10567
|
-
|
10568
|
-
|
10569
|
-
|
10570
|
-
|
10571
|
-
|
10572
|
-
|
10573
|
-
|
10574
|
-
|
10575
|
-
|
10576
|
-
|
10577
|
-
|
10578
|
-
|
10579
|
-
|
10580
|
-
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
10581
|
-
|
10582
|
-
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10583
|
-
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10584
|
-
// the original src1 data pointer, so we should index using the indices directly
|
10585
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10586
|
-
const char * src1_col = (const char *) wdata +
|
10587
|
-
(src1_cont || src1->type != vec_dot_type
|
10588
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10589
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10590
|
-
|
10591
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10592
|
-
|
10593
|
-
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
10594
|
-
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
10595
|
-
}
|
10747
|
+
const int64_t ith0 = ith % nth0;
|
10748
|
+
const int64_t ith1 = ith / nth0;
|
10749
|
+
|
10750
|
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
10751
|
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
10752
|
+
|
10753
|
+
const int64_t ir010 = dr0*ith0;
|
10754
|
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
10755
|
+
|
10756
|
+
const int64_t ir110 = dr1*ith1;
|
10757
|
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
10758
|
+
|
10759
|
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
10760
|
+
|
10761
|
+
// threads with no work simply yield (not sure if it helps)
|
10762
|
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
10763
|
+
sched_yield();
|
10764
|
+
return;
|
10596
10765
|
}
|
10597
10766
|
|
10598
|
-
|
10599
|
-
|
10600
|
-
//acc += t1 - t0;
|
10601
|
-
//if (t1 - t0 > 10) {
|
10602
|
-
// printf("\n");
|
10603
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10604
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10605
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10767
|
+
assert(ne12 % ne02 == 0);
|
10768
|
+
assert(ne13 % ne03 == 0);
|
10606
10769
|
|
10607
|
-
//
|
10608
|
-
|
10609
|
-
|
10770
|
+
// broadcast factors
|
10771
|
+
const int64_t r2 = ne12/ne02;
|
10772
|
+
const int64_t r3 = ne13/ne03;
|
10610
10773
|
|
10774
|
+
// block-tiling attempt
|
10775
|
+
const int64_t blck_0 = 16;
|
10776
|
+
const int64_t blck_1 = 16;
|
10611
10777
|
|
10612
|
-
//
|
10778
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
10779
|
+
float tmp[16];
|
10613
10780
|
|
10781
|
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
10782
|
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
10783
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
10784
|
+
const int64_t i13 = (ir1/(ne12*ne11));
|
10785
|
+
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
10786
|
+
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
10787
|
+
|
10788
|
+
// broadcast src0 into src1
|
10789
|
+
const int64_t i03 = i13/r3;
|
10790
|
+
const int64_t i02 = i12/r2;
|
10791
|
+
|
10792
|
+
const int64_t i1 = i11;
|
10793
|
+
const int64_t i2 = i12;
|
10794
|
+
const int64_t i3 = i13;
|
10795
|
+
|
10796
|
+
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
|
10797
|
+
|
10798
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10799
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10800
|
+
// the original src1 data pointer, so we should index using the indices directly
|
10801
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10802
|
+
const char * src1_col = (const char *) wdata +
|
10803
|
+
(src1_cont || src1->type != vec_dot_type
|
10804
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10805
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10806
|
+
|
10807
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10808
|
+
|
10809
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10810
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
10811
|
+
//}
|
10812
|
+
|
10813
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10814
|
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
10815
|
+
}
|
10816
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
10817
|
+
}
|
10818
|
+
}
|
10819
|
+
}
|
10820
|
+
}
|
10821
|
+
|
10822
|
+
// ggml_compute_forward_out_prod
|
10614
10823
|
|
10615
10824
|
static void ggml_compute_forward_out_prod_f32(
|
10616
10825
|
const struct ggml_compute_params * params,
|
@@ -12894,7 +13103,7 @@ static void ggml_compute_forward_pool_1d(
|
|
12894
13103
|
const struct ggml_tensor * src0,
|
12895
13104
|
struct ggml_tensor * dst) {
|
12896
13105
|
|
12897
|
-
const int32_t* opts = (const int32_t*)dst->op_params;
|
13106
|
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
12898
13107
|
enum ggml_op_pool op = opts[0];
|
12899
13108
|
const int k0 = opts[1];
|
12900
13109
|
const int s0 = opts[2];
|
@@ -14227,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32(
|
|
14227
14436
|
fun(dst, a);
|
14228
14437
|
}
|
14229
14438
|
|
14230
|
-
|
14231
|
-
static void ggml_compute_forward_map_custom1(
|
14232
|
-
const struct ggml_compute_params * params,
|
14233
|
-
const struct ggml_tensor * a,
|
14234
|
-
struct ggml_tensor * dst,
|
14235
|
-
const ggml_custom1_op_f32_t fun) {
|
14236
|
-
switch (a->type) {
|
14237
|
-
case GGML_TYPE_F32:
|
14238
|
-
{
|
14239
|
-
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
14240
|
-
} break;
|
14241
|
-
default:
|
14242
|
-
{
|
14243
|
-
GGML_ASSERT(false);
|
14244
|
-
} break;
|
14245
|
-
}
|
14246
|
-
}
|
14247
|
-
|
14248
14439
|
// ggml_compute_forward_map_custom2
|
14249
14440
|
|
14250
14441
|
static void ggml_compute_forward_map_custom2_f32(
|
@@ -14263,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32(
|
|
14263
14454
|
}
|
14264
14455
|
|
14265
14456
|
|
14266
|
-
static void ggml_compute_forward_map_custom2(
|
14267
|
-
const struct ggml_compute_params * params,
|
14268
|
-
const struct ggml_tensor * a,
|
14269
|
-
const struct ggml_tensor * b,
|
14270
|
-
struct ggml_tensor * dst,
|
14271
|
-
const ggml_custom2_op_f32_t fun) {
|
14272
|
-
switch (a->type) {
|
14273
|
-
case GGML_TYPE_F32:
|
14274
|
-
{
|
14275
|
-
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
14276
|
-
} break;
|
14277
|
-
default:
|
14278
|
-
{
|
14279
|
-
GGML_ASSERT(false);
|
14280
|
-
} break;
|
14281
|
-
}
|
14282
|
-
}
|
14283
|
-
|
14284
14457
|
// ggml_compute_forward_map_custom3
|
14285
14458
|
|
14286
14459
|
static void ggml_compute_forward_map_custom3_f32(
|
@@ -14299,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32(
|
|
14299
14472
|
fun(dst, a, b, c);
|
14300
14473
|
}
|
14301
14474
|
|
14475
|
+
// ggml_compute_forward_map_custom1
|
14476
|
+
|
14477
|
+
static void ggml_compute_forward_map_custom1(
|
14478
|
+
const struct ggml_compute_params * params,
|
14479
|
+
const struct ggml_tensor * a,
|
14480
|
+
struct ggml_tensor * dst) {
|
14481
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14482
|
+
return;
|
14483
|
+
}
|
14484
|
+
|
14485
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
|
14486
|
+
|
14487
|
+
p->fun(dst, a, params->ith, params->nth, p->userdata);
|
14488
|
+
}
|
14489
|
+
|
14490
|
+
// ggml_compute_forward_map_custom2
|
14491
|
+
|
14492
|
+
static void ggml_compute_forward_map_custom2(
|
14493
|
+
const struct ggml_compute_params * params,
|
14494
|
+
const struct ggml_tensor * a,
|
14495
|
+
const struct ggml_tensor * b,
|
14496
|
+
struct ggml_tensor * dst) {
|
14497
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14498
|
+
return;
|
14499
|
+
}
|
14500
|
+
|
14501
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
|
14502
|
+
|
14503
|
+
p->fun(dst, a, b, params->ith, params->nth, p->userdata);
|
14504
|
+
}
|
14505
|
+
|
14506
|
+
// ggml_compute_forward_map_custom3
|
14302
14507
|
|
14303
14508
|
static void ggml_compute_forward_map_custom3(
|
14304
14509
|
const struct ggml_compute_params * params,
|
14305
14510
|
const struct ggml_tensor * a,
|
14306
14511
|
const struct ggml_tensor * b,
|
14307
14512
|
const struct ggml_tensor * c,
|
14308
|
-
|
14309
|
-
|
14310
|
-
|
14311
|
-
case GGML_TYPE_F32:
|
14312
|
-
{
|
14313
|
-
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
14314
|
-
} break;
|
14315
|
-
default:
|
14316
|
-
{
|
14317
|
-
GGML_ASSERT(false);
|
14318
|
-
} break;
|
14513
|
+
struct ggml_tensor * dst) {
|
14514
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14515
|
+
return;
|
14319
14516
|
}
|
14517
|
+
|
14518
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
|
14519
|
+
|
14520
|
+
p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
|
14320
14521
|
}
|
14321
14522
|
|
14322
14523
|
// ggml_compute_forward_cross_entropy_loss
|
@@ -14838,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14838
15039
|
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14839
15040
|
}
|
14840
15041
|
break;
|
14841
|
-
case
|
15042
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
14842
15043
|
{
|
14843
15044
|
ggml_custom1_op_f32_t fun;
|
14844
15045
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14845
|
-
|
15046
|
+
ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
|
14846
15047
|
}
|
14847
15048
|
break;
|
14848
|
-
case
|
15049
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
14849
15050
|
{
|
14850
15051
|
ggml_custom2_op_f32_t fun;
|
14851
15052
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14852
|
-
|
15053
|
+
ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14853
15054
|
}
|
14854
15055
|
break;
|
14855
|
-
case
|
15056
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
14856
15057
|
{
|
14857
15058
|
ggml_custom3_op_f32_t fun;
|
14858
15059
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14859
|
-
|
15060
|
+
ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
|
15061
|
+
}
|
15062
|
+
break;
|
15063
|
+
case GGML_OP_MAP_CUSTOM1:
|
15064
|
+
{
|
15065
|
+
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
|
15066
|
+
}
|
15067
|
+
break;
|
15068
|
+
case GGML_OP_MAP_CUSTOM2:
|
15069
|
+
{
|
15070
|
+
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
|
15071
|
+
}
|
15072
|
+
break;
|
15073
|
+
case GGML_OP_MAP_CUSTOM3:
|
15074
|
+
{
|
15075
|
+
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14860
15076
|
}
|
14861
15077
|
break;
|
14862
15078
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
@@ -15664,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15664
15880
|
} break;
|
15665
15881
|
case GGML_OP_MAP_UNARY:
|
15666
15882
|
case GGML_OP_MAP_BINARY:
|
15883
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
15884
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
15885
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
15667
15886
|
case GGML_OP_MAP_CUSTOM1:
|
15668
15887
|
case GGML_OP_MAP_CUSTOM2:
|
15669
15888
|
case GGML_OP_MAP_CUSTOM3:
|
@@ -16449,11 +16668,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16449
16668
|
case GGML_OP_WIN_UNPART:
|
16450
16669
|
case GGML_OP_MAP_UNARY:
|
16451
16670
|
case GGML_OP_MAP_BINARY:
|
16671
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
16672
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
16673
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
16674
|
+
{
|
16675
|
+
n_tasks = 1;
|
16676
|
+
} break;
|
16452
16677
|
case GGML_OP_MAP_CUSTOM1:
|
16678
|
+
{
|
16679
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
16680
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16681
|
+
n_tasks = n_threads;
|
16682
|
+
} else {
|
16683
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16684
|
+
}
|
16685
|
+
} break;
|
16453
16686
|
case GGML_OP_MAP_CUSTOM2:
|
16687
|
+
{
|
16688
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
16689
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16690
|
+
n_tasks = n_threads;
|
16691
|
+
} else {
|
16692
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16693
|
+
}
|
16694
|
+
} break;
|
16454
16695
|
case GGML_OP_MAP_CUSTOM3:
|
16455
16696
|
{
|
16456
|
-
|
16697
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
16698
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16699
|
+
n_tasks = n_threads;
|
16700
|
+
} else {
|
16701
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16702
|
+
}
|
16457
16703
|
} break;
|
16458
16704
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16459
16705
|
{
|