llama_cpp 0.3.6 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +8 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1165 -721
- data/ext/llama_cpp/src/ggml-metal.m +39 -18
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +214 -146
- data/ext/llama_cpp/src/llama.h +18 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
|
|
195
195
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
196
196
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
197
|
#else
|
198
|
-
inline static void* ggml_aligned_malloc(size_t size) {
|
199
|
-
void* aligned_memory = NULL;
|
198
|
+
inline static void * ggml_aligned_malloc(size_t size) {
|
199
|
+
void * aligned_memory = NULL;
|
200
200
|
#ifdef GGML_USE_METAL
|
201
201
|
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
202
202
|
#else
|
@@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
3811
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
3812
|
};
|
3813
3813
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
3814
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3815
3815
|
|
3816
3816
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
3817
|
"none",
|
@@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
3883
|
"cross_entropy_loss_back(x,y)",
|
3884
3884
|
};
|
3885
3885
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
3886
|
+
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
3887
3887
|
|
3888
3888
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
3889
|
|
@@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
4110
4110
|
//
|
4111
4111
|
// is enough, but just in case, adding the second part
|
4112
4112
|
|
4113
|
-
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
4113
|
+
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
|
4114
4114
|
}
|
4115
4115
|
|
4116
4116
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
@@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4253
4253
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4254
|
}
|
4255
4255
|
|
4256
|
-
|
4256
|
+
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
4257
4257
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4258
4258
|
|
4259
4259
|
return
|
@@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4602
4602
|
/*.ne =*/ { 1, 1, 1, 1 },
|
4603
4603
|
/*.nb =*/ { 0, 0, 0, 0 },
|
4604
4604
|
/*.op =*/ GGML_OP_NONE,
|
4605
|
-
/*.op_params =*/ {0},
|
4605
|
+
/*.op_params =*/ { 0 },
|
4606
4606
|
/*.is_param =*/ false,
|
4607
4607
|
/*.grad =*/ NULL,
|
4608
4608
|
/*.src =*/ { NULL },
|
@@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4634
4634
|
}
|
4635
4635
|
|
4636
4636
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4637
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4637
4638
|
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4638
4639
|
memcpy(tensor->op_params, params, params_size);
|
4639
4640
|
}
|
@@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
|
|
6439
6440
|
result->src[0] = a;
|
6440
6441
|
|
6441
6442
|
int32_t params[] = { axis0, axis1, axis2, axis3 };
|
6442
|
-
ggml_set_op_params(result,
|
6443
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6443
6444
|
|
6444
6445
|
return result;
|
6445
6446
|
}
|
@@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6565
6566
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6566
6567
|
|
6567
6568
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6568
|
-
ggml_set_op_params(result,
|
6569
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6569
6570
|
|
6570
6571
|
result->op = GGML_OP_DIAG_MASK_INF;
|
6571
6572
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6605
6606
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6606
6607
|
|
6607
6608
|
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
6608
|
-
ggml_set_op_params(result,
|
6609
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6609
6610
|
|
6610
6611
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
6611
6612
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6721
6722
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6722
6723
|
|
6723
6724
|
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
|
6724
|
-
memcpy(params + 4, &freq_base,
|
6725
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
6725
6726
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
6726
|
-
ggml_set_op_params(result,
|
6727
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6727
6728
|
|
6728
6729
|
result->op = GGML_OP_ROPE;
|
6729
6730
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
|
|
6797
6798
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6798
6799
|
|
6799
6800
|
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
6800
|
-
ggml_set_op_params(result,
|
6801
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6801
6802
|
|
6802
6803
|
result->op = GGML_OP_ROPE_BACK;
|
6803
6804
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6828
6829
|
|
6829
6830
|
int32_t op_params[3] = { n_past, n_head };
|
6830
6831
|
memcpy(op_params + 2, &bias_max, sizeof(float));
|
6831
|
-
ggml_set_op_params(result,
|
6832
|
+
ggml_set_op_params(result, op_params, sizeof(op_params));
|
6832
6833
|
|
6833
6834
|
result->op = GGML_OP_ALIBI;
|
6834
6835
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6855
6856
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6856
6857
|
|
6857
6858
|
float params[] = { min, max };
|
6858
|
-
ggml_set_op_params(result,
|
6859
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6859
6860
|
|
6860
6861
|
result->op = GGML_OP_CLAMP;
|
6861
6862
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6890,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6890
6891
|
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
6891
6892
|
a->ne[2], 1, 1,
|
6892
6893
|
};
|
6893
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6894
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6894
6895
|
|
6895
6896
|
int32_t params[] = { s0, p0, d0 };
|
6896
|
-
ggml_set_op_params(result,
|
6897
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6897
6898
|
|
6898
6899
|
result->op = GGML_OP_CONV_1D;
|
6899
6900
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6905,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6905
6906
|
|
6906
6907
|
// ggml_conv_2d
|
6907
6908
|
|
6908
|
-
struct ggml_tensor* ggml_conv_2d(
|
6909
|
-
struct ggml_context* ctx,
|
6910
|
-
struct ggml_tensor
|
6911
|
-
struct ggml_tensor
|
6909
|
+
struct ggml_tensor * ggml_conv_2d(
|
6910
|
+
struct ggml_context * ctx,
|
6911
|
+
struct ggml_tensor * a,
|
6912
|
+
struct ggml_tensor * b,
|
6912
6913
|
int s0,
|
6913
6914
|
int s1,
|
6914
6915
|
int p0,
|
@@ -6929,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6929
6930
|
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
6930
6931
|
a->ne[3], b->ne[3],
|
6931
6932
|
};
|
6932
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6933
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6933
6934
|
|
6934
6935
|
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
6935
|
-
ggml_set_op_params(result,
|
6936
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6936
6937
|
|
6937
6938
|
result->op = GGML_OP_CONV_2D;
|
6938
6939
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6945,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d(
|
|
6945
6946
|
|
6946
6947
|
// ggml_conv_1d_ph
|
6947
6948
|
|
6948
|
-
struct ggml_tensor* ggml_conv_1d_ph(
|
6949
|
+
struct ggml_tensor * ggml_conv_1d_ph(
|
6949
6950
|
struct ggml_context * ctx,
|
6950
6951
|
struct ggml_tensor * a,
|
6951
6952
|
struct ggml_tensor * b,
|
@@ -6963,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
|
6963
6964
|
|
6964
6965
|
// ggml_pool_1d
|
6965
6966
|
|
6966
|
-
struct ggml_tensor* ggml_pool_1d(
|
6967
|
+
struct ggml_tensor * ggml_pool_1d(
|
6967
6968
|
struct ggml_context * ctx,
|
6968
6969
|
struct ggml_tensor * a,
|
6969
6970
|
enum ggml_op_pool op,
|
@@ -6982,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6982
6983
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
6983
6984
|
a->ne[1],
|
6984
6985
|
};
|
6985
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6986
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6986
6987
|
|
6987
6988
|
int32_t params[] = { op, k0, s0, p0 };
|
6988
|
-
ggml_set_op_params(result,
|
6989
|
+
ggml_set_op_params(result, params, sizeof(params));
|
6989
6990
|
|
6990
6991
|
result->op = GGML_OP_POOL_1D;
|
6991
6992
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6996,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d(
|
|
6996
6997
|
|
6997
6998
|
// ggml_pool_2d
|
6998
6999
|
|
6999
|
-
struct ggml_tensor* ggml_pool_2d(
|
7000
|
+
struct ggml_tensor * ggml_pool_2d(
|
7000
7001
|
struct ggml_context * ctx,
|
7001
7002
|
struct ggml_tensor * a,
|
7002
7003
|
enum ggml_op_pool op,
|
@@ -7019,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d(
|
|
7019
7020
|
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
7020
7021
|
a->ne[2],
|
7021
7022
|
};
|
7022
|
-
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7023
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7023
7024
|
|
7024
7025
|
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
7025
|
-
ggml_set_op_params(result,
|
7026
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7026
7027
|
|
7027
7028
|
result->op = GGML_OP_POOL_2D;
|
7028
7029
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
|
|
7190
7191
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7191
7192
|
|
7192
7193
|
int32_t params[] = { npx, npy, w };
|
7193
|
-
ggml_set_op_params(result,
|
7194
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7194
7195
|
|
7195
7196
|
result->op = GGML_OP_WIN_PART;
|
7196
7197
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7220
7221
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7221
7222
|
|
7222
7223
|
int32_t params[] = { w };
|
7223
|
-
ggml_set_op_params(result,
|
7224
|
+
ggml_set_op_params(result, params, sizeof(params));
|
7224
7225
|
|
7225
7226
|
result->op = GGML_OP_WIN_UNPART;
|
7226
7227
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -7349,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
7349
7350
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
7350
7351
|
}
|
7351
7352
|
|
7352
|
-
//
|
7353
|
+
// ggml_map_custom1_f32
|
7353
7354
|
|
7354
7355
|
static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7355
7356
|
struct ggml_context * ctx,
|
@@ -7366,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
|
7366
7367
|
|
7367
7368
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7368
7369
|
|
7369
|
-
result->op =
|
7370
|
+
result->op = GGML_OP_MAP_CUSTOM1_F32;
|
7370
7371
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7371
7372
|
result->src[0] = a;
|
7372
7373
|
|
@@ -7387,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
|
7387
7388
|
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7388
7389
|
}
|
7389
7390
|
|
7390
|
-
//
|
7391
|
+
// ggml_map_custom2_f32
|
7391
7392
|
|
7392
7393
|
static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7393
7394
|
struct ggml_context * ctx,
|
@@ -7405,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
|
7405
7406
|
|
7406
7407
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7407
7408
|
|
7408
|
-
result->op =
|
7409
|
+
result->op = GGML_OP_MAP_CUSTOM2_F32;
|
7409
7410
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7410
7411
|
result->src[0] = a;
|
7411
7412
|
result->src[1] = b;
|
@@ -7429,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
|
7429
7430
|
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7430
7431
|
}
|
7431
7432
|
|
7432
|
-
//
|
7433
|
+
// ggml_map_custom3_f32
|
7433
7434
|
|
7434
7435
|
static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7435
7436
|
struct ggml_context * ctx,
|
@@ -7448,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
|
7448
7449
|
|
7449
7450
|
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
7450
7451
|
|
7451
|
-
result->op =
|
7452
|
+
result->op = GGML_OP_MAP_CUSTOM3_F32;
|
7452
7453
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7453
7454
|
result->src[0] = a;
|
7454
7455
|
result->src[1] = b;
|
@@ -7475,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
|
7475
7476
|
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7476
7477
|
}
|
7477
7478
|
|
7479
|
+
// ggml_map_custom1
|
7480
|
+
struct ggml_map_custom1_op_params {
|
7481
|
+
ggml_custom1_op_t fun;
|
7482
|
+
int n_tasks;
|
7483
|
+
void * userdata;
|
7484
|
+
};
|
7485
|
+
|
7486
|
+
static struct ggml_tensor * ggml_map_custom1_impl(
|
7487
|
+
struct ggml_context * ctx,
|
7488
|
+
struct ggml_tensor * a,
|
7489
|
+
const ggml_custom1_op_t fun,
|
7490
|
+
int n_tasks,
|
7491
|
+
void * userdata,
|
7492
|
+
bool inplace) {
|
7493
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7494
|
+
|
7495
|
+
bool is_node = false;
|
7496
|
+
|
7497
|
+
if (!inplace && a->grad) {
|
7498
|
+
is_node = true;
|
7499
|
+
}
|
7500
|
+
|
7501
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7502
|
+
|
7503
|
+
struct ggml_map_custom1_op_params params = {
|
7504
|
+
/*.fun =*/ fun,
|
7505
|
+
/*.n_tasks =*/ n_tasks,
|
7506
|
+
/*.userdata =*/ userdata
|
7507
|
+
};
|
7508
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7509
|
+
|
7510
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7511
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7512
|
+
result->src[0] = a;
|
7513
|
+
|
7514
|
+
return result;
|
7515
|
+
}
|
7516
|
+
|
7517
|
+
struct ggml_tensor * ggml_map_custom1(
|
7518
|
+
struct ggml_context * ctx,
|
7519
|
+
struct ggml_tensor * a,
|
7520
|
+
const ggml_custom1_op_t fun,
|
7521
|
+
int n_tasks,
|
7522
|
+
void * userdata) {
|
7523
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
|
7524
|
+
}
|
7525
|
+
|
7526
|
+
struct ggml_tensor * ggml_map_custom1_inplace(
|
7527
|
+
struct ggml_context * ctx,
|
7528
|
+
struct ggml_tensor * a,
|
7529
|
+
const ggml_custom1_op_t fun,
|
7530
|
+
int n_tasks,
|
7531
|
+
void * userdata) {
|
7532
|
+
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
|
7533
|
+
}
|
7534
|
+
|
7535
|
+
// ggml_map_custom2
|
7536
|
+
|
7537
|
+
struct ggml_map_custom2_op_params {
|
7538
|
+
ggml_custom2_op_t fun;
|
7539
|
+
int n_tasks;
|
7540
|
+
void * userdata;
|
7541
|
+
};
|
7542
|
+
|
7543
|
+
static struct ggml_tensor * ggml_map_custom2_impl(
|
7544
|
+
struct ggml_context * ctx,
|
7545
|
+
struct ggml_tensor * a,
|
7546
|
+
struct ggml_tensor * b,
|
7547
|
+
const ggml_custom2_op_t fun,
|
7548
|
+
int n_tasks,
|
7549
|
+
void * userdata,
|
7550
|
+
bool inplace) {
|
7551
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7552
|
+
|
7553
|
+
bool is_node = false;
|
7554
|
+
|
7555
|
+
if (!inplace && (a->grad || b->grad)) {
|
7556
|
+
is_node = true;
|
7557
|
+
}
|
7558
|
+
|
7559
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7560
|
+
|
7561
|
+
struct ggml_map_custom2_op_params params = {
|
7562
|
+
/*.fun =*/ fun,
|
7563
|
+
/*.n_tasks =*/ n_tasks,
|
7564
|
+
/*.userdata =*/ userdata
|
7565
|
+
};
|
7566
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7567
|
+
|
7568
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7569
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7570
|
+
result->src[0] = a;
|
7571
|
+
result->src[1] = b;
|
7572
|
+
|
7573
|
+
return result;
|
7574
|
+
}
|
7575
|
+
|
7576
|
+
struct ggml_tensor * ggml_map_custom2(
|
7577
|
+
struct ggml_context * ctx,
|
7578
|
+
struct ggml_tensor * a,
|
7579
|
+
struct ggml_tensor * b,
|
7580
|
+
const ggml_custom2_op_t fun,
|
7581
|
+
int n_tasks,
|
7582
|
+
void * userdata) {
|
7583
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
|
7584
|
+
}
|
7585
|
+
|
7586
|
+
struct ggml_tensor * ggml_map_custom2_inplace(
|
7587
|
+
struct ggml_context * ctx,
|
7588
|
+
struct ggml_tensor * a,
|
7589
|
+
struct ggml_tensor * b,
|
7590
|
+
const ggml_custom2_op_t fun,
|
7591
|
+
int n_tasks,
|
7592
|
+
void * userdata) {
|
7593
|
+
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
|
7594
|
+
}
|
7595
|
+
|
7596
|
+
// ggml_map_custom3
|
7597
|
+
|
7598
|
+
struct ggml_map_custom3_op_params {
|
7599
|
+
ggml_custom3_op_t fun;
|
7600
|
+
int n_tasks;
|
7601
|
+
void * userdata;
|
7602
|
+
};
|
7603
|
+
|
7604
|
+
static struct ggml_tensor * ggml_map_custom3_impl(
|
7605
|
+
struct ggml_context * ctx,
|
7606
|
+
struct ggml_tensor * a,
|
7607
|
+
struct ggml_tensor * b,
|
7608
|
+
struct ggml_tensor * c,
|
7609
|
+
const ggml_custom3_op_t fun,
|
7610
|
+
int n_tasks,
|
7611
|
+
void * userdata,
|
7612
|
+
bool inplace) {
|
7613
|
+
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
7614
|
+
|
7615
|
+
bool is_node = false;
|
7616
|
+
|
7617
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7618
|
+
is_node = true;
|
7619
|
+
}
|
7620
|
+
|
7621
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7622
|
+
|
7623
|
+
struct ggml_map_custom3_op_params params = {
|
7624
|
+
/*.fun =*/ fun,
|
7625
|
+
/*.n_tasks =*/ n_tasks,
|
7626
|
+
/*.userdata =*/ userdata
|
7627
|
+
};
|
7628
|
+
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
7629
|
+
|
7630
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7631
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7632
|
+
result->src[0] = a;
|
7633
|
+
result->src[1] = b;
|
7634
|
+
result->src[2] = c;
|
7635
|
+
|
7636
|
+
return result;
|
7637
|
+
}
|
7638
|
+
|
7639
|
+
struct ggml_tensor * ggml_map_custom3(
|
7640
|
+
struct ggml_context * ctx,
|
7641
|
+
struct ggml_tensor * a,
|
7642
|
+
struct ggml_tensor * b,
|
7643
|
+
struct ggml_tensor * c,
|
7644
|
+
const ggml_custom3_op_t fun,
|
7645
|
+
int n_tasks,
|
7646
|
+
void * userdata) {
|
7647
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
|
7648
|
+
}
|
7649
|
+
|
7650
|
+
struct ggml_tensor * ggml_map_custom3_inplace(
|
7651
|
+
struct ggml_context * ctx,
|
7652
|
+
struct ggml_tensor * a,
|
7653
|
+
struct ggml_tensor * b,
|
7654
|
+
struct ggml_tensor * c,
|
7655
|
+
const ggml_custom3_op_t fun,
|
7656
|
+
int n_tasks,
|
7657
|
+
void * userdata) {
|
7658
|
+
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
|
7659
|
+
}
|
7660
|
+
|
7661
|
+
|
7662
|
+
|
7478
7663
|
// ggml_cross_entropy_loss
|
7479
7664
|
|
7480
7665
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -9283,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32(
|
|
9283
9468
|
for (int64_t i3 = 0; i3 < ne03; i3++) {
|
9284
9469
|
for (int64_t i2 = 0; i2 < ne02; i2++) {
|
9285
9470
|
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
9286
|
-
float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9287
|
-
float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9471
|
+
float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
9472
|
+
float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
9288
9473
|
float row_sum = 0;
|
9289
9474
|
ggml_vec_sum_f32(ne00, &row_sum, src_row);
|
9290
9475
|
dst_row[0] = row_sum;
|
@@ -10546,71 +10731,95 @@ static void ggml_compute_forward_mul_mat(
|
|
10546
10731
|
return;
|
10547
10732
|
}
|
10548
10733
|
|
10549
|
-
|
10550
|
-
const
|
10734
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10736
|
+
|
10737
|
+
const int64_t nr0 = ne01; // src0 rows
|
10738
|
+
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
10551
10739
|
|
10552
|
-
|
10553
|
-
const int64_t ir11 = MIN(ir10 + dr, ne01);
|
10740
|
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
10554
10741
|
|
10555
|
-
//
|
10556
|
-
const int64_t nr1 = ne11*ne12*ne13;
|
10742
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
10557
10743
|
|
10558
|
-
const
|
10559
|
-
const
|
10744
|
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
10745
|
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
10560
10746
|
|
10561
|
-
|
10562
|
-
|
10563
|
-
|
10564
|
-
|
10565
|
-
|
10566
|
-
|
10567
|
-
|
10568
|
-
|
10569
|
-
|
10570
|
-
|
10571
|
-
|
10572
|
-
|
10573
|
-
|
10574
|
-
|
10575
|
-
|
10576
|
-
|
10577
|
-
|
10578
|
-
|
10579
|
-
|
10580
|
-
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
10581
|
-
|
10582
|
-
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10583
|
-
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10584
|
-
// the original src1 data pointer, so we should index using the indices directly
|
10585
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10586
|
-
const char * src1_col = (const char *) wdata +
|
10587
|
-
(src1_cont || src1->type != vec_dot_type
|
10588
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10589
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10590
|
-
|
10591
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10592
|
-
|
10593
|
-
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
10594
|
-
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
10595
|
-
}
|
10747
|
+
const int64_t ith0 = ith % nth0;
|
10748
|
+
const int64_t ith1 = ith / nth0;
|
10749
|
+
|
10750
|
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
10751
|
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
10752
|
+
|
10753
|
+
const int64_t ir010 = dr0*ith0;
|
10754
|
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
10755
|
+
|
10756
|
+
const int64_t ir110 = dr1*ith1;
|
10757
|
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
10758
|
+
|
10759
|
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
10760
|
+
|
10761
|
+
// threads with no work simply yield (not sure if it helps)
|
10762
|
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
10763
|
+
sched_yield();
|
10764
|
+
return;
|
10596
10765
|
}
|
10597
10766
|
|
10598
|
-
|
10599
|
-
|
10600
|
-
//acc += t1 - t0;
|
10601
|
-
//if (t1 - t0 > 10) {
|
10602
|
-
// printf("\n");
|
10603
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10604
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10605
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10767
|
+
assert(ne12 % ne02 == 0);
|
10768
|
+
assert(ne13 % ne03 == 0);
|
10606
10769
|
|
10607
|
-
//
|
10608
|
-
|
10609
|
-
|
10770
|
+
// broadcast factors
|
10771
|
+
const int64_t r2 = ne12/ne02;
|
10772
|
+
const int64_t r3 = ne13/ne03;
|
10610
10773
|
|
10774
|
+
// block-tiling attempt
|
10775
|
+
const int64_t blck_0 = 16;
|
10776
|
+
const int64_t blck_1 = 16;
|
10611
10777
|
|
10612
|
-
//
|
10778
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
10779
|
+
float tmp[16];
|
10613
10780
|
|
10781
|
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
10782
|
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
10783
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
10784
|
+
const int64_t i13 = (ir1/(ne12*ne11));
|
10785
|
+
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
10786
|
+
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
10787
|
+
|
10788
|
+
// broadcast src0 into src1
|
10789
|
+
const int64_t i03 = i13/r3;
|
10790
|
+
const int64_t i02 = i12/r2;
|
10791
|
+
|
10792
|
+
const int64_t i1 = i11;
|
10793
|
+
const int64_t i2 = i12;
|
10794
|
+
const int64_t i3 = i13;
|
10795
|
+
|
10796
|
+
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
|
10797
|
+
|
10798
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10799
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
10800
|
+
// the original src1 data pointer, so we should index using the indices directly
|
10801
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
10802
|
+
const char * src1_col = (const char *) wdata +
|
10803
|
+
(src1_cont || src1->type != vec_dot_type
|
10804
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
10805
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
10806
|
+
|
10807
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
10808
|
+
|
10809
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10810
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
10811
|
+
//}
|
10812
|
+
|
10813
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
10814
|
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
10815
|
+
}
|
10816
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
10817
|
+
}
|
10818
|
+
}
|
10819
|
+
}
|
10820
|
+
}
|
10821
|
+
|
10822
|
+
// ggml_compute_forward_out_prod
|
10614
10823
|
|
10615
10824
|
static void ggml_compute_forward_out_prod_f32(
|
10616
10825
|
const struct ggml_compute_params * params,
|
@@ -12894,7 +13103,7 @@ static void ggml_compute_forward_pool_1d(
|
|
12894
13103
|
const struct ggml_tensor * src0,
|
12895
13104
|
struct ggml_tensor * dst) {
|
12896
13105
|
|
12897
|
-
const int32_t* opts = (const int32_t*)dst->op_params;
|
13106
|
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
12898
13107
|
enum ggml_op_pool op = opts[0];
|
12899
13108
|
const int k0 = opts[1];
|
12900
13109
|
const int s0 = opts[2];
|
@@ -14227,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32(
|
|
14227
14436
|
fun(dst, a);
|
14228
14437
|
}
|
14229
14438
|
|
14230
|
-
|
14231
|
-
static void ggml_compute_forward_map_custom1(
|
14232
|
-
const struct ggml_compute_params * params,
|
14233
|
-
const struct ggml_tensor * a,
|
14234
|
-
struct ggml_tensor * dst,
|
14235
|
-
const ggml_custom1_op_f32_t fun) {
|
14236
|
-
switch (a->type) {
|
14237
|
-
case GGML_TYPE_F32:
|
14238
|
-
{
|
14239
|
-
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
14240
|
-
} break;
|
14241
|
-
default:
|
14242
|
-
{
|
14243
|
-
GGML_ASSERT(false);
|
14244
|
-
} break;
|
14245
|
-
}
|
14246
|
-
}
|
14247
|
-
|
14248
14439
|
// ggml_compute_forward_map_custom2
|
14249
14440
|
|
14250
14441
|
static void ggml_compute_forward_map_custom2_f32(
|
@@ -14263,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32(
|
|
14263
14454
|
}
|
14264
14455
|
|
14265
14456
|
|
14266
|
-
static void ggml_compute_forward_map_custom2(
|
14267
|
-
const struct ggml_compute_params * params,
|
14268
|
-
const struct ggml_tensor * a,
|
14269
|
-
const struct ggml_tensor * b,
|
14270
|
-
struct ggml_tensor * dst,
|
14271
|
-
const ggml_custom2_op_f32_t fun) {
|
14272
|
-
switch (a->type) {
|
14273
|
-
case GGML_TYPE_F32:
|
14274
|
-
{
|
14275
|
-
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
14276
|
-
} break;
|
14277
|
-
default:
|
14278
|
-
{
|
14279
|
-
GGML_ASSERT(false);
|
14280
|
-
} break;
|
14281
|
-
}
|
14282
|
-
}
|
14283
|
-
|
14284
14457
|
// ggml_compute_forward_map_custom3
|
14285
14458
|
|
14286
14459
|
static void ggml_compute_forward_map_custom3_f32(
|
@@ -14299,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32(
|
|
14299
14472
|
fun(dst, a, b, c);
|
14300
14473
|
}
|
14301
14474
|
|
14475
|
+
// ggml_compute_forward_map_custom1
|
14476
|
+
|
14477
|
+
static void ggml_compute_forward_map_custom1(
|
14478
|
+
const struct ggml_compute_params * params,
|
14479
|
+
const struct ggml_tensor * a,
|
14480
|
+
struct ggml_tensor * dst) {
|
14481
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14482
|
+
return;
|
14483
|
+
}
|
14484
|
+
|
14485
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
|
14486
|
+
|
14487
|
+
p->fun(dst, a, params->ith, params->nth, p->userdata);
|
14488
|
+
}
|
14489
|
+
|
14490
|
+
// ggml_compute_forward_map_custom2
|
14491
|
+
|
14492
|
+
static void ggml_compute_forward_map_custom2(
|
14493
|
+
const struct ggml_compute_params * params,
|
14494
|
+
const struct ggml_tensor * a,
|
14495
|
+
const struct ggml_tensor * b,
|
14496
|
+
struct ggml_tensor * dst) {
|
14497
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14498
|
+
return;
|
14499
|
+
}
|
14500
|
+
|
14501
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
|
14502
|
+
|
14503
|
+
p->fun(dst, a, b, params->ith, params->nth, p->userdata);
|
14504
|
+
}
|
14505
|
+
|
14506
|
+
// ggml_compute_forward_map_custom3
|
14302
14507
|
|
14303
14508
|
static void ggml_compute_forward_map_custom3(
|
14304
14509
|
const struct ggml_compute_params * params,
|
14305
14510
|
const struct ggml_tensor * a,
|
14306
14511
|
const struct ggml_tensor * b,
|
14307
14512
|
const struct ggml_tensor * c,
|
14308
|
-
|
14309
|
-
|
14310
|
-
|
14311
|
-
case GGML_TYPE_F32:
|
14312
|
-
{
|
14313
|
-
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
14314
|
-
} break;
|
14315
|
-
default:
|
14316
|
-
{
|
14317
|
-
GGML_ASSERT(false);
|
14318
|
-
} break;
|
14513
|
+
struct ggml_tensor * dst) {
|
14514
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14515
|
+
return;
|
14319
14516
|
}
|
14517
|
+
|
14518
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
|
14519
|
+
|
14520
|
+
p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
|
14320
14521
|
}
|
14321
14522
|
|
14322
14523
|
// ggml_compute_forward_cross_entropy_loss
|
@@ -14838,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14838
15039
|
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14839
15040
|
}
|
14840
15041
|
break;
|
14841
|
-
case
|
15042
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
14842
15043
|
{
|
14843
15044
|
ggml_custom1_op_f32_t fun;
|
14844
15045
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14845
|
-
|
15046
|
+
ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
|
14846
15047
|
}
|
14847
15048
|
break;
|
14848
|
-
case
|
15049
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
14849
15050
|
{
|
14850
15051
|
ggml_custom2_op_f32_t fun;
|
14851
15052
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14852
|
-
|
15053
|
+
ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
|
14853
15054
|
}
|
14854
15055
|
break;
|
14855
|
-
case
|
15056
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
14856
15057
|
{
|
14857
15058
|
ggml_custom3_op_f32_t fun;
|
14858
15059
|
memcpy(&fun, tensor->op_params, sizeof(fun));
|
14859
|
-
|
15060
|
+
ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
|
15061
|
+
}
|
15062
|
+
break;
|
15063
|
+
case GGML_OP_MAP_CUSTOM1:
|
15064
|
+
{
|
15065
|
+
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
|
15066
|
+
}
|
15067
|
+
break;
|
15068
|
+
case GGML_OP_MAP_CUSTOM2:
|
15069
|
+
{
|
15070
|
+
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
|
15071
|
+
}
|
15072
|
+
break;
|
15073
|
+
case GGML_OP_MAP_CUSTOM3:
|
15074
|
+
{
|
15075
|
+
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
14860
15076
|
}
|
14861
15077
|
break;
|
14862
15078
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
@@ -15664,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15664
15880
|
} break;
|
15665
15881
|
case GGML_OP_MAP_UNARY:
|
15666
15882
|
case GGML_OP_MAP_BINARY:
|
15883
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
15884
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
15885
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
15667
15886
|
case GGML_OP_MAP_CUSTOM1:
|
15668
15887
|
case GGML_OP_MAP_CUSTOM2:
|
15669
15888
|
case GGML_OP_MAP_CUSTOM3:
|
@@ -16449,11 +16668,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16449
16668
|
case GGML_OP_WIN_UNPART:
|
16450
16669
|
case GGML_OP_MAP_UNARY:
|
16451
16670
|
case GGML_OP_MAP_BINARY:
|
16671
|
+
case GGML_OP_MAP_CUSTOM1_F32:
|
16672
|
+
case GGML_OP_MAP_CUSTOM2_F32:
|
16673
|
+
case GGML_OP_MAP_CUSTOM3_F32:
|
16674
|
+
{
|
16675
|
+
n_tasks = 1;
|
16676
|
+
} break;
|
16452
16677
|
case GGML_OP_MAP_CUSTOM1:
|
16678
|
+
{
|
16679
|
+
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
16680
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16681
|
+
n_tasks = n_threads;
|
16682
|
+
} else {
|
16683
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16684
|
+
}
|
16685
|
+
} break;
|
16453
16686
|
case GGML_OP_MAP_CUSTOM2:
|
16687
|
+
{
|
16688
|
+
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
16689
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16690
|
+
n_tasks = n_threads;
|
16691
|
+
} else {
|
16692
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16693
|
+
}
|
16694
|
+
} break;
|
16454
16695
|
case GGML_OP_MAP_CUSTOM3:
|
16455
16696
|
{
|
16456
|
-
|
16697
|
+
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
16698
|
+
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
16699
|
+
n_tasks = n_threads;
|
16700
|
+
} else {
|
16701
|
+
n_tasks = MIN(p->n_tasks, n_threads);
|
16702
|
+
}
|
16457
16703
|
} break;
|
16458
16704
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16459
16705
|
{
|