llama_cpp 0.10.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +691 -93
- data/ext/llama_cpp/src/ggml-metal.m +535 -54
- data/ext/llama_cpp/src/ggml-metal.metal +1497 -169
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +325 -159
- data/ext/llama_cpp/src/ggml.h +34 -13
- data/ext/llama_cpp/src/llama.cpp +195 -35
- data/ext/llama_cpp/src/llama.h +1 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe"
|
1
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
2
2
|
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
@@ -33,7 +33,7 @@
|
|
33
33
|
// we should just be careful :)
|
34
34
|
#pragma warning(disable: 4244 4267)
|
35
35
|
|
36
|
-
// disable POSIX deprecation
|
36
|
+
// disable POSIX deprecation warnings
|
37
37
|
// these functions are never going away, anyway
|
38
38
|
#pragma warning(disable: 4996)
|
39
39
|
#endif
|
@@ -1395,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
1395
1395
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
1396
1396
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1397
1397
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1398
|
-
inline static void
|
1398
|
+
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
1399
1399
|
|
1400
1400
|
static const float GELU_COEF_A = 0.044715f;
|
1401
1401
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1623,7 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1623
1623
|
"POOL_1D",
|
1624
1624
|
"POOL_2D",
|
1625
1625
|
"UPSCALE",
|
1626
|
+
"PAD",
|
1626
1627
|
"ARGSORT",
|
1628
|
+
"LEAKY_RELU",
|
1627
1629
|
|
1628
1630
|
"FLASH_ATTN",
|
1629
1631
|
"FLASH_FF",
|
@@ -1650,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1650
1652
|
"CROSS_ENTROPY_LOSS_BACK",
|
1651
1653
|
};
|
1652
1654
|
|
1653
|
-
static_assert(GGML_OP_COUNT ==
|
1655
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1654
1656
|
|
1655
1657
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1656
1658
|
"none",
|
@@ -1707,7 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1707
1709
|
"pool_1d(x)",
|
1708
1710
|
"pool_2d(x)",
|
1709
1711
|
"upscale(x)",
|
1712
|
+
"pad(x)",
|
1710
1713
|
"argsort(x)",
|
1714
|
+
"leaky_relu(x)",
|
1711
1715
|
|
1712
1716
|
"flash_attn(x)",
|
1713
1717
|
"flash_ff(x)",
|
@@ -1734,7 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1734
1738
|
"cross_entropy_loss_back(x,y)",
|
1735
1739
|
};
|
1736
1740
|
|
1737
|
-
static_assert(GGML_OP_COUNT ==
|
1741
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1738
1742
|
|
1739
1743
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1740
1744
|
|
@@ -1750,17 +1754,16 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
1750
1754
|
"GELU",
|
1751
1755
|
"GELU_QUICK",
|
1752
1756
|
"SILU",
|
1753
|
-
"LEAKY",
|
1754
1757
|
};
|
1755
1758
|
|
1756
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
1759
|
+
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
|
1757
1760
|
|
1758
1761
|
|
1759
1762
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1760
1763
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1761
1764
|
|
1762
1765
|
// WARN:
|
1763
|
-
// Mis-
|
1766
|
+
// Mis-configuration can lead to problem that's hard to reason about:
|
1764
1767
|
// * At best it crash or talks nosense.
|
1765
1768
|
// * At worst it talks slightly difference but hard to perceive.
|
1766
1769
|
//
|
@@ -1994,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
1994
1997
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
1995
1998
|
}
|
1996
1999
|
|
1997
|
-
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
1998
|
-
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
1999
|
-
|
2000
|
-
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
2001
|
-
}
|
2002
|
-
|
2003
2000
|
int ggml_blck_size(enum ggml_type type) {
|
2004
2001
|
return type_traits[type].blck_size;
|
2005
2002
|
}
|
@@ -2008,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
|
|
2008
2005
|
return type_traits[type].type_size;
|
2009
2006
|
}
|
2010
2007
|
|
2011
|
-
|
2012
|
-
|
2008
|
+
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2009
|
+
assert(ne % ggml_blck_size(type) == 0);
|
2010
|
+
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
double ggml_type_sizef(enum ggml_type type) {
|
2014
|
+
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
2013
2015
|
}
|
2014
2016
|
|
2015
2017
|
const char * ggml_type_name(enum ggml_type type) {
|
@@ -2046,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|
2046
2048
|
return ggml_type_size(tensor->type);
|
2047
2049
|
}
|
2048
2050
|
|
2049
|
-
|
2051
|
+
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
2050
2052
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2051
2053
|
|
2052
2054
|
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2053
2055
|
}
|
2054
2056
|
|
2055
|
-
|
2057
|
+
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
2056
2058
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2057
2059
|
|
2058
2060
|
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2059
2061
|
}
|
2060
2062
|
|
2061
|
-
|
2063
|
+
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
2062
2064
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2063
2065
|
|
2064
2066
|
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2065
2067
|
}
|
2066
2068
|
|
2069
|
+
bool ggml_is_3d(const struct ggml_tensor * tensor) {
|
2070
|
+
return tensor->ne[3] == 1;
|
2071
|
+
}
|
2072
|
+
|
2073
|
+
int ggml_n_dims(const struct ggml_tensor * tensor) {
|
2074
|
+
for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
|
2075
|
+
if (tensor->ne[i] > 1) {
|
2076
|
+
return i + 1;
|
2077
|
+
}
|
2078
|
+
}
|
2079
|
+
return 1;
|
2080
|
+
}
|
2081
|
+
|
2067
2082
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2068
2083
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2069
2084
|
|
@@ -2470,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2470
2485
|
view_src = view_src->view_src;
|
2471
2486
|
}
|
2472
2487
|
|
2473
|
-
size_t data_size =
|
2488
|
+
size_t data_size = ggml_row_size(type, ne[0]);
|
2474
2489
|
for (int i = 1; i < n_dims; i++) {
|
2475
2490
|
data_size *= ne[i];
|
2476
2491
|
}
|
@@ -2513,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2513
2528
|
/*.type =*/ type,
|
2514
2529
|
/*.backend =*/ GGML_BACKEND_CPU,
|
2515
2530
|
/*.buffer =*/ NULL,
|
2516
|
-
/*.n_dims =*/ n_dims,
|
2517
2531
|
/*.ne =*/ { 1, 1, 1, 1 },
|
2518
2532
|
/*.nb =*/ { 0, 0, 0, 0 },
|
2519
2533
|
/*.op =*/ GGML_OP_NONE,
|
@@ -2620,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
2620
2634
|
}
|
2621
2635
|
|
2622
2636
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
2623
|
-
return ggml_new_tensor(ctx, src->type,
|
2637
|
+
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
|
2624
2638
|
}
|
2625
2639
|
|
2626
2640
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
@@ -3069,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
3069
3083
|
struct ggml_tensor * ggml_view_tensor(
|
3070
3084
|
struct ggml_context * ctx,
|
3071
3085
|
struct ggml_tensor * src) {
|
3072
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type,
|
3086
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
|
3073
3087
|
ggml_format_name(result, "%s (view)", src->name);
|
3074
3088
|
|
3075
3089
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
@@ -3227,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|
3227
3241
|
is_node = true;
|
3228
3242
|
}
|
3229
3243
|
|
3230
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, type,
|
3244
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
3231
3245
|
|
3232
3246
|
result->op = GGML_OP_ADD;
|
3233
|
-
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3247
|
+
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
|
3234
3248
|
result->src[0] = a;
|
3235
3249
|
result->src[1] = b;
|
3236
3250
|
|
@@ -3599,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows(
|
|
3599
3613
|
is_node = true;
|
3600
3614
|
}
|
3601
3615
|
|
3602
|
-
int64_t ne[
|
3603
|
-
for (int i=1; i<
|
3616
|
+
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
3617
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
3604
3618
|
ne[i] = a->ne[i];
|
3605
3619
|
}
|
3606
3620
|
|
3607
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3621
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
3608
3622
|
|
3609
3623
|
result->op = GGML_OP_SUM_ROWS;
|
3610
3624
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3625,8 +3639,8 @@ struct ggml_tensor * ggml_mean(
|
|
3625
3639
|
is_node = true;
|
3626
3640
|
}
|
3627
3641
|
|
3628
|
-
int64_t ne[
|
3629
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3642
|
+
int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
3643
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
3630
3644
|
|
3631
3645
|
result->op = GGML_OP_MEAN;
|
3632
3646
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3648,8 +3662,7 @@ struct ggml_tensor * ggml_argmax(
|
|
3648
3662
|
is_node = true;
|
3649
3663
|
}
|
3650
3664
|
|
3651
|
-
|
3652
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
3665
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
|
3653
3666
|
|
3654
3667
|
result->op = GGML_OP_ARGMAX;
|
3655
3668
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3672,7 +3685,7 @@ struct ggml_tensor * ggml_repeat(
|
|
3672
3685
|
is_node = true;
|
3673
3686
|
}
|
3674
3687
|
|
3675
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3688
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3676
3689
|
|
3677
3690
|
result->op = GGML_OP_REPEAT;
|
3678
3691
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3699,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
3699
3712
|
return a;
|
3700
3713
|
}
|
3701
3714
|
|
3702
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3715
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3703
3716
|
|
3704
3717
|
result->op = GGML_OP_REPEAT_BACK;
|
3705
3718
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3830,12 +3843,25 @@ struct ggml_tensor * ggml_relu_inplace(
|
|
3830
3843
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
|
3831
3844
|
}
|
3832
3845
|
|
3833
|
-
//
|
3846
|
+
// ggml_leaky_relu
|
3834
3847
|
|
3835
|
-
struct ggml_tensor *
|
3848
|
+
struct ggml_tensor * ggml_leaky_relu(
|
3836
3849
|
struct ggml_context * ctx,
|
3837
|
-
struct ggml_tensor * a) {
|
3838
|
-
|
3850
|
+
struct ggml_tensor * a, float negative_slope, bool inplace) {
|
3851
|
+
bool is_node = false;
|
3852
|
+
|
3853
|
+
if (!inplace && (a->grad)) {
|
3854
|
+
is_node = true;
|
3855
|
+
}
|
3856
|
+
|
3857
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
3858
|
+
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
3859
|
+
|
3860
|
+
result->op = GGML_OP_LEAKY_RELU;
|
3861
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
3862
|
+
result->src[0] = a;
|
3863
|
+
|
3864
|
+
return result;
|
3839
3865
|
}
|
3840
3866
|
|
3841
3867
|
// ggml_gelu
|
@@ -4022,8 +4048,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|
4022
4048
|
|
4023
4049
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4024
4050
|
|
4025
|
-
result->op = GGML_OP_GROUP_NORM;
|
4026
4051
|
result->op_params[0] = n_groups;
|
4052
|
+
|
4053
|
+
result->op = GGML_OP_GROUP_NORM;
|
4027
4054
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4028
4055
|
result->src[0] = a;
|
4029
4056
|
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
@@ -4061,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4061
4088
|
}
|
4062
4089
|
|
4063
4090
|
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4064
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4091
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4065
4092
|
|
4066
4093
|
result->op = GGML_OP_MUL_MAT;
|
4067
4094
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4075,17 +4102,18 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4075
4102
|
|
4076
4103
|
struct ggml_tensor * ggml_mul_mat_id(
|
4077
4104
|
struct ggml_context * ctx,
|
4078
|
-
struct ggml_tensor * as[],
|
4105
|
+
struct ggml_tensor * const as[],
|
4106
|
+
int n_as,
|
4079
4107
|
struct ggml_tensor * ids,
|
4080
4108
|
int id,
|
4081
4109
|
struct ggml_tensor * b) {
|
4082
4110
|
|
4083
|
-
int64_t n_as = ids->ne[0];
|
4084
|
-
|
4085
4111
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4086
|
-
GGML_ASSERT(
|
4112
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4113
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4114
|
+
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4087
4115
|
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4088
|
-
GGML_ASSERT(id >= 0 && id <
|
4116
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]);
|
4089
4117
|
|
4090
4118
|
bool is_node = false;
|
4091
4119
|
|
@@ -4094,16 +4122,17 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
4094
4122
|
}
|
4095
4123
|
|
4096
4124
|
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4097
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4125
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4098
4126
|
|
4099
4127
|
ggml_set_op_params_i32(result, 0, id);
|
4128
|
+
ggml_set_op_params_i32(result, 1, n_as);
|
4100
4129
|
|
4101
4130
|
result->op = GGML_OP_MUL_MAT_ID;
|
4102
4131
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4103
4132
|
result->src[0] = ids;
|
4104
4133
|
result->src[1] = b;
|
4105
4134
|
|
4106
|
-
for (
|
4135
|
+
for (int i = 0; i < n_as; i++) {
|
4107
4136
|
struct ggml_tensor * a = as[i];
|
4108
4137
|
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4109
4138
|
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
@@ -4131,7 +4160,7 @@ struct ggml_tensor * ggml_out_prod(
|
|
4131
4160
|
|
4132
4161
|
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
4133
4162
|
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
4134
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4163
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4135
4164
|
|
4136
4165
|
result->op = GGML_OP_OUT_PROD;
|
4137
4166
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4416,7 +4445,7 @@ struct ggml_tensor * ggml_reshape(
|
|
4416
4445
|
//GGML_ASSERT(false);
|
4417
4446
|
}
|
4418
4447
|
|
4419
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type,
|
4448
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
|
4420
4449
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
4421
4450
|
|
4422
4451
|
result->op = GGML_OP_RESHAPE;
|
@@ -4731,7 +4760,9 @@ struct ggml_tensor * ggml_get_rows(
|
|
4731
4760
|
struct ggml_context * ctx,
|
4732
4761
|
struct ggml_tensor * a,
|
4733
4762
|
struct ggml_tensor * b) {
|
4734
|
-
GGML_ASSERT(
|
4763
|
+
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
4764
|
+
GGML_ASSERT(b->ne[3] == 1);
|
4765
|
+
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
4735
4766
|
|
4736
4767
|
bool is_node = false;
|
4737
4768
|
|
@@ -4741,7 +4772,7 @@ struct ggml_tensor * ggml_get_rows(
|
|
4741
4772
|
|
4742
4773
|
// TODO: implement non F32 return
|
4743
4774
|
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
|
4744
|
-
struct ggml_tensor * result =
|
4775
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
|
4745
4776
|
|
4746
4777
|
result->op = GGML_OP_GET_ROWS;
|
4747
4778
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4792,7 +4823,7 @@ struct ggml_tensor * ggml_diag(
|
|
4792
4823
|
}
|
4793
4824
|
|
4794
4825
|
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
4795
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
4826
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
|
4796
4827
|
|
4797
4828
|
result->op = GGML_OP_DIAG;
|
4798
4829
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5439,7 +5470,7 @@ struct ggml_tensor * ggml_pool_1d(
|
|
5439
5470
|
is_node = true;
|
5440
5471
|
}
|
5441
5472
|
|
5442
|
-
const int64_t ne[
|
5473
|
+
const int64_t ne[2] = {
|
5443
5474
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
5444
5475
|
a->ne[1],
|
5445
5476
|
};
|
@@ -5519,6 +5550,30 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
5519
5550
|
return result;
|
5520
5551
|
}
|
5521
5552
|
|
5553
|
+
struct ggml_tensor * ggml_pad(
|
5554
|
+
struct ggml_context * ctx,
|
5555
|
+
struct ggml_tensor * a,
|
5556
|
+
int p0, int p1, int p2, int p3) {
|
5557
|
+
bool is_node = false;
|
5558
|
+
|
5559
|
+
if (a->grad) {
|
5560
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5561
|
+
is_node = true;
|
5562
|
+
}
|
5563
|
+
|
5564
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
5565
|
+
a->ne[0] + p0,
|
5566
|
+
a->ne[1] + p1,
|
5567
|
+
a->ne[2] + p2,
|
5568
|
+
a->ne[3] + p3);
|
5569
|
+
|
5570
|
+
result->op = GGML_OP_PAD;
|
5571
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5572
|
+
result->src[0] = a;
|
5573
|
+
|
5574
|
+
return result;
|
5575
|
+
}
|
5576
|
+
|
5522
5577
|
struct ggml_tensor * ggml_upscale(
|
5523
5578
|
struct ggml_context * ctx,
|
5524
5579
|
struct ggml_tensor * a,
|
@@ -5534,7 +5589,7 @@ struct ggml_tensor * ggml_argsort(
|
|
5534
5589
|
enum ggml_sort_order order) {
|
5535
5590
|
bool is_node = false;
|
5536
5591
|
|
5537
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32,
|
5592
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
5538
5593
|
|
5539
5594
|
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5540
5595
|
|
@@ -5581,7 +5636,7 @@ struct ggml_tensor * ggml_flash_attn(
|
|
5581
5636
|
}
|
5582
5637
|
|
5583
5638
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
5584
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5639
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
5585
5640
|
|
5586
5641
|
int32_t t = masked ? 1 : 0;
|
5587
5642
|
ggml_set_op_params(result, &t, sizeof(t));
|
@@ -5614,7 +5669,7 @@ struct ggml_tensor * ggml_flash_ff(
|
|
5614
5669
|
}
|
5615
5670
|
|
5616
5671
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5617
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5672
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
5618
5673
|
|
5619
5674
|
result->op = GGML_OP_FLASH_FF;
|
5620
5675
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5730,7 +5785,6 @@ struct ggml_tensor * ggml_win_part(
|
|
5730
5785
|
const int np = npx*npy;
|
5731
5786
|
|
5732
5787
|
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
5733
|
-
|
5734
5788
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5735
5789
|
|
5736
5790
|
int32_t params[] = { npx, npy, w };
|
@@ -7520,7 +7574,7 @@ static void ggml_compute_forward_acc_f32(
|
|
7520
7574
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
7521
7575
|
|
7522
7576
|
// view src0 and dst with these strides and data offset inbytes during acc
|
7523
|
-
// nb0 is
|
7577
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
7524
7578
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
7525
7579
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
7526
7580
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -7716,6 +7770,8 @@ static void ggml_compute_forward_mul_f32(
|
|
7716
7770
|
|
7717
7771
|
#ifdef GGML_USE_CLBLAST
|
7718
7772
|
if (src1->backend == GGML_BACKEND_GPU) {
|
7773
|
+
// TODO: OpenCL kernel support full broadcast
|
7774
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
7719
7775
|
if (ith == 0) {
|
7720
7776
|
ggml_cl_mul(src0, src1, dst);
|
7721
7777
|
}
|
@@ -8981,10 +9037,9 @@ static void ggml_compute_forward_silu(
|
|
8981
9037
|
} break;
|
8982
9038
|
}
|
8983
9039
|
}
|
9040
|
+
// ggml_compute_forward_leaky_relu
|
8984
9041
|
|
8985
|
-
|
8986
|
-
|
8987
|
-
static void ggml_compute_forward_leaky_f32(
|
9042
|
+
static void ggml_compute_forward_leaky_relu_f32(
|
8988
9043
|
const struct ggml_compute_params * params,
|
8989
9044
|
const struct ggml_tensor * src0,
|
8990
9045
|
struct ggml_tensor * dst) {
|
@@ -8998,24 +9053,27 @@ static void ggml_compute_forward_leaky_f32(
|
|
8998
9053
|
const int n = ggml_nrows(src0);
|
8999
9054
|
const int nc = src0->ne[0];
|
9000
9055
|
|
9056
|
+
float negative_slope;
|
9057
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
9058
|
+
|
9001
9059
|
assert(dst->nb[0] == sizeof(float));
|
9002
9060
|
assert(src0->nb[0] == sizeof(float));
|
9003
9061
|
|
9004
9062
|
for (int i = 0; i < n; i++) {
|
9005
|
-
|
9063
|
+
ggml_vec_leaky_relu_f32(nc,
|
9006
9064
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9007
|
-
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9065
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
|
9008
9066
|
}
|
9009
9067
|
}
|
9010
9068
|
|
9011
|
-
static void
|
9069
|
+
static void ggml_compute_forward_leaky_relu(
|
9012
9070
|
const struct ggml_compute_params * params,
|
9013
9071
|
const struct ggml_tensor * src0,
|
9014
9072
|
struct ggml_tensor * dst) {
|
9015
9073
|
switch (src0->type) {
|
9016
9074
|
case GGML_TYPE_F32:
|
9017
9075
|
{
|
9018
|
-
|
9076
|
+
ggml_compute_forward_leaky_relu_f32(params, src0, dst);
|
9019
9077
|
} break;
|
9020
9078
|
default:
|
9021
9079
|
{
|
@@ -9504,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9504
9562
|
const int64_t ne0 = dst->ne[0];
|
9505
9563
|
const int64_t ne1 = dst->ne[1];
|
9506
9564
|
|
9565
|
+
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
9566
|
+
// all the experts for each batch element and the processing would become incredibly slow
|
9507
9567
|
// TODO: find the optimal values for these
|
9508
|
-
if (
|
9568
|
+
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
9569
|
+
ggml_is_contiguous(src0) &&
|
9509
9570
|
ggml_is_contiguous(src1) &&
|
9510
9571
|
//src0->type == GGML_TYPE_F32 &&
|
9511
9572
|
src1->type == GGML_TYPE_F32 &&
|
@@ -9519,11 +9580,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9519
9580
|
}
|
9520
9581
|
#endif
|
9521
9582
|
|
9583
|
+
// off1 = offset in i11 and i1
|
9584
|
+
// cne1 = ne11 and ne1
|
9585
|
+
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
9586
|
+
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
9522
9587
|
static void ggml_compute_forward_mul_mat(
|
9523
9588
|
const struct ggml_compute_params * params,
|
9524
9589
|
const struct ggml_tensor * src0,
|
9525
9590
|
const struct ggml_tensor * src1,
|
9526
|
-
struct ggml_tensor * dst
|
9591
|
+
struct ggml_tensor * dst,
|
9592
|
+
int64_t off1, int64_t cne1) {
|
9527
9593
|
int64_t t0 = ggml_perf_time_us();
|
9528
9594
|
UNUSED(t0);
|
9529
9595
|
|
@@ -9591,10 +9657,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9591
9657
|
const int64_t i03 = i13/r3;
|
9592
9658
|
const int64_t i02 = i12/r2;
|
9593
9659
|
|
9594
|
-
const void * x = (char *) src0->data +
|
9595
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
9596
|
-
|
9597
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9660
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
9661
|
+
const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
|
9662
|
+
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
|
9598
9663
|
|
9599
9664
|
if (type != GGML_TYPE_F32) {
|
9600
9665
|
float * const wdata = params->wdata;
|
@@ -9611,10 +9676,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9611
9676
|
}
|
9612
9677
|
|
9613
9678
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9614
|
-
|
9615
|
-
|
9616
|
-
|
9617
|
-
|
9679
|
+
cne1, ne01, ne10,
|
9680
|
+
1.0f, y, ne10,
|
9681
|
+
x, ne00,
|
9682
|
+
0.0f, d, ne01);
|
9618
9683
|
}
|
9619
9684
|
}
|
9620
9685
|
|
@@ -9627,9 +9692,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9627
9692
|
if (params->type == GGML_TASK_INIT) {
|
9628
9693
|
if (src1->type != vec_dot_type) {
|
9629
9694
|
char * wdata = params->wdata;
|
9630
|
-
const size_t row_size =
|
9695
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9631
9696
|
|
9632
9697
|
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9698
|
+
assert(src1->type == GGML_TYPE_F32);
|
9633
9699
|
|
9634
9700
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9635
9701
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -9649,10 +9715,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9649
9715
|
}
|
9650
9716
|
|
9651
9717
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9652
|
-
const size_t row_size =
|
9718
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9653
9719
|
|
9654
9720
|
const int64_t nr0 = ne01; // src0 rows
|
9655
|
-
const int64_t nr1 =
|
9721
|
+
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
9656
9722
|
|
9657
9723
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9658
9724
|
|
@@ -9694,9 +9760,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9694
9760
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9695
9761
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9696
9762
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9697
|
-
const int64_t i13 = (ir1/(ne12*
|
9698
|
-
const int64_t i12 = (ir1 - i13*ne12*
|
9699
|
-
const int64_t i11 = (ir1 - i13*ne12*
|
9763
|
+
const int64_t i13 = (ir1/(ne12*cne1));
|
9764
|
+
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
9765
|
+
const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
|
9700
9766
|
|
9701
9767
|
// broadcast src0 into src1
|
9702
9768
|
const int64_t i03 = i13/r3;
|
@@ -9736,20 +9802,28 @@ static void ggml_compute_forward_mul_mat(
|
|
9736
9802
|
|
9737
9803
|
static void ggml_compute_forward_mul_mat_id(
|
9738
9804
|
const struct ggml_compute_params * params,
|
9805
|
+
const struct ggml_tensor * src0,
|
9806
|
+
const struct ggml_tensor * src1,
|
9739
9807
|
struct ggml_tensor * dst) {
|
9740
9808
|
|
9741
|
-
|
9742
|
-
|
9743
|
-
|
9744
|
-
|
9809
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9810
|
+
// during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
|
9811
|
+
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
|
9812
|
+
return;
|
9813
|
+
}
|
9745
9814
|
|
9746
|
-
const
|
9815
|
+
const struct ggml_tensor * ids = src0;
|
9816
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9817
|
+
const int n_as = ggml_get_op_params_i32(dst, 1);
|
9747
9818
|
|
9748
|
-
|
9819
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
9820
|
+
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
9749
9821
|
|
9750
|
-
|
9822
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
9751
9823
|
|
9752
|
-
|
9824
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
9825
|
+
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
9826
|
+
}
|
9753
9827
|
}
|
9754
9828
|
|
9755
9829
|
// ggml_compute_forward_out_prod
|
@@ -10161,7 +10235,7 @@ static void ggml_compute_forward_set_f32(
|
|
10161
10235
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
10162
10236
|
|
10163
10237
|
// view src0 and dst with these strides and data offset inbytes during set
|
10164
|
-
// nb0 is
|
10238
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
10165
10239
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
10166
10240
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
10167
10241
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -10325,21 +10399,30 @@ static void ggml_compute_forward_get_rows_q(
|
|
10325
10399
|
return;
|
10326
10400
|
}
|
10327
10401
|
|
10328
|
-
|
10329
|
-
|
10402
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10403
|
+
|
10404
|
+
const int64_t nc = ne00;
|
10405
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10406
|
+
|
10330
10407
|
const enum ggml_type type = src0->type;
|
10331
10408
|
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
10332
10409
|
|
10333
|
-
assert(
|
10334
|
-
assert(
|
10335
|
-
assert(
|
10410
|
+
assert(ne0 == nc);
|
10411
|
+
assert(ne02 == ne11);
|
10412
|
+
assert(nb00 == ggml_type_size(type));
|
10413
|
+
assert(ggml_nrows(dst) == nr);
|
10336
10414
|
|
10337
|
-
|
10338
|
-
|
10415
|
+
// TODO: multi-thread
|
10416
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10417
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10418
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10419
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10339
10420
|
|
10340
|
-
|
10341
|
-
|
10342
|
-
|
10421
|
+
dequantize_row_q(
|
10422
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10423
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10424
|
+
}
|
10425
|
+
}
|
10343
10426
|
}
|
10344
10427
|
}
|
10345
10428
|
|
@@ -10354,19 +10437,26 @@ static void ggml_compute_forward_get_rows_f16(
|
|
10354
10437
|
return;
|
10355
10438
|
}
|
10356
10439
|
|
10357
|
-
|
10358
|
-
const int nr = ggml_nelements(src1);
|
10440
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10359
10441
|
|
10360
|
-
|
10361
|
-
|
10362
|
-
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
10442
|
+
const int64_t nc = ne00;
|
10443
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10363
10444
|
|
10364
|
-
|
10365
|
-
|
10445
|
+
assert(ne0 == nc);
|
10446
|
+
assert(ne02 == ne11);
|
10447
|
+
assert(nb00 == sizeof(ggml_fp16_t));
|
10448
|
+
assert(ggml_nrows(dst) == nr);
|
10366
10449
|
|
10367
|
-
|
10368
|
-
|
10369
|
-
|
10450
|
+
// TODO: multi-thread
|
10451
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10452
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10453
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10454
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10455
|
+
|
10456
|
+
ggml_fp16_to_fp32_row(
|
10457
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10458
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10459
|
+
}
|
10370
10460
|
}
|
10371
10461
|
}
|
10372
10462
|
}
|
@@ -10382,19 +10472,27 @@ static void ggml_compute_forward_get_rows_f32(
|
|
10382
10472
|
return;
|
10383
10473
|
}
|
10384
10474
|
|
10385
|
-
|
10386
|
-
const int nr = ggml_nelements(src1);
|
10475
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10387
10476
|
|
10388
|
-
|
10389
|
-
|
10390
|
-
assert(src0->nb[0] == sizeof(float));
|
10477
|
+
const int64_t nc = ne00;
|
10478
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10391
10479
|
|
10392
|
-
|
10393
|
-
|
10480
|
+
assert(ne0 == nc);
|
10481
|
+
assert(ne02 == ne11);
|
10482
|
+
assert(nb00 == sizeof(float));
|
10483
|
+
assert(ggml_nrows(dst) == nr);
|
10394
10484
|
|
10395
|
-
|
10396
|
-
|
10397
|
-
|
10485
|
+
// TODO: multi-thread
|
10486
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10487
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10488
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10489
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10490
|
+
|
10491
|
+
ggml_vec_cpy_f32(nc,
|
10492
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
10493
|
+
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
10494
|
+
}
|
10495
|
+
}
|
10398
10496
|
}
|
10399
10497
|
}
|
10400
10498
|
|
@@ -12114,6 +12212,7 @@ static void ggml_compute_forward_upscale_f32(
|
|
12114
12212
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
12115
12213
|
|
12116
12214
|
const int ith = params->ith;
|
12215
|
+
const int nth = params->nth;
|
12117
12216
|
|
12118
12217
|
GGML_TENSOR_UNARY_OP_LOCALS
|
12119
12218
|
|
@@ -12121,16 +12220,17 @@ static void ggml_compute_forward_upscale_f32(
|
|
12121
12220
|
|
12122
12221
|
// TODO: optimize
|
12123
12222
|
|
12124
|
-
for (
|
12125
|
-
|
12126
|
-
|
12127
|
-
|
12128
|
-
|
12129
|
-
|
12130
|
-
|
12131
|
-
const
|
12223
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12224
|
+
const int64_t i03 = i3;
|
12225
|
+
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
12226
|
+
const int64_t i02 = i2;
|
12227
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
12228
|
+
const int64_t i01 = i1 / scale_factor;
|
12229
|
+
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
12230
|
+
const int64_t i00 = i0 / scale_factor;
|
12132
12231
|
|
12133
|
-
float *
|
12232
|
+
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
12233
|
+
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
12134
12234
|
|
12135
12235
|
*y = *x;
|
12136
12236
|
}
|
@@ -12155,6 +12255,64 @@ static void ggml_compute_forward_upscale(
|
|
12155
12255
|
}
|
12156
12256
|
}
|
12157
12257
|
|
12258
|
+
// ggml_compute_forward_pad
|
12259
|
+
|
12260
|
+
static void ggml_compute_forward_pad_f32(
|
12261
|
+
const struct ggml_compute_params * params,
|
12262
|
+
const struct ggml_tensor * src0,
|
12263
|
+
struct ggml_tensor * dst) {
|
12264
|
+
|
12265
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12266
|
+
return;
|
12267
|
+
}
|
12268
|
+
|
12269
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
12270
|
+
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
12271
|
+
|
12272
|
+
const int ith = params->ith;
|
12273
|
+
const int nth = params->nth;
|
12274
|
+
|
12275
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12276
|
+
|
12277
|
+
float * dst_ptr = (float *) dst->data;
|
12278
|
+
|
12279
|
+
// TODO: optimize
|
12280
|
+
|
12281
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
12282
|
+
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
12283
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
12284
|
+
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
12285
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
12286
|
+
|
12287
|
+
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12288
|
+
|
12289
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
12290
|
+
dst_ptr[dst_idx] = *src_ptr;
|
12291
|
+
} else {
|
12292
|
+
dst_ptr[dst_idx] = 0;
|
12293
|
+
}
|
12294
|
+
}
|
12295
|
+
}
|
12296
|
+
}
|
12297
|
+
}
|
12298
|
+
}
|
12299
|
+
|
12300
|
+
static void ggml_compute_forward_pad(
|
12301
|
+
const struct ggml_compute_params * params,
|
12302
|
+
const struct ggml_tensor * src0,
|
12303
|
+
struct ggml_tensor * dst) {
|
12304
|
+
switch (src0->type) {
|
12305
|
+
case GGML_TYPE_F32:
|
12306
|
+
{
|
12307
|
+
ggml_compute_forward_pad_f32(params, src0, dst);
|
12308
|
+
} break;
|
12309
|
+
default:
|
12310
|
+
{
|
12311
|
+
GGML_ASSERT(false);
|
12312
|
+
} break;
|
12313
|
+
}
|
12314
|
+
}
|
12315
|
+
|
12158
12316
|
// ggml_compute_forward_argsort
|
12159
12317
|
|
12160
12318
|
static void ggml_compute_forward_argsort_f32(
|
@@ -13362,10 +13520,6 @@ static void ggml_compute_forward_unary(
|
|
13362
13520
|
{
|
13363
13521
|
ggml_compute_forward_silu(params, src0, dst);
|
13364
13522
|
} break;
|
13365
|
-
case GGML_UNARY_OP_LEAKY:
|
13366
|
-
{
|
13367
|
-
ggml_compute_forward_leaky(params, src0, dst);
|
13368
|
-
} break;
|
13369
13523
|
default:
|
13370
13524
|
{
|
13371
13525
|
GGML_ASSERT(false);
|
@@ -14037,11 +14191,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14037
14191
|
} break;
|
14038
14192
|
case GGML_OP_MUL_MAT:
|
14039
14193
|
{
|
14040
|
-
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
14194
|
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
|
14041
14195
|
} break;
|
14042
14196
|
case GGML_OP_MUL_MAT_ID:
|
14043
14197
|
{
|
14044
|
-
ggml_compute_forward_mul_mat_id(params, tensor);
|
14198
|
+
ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
|
14045
14199
|
} break;
|
14046
14200
|
case GGML_OP_OUT_PROD:
|
14047
14201
|
{
|
@@ -14147,10 +14301,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14147
14301
|
{
|
14148
14302
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
14149
14303
|
} break;
|
14304
|
+
case GGML_OP_PAD:
|
14305
|
+
{
|
14306
|
+
ggml_compute_forward_pad(params, tensor->src[0], tensor);
|
14307
|
+
} break;
|
14150
14308
|
case GGML_OP_ARGSORT:
|
14151
14309
|
{
|
14152
14310
|
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14153
14311
|
} break;
|
14312
|
+
case GGML_OP_LEAKY_RELU:
|
14313
|
+
{
|
14314
|
+
ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
|
14315
|
+
} break;
|
14154
14316
|
case GGML_OP_FLASH_ATTN:
|
14155
14317
|
{
|
14156
14318
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14405,7 +14567,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14405
14567
|
return replacements->vals[i];
|
14406
14568
|
}
|
14407
14569
|
|
14408
|
-
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type,
|
14570
|
+
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
|
14409
14571
|
|
14410
14572
|
// insert clone into replacements
|
14411
14573
|
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
@@ -14475,7 +14637,7 @@ void ggml_build_backward_gradient_checkpointing(
|
|
14475
14637
|
// insert new tensors recomputing src, reusing already made replacements,
|
14476
14638
|
// remember replacements: remember new tensors with mapping from corresponding gf nodes
|
14477
14639
|
// recurse for input tensors,
|
14478
|
-
// unless (i.e. terminating when) input tensors are
|
14640
|
+
// unless (i.e. terminating when) input tensors are replacements (like checkpoints)
|
14479
14641
|
node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
|
14480
14642
|
}
|
14481
14643
|
// insert rewritten backward node with replacements made into resulting backward graph gb
|
@@ -15143,10 +15305,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15143
15305
|
{
|
15144
15306
|
GGML_ASSERT(false); // TODO: not implemented
|
15145
15307
|
} break;
|
15308
|
+
case GGML_OP_PAD:
|
15309
|
+
{
|
15310
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15311
|
+
} break;
|
15146
15312
|
case GGML_OP_ARGSORT:
|
15147
15313
|
{
|
15148
15314
|
GGML_ASSERT(false); // TODO: not implemented
|
15149
15315
|
} break;
|
15316
|
+
case GGML_OP_LEAKY_RELU:
|
15317
|
+
{
|
15318
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15319
|
+
} break;
|
15150
15320
|
case GGML_OP_FLASH_ATTN:
|
15151
15321
|
{
|
15152
15322
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15752,6 +15922,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15752
15922
|
case GGML_OP_ARGMAX:
|
15753
15923
|
case GGML_OP_REPEAT:
|
15754
15924
|
case GGML_OP_REPEAT_BACK:
|
15925
|
+
case GGML_OP_LEAKY_RELU:
|
15755
15926
|
{
|
15756
15927
|
n_tasks = 1;
|
15757
15928
|
} break;
|
@@ -15764,7 +15935,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15764
15935
|
case GGML_UNARY_OP_TANH:
|
15765
15936
|
case GGML_UNARY_OP_ELU:
|
15766
15937
|
case GGML_UNARY_OP_RELU:
|
15767
|
-
case GGML_UNARY_OP_LEAKY:
|
15768
15938
|
{
|
15769
15939
|
n_tasks = 1;
|
15770
15940
|
} break;
|
@@ -15883,6 +16053,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15883
16053
|
{
|
15884
16054
|
n_tasks = n_threads;
|
15885
16055
|
} break;
|
16056
|
+
case GGML_OP_PAD:
|
16057
|
+
{
|
16058
|
+
n_tasks = n_threads;
|
16059
|
+
} break;
|
15886
16060
|
case GGML_OP_ARGSORT:
|
15887
16061
|
{
|
15888
16062
|
n_tasks = n_threads;
|
@@ -16146,7 +16320,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16146
16320
|
} else
|
16147
16321
|
#endif
|
16148
16322
|
if (node->src[1]->type != vec_dot_type) {
|
16149
|
-
cur =
|
16323
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
16150
16324
|
}
|
16151
16325
|
} break;
|
16152
16326
|
case GGML_OP_MUL_MAT_ID:
|
@@ -16163,7 +16337,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16163
16337
|
} else
|
16164
16338
|
#endif
|
16165
16339
|
if (b->type != vec_dot_type) {
|
16166
|
-
cur =
|
16340
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
|
16167
16341
|
}
|
16168
16342
|
} break;
|
16169
16343
|
case GGML_OP_OUT_PROD:
|
@@ -16394,7 +16568,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|
16394
16568
|
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
16395
16569
|
ggml_type_name(tensor->type),
|
16396
16570
|
ggml_op_name (tensor->op),
|
16397
|
-
tensor
|
16571
|
+
ggml_n_dims(tensor),
|
16398
16572
|
ne[0], ne[1], ne[2], ne[3],
|
16399
16573
|
nb[0], nb[1], nb[2], nb[3],
|
16400
16574
|
tensor->data,
|
@@ -16409,7 +16583,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16409
16583
|
arg,
|
16410
16584
|
ggml_type_name(tensor->type),
|
16411
16585
|
ggml_op_name (tensor->op),
|
16412
|
-
tensor
|
16586
|
+
ggml_n_dims(tensor),
|
16413
16587
|
ne[0], ne[1], ne[2], ne[3],
|
16414
16588
|
nb[0], nb[1], nb[2], nb[3],
|
16415
16589
|
tensor->data,
|
@@ -16499,11 +16673,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16499
16673
|
|
16500
16674
|
const uint32_t type = tensor->type;
|
16501
16675
|
const uint32_t op = tensor->op;
|
16502
|
-
const uint32_t n_dims = tensor->n_dims;
|
16503
16676
|
|
16504
16677
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16505
16678
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16506
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16507
16679
|
|
16508
16680
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16509
16681
|
const uint64_t ne = tensor->ne[j];
|
@@ -16533,11 +16705,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16533
16705
|
|
16534
16706
|
const uint32_t type = tensor->type;
|
16535
16707
|
const uint32_t op = tensor->op;
|
16536
|
-
const uint32_t n_dims = tensor->n_dims;
|
16537
16708
|
|
16538
16709
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16539
16710
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16540
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16541
16711
|
|
16542
16712
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16543
16713
|
const uint64_t ne = tensor->ne[j];
|
@@ -16709,12 +16879,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16709
16879
|
{
|
16710
16880
|
uint32_t type;
|
16711
16881
|
uint32_t op;
|
16712
|
-
uint32_t n_dims;
|
16713
16882
|
|
16714
16883
|
for (uint32_t i = 0; i < n_leafs; ++i) {
|
16715
16884
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16716
16885
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16717
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16718
16886
|
|
16719
16887
|
int64_t ne[GGML_MAX_DIMS];
|
16720
16888
|
size_t nb[GGML_MAX_DIMS];
|
@@ -16730,7 +16898,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16730
16898
|
nb[j] = nb_cur;
|
16731
16899
|
}
|
16732
16900
|
|
16733
|
-
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
16901
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16734
16902
|
|
16735
16903
|
tensor->op = (enum ggml_op) op;
|
16736
16904
|
|
@@ -16747,7 +16915,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16747
16915
|
|
16748
16916
|
ptr += ggml_nbytes(tensor);
|
16749
16917
|
|
16750
|
-
fprintf(stderr, "%s: loaded leaf %d: '%16s', %
|
16918
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16751
16919
|
}
|
16752
16920
|
}
|
16753
16921
|
|
@@ -16757,12 +16925,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16757
16925
|
{
|
16758
16926
|
uint32_t type;
|
16759
16927
|
uint32_t op;
|
16760
|
-
uint32_t n_dims;
|
16761
16928
|
|
16762
16929
|
for (uint32_t i = 0; i < n_nodes; ++i) {
|
16763
16930
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16764
16931
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16765
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16766
16932
|
|
16767
16933
|
enum ggml_op eop = (enum ggml_op) op;
|
16768
16934
|
|
@@ -16833,7 +16999,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16833
16999
|
} break;
|
16834
17000
|
default:
|
16835
17001
|
{
|
16836
|
-
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
17002
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16837
17003
|
|
16838
17004
|
tensor->op = eop;
|
16839
17005
|
} break;
|
@@ -16852,7 +17018,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16852
17018
|
|
16853
17019
|
result->nodes[i] = tensor;
|
16854
17020
|
|
16855
|
-
fprintf(stderr, "%s: loaded node %d: '%16s', %
|
17021
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16856
17022
|
}
|
16857
17023
|
}
|
16858
17024
|
}
|
@@ -16990,7 +17156,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16990
17156
|
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16991
17157
|
}
|
16992
17158
|
|
16993
|
-
if (node
|
17159
|
+
if (ggml_is_matrix(node)) {
|
16994
17160
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
16995
17161
|
} else {
|
16996
17162
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
@@ -17257,7 +17423,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17257
17423
|
int64_t i = 0;
|
17258
17424
|
for (int p = 0; p < np; ++p) {
|
17259
17425
|
const int64_t ne = ggml_nelements(ps[p]);
|
17260
|
-
const float p_decay = ((ps[p]
|
17426
|
+
const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
|
17261
17427
|
for (int64_t j = 0; j < ne; ++j) {
|
17262
17428
|
float x = ggml_get_f32_1d(ps[p], j);
|
17263
17429
|
float g_ = g[i]*gnorm;
|
@@ -18531,7 +18697,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18531
18697
|
return NULL;
|
18532
18698
|
}
|
18533
18699
|
|
18534
|
-
const size_t size_cur = (
|
18700
|
+
const size_t size_cur = ggml_row_size(info->type, ne);
|
18535
18701
|
|
18536
18702
|
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
18537
18703
|
}
|
@@ -19035,8 +19201,8 @@ void gguf_add_tensor(
|
|
19035
19201
|
ctx->infos[idx].ne[i] = 1;
|
19036
19202
|
}
|
19037
19203
|
|
19038
|
-
ctx->infos[idx].n_dims = tensor
|
19039
|
-
for (
|
19204
|
+
ctx->infos[idx].n_dims = ggml_n_dims(tensor);
|
19205
|
+
for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
|
19040
19206
|
ctx->infos[idx].ne[i] = tensor->ne[i];
|
19041
19207
|
}
|
19042
19208
|
|