llama_cpp 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +18 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +952 -232
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +725 -98
- data/ext/llama_cpp/src/ggml-metal.metal +1508 -171
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +554 -215
- data/ext/llama_cpp/src/ggml.h +58 -23
- data/ext/llama_cpp/src/llama.cpp +1157 -851
- data/ext/llama_cpp/src/llama.h +9 -4
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe"
|
1
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
2
2
|
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
@@ -33,7 +33,7 @@
|
|
33
33
|
// we should just be careful :)
|
34
34
|
#pragma warning(disable: 4244 4267)
|
35
35
|
|
36
|
-
// disable POSIX deprecation
|
36
|
+
// disable POSIX deprecation warnings
|
37
37
|
// these functions are never going away, anyway
|
38
38
|
#pragma warning(disable: 4996)
|
39
39
|
#endif
|
@@ -1395,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
1395
1395
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
1396
1396
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1397
1397
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1398
|
-
inline static void
|
1398
|
+
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
1399
1399
|
|
1400
1400
|
static const float GELU_COEF_A = 0.044715f;
|
1401
1401
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1623,7 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1623
1623
|
"POOL_1D",
|
1624
1624
|
"POOL_2D",
|
1625
1625
|
"UPSCALE",
|
1626
|
+
"PAD",
|
1626
1627
|
"ARGSORT",
|
1628
|
+
"LEAKY_RELU",
|
1627
1629
|
|
1628
1630
|
"FLASH_ATTN",
|
1629
1631
|
"FLASH_FF",
|
@@ -1650,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1650
1652
|
"CROSS_ENTROPY_LOSS_BACK",
|
1651
1653
|
};
|
1652
1654
|
|
1653
|
-
static_assert(GGML_OP_COUNT ==
|
1655
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1654
1656
|
|
1655
1657
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1656
1658
|
"none",
|
@@ -1707,7 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1707
1709
|
"pool_1d(x)",
|
1708
1710
|
"pool_2d(x)",
|
1709
1711
|
"upscale(x)",
|
1712
|
+
"pad(x)",
|
1710
1713
|
"argsort(x)",
|
1714
|
+
"leaky_relu(x)",
|
1711
1715
|
|
1712
1716
|
"flash_attn(x)",
|
1713
1717
|
"flash_ff(x)",
|
@@ -1734,7 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1734
1738
|
"cross_entropy_loss_back(x,y)",
|
1735
1739
|
};
|
1736
1740
|
|
1737
|
-
static_assert(GGML_OP_COUNT ==
|
1741
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1738
1742
|
|
1739
1743
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1740
1744
|
|
@@ -1750,17 +1754,16 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
1750
1754
|
"GELU",
|
1751
1755
|
"GELU_QUICK",
|
1752
1756
|
"SILU",
|
1753
|
-
"LEAKY",
|
1754
1757
|
};
|
1755
1758
|
|
1756
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
1759
|
+
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
|
1757
1760
|
|
1758
1761
|
|
1759
1762
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1760
1763
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1761
1764
|
|
1762
1765
|
// WARN:
|
1763
|
-
// Mis-
|
1766
|
+
// Mis-configuration can lead to problem that's hard to reason about:
|
1764
1767
|
// * At best it crash or talks nosense.
|
1765
1768
|
// * At worst it talks slightly difference but hard to perceive.
|
1766
1769
|
//
|
@@ -1994,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
1994
1997
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
1995
1998
|
}
|
1996
1999
|
|
1997
|
-
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
1998
|
-
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
1999
|
-
|
2000
|
-
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
2001
|
-
}
|
2002
|
-
|
2003
2000
|
int ggml_blck_size(enum ggml_type type) {
|
2004
2001
|
return type_traits[type].blck_size;
|
2005
2002
|
}
|
@@ -2008,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
|
|
2008
2005
|
return type_traits[type].type_size;
|
2009
2006
|
}
|
2010
2007
|
|
2011
|
-
|
2012
|
-
|
2008
|
+
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2009
|
+
assert(ne % ggml_blck_size(type) == 0);
|
2010
|
+
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
double ggml_type_sizef(enum ggml_type type) {
|
2014
|
+
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
2013
2015
|
}
|
2014
2016
|
|
2015
2017
|
const char * ggml_type_name(enum ggml_type type) {
|
@@ -2046,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|
2046
2048
|
return ggml_type_size(tensor->type);
|
2047
2049
|
}
|
2048
2050
|
|
2049
|
-
|
2051
|
+
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
2050
2052
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2051
2053
|
|
2052
2054
|
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2053
2055
|
}
|
2054
2056
|
|
2055
|
-
|
2057
|
+
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
2056
2058
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2057
2059
|
|
2058
2060
|
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2059
2061
|
}
|
2060
2062
|
|
2061
|
-
|
2063
|
+
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
2062
2064
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2063
2065
|
|
2064
2066
|
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2065
2067
|
}
|
2066
2068
|
|
2069
|
+
bool ggml_is_3d(const struct ggml_tensor * tensor) {
|
2070
|
+
return tensor->ne[3] == 1;
|
2071
|
+
}
|
2072
|
+
|
2073
|
+
int ggml_n_dims(const struct ggml_tensor * tensor) {
|
2074
|
+
for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
|
2075
|
+
if (tensor->ne[i] > 1) {
|
2076
|
+
return i + 1;
|
2077
|
+
}
|
2078
|
+
}
|
2079
|
+
return 1;
|
2080
|
+
}
|
2081
|
+
|
2067
2082
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2068
2083
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2069
2084
|
|
@@ -2368,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
|
2368
2383
|
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
2369
2384
|
size_t max_size = 0;
|
2370
2385
|
|
2371
|
-
struct
|
2372
|
-
|
2373
|
-
while (obj != NULL) {
|
2374
|
-
if (obj->type == GGML_OBJECT_TENSOR) {
|
2375
|
-
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
2376
|
-
|
2377
|
-
const size_t size = ggml_nbytes(tensor);
|
2378
|
-
|
2379
|
-
if (max_size < size) {
|
2380
|
-
max_size = size;
|
2381
|
-
}
|
2382
|
-
}
|
2383
|
-
|
2384
|
-
obj = obj->next;
|
2386
|
+
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
2387
|
+
max_size = MAX(max_size, ggml_nbytes(tensor));
|
2385
2388
|
}
|
2386
2389
|
|
2387
2390
|
return max_size;
|
@@ -2470,7 +2473,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2470
2473
|
view_src = view_src->view_src;
|
2471
2474
|
}
|
2472
2475
|
|
2473
|
-
size_t data_size =
|
2476
|
+
size_t data_size = ggml_row_size(type, ne[0]);
|
2474
2477
|
for (int i = 1; i < n_dims; i++) {
|
2475
2478
|
data_size *= ne[i];
|
2476
2479
|
}
|
@@ -2513,7 +2516,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2513
2516
|
/*.type =*/ type,
|
2514
2517
|
/*.backend =*/ GGML_BACKEND_CPU,
|
2515
2518
|
/*.buffer =*/ NULL,
|
2516
|
-
/*.n_dims =*/ n_dims,
|
2517
2519
|
/*.ne =*/ { 1, 1, 1, 1 },
|
2518
2520
|
/*.nb =*/ { 0, 0, 0, 0 },
|
2519
2521
|
/*.op =*/ GGML_OP_NONE,
|
@@ -2620,7 +2622,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
2620
2622
|
}
|
2621
2623
|
|
2622
2624
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
2623
|
-
return ggml_new_tensor(ctx, src->type,
|
2625
|
+
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
|
2624
2626
|
}
|
2625
2627
|
|
2626
2628
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
@@ -3069,7 +3071,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
3069
3071
|
struct ggml_tensor * ggml_view_tensor(
|
3070
3072
|
struct ggml_context * ctx,
|
3071
3073
|
struct ggml_tensor * src) {
|
3072
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type,
|
3074
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
|
3073
3075
|
ggml_format_name(result, "%s (view)", src->name);
|
3074
3076
|
|
3075
3077
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
@@ -3079,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
|
|
3079
3081
|
return result;
|
3080
3082
|
}
|
3081
3083
|
|
3082
|
-
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
3084
|
+
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
|
3083
3085
|
struct ggml_object * obj = ctx->objects_begin;
|
3084
3086
|
|
3085
3087
|
char * const mem_buffer = ctx->mem_buffer;
|
@@ -3095,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
|
3095
3097
|
return NULL;
|
3096
3098
|
}
|
3097
3099
|
|
3098
|
-
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
3100
|
+
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
3099
3101
|
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
3100
3102
|
obj = obj->next;
|
3101
3103
|
|
@@ -3227,10 +3229,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|
3227
3229
|
is_node = true;
|
3228
3230
|
}
|
3229
3231
|
|
3230
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, type,
|
3232
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
3231
3233
|
|
3232
3234
|
result->op = GGML_OP_ADD;
|
3233
|
-
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3235
|
+
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
|
3234
3236
|
result->src[0] = a;
|
3235
3237
|
result->src[1] = b;
|
3236
3238
|
|
@@ -3599,12 +3601,12 @@ struct ggml_tensor * ggml_sum_rows(
|
|
3599
3601
|
is_node = true;
|
3600
3602
|
}
|
3601
3603
|
|
3602
|
-
int64_t ne[
|
3603
|
-
for (int i=1; i<
|
3604
|
+
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
3605
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
3604
3606
|
ne[i] = a->ne[i];
|
3605
3607
|
}
|
3606
3608
|
|
3607
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3609
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
3608
3610
|
|
3609
3611
|
result->op = GGML_OP_SUM_ROWS;
|
3610
3612
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3625,8 +3627,8 @@ struct ggml_tensor * ggml_mean(
|
|
3625
3627
|
is_node = true;
|
3626
3628
|
}
|
3627
3629
|
|
3628
|
-
int64_t ne[
|
3629
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3630
|
+
int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
3631
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
3630
3632
|
|
3631
3633
|
result->op = GGML_OP_MEAN;
|
3632
3634
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3648,8 +3650,7 @@ struct ggml_tensor * ggml_argmax(
|
|
3648
3650
|
is_node = true;
|
3649
3651
|
}
|
3650
3652
|
|
3651
|
-
|
3652
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
3653
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
|
3653
3654
|
|
3654
3655
|
result->op = GGML_OP_ARGMAX;
|
3655
3656
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3672,7 +3673,7 @@ struct ggml_tensor * ggml_repeat(
|
|
3672
3673
|
is_node = true;
|
3673
3674
|
}
|
3674
3675
|
|
3675
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3676
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3676
3677
|
|
3677
3678
|
result->op = GGML_OP_REPEAT;
|
3678
3679
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3699,7 +3700,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
3699
3700
|
return a;
|
3700
3701
|
}
|
3701
3702
|
|
3702
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3703
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3703
3704
|
|
3704
3705
|
result->op = GGML_OP_REPEAT_BACK;
|
3705
3706
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3830,12 +3831,25 @@ struct ggml_tensor * ggml_relu_inplace(
|
|
3830
3831
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
|
3831
3832
|
}
|
3832
3833
|
|
3833
|
-
//
|
3834
|
+
// ggml_leaky_relu
|
3834
3835
|
|
3835
|
-
struct ggml_tensor *
|
3836
|
+
struct ggml_tensor * ggml_leaky_relu(
|
3836
3837
|
struct ggml_context * ctx,
|
3837
|
-
struct ggml_tensor * a) {
|
3838
|
-
|
3838
|
+
struct ggml_tensor * a, float negative_slope, bool inplace) {
|
3839
|
+
bool is_node = false;
|
3840
|
+
|
3841
|
+
if (!inplace && (a->grad)) {
|
3842
|
+
is_node = true;
|
3843
|
+
}
|
3844
|
+
|
3845
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
3846
|
+
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
3847
|
+
|
3848
|
+
result->op = GGML_OP_LEAKY_RELU;
|
3849
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
3850
|
+
result->src[0] = a;
|
3851
|
+
|
3852
|
+
return result;
|
3839
3853
|
}
|
3840
3854
|
|
3841
3855
|
// ggml_gelu
|
@@ -4022,8 +4036,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|
4022
4036
|
|
4023
4037
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4024
4038
|
|
4025
|
-
result->op = GGML_OP_GROUP_NORM;
|
4026
4039
|
result->op_params[0] = n_groups;
|
4040
|
+
|
4041
|
+
result->op = GGML_OP_GROUP_NORM;
|
4027
4042
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4028
4043
|
result->src[0] = a;
|
4029
4044
|
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
@@ -4061,7 +4076,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4061
4076
|
}
|
4062
4077
|
|
4063
4078
|
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4064
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4079
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4065
4080
|
|
4066
4081
|
result->op = GGML_OP_MUL_MAT;
|
4067
4082
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4071,21 +4086,30 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4071
4086
|
return result;
|
4072
4087
|
}
|
4073
4088
|
|
4089
|
+
void ggml_mul_mat_set_prec(
|
4090
|
+
struct ggml_tensor * a,
|
4091
|
+
enum ggml_prec prec) {
|
4092
|
+
const int32_t prec_i32 = (int32_t) prec;
|
4093
|
+
|
4094
|
+
ggml_set_op_params_i32(a, 0, prec_i32);
|
4095
|
+
}
|
4096
|
+
|
4074
4097
|
// ggml_mul_mat_id
|
4075
4098
|
|
4076
4099
|
struct ggml_tensor * ggml_mul_mat_id(
|
4077
4100
|
struct ggml_context * ctx,
|
4078
|
-
struct ggml_tensor * as[],
|
4101
|
+
struct ggml_tensor * const as[],
|
4102
|
+
int n_as,
|
4079
4103
|
struct ggml_tensor * ids,
|
4080
4104
|
int id,
|
4081
4105
|
struct ggml_tensor * b) {
|
4082
4106
|
|
4083
|
-
int64_t n_as = ids->ne[0];
|
4084
|
-
|
4085
4107
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4086
|
-
GGML_ASSERT(
|
4108
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4109
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4110
|
+
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4087
4111
|
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4088
|
-
GGML_ASSERT(id >= 0 && id <
|
4112
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]);
|
4089
4113
|
|
4090
4114
|
bool is_node = false;
|
4091
4115
|
|
@@ -4094,16 +4118,17 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
4094
4118
|
}
|
4095
4119
|
|
4096
4120
|
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4097
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4121
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4098
4122
|
|
4099
4123
|
ggml_set_op_params_i32(result, 0, id);
|
4124
|
+
ggml_set_op_params_i32(result, 1, n_as);
|
4100
4125
|
|
4101
4126
|
result->op = GGML_OP_MUL_MAT_ID;
|
4102
4127
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4103
4128
|
result->src[0] = ids;
|
4104
4129
|
result->src[1] = b;
|
4105
4130
|
|
4106
|
-
for (
|
4131
|
+
for (int i = 0; i < n_as; i++) {
|
4107
4132
|
struct ggml_tensor * a = as[i];
|
4108
4133
|
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4109
4134
|
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
@@ -4131,7 +4156,7 @@ struct ggml_tensor * ggml_out_prod(
|
|
4131
4156
|
|
4132
4157
|
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
4133
4158
|
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
4134
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4159
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4135
4160
|
|
4136
4161
|
result->op = GGML_OP_OUT_PROD;
|
4137
4162
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4146,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod(
|
|
4146
4171
|
static struct ggml_tensor * ggml_scale_impl(
|
4147
4172
|
struct ggml_context * ctx,
|
4148
4173
|
struct ggml_tensor * a,
|
4149
|
-
|
4174
|
+
float s,
|
4150
4175
|
bool inplace) {
|
4151
|
-
GGML_ASSERT(ggml_is_scalar(b));
|
4152
4176
|
GGML_ASSERT(ggml_is_padded_1d(a));
|
4153
4177
|
|
4154
4178
|
bool is_node = false;
|
4155
4179
|
|
4156
|
-
if (a->grad
|
4180
|
+
if (a->grad) {
|
4157
4181
|
is_node = true;
|
4158
4182
|
}
|
4159
4183
|
|
4160
4184
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4161
4185
|
|
4186
|
+
ggml_set_op_params(result, &s, sizeof(s));
|
4187
|
+
|
4162
4188
|
result->op = GGML_OP_SCALE;
|
4163
4189
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4164
4190
|
result->src[0] = a;
|
4165
|
-
result->src[1] = b;
|
4166
4191
|
|
4167
4192
|
return result;
|
4168
4193
|
}
|
@@ -4170,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl(
|
|
4170
4195
|
struct ggml_tensor * ggml_scale(
|
4171
4196
|
struct ggml_context * ctx,
|
4172
4197
|
struct ggml_tensor * a,
|
4173
|
-
|
4174
|
-
return ggml_scale_impl(ctx, a,
|
4198
|
+
float s) {
|
4199
|
+
return ggml_scale_impl(ctx, a, s, false);
|
4175
4200
|
}
|
4176
4201
|
|
4177
4202
|
struct ggml_tensor * ggml_scale_inplace(
|
4178
4203
|
struct ggml_context * ctx,
|
4179
4204
|
struct ggml_tensor * a,
|
4180
|
-
|
4181
|
-
return ggml_scale_impl(ctx, a,
|
4205
|
+
float s) {
|
4206
|
+
return ggml_scale_impl(ctx, a, s, true);
|
4182
4207
|
}
|
4183
4208
|
|
4184
4209
|
// ggml_set
|
@@ -4416,7 +4441,7 @@ struct ggml_tensor * ggml_reshape(
|
|
4416
4441
|
//GGML_ASSERT(false);
|
4417
4442
|
}
|
4418
4443
|
|
4419
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type,
|
4444
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
|
4420
4445
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
4421
4446
|
|
4422
4447
|
result->op = GGML_OP_RESHAPE;
|
@@ -4731,7 +4756,9 @@ struct ggml_tensor * ggml_get_rows(
|
|
4731
4756
|
struct ggml_context * ctx,
|
4732
4757
|
struct ggml_tensor * a,
|
4733
4758
|
struct ggml_tensor * b) {
|
4734
|
-
GGML_ASSERT(
|
4759
|
+
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
4760
|
+
GGML_ASSERT(b->ne[3] == 1);
|
4761
|
+
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
4735
4762
|
|
4736
4763
|
bool is_node = false;
|
4737
4764
|
|
@@ -4741,7 +4768,7 @@ struct ggml_tensor * ggml_get_rows(
|
|
4741
4768
|
|
4742
4769
|
// TODO: implement non F32 return
|
4743
4770
|
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
|
4744
|
-
struct ggml_tensor * result =
|
4771
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
|
4745
4772
|
|
4746
4773
|
result->op = GGML_OP_GET_ROWS;
|
4747
4774
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4792,7 +4819,7 @@ struct ggml_tensor * ggml_diag(
|
|
4792
4819
|
}
|
4793
4820
|
|
4794
4821
|
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
4795
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
4822
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
|
4796
4823
|
|
4797
4824
|
result->op = GGML_OP_DIAG;
|
4798
4825
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5439,7 +5466,7 @@ struct ggml_tensor * ggml_pool_1d(
|
|
5439
5466
|
is_node = true;
|
5440
5467
|
}
|
5441
5468
|
|
5442
|
-
const int64_t ne[
|
5469
|
+
const int64_t ne[2] = {
|
5443
5470
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
5444
5471
|
a->ne[1],
|
5445
5472
|
};
|
@@ -5519,6 +5546,30 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
5519
5546
|
return result;
|
5520
5547
|
}
|
5521
5548
|
|
5549
|
+
struct ggml_tensor * ggml_pad(
|
5550
|
+
struct ggml_context * ctx,
|
5551
|
+
struct ggml_tensor * a,
|
5552
|
+
int p0, int p1, int p2, int p3) {
|
5553
|
+
bool is_node = false;
|
5554
|
+
|
5555
|
+
if (a->grad) {
|
5556
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5557
|
+
is_node = true;
|
5558
|
+
}
|
5559
|
+
|
5560
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
5561
|
+
a->ne[0] + p0,
|
5562
|
+
a->ne[1] + p1,
|
5563
|
+
a->ne[2] + p2,
|
5564
|
+
a->ne[3] + p3);
|
5565
|
+
|
5566
|
+
result->op = GGML_OP_PAD;
|
5567
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5568
|
+
result->src[0] = a;
|
5569
|
+
|
5570
|
+
return result;
|
5571
|
+
}
|
5572
|
+
|
5522
5573
|
struct ggml_tensor * ggml_upscale(
|
5523
5574
|
struct ggml_context * ctx,
|
5524
5575
|
struct ggml_tensor * a,
|
@@ -5534,7 +5585,7 @@ struct ggml_tensor * ggml_argsort(
|
|
5534
5585
|
enum ggml_sort_order order) {
|
5535
5586
|
bool is_node = false;
|
5536
5587
|
|
5537
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32,
|
5588
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
5538
5589
|
|
5539
5590
|
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5540
5591
|
|
@@ -5581,7 +5632,7 @@ struct ggml_tensor * ggml_flash_attn(
|
|
5581
5632
|
}
|
5582
5633
|
|
5583
5634
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
5584
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5635
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
5585
5636
|
|
5586
5637
|
int32_t t = masked ? 1 : 0;
|
5587
5638
|
ggml_set_op_params(result, &t, sizeof(t));
|
@@ -5614,7 +5665,7 @@ struct ggml_tensor * ggml_flash_ff(
|
|
5614
5665
|
}
|
5615
5666
|
|
5616
5667
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5617
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5668
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
5618
5669
|
|
5619
5670
|
result->op = GGML_OP_FLASH_FF;
|
5620
5671
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5730,7 +5781,6 @@ struct ggml_tensor * ggml_win_part(
|
|
5730
5781
|
const int np = npx*npy;
|
5731
5782
|
|
5732
5783
|
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
5733
|
-
|
5734
5784
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5735
5785
|
|
5736
5786
|
int32_t params[] = { npx, npy, w };
|
@@ -7520,7 +7570,7 @@ static void ggml_compute_forward_acc_f32(
|
|
7520
7570
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
7521
7571
|
|
7522
7572
|
// view src0 and dst with these strides and data offset inbytes during acc
|
7523
|
-
// nb0 is
|
7573
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
7524
7574
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
7525
7575
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
7526
7576
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -7716,6 +7766,8 @@ static void ggml_compute_forward_mul_f32(
|
|
7716
7766
|
|
7717
7767
|
#ifdef GGML_USE_CLBLAST
|
7718
7768
|
if (src1->backend == GGML_BACKEND_GPU) {
|
7769
|
+
// TODO: OpenCL kernel support full broadcast
|
7770
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
7719
7771
|
if (ith == 0) {
|
7720
7772
|
ggml_cl_mul(src0, src1, dst);
|
7721
7773
|
}
|
@@ -8981,10 +9033,9 @@ static void ggml_compute_forward_silu(
|
|
8981
9033
|
} break;
|
8982
9034
|
}
|
8983
9035
|
}
|
9036
|
+
// ggml_compute_forward_leaky_relu
|
8984
9037
|
|
8985
|
-
|
8986
|
-
|
8987
|
-
static void ggml_compute_forward_leaky_f32(
|
9038
|
+
static void ggml_compute_forward_leaky_relu_f32(
|
8988
9039
|
const struct ggml_compute_params * params,
|
8989
9040
|
const struct ggml_tensor * src0,
|
8990
9041
|
struct ggml_tensor * dst) {
|
@@ -8998,24 +9049,27 @@ static void ggml_compute_forward_leaky_f32(
|
|
8998
9049
|
const int n = ggml_nrows(src0);
|
8999
9050
|
const int nc = src0->ne[0];
|
9000
9051
|
|
9052
|
+
float negative_slope;
|
9053
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
9054
|
+
|
9001
9055
|
assert(dst->nb[0] == sizeof(float));
|
9002
9056
|
assert(src0->nb[0] == sizeof(float));
|
9003
9057
|
|
9004
9058
|
for (int i = 0; i < n; i++) {
|
9005
|
-
|
9059
|
+
ggml_vec_leaky_relu_f32(nc,
|
9006
9060
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9007
|
-
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9061
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
|
9008
9062
|
}
|
9009
9063
|
}
|
9010
9064
|
|
9011
|
-
static void
|
9065
|
+
static void ggml_compute_forward_leaky_relu(
|
9012
9066
|
const struct ggml_compute_params * params,
|
9013
9067
|
const struct ggml_tensor * src0,
|
9014
9068
|
struct ggml_tensor * dst) {
|
9015
9069
|
switch (src0->type) {
|
9016
9070
|
case GGML_TYPE_F32:
|
9017
9071
|
{
|
9018
|
-
|
9072
|
+
ggml_compute_forward_leaky_relu_f32(params, src0, dst);
|
9019
9073
|
} break;
|
9020
9074
|
default:
|
9021
9075
|
{
|
@@ -9110,6 +9164,8 @@ static void ggml_compute_forward_norm_f32(
|
|
9110
9164
|
float eps;
|
9111
9165
|
memcpy(&eps, dst->op_params, sizeof(float));
|
9112
9166
|
|
9167
|
+
GGML_ASSERT(eps > 0.0f);
|
9168
|
+
|
9113
9169
|
// TODO: optimize
|
9114
9170
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9115
9171
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -9179,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
9179
9235
|
float eps;
|
9180
9236
|
memcpy(&eps, dst->op_params, sizeof(float));
|
9181
9237
|
|
9238
|
+
GGML_ASSERT(eps > 0.0f);
|
9239
|
+
|
9182
9240
|
// TODO: optimize
|
9183
9241
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9184
9242
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
@@ -9504,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9504
9562
|
const int64_t ne0 = dst->ne[0];
|
9505
9563
|
const int64_t ne1 = dst->ne[1];
|
9506
9564
|
|
9565
|
+
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
9566
|
+
// all the experts for each batch element and the processing would become incredibly slow
|
9507
9567
|
// TODO: find the optimal values for these
|
9508
|
-
if (
|
9568
|
+
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
9569
|
+
ggml_is_contiguous(src0) &&
|
9509
9570
|
ggml_is_contiguous(src1) &&
|
9510
9571
|
//src0->type == GGML_TYPE_F32 &&
|
9511
9572
|
src1->type == GGML_TYPE_F32 &&
|
@@ -9593,8 +9654,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9593
9654
|
|
9594
9655
|
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
9595
9656
|
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
9596
|
-
|
9597
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9657
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9598
9658
|
|
9599
9659
|
if (type != GGML_TYPE_F32) {
|
9600
9660
|
float * const wdata = params->wdata;
|
@@ -9611,10 +9671,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9611
9671
|
}
|
9612
9672
|
|
9613
9673
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9614
|
-
|
9615
|
-
|
9616
|
-
|
9617
|
-
|
9674
|
+
ne1, ne01, ne10,
|
9675
|
+
1.0f, y, ne10,
|
9676
|
+
x, ne00,
|
9677
|
+
0.0f, d, ne01);
|
9618
9678
|
}
|
9619
9679
|
}
|
9620
9680
|
|
@@ -9627,9 +9687,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9627
9687
|
if (params->type == GGML_TASK_INIT) {
|
9628
9688
|
if (src1->type != vec_dot_type) {
|
9629
9689
|
char * wdata = params->wdata;
|
9630
|
-
const size_t row_size =
|
9690
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9631
9691
|
|
9632
9692
|
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9693
|
+
assert(src1->type == GGML_TYPE_F32);
|
9633
9694
|
|
9634
9695
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9635
9696
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -9649,10 +9710,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9649
9710
|
}
|
9650
9711
|
|
9651
9712
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9652
|
-
const size_t row_size =
|
9713
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9653
9714
|
|
9654
|
-
const int64_t nr0 = ne01;
|
9655
|
-
const int64_t nr1 =
|
9715
|
+
const int64_t nr0 = ne01; // src0 rows
|
9716
|
+
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
9656
9717
|
|
9657
9718
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9658
9719
|
|
@@ -9694,9 +9755,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9694
9755
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9695
9756
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9696
9757
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9697
|
-
const int64_t i13 = (ir1/(ne12*
|
9698
|
-
const int64_t i12 = (ir1 - i13*ne12*
|
9699
|
-
const int64_t i11 = (ir1 - i13*ne12*
|
9758
|
+
const int64_t i13 = (ir1/(ne12*ne1));
|
9759
|
+
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
9760
|
+
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
9700
9761
|
|
9701
9762
|
// broadcast src0 into src1
|
9702
9763
|
const int64_t i03 = i13/r3;
|
@@ -9736,20 +9797,191 @@ static void ggml_compute_forward_mul_mat(
|
|
9736
9797
|
|
9737
9798
|
static void ggml_compute_forward_mul_mat_id(
|
9738
9799
|
const struct ggml_compute_params * params,
|
9800
|
+
const struct ggml_tensor * ids,
|
9801
|
+
const struct ggml_tensor * src1,
|
9739
9802
|
struct ggml_tensor * dst) {
|
9740
9803
|
|
9741
|
-
const struct ggml_tensor *
|
9742
|
-
|
9804
|
+
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
9805
|
+
|
9806
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
9807
|
+
|
9808
|
+
const int ith = params->ith;
|
9809
|
+
const int nth = params->nth;
|
9810
|
+
|
9811
|
+
const enum ggml_type type = src0->type;
|
9812
|
+
|
9813
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
9814
|
+
|
9815
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
9816
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
9817
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
9818
|
+
|
9819
|
+
GGML_ASSERT(ne0 == ne01);
|
9820
|
+
GGML_ASSERT(ne1 == ne11);
|
9821
|
+
GGML_ASSERT(ne2 == ne12);
|
9822
|
+
GGML_ASSERT(ne3 == ne13);
|
9823
|
+
|
9824
|
+
// we don't support permuted src0 or src1
|
9825
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
9826
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
9827
|
+
|
9828
|
+
// dst cannot be transposed or permuted
|
9829
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
9830
|
+
GGML_ASSERT(nb0 <= nb1);
|
9831
|
+
GGML_ASSERT(nb1 <= nb2);
|
9832
|
+
GGML_ASSERT(nb2 <= nb3);
|
9833
|
+
|
9834
|
+
// broadcast factors
|
9835
|
+
const int64_t r2 = ne12/ne02;
|
9836
|
+
const int64_t r3 = ne13/ne03;
|
9837
|
+
|
9838
|
+
// row groups
|
9839
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9840
|
+
const int n_as = ggml_get_op_params_i32(dst, 1);
|
9841
|
+
|
9842
|
+
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
9843
|
+
(char *) params->wdata :
|
9844
|
+
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
9845
|
+
|
9846
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
9847
|
+
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
|
9848
|
+
|
9849
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
9850
|
+
|
9851
|
+
if (params->type == GGML_TASK_INIT) {
|
9852
|
+
char * wdata = params->wdata;
|
9853
|
+
if (src1->type != vec_dot_type) {
|
9854
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9855
|
+
|
9856
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9857
|
+
assert(src1->type == GGML_TYPE_F32);
|
9858
|
+
|
9859
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9860
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
9861
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
9862
|
+
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
9863
|
+
wdata += row_size;
|
9864
|
+
}
|
9865
|
+
}
|
9866
|
+
}
|
9867
|
+
}
|
9868
|
+
|
9869
|
+
// initialize matrix_row_counts
|
9870
|
+
GGML_ASSERT(wdata == wdata_src1_end);
|
9871
|
+
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
9872
|
+
|
9873
|
+
// group rows by src0 matrix
|
9874
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
9875
|
+
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
9876
|
+
|
9877
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
9878
|
+
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
|
9879
|
+
matrix_row_counts[row_id] += 1;
|
9880
|
+
}
|
9881
|
+
|
9882
|
+
return;
|
9883
|
+
}
|
9884
|
+
|
9885
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
9886
|
+
return;
|
9887
|
+
}
|
9888
|
+
|
9889
|
+
// compute each matrix multiplication in sequence
|
9890
|
+
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
9891
|
+
const int64_t cne1 = matrix_row_counts[cur_a];
|
9892
|
+
|
9893
|
+
if (cne1 == 0) {
|
9894
|
+
continue;
|
9895
|
+
}
|
9743
9896
|
|
9744
|
-
|
9897
|
+
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
9745
9898
|
|
9746
|
-
|
9899
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9900
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9747
9901
|
|
9748
|
-
|
9902
|
+
const int64_t nr0 = ne01; // src0 rows
|
9903
|
+
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
9749
9904
|
|
9750
|
-
|
9905
|
+
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9751
9906
|
|
9752
|
-
|
9907
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
9908
|
+
|
9909
|
+
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
9910
|
+
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
9911
|
+
|
9912
|
+
const int64_t ith0 = ith % nth0;
|
9913
|
+
const int64_t ith1 = ith / nth0;
|
9914
|
+
|
9915
|
+
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
9916
|
+
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
9917
|
+
|
9918
|
+
const int64_t ir010 = dr0*ith0;
|
9919
|
+
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
9920
|
+
|
9921
|
+
const int64_t ir110 = dr1*ith1;
|
9922
|
+
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
9923
|
+
|
9924
|
+
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
9925
|
+
|
9926
|
+
// threads with no work simply yield (not sure if it helps)
|
9927
|
+
if (ir010 >= ir011 || ir110 >= ir111) {
|
9928
|
+
sched_yield();
|
9929
|
+
continue;
|
9930
|
+
}
|
9931
|
+
|
9932
|
+
assert(ne12 % ne02 == 0);
|
9933
|
+
assert(ne13 % ne03 == 0);
|
9934
|
+
|
9935
|
+
// block-tiling attempt
|
9936
|
+
const int64_t blck_0 = 16;
|
9937
|
+
const int64_t blck_1 = 16;
|
9938
|
+
|
9939
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
9940
|
+
float tmp[16];
|
9941
|
+
|
9942
|
+
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9943
|
+
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9944
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9945
|
+
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
|
9946
|
+
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
9947
|
+
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
9948
|
+
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
9949
|
+
|
9950
|
+
// broadcast src0 into src1
|
9951
|
+
const int64_t i03 = i13/r3;
|
9952
|
+
const int64_t i02 = i12/r2;
|
9953
|
+
|
9954
|
+
const int64_t i1 = i11;
|
9955
|
+
const int64_t i2 = i12;
|
9956
|
+
const int64_t i3 = i13;
|
9957
|
+
|
9958
|
+
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
|
9959
|
+
|
9960
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
9961
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
9962
|
+
// the original src1 data pointer, so we should index using the indices directly
|
9963
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
9964
|
+
const char * src1_col = (const char *) wdata +
|
9965
|
+
(src1_cont || src1->type != vec_dot_type
|
9966
|
+
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
9967
|
+
: (i11*nb11 + i12*nb12 + i13*nb13));
|
9968
|
+
|
9969
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
9970
|
+
|
9971
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
9972
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
9973
|
+
//}
|
9974
|
+
|
9975
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
9976
|
+
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
9977
|
+
}
|
9978
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
9979
|
+
}
|
9980
|
+
}
|
9981
|
+
}
|
9982
|
+
}
|
9983
|
+
|
9984
|
+
#undef MMID_MATRIX_ROW
|
9753
9985
|
}
|
9754
9986
|
|
9755
9987
|
// ggml_compute_forward_out_prod
|
@@ -10093,19 +10325,17 @@ static void ggml_compute_forward_out_prod(
|
|
10093
10325
|
static void ggml_compute_forward_scale_f32(
|
10094
10326
|
const struct ggml_compute_params * params,
|
10095
10327
|
const struct ggml_tensor * src0,
|
10096
|
-
const struct ggml_tensor * src1,
|
10097
10328
|
struct ggml_tensor * dst) {
|
10098
10329
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
10099
10330
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
10100
10331
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10101
|
-
GGML_ASSERT(ggml_is_scalar(src1));
|
10102
10332
|
|
10103
10333
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10104
10334
|
return;
|
10105
10335
|
}
|
10106
10336
|
|
10107
10337
|
// scale factor
|
10108
|
-
const float v = *(float *)
|
10338
|
+
const float v = *(float *) dst->op_params;
|
10109
10339
|
|
10110
10340
|
const int ith = params->ith;
|
10111
10341
|
const int nth = params->nth;
|
@@ -10136,12 +10366,11 @@ static void ggml_compute_forward_scale_f32(
|
|
10136
10366
|
static void ggml_compute_forward_scale(
|
10137
10367
|
const struct ggml_compute_params * params,
|
10138
10368
|
const struct ggml_tensor * src0,
|
10139
|
-
const struct ggml_tensor * src1,
|
10140
10369
|
struct ggml_tensor * dst) {
|
10141
10370
|
switch (src0->type) {
|
10142
10371
|
case GGML_TYPE_F32:
|
10143
10372
|
{
|
10144
|
-
ggml_compute_forward_scale_f32(params, src0,
|
10373
|
+
ggml_compute_forward_scale_f32(params, src0, dst);
|
10145
10374
|
} break;
|
10146
10375
|
default:
|
10147
10376
|
{
|
@@ -10161,7 +10390,7 @@ static void ggml_compute_forward_set_f32(
|
|
10161
10390
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
10162
10391
|
|
10163
10392
|
// view src0 and dst with these strides and data offset inbytes during set
|
10164
|
-
// nb0 is
|
10393
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
10165
10394
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
10166
10395
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
10167
10396
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -10325,21 +10554,30 @@ static void ggml_compute_forward_get_rows_q(
|
|
10325
10554
|
return;
|
10326
10555
|
}
|
10327
10556
|
|
10328
|
-
|
10329
|
-
|
10557
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10558
|
+
|
10559
|
+
const int64_t nc = ne00;
|
10560
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10561
|
+
|
10330
10562
|
const enum ggml_type type = src0->type;
|
10331
10563
|
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
10332
10564
|
|
10333
|
-
assert(
|
10334
|
-
assert(
|
10335
|
-
assert(
|
10565
|
+
assert(ne0 == nc);
|
10566
|
+
assert(ne02 == ne11);
|
10567
|
+
assert(nb00 == ggml_type_size(type));
|
10568
|
+
assert(ggml_nrows(dst) == nr);
|
10336
10569
|
|
10337
|
-
|
10338
|
-
|
10570
|
+
// TODO: multi-thread
|
10571
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10572
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10573
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10574
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10339
10575
|
|
10340
|
-
|
10341
|
-
|
10342
|
-
|
10576
|
+
dequantize_row_q(
|
10577
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10578
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10579
|
+
}
|
10580
|
+
}
|
10343
10581
|
}
|
10344
10582
|
}
|
10345
10583
|
|
@@ -10354,19 +10592,26 @@ static void ggml_compute_forward_get_rows_f16(
|
|
10354
10592
|
return;
|
10355
10593
|
}
|
10356
10594
|
|
10357
|
-
|
10358
|
-
const int nr = ggml_nelements(src1);
|
10595
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10359
10596
|
|
10360
|
-
|
10361
|
-
|
10362
|
-
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
10597
|
+
const int64_t nc = ne00;
|
10598
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10363
10599
|
|
10364
|
-
|
10365
|
-
|
10600
|
+
assert(ne0 == nc);
|
10601
|
+
assert(ne02 == ne11);
|
10602
|
+
assert(nb00 == sizeof(ggml_fp16_t));
|
10603
|
+
assert(ggml_nrows(dst) == nr);
|
10366
10604
|
|
10367
|
-
|
10368
|
-
|
10369
|
-
|
10605
|
+
// TODO: multi-thread
|
10606
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10607
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10608
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10609
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10610
|
+
|
10611
|
+
ggml_fp16_to_fp32_row(
|
10612
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10613
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10614
|
+
}
|
10370
10615
|
}
|
10371
10616
|
}
|
10372
10617
|
}
|
@@ -10382,19 +10627,27 @@ static void ggml_compute_forward_get_rows_f32(
|
|
10382
10627
|
return;
|
10383
10628
|
}
|
10384
10629
|
|
10385
|
-
|
10386
|
-
const int nr = ggml_nelements(src1);
|
10630
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10387
10631
|
|
10388
|
-
|
10389
|
-
|
10390
|
-
assert(src0->nb[0] == sizeof(float));
|
10632
|
+
const int64_t nc = ne00;
|
10633
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10391
10634
|
|
10392
|
-
|
10393
|
-
|
10635
|
+
assert(ne0 == nc);
|
10636
|
+
assert(ne02 == ne11);
|
10637
|
+
assert(nb00 == sizeof(float));
|
10638
|
+
assert(ggml_nrows(dst) == nr);
|
10394
10639
|
|
10395
|
-
|
10396
|
-
|
10397
|
-
|
10640
|
+
// TODO: multi-thread
|
10641
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10642
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10643
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10644
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10645
|
+
|
10646
|
+
ggml_vec_cpy_f32(nc,
|
10647
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
10648
|
+
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
10649
|
+
}
|
10650
|
+
}
|
10398
10651
|
}
|
10399
10652
|
}
|
10400
10653
|
|
@@ -11306,10 +11559,13 @@ static void ggml_compute_forward_rope_f32(
|
|
11306
11559
|
}
|
11307
11560
|
} else {
|
11308
11561
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
11309
|
-
//
|
11562
|
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
11563
|
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
11310
11564
|
theta_base *= freq_scale;
|
11311
|
-
for (int64_t
|
11312
|
-
|
11565
|
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
11566
|
+
if (ic < n_dims) {
|
11567
|
+
const int64_t ib = 0;
|
11568
|
+
|
11313
11569
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11314
11570
|
float cur_rot = inv_ndims * ic - ib;
|
11315
11571
|
|
@@ -11332,6 +11588,14 @@ static void ggml_compute_forward_rope_f32(
|
|
11332
11588
|
|
11333
11589
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
11334
11590
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
11591
|
+
} else {
|
11592
|
+
const int64_t i0 = ic;
|
11593
|
+
|
11594
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11595
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11596
|
+
|
11597
|
+
dst_data[0] = src[0];
|
11598
|
+
dst_data[1] = src[1];
|
11335
11599
|
}
|
11336
11600
|
}
|
11337
11601
|
}
|
@@ -11459,10 +11723,13 @@ static void ggml_compute_forward_rope_f16(
|
|
11459
11723
|
}
|
11460
11724
|
} else {
|
11461
11725
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
11462
|
-
//
|
11726
|
+
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
11727
|
+
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
11463
11728
|
theta_base *= freq_scale;
|
11464
|
-
for (int64_t
|
11465
|
-
|
11729
|
+
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
11730
|
+
if (ic < n_dims) {
|
11731
|
+
const int64_t ib = 0;
|
11732
|
+
|
11466
11733
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
11467
11734
|
float cur_rot = inv_ndims * ic - ib;
|
11468
11735
|
|
@@ -11485,6 +11752,14 @@ static void ggml_compute_forward_rope_f16(
|
|
11485
11752
|
|
11486
11753
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
11487
11754
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
11755
|
+
} else {
|
11756
|
+
const int64_t i0 = ic;
|
11757
|
+
|
11758
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11759
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11760
|
+
|
11761
|
+
dst_data[0] = src[0];
|
11762
|
+
dst_data[1] = src[1];
|
11488
11763
|
}
|
11489
11764
|
}
|
11490
11765
|
}
|
@@ -12114,6 +12389,7 @@ static void ggml_compute_forward_upscale_f32(
|
|
12114
12389
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
12115
12390
|
|
12116
12391
|
const int ith = params->ith;
|
12392
|
+
const int nth = params->nth;
|
12117
12393
|
|
12118
12394
|
GGML_TENSOR_UNARY_OP_LOCALS
|
12119
12395
|
|
@@ -12121,16 +12397,17 @@ static void ggml_compute_forward_upscale_f32(
|
|
12121
12397
|
|
12122
12398
|
// TODO: optimize
|
12123
12399
|
|
12124
|
-
for (
|
12125
|
-
|
12126
|
-
|
12127
|
-
|
12128
|
-
|
12129
|
-
|
12130
|
-
|
12131
|
-
const
|
12400
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12401
|
+
const int64_t i03 = i3;
|
12402
|
+
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
12403
|
+
const int64_t i02 = i2;
|
12404
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
12405
|
+
const int64_t i01 = i1 / scale_factor;
|
12406
|
+
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
12407
|
+
const int64_t i00 = i0 / scale_factor;
|
12132
12408
|
|
12133
|
-
float *
|
12409
|
+
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
12410
|
+
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
12134
12411
|
|
12135
12412
|
*y = *x;
|
12136
12413
|
}
|
@@ -12155,6 +12432,64 @@ static void ggml_compute_forward_upscale(
|
|
12155
12432
|
}
|
12156
12433
|
}
|
12157
12434
|
|
12435
|
+
// ggml_compute_forward_pad
|
12436
|
+
|
12437
|
+
static void ggml_compute_forward_pad_f32(
|
12438
|
+
const struct ggml_compute_params * params,
|
12439
|
+
const struct ggml_tensor * src0,
|
12440
|
+
struct ggml_tensor * dst) {
|
12441
|
+
|
12442
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12443
|
+
return;
|
12444
|
+
}
|
12445
|
+
|
12446
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
12447
|
+
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
12448
|
+
|
12449
|
+
const int ith = params->ith;
|
12450
|
+
const int nth = params->nth;
|
12451
|
+
|
12452
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12453
|
+
|
12454
|
+
float * dst_ptr = (float *) dst->data;
|
12455
|
+
|
12456
|
+
// TODO: optimize
|
12457
|
+
|
12458
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
12459
|
+
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
12460
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
12461
|
+
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
12462
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
12463
|
+
|
12464
|
+
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12465
|
+
|
12466
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
12467
|
+
dst_ptr[dst_idx] = *src_ptr;
|
12468
|
+
} else {
|
12469
|
+
dst_ptr[dst_idx] = 0;
|
12470
|
+
}
|
12471
|
+
}
|
12472
|
+
}
|
12473
|
+
}
|
12474
|
+
}
|
12475
|
+
}
|
12476
|
+
|
12477
|
+
static void ggml_compute_forward_pad(
|
12478
|
+
const struct ggml_compute_params * params,
|
12479
|
+
const struct ggml_tensor * src0,
|
12480
|
+
struct ggml_tensor * dst) {
|
12481
|
+
switch (src0->type) {
|
12482
|
+
case GGML_TYPE_F32:
|
12483
|
+
{
|
12484
|
+
ggml_compute_forward_pad_f32(params, src0, dst);
|
12485
|
+
} break;
|
12486
|
+
default:
|
12487
|
+
{
|
12488
|
+
GGML_ASSERT(false);
|
12489
|
+
} break;
|
12490
|
+
}
|
12491
|
+
}
|
12492
|
+
|
12158
12493
|
// ggml_compute_forward_argsort
|
12159
12494
|
|
12160
12495
|
static void ggml_compute_forward_argsort_f32(
|
@@ -13362,10 +13697,6 @@ static void ggml_compute_forward_unary(
|
|
13362
13697
|
{
|
13363
13698
|
ggml_compute_forward_silu(params, src0, dst);
|
13364
13699
|
} break;
|
13365
|
-
case GGML_UNARY_OP_LEAKY:
|
13366
|
-
{
|
13367
|
-
ggml_compute_forward_leaky(params, src0, dst);
|
13368
|
-
} break;
|
13369
13700
|
default:
|
13370
13701
|
{
|
13371
13702
|
GGML_ASSERT(false);
|
@@ -14041,7 +14372,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14041
14372
|
} break;
|
14042
14373
|
case GGML_OP_MUL_MAT_ID:
|
14043
14374
|
{
|
14044
|
-
ggml_compute_forward_mul_mat_id(params, tensor);
|
14375
|
+
ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
|
14045
14376
|
} break;
|
14046
14377
|
case GGML_OP_OUT_PROD:
|
14047
14378
|
{
|
@@ -14049,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14049
14380
|
} break;
|
14050
14381
|
case GGML_OP_SCALE:
|
14051
14382
|
{
|
14052
|
-
ggml_compute_forward_scale(params, tensor->src[0], tensor
|
14383
|
+
ggml_compute_forward_scale(params, tensor->src[0], tensor);
|
14053
14384
|
} break;
|
14054
14385
|
case GGML_OP_SET:
|
14055
14386
|
{
|
@@ -14147,10 +14478,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14147
14478
|
{
|
14148
14479
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
14149
14480
|
} break;
|
14481
|
+
case GGML_OP_PAD:
|
14482
|
+
{
|
14483
|
+
ggml_compute_forward_pad(params, tensor->src[0], tensor);
|
14484
|
+
} break;
|
14150
14485
|
case GGML_OP_ARGSORT:
|
14151
14486
|
{
|
14152
14487
|
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14153
14488
|
} break;
|
14489
|
+
case GGML_OP_LEAKY_RELU:
|
14490
|
+
{
|
14491
|
+
ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
|
14492
|
+
} break;
|
14154
14493
|
case GGML_OP_FLASH_ATTN:
|
14155
14494
|
{
|
14156
14495
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14405,7 +14744,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14405
14744
|
return replacements->vals[i];
|
14406
14745
|
}
|
14407
14746
|
|
14408
|
-
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type,
|
14747
|
+
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
|
14409
14748
|
|
14410
14749
|
// insert clone into replacements
|
14411
14750
|
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
@@ -14475,7 +14814,7 @@ void ggml_build_backward_gradient_checkpointing(
|
|
14475
14814
|
// insert new tensors recomputing src, reusing already made replacements,
|
14476
14815
|
// remember replacements: remember new tensors with mapping from corresponding gf nodes
|
14477
14816
|
// recurse for input tensors,
|
14478
|
-
// unless (i.e. terminating when) input tensors are
|
14817
|
+
// unless (i.e. terminating when) input tensors are replacements (like checkpoints)
|
14479
14818
|
node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
|
14480
14819
|
}
|
14481
14820
|
// insert rewritten backward node with replacements made into resulting backward graph gb
|
@@ -14497,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
|
|
14497
14836
|
|
14498
14837
|
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
|
14499
14838
|
if (ggml_hash_contains(zero_table, a)) {
|
14500
|
-
struct ggml_tensor * a_zero = ggml_scale(ctx, a,
|
14839
|
+
struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
|
14501
14840
|
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
|
14502
14841
|
} else {
|
14503
14842
|
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
|
@@ -14633,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14633
14972
|
src0->grad,
|
14634
14973
|
ggml_scale(ctx,
|
14635
14974
|
ggml_mul(ctx, src0, tensor->grad),
|
14636
|
-
|
14975
|
+
2.0f),
|
14637
14976
|
zero_table);
|
14638
14977
|
}
|
14639
14978
|
} break;
|
@@ -14647,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14647
14986
|
ggml_div(ctx,
|
14648
14987
|
tensor->grad,
|
14649
14988
|
tensor),
|
14650
|
-
|
14989
|
+
0.5f),
|
14651
14990
|
zero_table);
|
14652
14991
|
}
|
14653
14992
|
} break;
|
@@ -14813,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14813
15152
|
{
|
14814
15153
|
// necessary for llama
|
14815
15154
|
if (src0->grad) {
|
15155
|
+
const float s = ((float *) tensor->op_params)[0];
|
15156
|
+
|
14816
15157
|
src0->grad =
|
14817
15158
|
ggml_add_or_set(ctx,
|
14818
15159
|
src0->grad,
|
14819
|
-
ggml_scale_impl(ctx, tensor->grad,
|
14820
|
-
zero_table);
|
14821
|
-
}
|
14822
|
-
if (src1->grad) {
|
14823
|
-
src1->grad =
|
14824
|
-
ggml_add_or_set(ctx,
|
14825
|
-
src1->grad,
|
14826
|
-
ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
|
15160
|
+
ggml_scale_impl(ctx, tensor->grad, s, false),
|
14827
15161
|
zero_table);
|
14828
15162
|
}
|
14829
15163
|
} break;
|
@@ -15001,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15001
15335
|
const int n_past = ((int32_t *) tensor->op_params)[0];
|
15002
15336
|
src0->grad =
|
15003
15337
|
ggml_add_or_set(ctx, src0->grad,
|
15338
|
+
/* ggml_diag_mask_inf_impl() shouldn't be here */
|
15339
|
+
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
|
15004
15340
|
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
|
15005
15341
|
zero_table);
|
15006
15342
|
}
|
@@ -15143,10 +15479,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15143
15479
|
{
|
15144
15480
|
GGML_ASSERT(false); // TODO: not implemented
|
15145
15481
|
} break;
|
15482
|
+
case GGML_OP_PAD:
|
15483
|
+
{
|
15484
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15485
|
+
} break;
|
15146
15486
|
case GGML_OP_ARGSORT:
|
15147
15487
|
{
|
15148
15488
|
GGML_ASSERT(false); // TODO: not implemented
|
15149
15489
|
} break;
|
15490
|
+
case GGML_OP_LEAKY_RELU:
|
15491
|
+
{
|
15492
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15493
|
+
} break;
|
15150
15494
|
case GGML_OP_FLASH_ATTN:
|
15151
15495
|
{
|
15152
15496
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15752,6 +16096,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15752
16096
|
case GGML_OP_ARGMAX:
|
15753
16097
|
case GGML_OP_REPEAT:
|
15754
16098
|
case GGML_OP_REPEAT_BACK:
|
16099
|
+
case GGML_OP_LEAKY_RELU:
|
15755
16100
|
{
|
15756
16101
|
n_tasks = 1;
|
15757
16102
|
} break;
|
@@ -15764,7 +16109,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15764
16109
|
case GGML_UNARY_OP_TANH:
|
15765
16110
|
case GGML_UNARY_OP_ELU:
|
15766
16111
|
case GGML_UNARY_OP_RELU:
|
15767
|
-
case GGML_UNARY_OP_LEAKY:
|
15768
16112
|
{
|
15769
16113
|
n_tasks = 1;
|
15770
16114
|
} break;
|
@@ -15821,7 +16165,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15821
16165
|
} break;
|
15822
16166
|
case GGML_OP_MUL_MAT_ID:
|
15823
16167
|
{
|
15824
|
-
// FIXME: blas
|
15825
16168
|
n_tasks = n_threads;
|
15826
16169
|
} break;
|
15827
16170
|
case GGML_OP_OUT_PROD:
|
@@ -15883,6 +16226,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15883
16226
|
{
|
15884
16227
|
n_tasks = n_threads;
|
15885
16228
|
} break;
|
16229
|
+
case GGML_OP_PAD:
|
16230
|
+
{
|
16231
|
+
n_tasks = n_threads;
|
16232
|
+
} break;
|
15886
16233
|
case GGML_OP_ARGSORT:
|
15887
16234
|
{
|
15888
16235
|
n_tasks = n_threads;
|
@@ -16146,25 +16493,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16146
16493
|
} else
|
16147
16494
|
#endif
|
16148
16495
|
if (node->src[1]->type != vec_dot_type) {
|
16149
|
-
cur =
|
16496
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
16150
16497
|
}
|
16151
16498
|
} break;
|
16152
16499
|
case GGML_OP_MUL_MAT_ID:
|
16153
16500
|
{
|
16154
|
-
const struct ggml_tensor *
|
16155
|
-
const struct ggml_tensor *
|
16156
|
-
const enum ggml_type vec_dot_type = type_traits[
|
16157
|
-
|
16158
|
-
|
16159
|
-
if (a->type != GGML_TYPE_F32) {
|
16160
|
-
// here we need memory just for single 2D matrix from src0
|
16161
|
-
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16162
|
-
}
|
16163
|
-
} else
|
16164
|
-
#endif
|
16165
|
-
if (b->type != vec_dot_type) {
|
16166
|
-
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
16501
|
+
const struct ggml_tensor * src0 = node->src[2];
|
16502
|
+
const struct ggml_tensor * src1 = node->src[1];
|
16503
|
+
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
16504
|
+
if (src1->type != vec_dot_type) {
|
16505
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
16167
16506
|
}
|
16507
|
+
const int n_as = ggml_get_op_params_i32(node, 1);
|
16508
|
+
cur = GGML_PAD(cur, sizeof(int64_t)); // align
|
16509
|
+
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
16510
|
+
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
16168
16511
|
} break;
|
16169
16512
|
case GGML_OP_OUT_PROD:
|
16170
16513
|
{
|
@@ -16394,7 +16737,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|
16394
16737
|
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
16395
16738
|
ggml_type_name(tensor->type),
|
16396
16739
|
ggml_op_name (tensor->op),
|
16397
|
-
tensor
|
16740
|
+
ggml_n_dims(tensor),
|
16398
16741
|
ne[0], ne[1], ne[2], ne[3],
|
16399
16742
|
nb[0], nb[1], nb[2], nb[3],
|
16400
16743
|
tensor->data,
|
@@ -16409,7 +16752,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16409
16752
|
arg,
|
16410
16753
|
ggml_type_name(tensor->type),
|
16411
16754
|
ggml_op_name (tensor->op),
|
16412
|
-
tensor
|
16755
|
+
ggml_n_dims(tensor),
|
16413
16756
|
ne[0], ne[1], ne[2], ne[3],
|
16414
16757
|
nb[0], nb[1], nb[2], nb[3],
|
16415
16758
|
tensor->data,
|
@@ -16499,11 +16842,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16499
16842
|
|
16500
16843
|
const uint32_t type = tensor->type;
|
16501
16844
|
const uint32_t op = tensor->op;
|
16502
|
-
const uint32_t n_dims = tensor->n_dims;
|
16503
16845
|
|
16504
16846
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16505
16847
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16506
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16507
16848
|
|
16508
16849
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16509
16850
|
const uint64_t ne = tensor->ne[j];
|
@@ -16533,11 +16874,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16533
16874
|
|
16534
16875
|
const uint32_t type = tensor->type;
|
16535
16876
|
const uint32_t op = tensor->op;
|
16536
|
-
const uint32_t n_dims = tensor->n_dims;
|
16537
16877
|
|
16538
16878
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16539
16879
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16540
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16541
16880
|
|
16542
16881
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16543
16882
|
const uint64_t ne = tensor->ne[j];
|
@@ -16709,12 +17048,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16709
17048
|
{
|
16710
17049
|
uint32_t type;
|
16711
17050
|
uint32_t op;
|
16712
|
-
uint32_t n_dims;
|
16713
17051
|
|
16714
17052
|
for (uint32_t i = 0; i < n_leafs; ++i) {
|
16715
17053
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16716
17054
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16717
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16718
17055
|
|
16719
17056
|
int64_t ne[GGML_MAX_DIMS];
|
16720
17057
|
size_t nb[GGML_MAX_DIMS];
|
@@ -16730,7 +17067,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16730
17067
|
nb[j] = nb_cur;
|
16731
17068
|
}
|
16732
17069
|
|
16733
|
-
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
17070
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16734
17071
|
|
16735
17072
|
tensor->op = (enum ggml_op) op;
|
16736
17073
|
|
@@ -16747,7 +17084,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16747
17084
|
|
16748
17085
|
ptr += ggml_nbytes(tensor);
|
16749
17086
|
|
16750
|
-
fprintf(stderr, "%s: loaded leaf %d: '%16s', %
|
17087
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16751
17088
|
}
|
16752
17089
|
}
|
16753
17090
|
|
@@ -16757,12 +17094,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16757
17094
|
{
|
16758
17095
|
uint32_t type;
|
16759
17096
|
uint32_t op;
|
16760
|
-
uint32_t n_dims;
|
16761
17097
|
|
16762
17098
|
for (uint32_t i = 0; i < n_nodes; ++i) {
|
16763
17099
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16764
17100
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16765
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16766
17101
|
|
16767
17102
|
enum ggml_op eop = (enum ggml_op) op;
|
16768
17103
|
|
@@ -16833,7 +17168,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16833
17168
|
} break;
|
16834
17169
|
default:
|
16835
17170
|
{
|
16836
|
-
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
17171
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16837
17172
|
|
16838
17173
|
tensor->op = eop;
|
16839
17174
|
} break;
|
@@ -16852,7 +17187,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16852
17187
|
|
16853
17188
|
result->nodes[i] = tensor;
|
16854
17189
|
|
16855
|
-
fprintf(stderr, "%s: loaded node %d: '%16s', %
|
17190
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16856
17191
|
}
|
16857
17192
|
}
|
16858
17193
|
}
|
@@ -16990,7 +17325,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16990
17325
|
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16991
17326
|
}
|
16992
17327
|
|
16993
|
-
if (node
|
17328
|
+
if (ggml_is_matrix(node)) {
|
16994
17329
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
16995
17330
|
} else {
|
16996
17331
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
@@ -17257,7 +17592,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17257
17592
|
int64_t i = 0;
|
17258
17593
|
for (int p = 0; p < np; ++p) {
|
17259
17594
|
const int64_t ne = ggml_nelements(ps[p]);
|
17260
|
-
const float p_decay = ((ps[p]
|
17595
|
+
const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
|
17261
17596
|
for (int64_t j = 0; j < ne; ++j) {
|
17262
17597
|
float x = ggml_get_f32_1d(ps[p], j);
|
17263
17598
|
float g_ = g[i]*gnorm;
|
@@ -18531,7 +18866,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18531
18866
|
return NULL;
|
18532
18867
|
}
|
18533
18868
|
|
18534
|
-
const size_t size_cur = (
|
18869
|
+
const size_t size_cur = ggml_row_size(info->type, ne);
|
18535
18870
|
|
18536
18871
|
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
18537
18872
|
}
|
@@ -18860,6 +19195,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
|
18860
19195
|
return ctx->infos[i].name.data;
|
18861
19196
|
}
|
18862
19197
|
|
19198
|
+
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
|
19199
|
+
return ctx->infos[i].type;
|
19200
|
+
}
|
19201
|
+
|
18863
19202
|
// returns the index
|
18864
19203
|
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
18865
19204
|
const int idx = gguf_find_key(ctx, key);
|
@@ -19035,8 +19374,8 @@ void gguf_add_tensor(
|
|
19035
19374
|
ctx->infos[idx].ne[i] = 1;
|
19036
19375
|
}
|
19037
19376
|
|
19038
|
-
ctx->infos[idx].n_dims = tensor
|
19039
|
-
for (
|
19377
|
+
ctx->infos[idx].n_dims = ggml_n_dims(tensor);
|
19378
|
+
for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
|
19040
19379
|
ctx->infos[idx].ne[i] = tensor->ne[i];
|
19041
19380
|
}
|
19042
19381
|
|