llama_cpp 0.9.5 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe"
|
1
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
2
2
|
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
@@ -33,7 +33,7 @@
|
|
33
33
|
// we should just be careful :)
|
34
34
|
#pragma warning(disable: 4244 4267)
|
35
35
|
|
36
|
-
// disable POSIX deprecation
|
36
|
+
// disable POSIX deprecation warnings
|
37
37
|
// these functions are never going away, anyway
|
38
38
|
#pragma warning(disable: 4996)
|
39
39
|
#endif
|
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
233
233
|
#define UNUSED GGML_UNUSED
|
234
234
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
235
|
|
236
|
-
//
|
237
|
-
// tensor access macros
|
238
|
-
//
|
239
|
-
|
240
|
-
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
241
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
242
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
243
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
244
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
245
|
-
|
246
|
-
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
247
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
248
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
249
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
250
|
-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
251
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
252
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
253
|
-
|
254
236
|
#if defined(GGML_USE_ACCELERATE)
|
255
237
|
#include <Accelerate/Accelerate.h>
|
256
238
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -1413,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
1413
1395
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
1414
1396
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1415
1397
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1416
|
-
inline static void
|
1398
|
+
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
1417
1399
|
|
1418
1400
|
static const float GELU_COEF_A = 0.044715f;
|
1419
1401
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1613
1595
|
"GROUP_NORM",
|
1614
1596
|
|
1615
1597
|
"MUL_MAT",
|
1598
|
+
"MUL_MAT_ID",
|
1616
1599
|
"OUT_PROD",
|
1617
1600
|
|
1618
1601
|
"SCALE",
|
@@ -1640,6 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1640
1623
|
"POOL_1D",
|
1641
1624
|
"POOL_2D",
|
1642
1625
|
"UPSCALE",
|
1626
|
+
"PAD",
|
1627
|
+
"ARGSORT",
|
1628
|
+
"LEAKY_RELU",
|
1643
1629
|
|
1644
1630
|
"FLASH_ATTN",
|
1645
1631
|
"FLASH_FF",
|
@@ -1666,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1666
1652
|
"CROSS_ENTROPY_LOSS_BACK",
|
1667
1653
|
};
|
1668
1654
|
|
1669
|
-
static_assert(GGML_OP_COUNT ==
|
1655
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1670
1656
|
|
1671
1657
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1672
1658
|
"none",
|
@@ -1695,6 +1681,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1695
1681
|
"group_norm(x)",
|
1696
1682
|
|
1697
1683
|
"X*Y",
|
1684
|
+
"X[i]*Y",
|
1698
1685
|
"X*Y",
|
1699
1686
|
|
1700
1687
|
"x*v",
|
@@ -1722,6 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1722
1709
|
"pool_1d(x)",
|
1723
1710
|
"pool_2d(x)",
|
1724
1711
|
"upscale(x)",
|
1712
|
+
"pad(x)",
|
1713
|
+
"argsort(x)",
|
1714
|
+
"leaky_relu(x)",
|
1725
1715
|
|
1726
1716
|
"flash_attn(x)",
|
1727
1717
|
"flash_ff(x)",
|
@@ -1748,15 +1738,32 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1748
1738
|
"cross_entropy_loss_back(x,y)",
|
1749
1739
|
};
|
1750
1740
|
|
1751
|
-
static_assert(GGML_OP_COUNT ==
|
1741
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1752
1742
|
|
1753
1743
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1754
1744
|
|
1745
|
+
|
1746
|
+
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
1747
|
+
"ABS",
|
1748
|
+
"SGN",
|
1749
|
+
"NEG",
|
1750
|
+
"STEP",
|
1751
|
+
"TANH",
|
1752
|
+
"ELU",
|
1753
|
+
"RELU",
|
1754
|
+
"GELU",
|
1755
|
+
"GELU_QUICK",
|
1756
|
+
"SILU",
|
1757
|
+
};
|
1758
|
+
|
1759
|
+
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
|
1760
|
+
|
1761
|
+
|
1755
1762
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1756
1763
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1757
1764
|
|
1758
1765
|
// WARN:
|
1759
|
-
// Mis-
|
1766
|
+
// Mis-configuration can lead to problem that's hard to reason about:
|
1760
1767
|
// * At best it crash or talks nosense.
|
1761
1768
|
// * At worst it talks slightly difference but hard to perceive.
|
1762
1769
|
//
|
@@ -1771,6 +1778,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1771
1778
|
|
1772
1779
|
p[GGML_OP_ACC ] = true;
|
1773
1780
|
p[GGML_OP_MUL_MAT ] = true;
|
1781
|
+
p[GGML_OP_MUL_MAT_ID ] = true;
|
1774
1782
|
p[GGML_OP_OUT_PROD ] = true;
|
1775
1783
|
p[GGML_OP_SET ] = true;
|
1776
1784
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
@@ -1989,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
1989
1997
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
1990
1998
|
}
|
1991
1999
|
|
1992
|
-
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
1993
|
-
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
1994
|
-
|
1995
|
-
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
1996
|
-
}
|
1997
|
-
|
1998
2000
|
int ggml_blck_size(enum ggml_type type) {
|
1999
2001
|
return type_traits[type].blck_size;
|
2000
2002
|
}
|
@@ -2003,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
|
|
2003
2005
|
return type_traits[type].type_size;
|
2004
2006
|
}
|
2005
2007
|
|
2006
|
-
|
2007
|
-
|
2008
|
+
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2009
|
+
assert(ne % ggml_blck_size(type) == 0);
|
2010
|
+
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
double ggml_type_sizef(enum ggml_type type) {
|
2014
|
+
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
2008
2015
|
}
|
2009
2016
|
|
2010
2017
|
const char * ggml_type_name(enum ggml_type type) {
|
@@ -2023,28 +2030,55 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
2023
2030
|
return GGML_OP_SYMBOL[op];
|
2024
2031
|
}
|
2025
2032
|
|
2033
|
+
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
2034
|
+
return GGML_UNARY_OP_NAME[op];
|
2035
|
+
}
|
2036
|
+
|
2037
|
+
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2038
|
+
if (t->op == GGML_OP_UNARY) {
|
2039
|
+
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2040
|
+
return ggml_unary_op_name(uop);
|
2041
|
+
}
|
2042
|
+
else {
|
2043
|
+
return ggml_op_name(t->op);
|
2044
|
+
}
|
2045
|
+
}
|
2046
|
+
|
2026
2047
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2027
2048
|
return ggml_type_size(tensor->type);
|
2028
2049
|
}
|
2029
2050
|
|
2030
|
-
|
2051
|
+
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
2031
2052
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2032
2053
|
|
2033
2054
|
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2034
2055
|
}
|
2035
2056
|
|
2036
|
-
|
2057
|
+
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
2037
2058
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2038
2059
|
|
2039
2060
|
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2040
2061
|
}
|
2041
2062
|
|
2042
|
-
|
2063
|
+
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
2043
2064
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2044
2065
|
|
2045
2066
|
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2046
2067
|
}
|
2047
2068
|
|
2069
|
+
bool ggml_is_3d(const struct ggml_tensor * tensor) {
|
2070
|
+
return tensor->ne[3] == 1;
|
2071
|
+
}
|
2072
|
+
|
2073
|
+
int ggml_n_dims(const struct ggml_tensor * tensor) {
|
2074
|
+
for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
|
2075
|
+
if (tensor->ne[i] > 1) {
|
2076
|
+
return i + 1;
|
2077
|
+
}
|
2078
|
+
}
|
2079
|
+
return 1;
|
2080
|
+
}
|
2081
|
+
|
2048
2082
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2049
2083
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2050
2084
|
|
@@ -2451,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2451
2485
|
view_src = view_src->view_src;
|
2452
2486
|
}
|
2453
2487
|
|
2454
|
-
size_t data_size =
|
2488
|
+
size_t data_size = ggml_row_size(type, ne[0]);
|
2455
2489
|
for (int i = 1; i < n_dims; i++) {
|
2456
2490
|
data_size *= ne[i];
|
2457
2491
|
}
|
@@ -2494,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2494
2528
|
/*.type =*/ type,
|
2495
2529
|
/*.backend =*/ GGML_BACKEND_CPU,
|
2496
2530
|
/*.buffer =*/ NULL,
|
2497
|
-
/*.n_dims =*/ n_dims,
|
2498
2531
|
/*.ne =*/ { 1, 1, 1, 1 },
|
2499
2532
|
/*.nb =*/ { 0, 0, 0, 0 },
|
2500
2533
|
/*.op =*/ GGML_OP_NONE,
|
@@ -2601,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
2601
2634
|
}
|
2602
2635
|
|
2603
2636
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
2604
|
-
return ggml_new_tensor(ctx, src->type,
|
2637
|
+
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
|
2605
2638
|
}
|
2606
2639
|
|
2607
2640
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
@@ -3050,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
3050
3083
|
struct ggml_tensor * ggml_view_tensor(
|
3051
3084
|
struct ggml_context * ctx,
|
3052
3085
|
struct ggml_tensor * src) {
|
3053
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type,
|
3086
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
|
3054
3087
|
ggml_format_name(result, "%s (view)", src->name);
|
3055
3088
|
|
3056
3089
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
@@ -3154,9 +3187,7 @@ static struct ggml_tensor * ggml_add_impl(
|
|
3154
3187
|
struct ggml_tensor * a,
|
3155
3188
|
struct ggml_tensor * b,
|
3156
3189
|
bool inplace) {
|
3157
|
-
|
3158
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3159
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3190
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3160
3191
|
|
3161
3192
|
bool is_node = false;
|
3162
3193
|
|
@@ -3210,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|
3210
3241
|
is_node = true;
|
3211
3242
|
}
|
3212
3243
|
|
3213
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, type,
|
3244
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
3214
3245
|
|
3215
3246
|
result->op = GGML_OP_ADD;
|
3216
|
-
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3247
|
+
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
|
3217
3248
|
result->src[0] = a;
|
3218
3249
|
result->src[1] = b;
|
3219
3250
|
|
@@ -3371,9 +3402,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
3371
3402
|
struct ggml_tensor * a,
|
3372
3403
|
struct ggml_tensor * b,
|
3373
3404
|
bool inplace) {
|
3374
|
-
|
3375
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3376
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3405
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3377
3406
|
|
3378
3407
|
bool is_node = false;
|
3379
3408
|
|
@@ -3418,7 +3447,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
3418
3447
|
struct ggml_tensor * a,
|
3419
3448
|
struct ggml_tensor * b,
|
3420
3449
|
bool inplace) {
|
3421
|
-
GGML_ASSERT(
|
3450
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3422
3451
|
|
3423
3452
|
bool is_node = false;
|
3424
3453
|
|
@@ -3584,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows(
|
|
3584
3613
|
is_node = true;
|
3585
3614
|
}
|
3586
3615
|
|
3587
|
-
int64_t ne[
|
3588
|
-
for (int i=1; i<
|
3616
|
+
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
3617
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
3589
3618
|
ne[i] = a->ne[i];
|
3590
3619
|
}
|
3591
3620
|
|
3592
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3621
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
3593
3622
|
|
3594
3623
|
result->op = GGML_OP_SUM_ROWS;
|
3595
3624
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3610,8 +3639,8 @@ struct ggml_tensor * ggml_mean(
|
|
3610
3639
|
is_node = true;
|
3611
3640
|
}
|
3612
3641
|
|
3613
|
-
int64_t ne[
|
3614
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3642
|
+
int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
3643
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
3615
3644
|
|
3616
3645
|
result->op = GGML_OP_MEAN;
|
3617
3646
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3633,8 +3662,7 @@ struct ggml_tensor * ggml_argmax(
|
|
3633
3662
|
is_node = true;
|
3634
3663
|
}
|
3635
3664
|
|
3636
|
-
|
3637
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
3665
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
|
3638
3666
|
|
3639
3667
|
result->op = GGML_OP_ARGMAX;
|
3640
3668
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3657,7 +3685,7 @@ struct ggml_tensor * ggml_repeat(
|
|
3657
3685
|
is_node = true;
|
3658
3686
|
}
|
3659
3687
|
|
3660
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3688
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3661
3689
|
|
3662
3690
|
result->op = GGML_OP_REPEAT;
|
3663
3691
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3684,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
3684
3712
|
return a;
|
3685
3713
|
}
|
3686
3714
|
|
3687
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3715
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3688
3716
|
|
3689
3717
|
result->op = GGML_OP_REPEAT_BACK;
|
3690
3718
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3815,12 +3843,25 @@ struct ggml_tensor * ggml_relu_inplace(
|
|
3815
3843
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
|
3816
3844
|
}
|
3817
3845
|
|
3818
|
-
//
|
3846
|
+
// ggml_leaky_relu
|
3819
3847
|
|
3820
|
-
struct ggml_tensor *
|
3848
|
+
struct ggml_tensor * ggml_leaky_relu(
|
3821
3849
|
struct ggml_context * ctx,
|
3822
|
-
struct ggml_tensor * a) {
|
3823
|
-
|
3850
|
+
struct ggml_tensor * a, float negative_slope, bool inplace) {
|
3851
|
+
bool is_node = false;
|
3852
|
+
|
3853
|
+
if (!inplace && (a->grad)) {
|
3854
|
+
is_node = true;
|
3855
|
+
}
|
3856
|
+
|
3857
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
3858
|
+
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
3859
|
+
|
3860
|
+
result->op = GGML_OP_LEAKY_RELU;
|
3861
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
3862
|
+
result->src[0] = a;
|
3863
|
+
|
3864
|
+
return result;
|
3824
3865
|
}
|
3825
3866
|
|
3826
3867
|
// ggml_gelu
|
@@ -4007,8 +4048,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|
4007
4048
|
|
4008
4049
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4009
4050
|
|
4010
|
-
result->op = GGML_OP_GROUP_NORM;
|
4011
4051
|
result->op_params[0] = n_groups;
|
4052
|
+
|
4053
|
+
result->op = GGML_OP_GROUP_NORM;
|
4012
4054
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4013
4055
|
result->src[0] = a;
|
4014
4056
|
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
@@ -4046,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4046
4088
|
}
|
4047
4089
|
|
4048
4090
|
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4049
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4091
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4050
4092
|
|
4051
4093
|
result->op = GGML_OP_MUL_MAT;
|
4052
4094
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4056,6 +4098,51 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4056
4098
|
return result;
|
4057
4099
|
}
|
4058
4100
|
|
4101
|
+
// ggml_mul_mat_id
|
4102
|
+
|
4103
|
+
struct ggml_tensor * ggml_mul_mat_id(
|
4104
|
+
struct ggml_context * ctx,
|
4105
|
+
struct ggml_tensor * const as[],
|
4106
|
+
int n_as,
|
4107
|
+
struct ggml_tensor * ids,
|
4108
|
+
int id,
|
4109
|
+
struct ggml_tensor * b) {
|
4110
|
+
|
4111
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4112
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4113
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4114
|
+
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4115
|
+
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4116
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]);
|
4117
|
+
|
4118
|
+
bool is_node = false;
|
4119
|
+
|
4120
|
+
if (as[0]->grad || b->grad) {
|
4121
|
+
is_node = true;
|
4122
|
+
}
|
4123
|
+
|
4124
|
+
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4125
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4126
|
+
|
4127
|
+
ggml_set_op_params_i32(result, 0, id);
|
4128
|
+
ggml_set_op_params_i32(result, 1, n_as);
|
4129
|
+
|
4130
|
+
result->op = GGML_OP_MUL_MAT_ID;
|
4131
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4132
|
+
result->src[0] = ids;
|
4133
|
+
result->src[1] = b;
|
4134
|
+
|
4135
|
+
for (int i = 0; i < n_as; i++) {
|
4136
|
+
struct ggml_tensor * a = as[i];
|
4137
|
+
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4138
|
+
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4139
|
+
GGML_ASSERT(!ggml_is_transposed(a));
|
4140
|
+
result->src[i + 2] = a;
|
4141
|
+
}
|
4142
|
+
|
4143
|
+
return result;
|
4144
|
+
}
|
4145
|
+
|
4059
4146
|
// ggml_out_prod
|
4060
4147
|
|
4061
4148
|
struct ggml_tensor * ggml_out_prod(
|
@@ -4073,7 +4160,7 @@ struct ggml_tensor * ggml_out_prod(
|
|
4073
4160
|
|
4074
4161
|
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
4075
4162
|
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
4076
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4163
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4077
4164
|
|
4078
4165
|
result->op = GGML_OP_OUT_PROD;
|
4079
4166
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4209,7 +4296,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4209
4296
|
struct ggml_tensor * b,
|
4210
4297
|
size_t nb1,
|
4211
4298
|
size_t offset) {
|
4212
|
-
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset,
|
4299
|
+
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
4213
4300
|
}
|
4214
4301
|
|
4215
4302
|
// ggml_cpy
|
@@ -4358,7 +4445,7 @@ struct ggml_tensor * ggml_reshape(
|
|
4358
4445
|
//GGML_ASSERT(false);
|
4359
4446
|
}
|
4360
4447
|
|
4361
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type,
|
4448
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
|
4362
4449
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
4363
4450
|
|
4364
4451
|
result->op = GGML_OP_RESHAPE;
|
@@ -4673,7 +4760,9 @@ struct ggml_tensor * ggml_get_rows(
|
|
4673
4760
|
struct ggml_context * ctx,
|
4674
4761
|
struct ggml_tensor * a,
|
4675
4762
|
struct ggml_tensor * b) {
|
4676
|
-
GGML_ASSERT(
|
4763
|
+
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
4764
|
+
GGML_ASSERT(b->ne[3] == 1);
|
4765
|
+
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
4677
4766
|
|
4678
4767
|
bool is_node = false;
|
4679
4768
|
|
@@ -4683,7 +4772,7 @@ struct ggml_tensor * ggml_get_rows(
|
|
4683
4772
|
|
4684
4773
|
// TODO: implement non F32 return
|
4685
4774
|
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
|
4686
|
-
struct ggml_tensor * result =
|
4775
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
|
4687
4776
|
|
4688
4777
|
result->op = GGML_OP_GET_ROWS;
|
4689
4778
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4734,7 +4823,7 @@ struct ggml_tensor * ggml_diag(
|
|
4734
4823
|
}
|
4735
4824
|
|
4736
4825
|
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
4737
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
4826
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
|
4738
4827
|
|
4739
4828
|
result->op = GGML_OP_DIAG;
|
4740
4829
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5381,7 +5470,7 @@ struct ggml_tensor * ggml_pool_1d(
|
|
5381
5470
|
is_node = true;
|
5382
5471
|
}
|
5383
5472
|
|
5384
|
-
const int64_t ne[
|
5473
|
+
const int64_t ne[2] = {
|
5385
5474
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
5386
5475
|
a->ne[1],
|
5387
5476
|
};
|
@@ -5461,6 +5550,30 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
5461
5550
|
return result;
|
5462
5551
|
}
|
5463
5552
|
|
5553
|
+
struct ggml_tensor * ggml_pad(
|
5554
|
+
struct ggml_context * ctx,
|
5555
|
+
struct ggml_tensor * a,
|
5556
|
+
int p0, int p1, int p2, int p3) {
|
5557
|
+
bool is_node = false;
|
5558
|
+
|
5559
|
+
if (a->grad) {
|
5560
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5561
|
+
is_node = true;
|
5562
|
+
}
|
5563
|
+
|
5564
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
5565
|
+
a->ne[0] + p0,
|
5566
|
+
a->ne[1] + p1,
|
5567
|
+
a->ne[2] + p2,
|
5568
|
+
a->ne[3] + p3);
|
5569
|
+
|
5570
|
+
result->op = GGML_OP_PAD;
|
5571
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5572
|
+
result->src[0] = a;
|
5573
|
+
|
5574
|
+
return result;
|
5575
|
+
}
|
5576
|
+
|
5464
5577
|
struct ggml_tensor * ggml_upscale(
|
5465
5578
|
struct ggml_context * ctx,
|
5466
5579
|
struct ggml_tensor * a,
|
@@ -5468,6 +5581,43 @@ struct ggml_tensor * ggml_upscale(
|
|
5468
5581
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5469
5582
|
}
|
5470
5583
|
|
5584
|
+
// ggml_argsort
|
5585
|
+
|
5586
|
+
struct ggml_tensor * ggml_argsort(
|
5587
|
+
struct ggml_context * ctx,
|
5588
|
+
struct ggml_tensor * a,
|
5589
|
+
enum ggml_sort_order order) {
|
5590
|
+
bool is_node = false;
|
5591
|
+
|
5592
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
5593
|
+
|
5594
|
+
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5595
|
+
|
5596
|
+
result->op = GGML_OP_ARGSORT;
|
5597
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5598
|
+
result->src[0] = a;
|
5599
|
+
|
5600
|
+
return result;
|
5601
|
+
}
|
5602
|
+
|
5603
|
+
// ggml_top_k
|
5604
|
+
|
5605
|
+
struct ggml_tensor * ggml_top_k(
|
5606
|
+
struct ggml_context * ctx,
|
5607
|
+
struct ggml_tensor * a,
|
5608
|
+
int k) {
|
5609
|
+
GGML_ASSERT(a->ne[0] >= k);
|
5610
|
+
|
5611
|
+
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
5612
|
+
|
5613
|
+
result = ggml_view_4d(ctx, result,
|
5614
|
+
k, result->ne[1], result->ne[2], result->ne[3],
|
5615
|
+
result->nb[1], result->nb[2], result->nb[3],
|
5616
|
+
0);
|
5617
|
+
|
5618
|
+
return result;
|
5619
|
+
}
|
5620
|
+
|
5471
5621
|
// ggml_flash_attn
|
5472
5622
|
|
5473
5623
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -5486,7 +5636,7 @@ struct ggml_tensor * ggml_flash_attn(
|
|
5486
5636
|
}
|
5487
5637
|
|
5488
5638
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
5489
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5639
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
5490
5640
|
|
5491
5641
|
int32_t t = masked ? 1 : 0;
|
5492
5642
|
ggml_set_op_params(result, &t, sizeof(t));
|
@@ -5519,7 +5669,7 @@ struct ggml_tensor * ggml_flash_ff(
|
|
5519
5669
|
}
|
5520
5670
|
|
5521
5671
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5522
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5672
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
5523
5673
|
|
5524
5674
|
result->op = GGML_OP_FLASH_FF;
|
5525
5675
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5635,7 +5785,6 @@ struct ggml_tensor * ggml_win_part(
|
|
5635
5785
|
const int np = npx*npy;
|
5636
5786
|
|
5637
5787
|
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
5638
|
-
|
5639
5788
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5640
5789
|
|
5641
5790
|
int32_t params[] = { npx, npy, w };
|
@@ -6827,7 +6976,7 @@ static void ggml_compute_forward_add_f32(
|
|
6827
6976
|
const struct ggml_tensor * src0,
|
6828
6977
|
const struct ggml_tensor * src1,
|
6829
6978
|
struct ggml_tensor * dst) {
|
6830
|
-
GGML_ASSERT(
|
6979
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
6831
6980
|
|
6832
6981
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
6833
6982
|
return;
|
@@ -6860,16 +7009,19 @@ static void ggml_compute_forward_add_f32(
|
|
6860
7009
|
const int64_t i13 = i03 % ne13;
|
6861
7010
|
const int64_t i12 = i02 % ne12;
|
6862
7011
|
const int64_t i11 = i01 % ne11;
|
7012
|
+
const int64_t nr0 = ne00 / ne10;
|
6863
7013
|
|
6864
7014
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6865
7015
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6866
7016
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
6867
7017
|
|
7018
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
6868
7019
|
#ifdef GGML_USE_ACCELERATE
|
6869
|
-
|
7020
|
+
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
6870
7021
|
#else
|
6871
|
-
|
7022
|
+
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
6872
7023
|
#endif
|
7024
|
+
}
|
6873
7025
|
}
|
6874
7026
|
} else {
|
6875
7027
|
// src1 is not contiguous
|
@@ -6886,8 +7038,9 @@ static void ggml_compute_forward_add_f32(
|
|
6886
7038
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6887
7039
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6888
7040
|
|
6889
|
-
for (
|
6890
|
-
|
7041
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
7042
|
+
const int64_t i10 = i0 % ne10;
|
7043
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
6891
7044
|
|
6892
7045
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
6893
7046
|
}
|
@@ -7421,7 +7574,7 @@ static void ggml_compute_forward_acc_f32(
|
|
7421
7574
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
7422
7575
|
|
7423
7576
|
// view src0 and dst with these strides and data offset inbytes during acc
|
7424
|
-
// nb0 is
|
7577
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
7425
7578
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
7426
7579
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
7427
7580
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -7607,7 +7760,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7607
7760
|
const struct ggml_tensor * src0,
|
7608
7761
|
const struct ggml_tensor * src1,
|
7609
7762
|
struct ggml_tensor * dst) {
|
7610
|
-
GGML_ASSERT(
|
7763
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7611
7764
|
|
7612
7765
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7613
7766
|
return;
|
@@ -7617,6 +7770,8 @@ static void ggml_compute_forward_mul_f32(
|
|
7617
7770
|
|
7618
7771
|
#ifdef GGML_USE_CLBLAST
|
7619
7772
|
if (src1->backend == GGML_BACKEND_GPU) {
|
7773
|
+
// TODO: OpenCL kernel support full broadcast
|
7774
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
7620
7775
|
if (ith == 0) {
|
7621
7776
|
ggml_cl_mul(src0, src1, dst);
|
7622
7777
|
}
|
@@ -7630,7 +7785,6 @@ static void ggml_compute_forward_mul_f32(
|
|
7630
7785
|
|
7631
7786
|
GGML_ASSERT( nb0 == sizeof(float));
|
7632
7787
|
GGML_ASSERT(nb00 == sizeof(float));
|
7633
|
-
GGML_ASSERT(ne00 == ne10);
|
7634
7788
|
|
7635
7789
|
if (nb10 == sizeof(float)) {
|
7636
7790
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
@@ -7642,20 +7796,21 @@ static void ggml_compute_forward_mul_f32(
|
|
7642
7796
|
const int64_t i13 = i03 % ne13;
|
7643
7797
|
const int64_t i12 = i02 % ne12;
|
7644
7798
|
const int64_t i11 = i01 % ne11;
|
7799
|
+
const int64_t nr0 = ne00 / ne10;
|
7645
7800
|
|
7646
7801
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7647
7802
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7648
7803
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7649
7804
|
|
7805
|
+
for (int64_t r = 0 ; r < nr0; ++r) {
|
7650
7806
|
#ifdef GGML_USE_ACCELERATE
|
7651
|
-
|
7807
|
+
UNUSED(ggml_vec_mul_f32);
|
7652
7808
|
|
7653
|
-
|
7809
|
+
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
7654
7810
|
#else
|
7655
|
-
|
7811
|
+
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7656
7812
|
#endif
|
7657
|
-
|
7658
|
-
// }
|
7813
|
+
}
|
7659
7814
|
}
|
7660
7815
|
} else {
|
7661
7816
|
// src1 is not contiguous
|
@@ -7673,8 +7828,9 @@ static void ggml_compute_forward_mul_f32(
|
|
7673
7828
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7674
7829
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7675
7830
|
|
7676
|
-
for (int64_t i0 = 0; i0 < ne00; i0
|
7677
|
-
|
7831
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7832
|
+
const int64_t i10 = i0 % ne10;
|
7833
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7678
7834
|
|
7679
7835
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
7680
7836
|
}
|
@@ -7708,14 +7864,16 @@ static void ggml_compute_forward_div_f32(
|
|
7708
7864
|
const struct ggml_tensor * src0,
|
7709
7865
|
const struct ggml_tensor * src1,
|
7710
7866
|
struct ggml_tensor * dst) {
|
7711
|
-
|
7712
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
7867
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7713
7868
|
|
7714
7869
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7715
7870
|
return;
|
7716
7871
|
}
|
7717
7872
|
|
7718
|
-
const int
|
7873
|
+
const int ith = params->ith;
|
7874
|
+
const int nth = params->nth;
|
7875
|
+
|
7876
|
+
const int64_t nr = ggml_nrows(src0);
|
7719
7877
|
|
7720
7878
|
GGML_TENSOR_BINARY_OP_LOCALS
|
7721
7879
|
|
@@ -7723,41 +7881,50 @@ static void ggml_compute_forward_div_f32(
|
|
7723
7881
|
GGML_ASSERT(nb00 == sizeof(float));
|
7724
7882
|
|
7725
7883
|
if (nb10 == sizeof(float)) {
|
7726
|
-
for (
|
7727
|
-
// src0
|
7728
|
-
const
|
7729
|
-
const
|
7730
|
-
const
|
7884
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7885
|
+
// src0 and dst are same shape => same indices
|
7886
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7887
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7888
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7889
|
+
|
7890
|
+
const int64_t i13 = i03 % ne13;
|
7891
|
+
const int64_t i12 = i02 % ne12;
|
7892
|
+
const int64_t i11 = i01 % ne11;
|
7893
|
+
const int64_t nr0 = ne00 / ne10;
|
7894
|
+
|
7895
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7896
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7897
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7731
7898
|
|
7899
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
7732
7900
|
#ifdef GGML_USE_ACCELERATE
|
7733
|
-
|
7901
|
+
UNUSED(ggml_vec_div_f32);
|
7734
7902
|
|
7735
|
-
|
7736
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
7737
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
7738
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
7739
|
-
ne0);
|
7903
|
+
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
7740
7904
|
#else
|
7741
|
-
|
7742
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
7743
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
7744
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
7905
|
+
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7745
7906
|
#endif
|
7746
|
-
|
7747
|
-
// }
|
7907
|
+
}
|
7748
7908
|
}
|
7749
7909
|
} else {
|
7750
7910
|
// src1 is not contiguous
|
7751
|
-
for (
|
7752
|
-
// src0
|
7753
|
-
|
7754
|
-
const
|
7755
|
-
const
|
7911
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7912
|
+
// src0 and dst are same shape => same indices
|
7913
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
7914
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7915
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7916
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7756
7917
|
|
7757
|
-
|
7758
|
-
|
7759
|
-
|
7760
|
-
|
7918
|
+
const int64_t i13 = i03 % ne13;
|
7919
|
+
const int64_t i12 = i02 % ne12;
|
7920
|
+
const int64_t i11 = i01 % ne11;
|
7921
|
+
|
7922
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7923
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7924
|
+
|
7925
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7926
|
+
const int64_t i10 = i0 % ne10;
|
7927
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7761
7928
|
|
7762
7929
|
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
7763
7930
|
}
|
@@ -8203,7 +8370,7 @@ static void ggml_compute_forward_repeat_f16(
|
|
8203
8370
|
return;
|
8204
8371
|
}
|
8205
8372
|
|
8206
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
8373
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
8207
8374
|
|
8208
8375
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
8209
8376
|
const int nr0 = (int)(ne0/ne00);
|
@@ -8348,6 +8515,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8348
8515
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
8349
8516
|
|
8350
8517
|
const int ith = params->ith;
|
8518
|
+
const int nth = params->nth;
|
8351
8519
|
|
8352
8520
|
GGML_TENSOR_BINARY_OP_LOCALS
|
8353
8521
|
|
@@ -8357,7 +8525,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8357
8525
|
GGML_ASSERT(nb10 == sizeof(float));
|
8358
8526
|
|
8359
8527
|
for (int i3 = 0; i3 < ne3; i3++) {
|
8360
|
-
for (int i2 = ith; i2 < ne2; i2
|
8528
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
8361
8529
|
if (i2 < ne02) { // src0
|
8362
8530
|
for (int i1 = 0; i1 < ne1; i1++) {
|
8363
8531
|
for (int i0 = 0; i0 < ne0; i0++) {
|
@@ -8869,10 +9037,9 @@ static void ggml_compute_forward_silu(
|
|
8869
9037
|
} break;
|
8870
9038
|
}
|
8871
9039
|
}
|
9040
|
+
// ggml_compute_forward_leaky_relu
|
8872
9041
|
|
8873
|
-
|
8874
|
-
|
8875
|
-
static void ggml_compute_forward_leaky_f32(
|
9042
|
+
static void ggml_compute_forward_leaky_relu_f32(
|
8876
9043
|
const struct ggml_compute_params * params,
|
8877
9044
|
const struct ggml_tensor * src0,
|
8878
9045
|
struct ggml_tensor * dst) {
|
@@ -8886,24 +9053,27 @@ static void ggml_compute_forward_leaky_f32(
|
|
8886
9053
|
const int n = ggml_nrows(src0);
|
8887
9054
|
const int nc = src0->ne[0];
|
8888
9055
|
|
9056
|
+
float negative_slope;
|
9057
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
9058
|
+
|
8889
9059
|
assert(dst->nb[0] == sizeof(float));
|
8890
9060
|
assert(src0->nb[0] == sizeof(float));
|
8891
9061
|
|
8892
9062
|
for (int i = 0; i < n; i++) {
|
8893
|
-
|
9063
|
+
ggml_vec_leaky_relu_f32(nc,
|
8894
9064
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
8895
|
-
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9065
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
|
8896
9066
|
}
|
8897
9067
|
}
|
8898
9068
|
|
8899
|
-
static void
|
9069
|
+
static void ggml_compute_forward_leaky_relu(
|
8900
9070
|
const struct ggml_compute_params * params,
|
8901
9071
|
const struct ggml_tensor * src0,
|
8902
9072
|
struct ggml_tensor * dst) {
|
8903
9073
|
switch (src0->type) {
|
8904
9074
|
case GGML_TYPE_F32:
|
8905
9075
|
{
|
8906
|
-
|
9076
|
+
ggml_compute_forward_leaky_relu_f32(params, src0, dst);
|
8907
9077
|
} break;
|
8908
9078
|
default:
|
8909
9079
|
{
|
@@ -9392,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9392
9562
|
const int64_t ne0 = dst->ne[0];
|
9393
9563
|
const int64_t ne1 = dst->ne[1];
|
9394
9564
|
|
9565
|
+
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
9566
|
+
// all the experts for each batch element and the processing would become incredibly slow
|
9395
9567
|
// TODO: find the optimal values for these
|
9396
|
-
if (
|
9568
|
+
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
9569
|
+
ggml_is_contiguous(src0) &&
|
9397
9570
|
ggml_is_contiguous(src1) &&
|
9398
9571
|
//src0->type == GGML_TYPE_F32 &&
|
9399
9572
|
src1->type == GGML_TYPE_F32 &&
|
@@ -9407,11 +9580,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9407
9580
|
}
|
9408
9581
|
#endif
|
9409
9582
|
|
9583
|
+
// off1 = offset in i11 and i1
|
9584
|
+
// cne1 = ne11 and ne1
|
9585
|
+
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
9586
|
+
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
9410
9587
|
static void ggml_compute_forward_mul_mat(
|
9411
9588
|
const struct ggml_compute_params * params,
|
9412
9589
|
const struct ggml_tensor * src0,
|
9413
9590
|
const struct ggml_tensor * src1,
|
9414
|
-
struct ggml_tensor * dst
|
9591
|
+
struct ggml_tensor * dst,
|
9592
|
+
int64_t off1, int64_t cne1) {
|
9415
9593
|
int64_t t0 = ggml_perf_time_us();
|
9416
9594
|
UNUSED(t0);
|
9417
9595
|
|
@@ -9479,10 +9657,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9479
9657
|
const int64_t i03 = i13/r3;
|
9480
9658
|
const int64_t i02 = i12/r2;
|
9481
9659
|
|
9482
|
-
const void * x = (char *) src0->data +
|
9483
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
9484
|
-
|
9485
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9660
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
9661
|
+
const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
|
9662
|
+
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
|
9486
9663
|
|
9487
9664
|
if (type != GGML_TYPE_F32) {
|
9488
9665
|
float * const wdata = params->wdata;
|
@@ -9499,10 +9676,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9499
9676
|
}
|
9500
9677
|
|
9501
9678
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9502
|
-
|
9503
|
-
|
9504
|
-
|
9505
|
-
|
9679
|
+
cne1, ne01, ne10,
|
9680
|
+
1.0f, y, ne10,
|
9681
|
+
x, ne00,
|
9682
|
+
0.0f, d, ne01);
|
9506
9683
|
}
|
9507
9684
|
}
|
9508
9685
|
|
@@ -9515,7 +9692,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9515
9692
|
if (params->type == GGML_TASK_INIT) {
|
9516
9693
|
if (src1->type != vec_dot_type) {
|
9517
9694
|
char * wdata = params->wdata;
|
9518
|
-
const size_t row_size =
|
9695
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9696
|
+
|
9697
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9698
|
+
assert(src1->type == GGML_TYPE_F32);
|
9519
9699
|
|
9520
9700
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9521
9701
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -9535,10 +9715,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9535
9715
|
}
|
9536
9716
|
|
9537
9717
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9538
|
-
const size_t row_size =
|
9718
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9539
9719
|
|
9540
9720
|
const int64_t nr0 = ne01; // src0 rows
|
9541
|
-
const int64_t nr1 =
|
9721
|
+
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
9542
9722
|
|
9543
9723
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9544
9724
|
|
@@ -9580,9 +9760,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9580
9760
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9581
9761
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9582
9762
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9583
|
-
const int64_t i13 = (ir1/(ne12*
|
9584
|
-
const int64_t i12 = (ir1 - i13*ne12*
|
9585
|
-
const int64_t i11 = (ir1 - i13*ne12*
|
9763
|
+
const int64_t i13 = (ir1/(ne12*cne1));
|
9764
|
+
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
9765
|
+
const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
|
9586
9766
|
|
9587
9767
|
// broadcast src0 into src1
|
9588
9768
|
const int64_t i03 = i13/r3;
|
@@ -9618,6 +9798,34 @@ static void ggml_compute_forward_mul_mat(
|
|
9618
9798
|
}
|
9619
9799
|
}
|
9620
9800
|
|
9801
|
+
// ggml_compute_forward_mul_mat_id
|
9802
|
+
|
9803
|
+
static void ggml_compute_forward_mul_mat_id(
|
9804
|
+
const struct ggml_compute_params * params,
|
9805
|
+
const struct ggml_tensor * src0,
|
9806
|
+
const struct ggml_tensor * src1,
|
9807
|
+
struct ggml_tensor * dst) {
|
9808
|
+
|
9809
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9810
|
+
// during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
|
9811
|
+
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
|
9812
|
+
return;
|
9813
|
+
}
|
9814
|
+
|
9815
|
+
const struct ggml_tensor * ids = src0;
|
9816
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9817
|
+
const int n_as = ggml_get_op_params_i32(dst, 1);
|
9818
|
+
|
9819
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
9820
|
+
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
9821
|
+
|
9822
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
9823
|
+
|
9824
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
9825
|
+
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
9826
|
+
}
|
9827
|
+
}
|
9828
|
+
|
9621
9829
|
// ggml_compute_forward_out_prod
|
9622
9830
|
|
9623
9831
|
static void ggml_compute_forward_out_prod_f32(
|
@@ -10027,7 +10235,7 @@ static void ggml_compute_forward_set_f32(
|
|
10027
10235
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
10028
10236
|
|
10029
10237
|
// view src0 and dst with these strides and data offset inbytes during set
|
10030
|
-
// nb0 is
|
10238
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
10031
10239
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
10032
10240
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
10033
10241
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -10191,21 +10399,30 @@ static void ggml_compute_forward_get_rows_q(
|
|
10191
10399
|
return;
|
10192
10400
|
}
|
10193
10401
|
|
10194
|
-
|
10195
|
-
|
10402
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10403
|
+
|
10404
|
+
const int64_t nc = ne00;
|
10405
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10406
|
+
|
10196
10407
|
const enum ggml_type type = src0->type;
|
10197
10408
|
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
10198
10409
|
|
10199
|
-
assert(
|
10200
|
-
assert(
|
10201
|
-
assert(
|
10410
|
+
assert(ne0 == nc);
|
10411
|
+
assert(ne02 == ne11);
|
10412
|
+
assert(nb00 == ggml_type_size(type));
|
10413
|
+
assert(ggml_nrows(dst) == nr);
|
10202
10414
|
|
10203
|
-
|
10204
|
-
|
10415
|
+
// TODO: multi-thread
|
10416
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10417
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10418
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10419
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10205
10420
|
|
10206
|
-
|
10207
|
-
|
10208
|
-
|
10421
|
+
dequantize_row_q(
|
10422
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10423
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10424
|
+
}
|
10425
|
+
}
|
10209
10426
|
}
|
10210
10427
|
}
|
10211
10428
|
|
@@ -10220,19 +10437,26 @@ static void ggml_compute_forward_get_rows_f16(
|
|
10220
10437
|
return;
|
10221
10438
|
}
|
10222
10439
|
|
10223
|
-
|
10224
|
-
const int nr = ggml_nelements(src1);
|
10440
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10225
10441
|
|
10226
|
-
|
10227
|
-
|
10228
|
-
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
10442
|
+
const int64_t nc = ne00;
|
10443
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10229
10444
|
|
10230
|
-
|
10231
|
-
|
10445
|
+
assert(ne0 == nc);
|
10446
|
+
assert(ne02 == ne11);
|
10447
|
+
assert(nb00 == sizeof(ggml_fp16_t));
|
10448
|
+
assert(ggml_nrows(dst) == nr);
|
10232
10449
|
|
10233
|
-
|
10234
|
-
|
10235
|
-
|
10450
|
+
// TODO: multi-thread
|
10451
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10452
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10453
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10454
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10455
|
+
|
10456
|
+
ggml_fp16_to_fp32_row(
|
10457
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10458
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10459
|
+
}
|
10236
10460
|
}
|
10237
10461
|
}
|
10238
10462
|
}
|
@@ -10248,19 +10472,27 @@ static void ggml_compute_forward_get_rows_f32(
|
|
10248
10472
|
return;
|
10249
10473
|
}
|
10250
10474
|
|
10251
|
-
|
10252
|
-
const int nr = ggml_nelements(src1);
|
10475
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10253
10476
|
|
10254
|
-
|
10255
|
-
|
10256
|
-
assert(src0->nb[0] == sizeof(float));
|
10477
|
+
const int64_t nc = ne00;
|
10478
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10257
10479
|
|
10258
|
-
|
10259
|
-
|
10480
|
+
assert(ne0 == nc);
|
10481
|
+
assert(ne02 == ne11);
|
10482
|
+
assert(nb00 == sizeof(float));
|
10483
|
+
assert(ggml_nrows(dst) == nr);
|
10260
10484
|
|
10261
|
-
|
10262
|
-
|
10263
|
-
|
10485
|
+
// TODO: multi-thread
|
10486
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10487
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10488
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10489
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10490
|
+
|
10491
|
+
ggml_vec_cpy_f32(nc,
|
10492
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
10493
|
+
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
10494
|
+
}
|
10495
|
+
}
|
10264
10496
|
}
|
10265
10497
|
}
|
10266
10498
|
|
@@ -11980,6 +12212,7 @@ static void ggml_compute_forward_upscale_f32(
|
|
11980
12212
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
11981
12213
|
|
11982
12214
|
const int ith = params->ith;
|
12215
|
+
const int nth = params->nth;
|
11983
12216
|
|
11984
12217
|
GGML_TENSOR_UNARY_OP_LOCALS
|
11985
12218
|
|
@@ -11987,16 +12220,17 @@ static void ggml_compute_forward_upscale_f32(
|
|
11987
12220
|
|
11988
12221
|
// TODO: optimize
|
11989
12222
|
|
11990
|
-
for (
|
11991
|
-
|
11992
|
-
|
11993
|
-
|
11994
|
-
|
11995
|
-
|
11996
|
-
|
11997
|
-
const
|
12223
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12224
|
+
const int64_t i03 = i3;
|
12225
|
+
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
12226
|
+
const int64_t i02 = i2;
|
12227
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
12228
|
+
const int64_t i01 = i1 / scale_factor;
|
12229
|
+
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
12230
|
+
const int64_t i00 = i0 / scale_factor;
|
11998
12231
|
|
11999
|
-
float *
|
12232
|
+
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
12233
|
+
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
12000
12234
|
|
12001
12235
|
*y = *x;
|
12002
12236
|
}
|
@@ -12021,6 +12255,125 @@ static void ggml_compute_forward_upscale(
|
|
12021
12255
|
}
|
12022
12256
|
}
|
12023
12257
|
|
12258
|
+
// ggml_compute_forward_pad
|
12259
|
+
|
12260
|
+
static void ggml_compute_forward_pad_f32(
|
12261
|
+
const struct ggml_compute_params * params,
|
12262
|
+
const struct ggml_tensor * src0,
|
12263
|
+
struct ggml_tensor * dst) {
|
12264
|
+
|
12265
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12266
|
+
return;
|
12267
|
+
}
|
12268
|
+
|
12269
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
12270
|
+
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
12271
|
+
|
12272
|
+
const int ith = params->ith;
|
12273
|
+
const int nth = params->nth;
|
12274
|
+
|
12275
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12276
|
+
|
12277
|
+
float * dst_ptr = (float *) dst->data;
|
12278
|
+
|
12279
|
+
// TODO: optimize
|
12280
|
+
|
12281
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
12282
|
+
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
12283
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
12284
|
+
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
12285
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
12286
|
+
|
12287
|
+
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12288
|
+
|
12289
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
12290
|
+
dst_ptr[dst_idx] = *src_ptr;
|
12291
|
+
} else {
|
12292
|
+
dst_ptr[dst_idx] = 0;
|
12293
|
+
}
|
12294
|
+
}
|
12295
|
+
}
|
12296
|
+
}
|
12297
|
+
}
|
12298
|
+
}
|
12299
|
+
|
12300
|
+
static void ggml_compute_forward_pad(
|
12301
|
+
const struct ggml_compute_params * params,
|
12302
|
+
const struct ggml_tensor * src0,
|
12303
|
+
struct ggml_tensor * dst) {
|
12304
|
+
switch (src0->type) {
|
12305
|
+
case GGML_TYPE_F32:
|
12306
|
+
{
|
12307
|
+
ggml_compute_forward_pad_f32(params, src0, dst);
|
12308
|
+
} break;
|
12309
|
+
default:
|
12310
|
+
{
|
12311
|
+
GGML_ASSERT(false);
|
12312
|
+
} break;
|
12313
|
+
}
|
12314
|
+
}
|
12315
|
+
|
12316
|
+
// ggml_compute_forward_argsort
|
12317
|
+
|
12318
|
+
static void ggml_compute_forward_argsort_f32(
|
12319
|
+
const struct ggml_compute_params * params,
|
12320
|
+
const struct ggml_tensor * src0,
|
12321
|
+
struct ggml_tensor * dst) {
|
12322
|
+
|
12323
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12324
|
+
return;
|
12325
|
+
}
|
12326
|
+
|
12327
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12328
|
+
|
12329
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12330
|
+
|
12331
|
+
const int ith = params->ith;
|
12332
|
+
const int nth = params->nth;
|
12333
|
+
|
12334
|
+
const int64_t nr = ggml_nrows(src0);
|
12335
|
+
|
12336
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
12337
|
+
|
12338
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
12339
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
12340
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
12341
|
+
|
12342
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12343
|
+
dst_data[j] = j;
|
12344
|
+
}
|
12345
|
+
|
12346
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
12347
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12348
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
12349
|
+
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
12350
|
+
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
12351
|
+
int32_t tmp = dst_data[j];
|
12352
|
+
dst_data[j] = dst_data[k];
|
12353
|
+
dst_data[k] = tmp;
|
12354
|
+
}
|
12355
|
+
}
|
12356
|
+
}
|
12357
|
+
}
|
12358
|
+
}
|
12359
|
+
|
12360
|
+
static void ggml_compute_forward_argsort(
|
12361
|
+
const struct ggml_compute_params * params,
|
12362
|
+
const struct ggml_tensor * src0,
|
12363
|
+
struct ggml_tensor * dst) {
|
12364
|
+
|
12365
|
+
switch (src0->type) {
|
12366
|
+
case GGML_TYPE_F32:
|
12367
|
+
{
|
12368
|
+
ggml_compute_forward_argsort_f32(params, src0, dst);
|
12369
|
+
} break;
|
12370
|
+
default:
|
12371
|
+
{
|
12372
|
+
GGML_ASSERT(false);
|
12373
|
+
} break;
|
12374
|
+
}
|
12375
|
+
}
|
12376
|
+
|
12024
12377
|
// ggml_compute_forward_flash_attn
|
12025
12378
|
|
12026
12379
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13167,10 +13520,6 @@ static void ggml_compute_forward_unary(
|
|
13167
13520
|
{
|
13168
13521
|
ggml_compute_forward_silu(params, src0, dst);
|
13169
13522
|
} break;
|
13170
|
-
case GGML_UNARY_OP_LEAKY:
|
13171
|
-
{
|
13172
|
-
ggml_compute_forward_leaky(params, src0, dst);
|
13173
|
-
} break;
|
13174
13523
|
default:
|
13175
13524
|
{
|
13176
13525
|
GGML_ASSERT(false);
|
@@ -13842,7 +14191,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13842
14191
|
} break;
|
13843
14192
|
case GGML_OP_MUL_MAT:
|
13844
14193
|
{
|
13845
|
-
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
14194
|
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
|
14195
|
+
} break;
|
14196
|
+
case GGML_OP_MUL_MAT_ID:
|
14197
|
+
{
|
14198
|
+
ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
|
13846
14199
|
} break;
|
13847
14200
|
case GGML_OP_OUT_PROD:
|
13848
14201
|
{
|
@@ -13948,6 +14301,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13948
14301
|
{
|
13949
14302
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
13950
14303
|
} break;
|
14304
|
+
case GGML_OP_PAD:
|
14305
|
+
{
|
14306
|
+
ggml_compute_forward_pad(params, tensor->src[0], tensor);
|
14307
|
+
} break;
|
14308
|
+
case GGML_OP_ARGSORT:
|
14309
|
+
{
|
14310
|
+
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14311
|
+
} break;
|
14312
|
+
case GGML_OP_LEAKY_RELU:
|
14313
|
+
{
|
14314
|
+
ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
|
14315
|
+
} break;
|
13951
14316
|
case GGML_OP_FLASH_ATTN:
|
13952
14317
|
{
|
13953
14318
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14202,7 +14567,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14202
14567
|
return replacements->vals[i];
|
14203
14568
|
}
|
14204
14569
|
|
14205
|
-
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type,
|
14570
|
+
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
|
14206
14571
|
|
14207
14572
|
// insert clone into replacements
|
14208
14573
|
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
@@ -14272,7 +14637,7 @@ void ggml_build_backward_gradient_checkpointing(
|
|
14272
14637
|
// insert new tensors recomputing src, reusing already made replacements,
|
14273
14638
|
// remember replacements: remember new tensors with mapping from corresponding gf nodes
|
14274
14639
|
// recurse for input tensors,
|
14275
|
-
// unless (i.e. terminating when) input tensors are
|
14640
|
+
// unless (i.e. terminating when) input tensors are replacements (like checkpoints)
|
14276
14641
|
node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
|
14277
14642
|
}
|
14278
14643
|
// insert rewritten backward node with replacements made into resulting backward graph gb
|
@@ -14598,6 +14963,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14598
14963
|
zero_table);
|
14599
14964
|
}
|
14600
14965
|
} break;
|
14966
|
+
case GGML_OP_MUL_MAT_ID:
|
14967
|
+
{
|
14968
|
+
GGML_ASSERT(false); // TODO: not implemented
|
14969
|
+
} break;
|
14601
14970
|
case GGML_OP_OUT_PROD:
|
14602
14971
|
{
|
14603
14972
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -14936,6 +15305,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14936
15305
|
{
|
14937
15306
|
GGML_ASSERT(false); // TODO: not implemented
|
14938
15307
|
} break;
|
15308
|
+
case GGML_OP_PAD:
|
15309
|
+
{
|
15310
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15311
|
+
} break;
|
15312
|
+
case GGML_OP_ARGSORT:
|
15313
|
+
{
|
15314
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15315
|
+
} break;
|
15316
|
+
case GGML_OP_LEAKY_RELU:
|
15317
|
+
{
|
15318
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15319
|
+
} break;
|
14939
15320
|
case GGML_OP_FLASH_ATTN:
|
14940
15321
|
{
|
14941
15322
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15296,12 +15677,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
15296
15677
|
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15297
15678
|
}
|
15298
15679
|
|
15299
|
-
struct ggml_cgraph
|
15300
|
-
|
15301
|
-
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15302
|
-
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15303
|
-
|
15304
|
-
*cgraph = (struct ggml_cgraph) {
|
15680
|
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15681
|
+
struct ggml_cgraph cgraph = {
|
15305
15682
|
/*.size =*/ 0,
|
15306
15683
|
/*.n_nodes =*/ i1 - i0,
|
15307
15684
|
/*.n_leafs =*/ 0,
|
@@ -15536,7 +15913,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15536
15913
|
n_tasks = n_threads;
|
15537
15914
|
} break;
|
15538
15915
|
case GGML_OP_SUB:
|
15539
|
-
case GGML_OP_DIV:
|
15540
15916
|
case GGML_OP_SQR:
|
15541
15917
|
case GGML_OP_SQRT:
|
15542
15918
|
case GGML_OP_LOG:
|
@@ -15546,6 +15922,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15546
15922
|
case GGML_OP_ARGMAX:
|
15547
15923
|
case GGML_OP_REPEAT:
|
15548
15924
|
case GGML_OP_REPEAT_BACK:
|
15925
|
+
case GGML_OP_LEAKY_RELU:
|
15549
15926
|
{
|
15550
15927
|
n_tasks = 1;
|
15551
15928
|
} break;
|
@@ -15558,7 +15935,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15558
15935
|
case GGML_UNARY_OP_TANH:
|
15559
15936
|
case GGML_UNARY_OP_ELU:
|
15560
15937
|
case GGML_UNARY_OP_RELU:
|
15561
|
-
case GGML_UNARY_OP_LEAKY:
|
15562
15938
|
{
|
15563
15939
|
n_tasks = 1;
|
15564
15940
|
} break;
|
@@ -15569,10 +15945,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15569
15945
|
{
|
15570
15946
|
n_tasks = n_threads;
|
15571
15947
|
} break;
|
15948
|
+
default:
|
15949
|
+
GGML_ASSERT(false);
|
15572
15950
|
}
|
15573
15951
|
break;
|
15574
15952
|
case GGML_OP_SILU_BACK:
|
15575
15953
|
case GGML_OP_MUL:
|
15954
|
+
case GGML_OP_DIV:
|
15576
15955
|
case GGML_OP_NORM:
|
15577
15956
|
case GGML_OP_RMS_NORM:
|
15578
15957
|
case GGML_OP_RMS_NORM_BACK:
|
@@ -15610,6 +15989,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15610
15989
|
}
|
15611
15990
|
#endif
|
15612
15991
|
} break;
|
15992
|
+
case GGML_OP_MUL_MAT_ID:
|
15993
|
+
{
|
15994
|
+
// FIXME: blas
|
15995
|
+
n_tasks = n_threads;
|
15996
|
+
} break;
|
15613
15997
|
case GGML_OP_OUT_PROD:
|
15614
15998
|
{
|
15615
15999
|
n_tasks = n_threads;
|
@@ -15629,7 +16013,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15629
16013
|
} break;
|
15630
16014
|
case GGML_OP_DIAG_MASK_ZERO:
|
15631
16015
|
case GGML_OP_DIAG_MASK_INF:
|
15632
|
-
case GGML_OP_SOFT_MAX:
|
15633
16016
|
case GGML_OP_SOFT_MAX_BACK:
|
15634
16017
|
case GGML_OP_ROPE:
|
15635
16018
|
case GGML_OP_ROPE_BACK:
|
@@ -15645,6 +16028,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15645
16028
|
{
|
15646
16029
|
n_tasks = 1; //TODO
|
15647
16030
|
} break;
|
16031
|
+
case GGML_OP_SOFT_MAX:
|
16032
|
+
{
|
16033
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
16034
|
+
} break;
|
15648
16035
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15649
16036
|
{
|
15650
16037
|
n_tasks = n_threads;
|
@@ -15666,6 +16053,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15666
16053
|
{
|
15667
16054
|
n_tasks = n_threads;
|
15668
16055
|
} break;
|
16056
|
+
case GGML_OP_PAD:
|
16057
|
+
{
|
16058
|
+
n_tasks = n_threads;
|
16059
|
+
} break;
|
16060
|
+
case GGML_OP_ARGSORT:
|
16061
|
+
{
|
16062
|
+
n_tasks = n_threads;
|
16063
|
+
} break;
|
15669
16064
|
case GGML_OP_FLASH_ATTN:
|
15670
16065
|
{
|
15671
16066
|
n_tasks = n_threads;
|
@@ -15728,6 +16123,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15728
16123
|
{
|
15729
16124
|
n_tasks = 1;
|
15730
16125
|
} break;
|
16126
|
+
case GGML_OP_COUNT:
|
16127
|
+
{
|
16128
|
+
GGML_ASSERT(false);
|
16129
|
+
} break;
|
15731
16130
|
default:
|
15732
16131
|
{
|
15733
16132
|
fprintf(stderr, "%s: op not implemented: ", __func__);
|
@@ -15876,18 +16275,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15876
16275
|
|
15877
16276
|
// thread scheduling for the different operations + work buffer size estimation
|
15878
16277
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15879
|
-
int n_tasks = 1;
|
15880
|
-
|
15881
16278
|
struct ggml_tensor * node = cgraph->nodes[i];
|
15882
16279
|
|
16280
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16281
|
+
|
15883
16282
|
size_t cur = 0;
|
15884
16283
|
|
15885
16284
|
switch (node->op) {
|
15886
16285
|
case GGML_OP_CPY:
|
15887
16286
|
case GGML_OP_DUP:
|
15888
16287
|
{
|
15889
|
-
n_tasks = n_threads;
|
15890
|
-
|
15891
16288
|
if (ggml_is_quantized(node->type)) {
|
15892
16289
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15893
16290
|
}
|
@@ -15895,16 +16292,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15895
16292
|
case GGML_OP_ADD:
|
15896
16293
|
case GGML_OP_ADD1:
|
15897
16294
|
{
|
15898
|
-
n_tasks = n_threads;
|
15899
|
-
|
15900
16295
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15901
16296
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15902
16297
|
}
|
15903
16298
|
} break;
|
15904
16299
|
case GGML_OP_ACC:
|
15905
16300
|
{
|
15906
|
-
n_tasks = n_threads;
|
15907
|
-
|
15908
16301
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15909
16302
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
15910
16303
|
}
|
@@ -15927,21 +16320,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15927
16320
|
} else
|
15928
16321
|
#endif
|
15929
16322
|
if (node->src[1]->type != vec_dot_type) {
|
15930
|
-
cur =
|
16323
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
16324
|
+
}
|
16325
|
+
} break;
|
16326
|
+
case GGML_OP_MUL_MAT_ID:
|
16327
|
+
{
|
16328
|
+
const struct ggml_tensor * a = node->src[2];
|
16329
|
+
const struct ggml_tensor * b = node->src[1];
|
16330
|
+
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
16331
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16332
|
+
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
16333
|
+
if (a->type != GGML_TYPE_F32) {
|
16334
|
+
// here we need memory just for single 2D matrix from src0
|
16335
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16336
|
+
}
|
16337
|
+
} else
|
16338
|
+
#endif
|
16339
|
+
if (b->type != vec_dot_type) {
|
16340
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
|
15931
16341
|
}
|
15932
16342
|
} break;
|
15933
16343
|
case GGML_OP_OUT_PROD:
|
15934
16344
|
{
|
15935
|
-
n_tasks = n_threads;
|
15936
|
-
|
15937
16345
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15938
16346
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15939
16347
|
}
|
15940
16348
|
} break;
|
15941
16349
|
case GGML_OP_SOFT_MAX:
|
15942
16350
|
{
|
15943
|
-
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
-
|
15945
16351
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
16352
|
} break;
|
15947
16353
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
@@ -15969,10 +16375,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15969
16375
|
GGML_ASSERT(false);
|
15970
16376
|
}
|
15971
16377
|
} break;
|
15972
|
-
case GGML_OP_IM2COL:
|
15973
|
-
{
|
15974
|
-
n_tasks = n_threads;
|
15975
|
-
} break;
|
15976
16378
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15977
16379
|
{
|
15978
16380
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
@@ -15989,8 +16391,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15989
16391
|
} break;
|
15990
16392
|
case GGML_OP_FLASH_ATTN:
|
15991
16393
|
{
|
15992
|
-
n_tasks = n_threads;
|
15993
|
-
|
15994
16394
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15995
16395
|
|
15996
16396
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -16003,8 +16403,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16003
16403
|
} break;
|
16004
16404
|
case GGML_OP_FLASH_FF:
|
16005
16405
|
{
|
16006
|
-
n_tasks = n_threads;
|
16007
|
-
|
16008
16406
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16009
16407
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16010
16408
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
@@ -16015,8 +16413,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16015
16413
|
} break;
|
16016
16414
|
case GGML_OP_FLASH_ATTN_BACK:
|
16017
16415
|
{
|
16018
|
-
n_tasks = n_threads;
|
16019
|
-
|
16020
16416
|
const int64_t D = node->src[0]->ne[0];
|
16021
16417
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16022
16418
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
@@ -16031,8 +16427,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16031
16427
|
|
16032
16428
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16033
16429
|
{
|
16034
|
-
n_tasks = n_threads;
|
16035
|
-
|
16036
16430
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16037
16431
|
} break;
|
16038
16432
|
case GGML_OP_COUNT:
|
@@ -16174,7 +16568,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|
16174
16568
|
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
16175
16569
|
ggml_type_name(tensor->type),
|
16176
16570
|
ggml_op_name (tensor->op),
|
16177
|
-
tensor
|
16571
|
+
ggml_n_dims(tensor),
|
16178
16572
|
ne[0], ne[1], ne[2], ne[3],
|
16179
16573
|
nb[0], nb[1], nb[2], nb[3],
|
16180
16574
|
tensor->data,
|
@@ -16189,7 +16583,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16189
16583
|
arg,
|
16190
16584
|
ggml_type_name(tensor->type),
|
16191
16585
|
ggml_op_name (tensor->op),
|
16192
|
-
tensor
|
16586
|
+
ggml_n_dims(tensor),
|
16193
16587
|
ne[0], ne[1], ne[2], ne[3],
|
16194
16588
|
nb[0], nb[1], nb[2], nb[3],
|
16195
16589
|
tensor->data,
|
@@ -16279,11 +16673,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16279
16673
|
|
16280
16674
|
const uint32_t type = tensor->type;
|
16281
16675
|
const uint32_t op = tensor->op;
|
16282
|
-
const uint32_t n_dims = tensor->n_dims;
|
16283
16676
|
|
16284
16677
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16285
16678
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16286
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16287
16679
|
|
16288
16680
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16289
16681
|
const uint64_t ne = tensor->ne[j];
|
@@ -16313,11 +16705,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16313
16705
|
|
16314
16706
|
const uint32_t type = tensor->type;
|
16315
16707
|
const uint32_t op = tensor->op;
|
16316
|
-
const uint32_t n_dims = tensor->n_dims;
|
16317
16708
|
|
16318
16709
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16319
16710
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16320
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16321
16711
|
|
16322
16712
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16323
16713
|
const uint64_t ne = tensor->ne[j];
|
@@ -16489,12 +16879,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16489
16879
|
{
|
16490
16880
|
uint32_t type;
|
16491
16881
|
uint32_t op;
|
16492
|
-
uint32_t n_dims;
|
16493
16882
|
|
16494
16883
|
for (uint32_t i = 0; i < n_leafs; ++i) {
|
16495
16884
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16496
16885
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16497
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16498
16886
|
|
16499
16887
|
int64_t ne[GGML_MAX_DIMS];
|
16500
16888
|
size_t nb[GGML_MAX_DIMS];
|
@@ -16510,7 +16898,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16510
16898
|
nb[j] = nb_cur;
|
16511
16899
|
}
|
16512
16900
|
|
16513
|
-
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
16901
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16514
16902
|
|
16515
16903
|
tensor->op = (enum ggml_op) op;
|
16516
16904
|
|
@@ -16527,7 +16915,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16527
16915
|
|
16528
16916
|
ptr += ggml_nbytes(tensor);
|
16529
16917
|
|
16530
|
-
fprintf(stderr, "%s: loaded leaf %d: '%16s', %
|
16918
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16531
16919
|
}
|
16532
16920
|
}
|
16533
16921
|
|
@@ -16537,12 +16925,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16537
16925
|
{
|
16538
16926
|
uint32_t type;
|
16539
16927
|
uint32_t op;
|
16540
|
-
uint32_t n_dims;
|
16541
16928
|
|
16542
16929
|
for (uint32_t i = 0; i < n_nodes; ++i) {
|
16543
16930
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16544
16931
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16545
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16546
16932
|
|
16547
16933
|
enum ggml_op eop = (enum ggml_op) op;
|
16548
16934
|
|
@@ -16613,7 +16999,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16613
16999
|
} break;
|
16614
17000
|
default:
|
16615
17001
|
{
|
16616
|
-
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
17002
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16617
17003
|
|
16618
17004
|
tensor->op = eop;
|
16619
17005
|
} break;
|
@@ -16632,7 +17018,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16632
17018
|
|
16633
17019
|
result->nodes[i] = tensor;
|
16634
17020
|
|
16635
|
-
fprintf(stderr, "%s: loaded node %d: '%16s', %
|
17021
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16636
17022
|
}
|
16637
17023
|
}
|
16638
17024
|
}
|
@@ -16770,7 +17156,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16770
17156
|
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16771
17157
|
}
|
16772
17158
|
|
16773
|
-
if (node
|
17159
|
+
if (ggml_is_matrix(node)) {
|
16774
17160
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
16775
17161
|
} else {
|
16776
17162
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
@@ -17037,7 +17423,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17037
17423
|
int64_t i = 0;
|
17038
17424
|
for (int p = 0; p < np; ++p) {
|
17039
17425
|
const int64_t ne = ggml_nelements(ps[p]);
|
17040
|
-
const float p_decay = ((ps[p]
|
17426
|
+
const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
|
17041
17427
|
for (int64_t j = 0; j < ne; ++j) {
|
17042
17428
|
float x = ggml_get_f32_1d(ps[p], j);
|
17043
17429
|
float g_ = g[i]*gnorm;
|
@@ -17819,8 +18205,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
17819
18205
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17820
18206
|
|
17821
18207
|
for (int j = 0; j < QK5_0; j += 2) {
|
17822
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17823
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18208
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18209
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17824
18210
|
|
17825
18211
|
// cast to 16 bins
|
17826
18212
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17849,8 +18235,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
17849
18235
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17850
18236
|
|
17851
18237
|
for (int j = 0; j < QK5_1; j += 2) {
|
17852
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17853
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18238
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18239
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17854
18240
|
|
17855
18241
|
// cast to 16 bins
|
17856
18242
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -18040,6 +18426,7 @@ struct gguf_kv {
|
|
18040
18426
|
|
18041
18427
|
struct gguf_header {
|
18042
18428
|
char magic[4];
|
18429
|
+
|
18043
18430
|
uint32_t version;
|
18044
18431
|
uint64_t n_tensors; // GGUFv2
|
18045
18432
|
uint64_t n_kv; // GGUFv2
|
@@ -18129,7 +18516,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18129
18516
|
|
18130
18517
|
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
18131
18518
|
if (magic[i] != GGUF_MAGIC[i]) {
|
18132
|
-
fprintf(stderr, "%s: invalid magic characters %
|
18519
|
+
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
18133
18520
|
fclose(file);
|
18134
18521
|
return NULL;
|
18135
18522
|
}
|
@@ -18144,7 +18531,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18144
18531
|
{
|
18145
18532
|
strncpy(ctx->header.magic, magic, 4);
|
18146
18533
|
|
18147
|
-
|
18148
18534
|
ctx->kv = NULL;
|
18149
18535
|
ctx->infos = NULL;
|
18150
18536
|
ctx->data = NULL;
|
@@ -18311,7 +18697,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18311
18697
|
return NULL;
|
18312
18698
|
}
|
18313
18699
|
|
18314
|
-
const size_t size_cur = (
|
18700
|
+
const size_t size_cur = ggml_row_size(info->type, ne);
|
18315
18701
|
|
18316
18702
|
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
18317
18703
|
}
|
@@ -18815,8 +19201,8 @@ void gguf_add_tensor(
|
|
18815
19201
|
ctx->infos[idx].ne[i] = 1;
|
18816
19202
|
}
|
18817
19203
|
|
18818
|
-
ctx->infos[idx].n_dims = tensor
|
18819
|
-
for (
|
19204
|
+
ctx->infos[idx].n_dims = ggml_n_dims(tensor);
|
19205
|
+
for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
|
18820
19206
|
ctx->infos[idx].ne[i] = tensor->ne[i];
|
18821
19207
|
}
|
18822
19208
|
|