llama_cpp 0.9.5 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe"
|
1
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
2
2
|
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
@@ -33,7 +33,7 @@
|
|
33
33
|
// we should just be careful :)
|
34
34
|
#pragma warning(disable: 4244 4267)
|
35
35
|
|
36
|
-
// disable POSIX deprecation
|
36
|
+
// disable POSIX deprecation warnings
|
37
37
|
// these functions are never going away, anyway
|
38
38
|
#pragma warning(disable: 4996)
|
39
39
|
#endif
|
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
233
233
|
#define UNUSED GGML_UNUSED
|
234
234
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
235
|
|
236
|
-
//
|
237
|
-
// tensor access macros
|
238
|
-
//
|
239
|
-
|
240
|
-
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
241
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
242
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
243
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
244
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
245
|
-
|
246
|
-
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
247
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
248
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
249
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
250
|
-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
251
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
252
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
253
|
-
|
254
236
|
#if defined(GGML_USE_ACCELERATE)
|
255
237
|
#include <Accelerate/Accelerate.h>
|
256
238
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -1413,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
1413
1395
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
1414
1396
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1415
1397
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1416
|
-
inline static void
|
1398
|
+
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
1417
1399
|
|
1418
1400
|
static const float GELU_COEF_A = 0.044715f;
|
1419
1401
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1613
1595
|
"GROUP_NORM",
|
1614
1596
|
|
1615
1597
|
"MUL_MAT",
|
1598
|
+
"MUL_MAT_ID",
|
1616
1599
|
"OUT_PROD",
|
1617
1600
|
|
1618
1601
|
"SCALE",
|
@@ -1640,6 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1640
1623
|
"POOL_1D",
|
1641
1624
|
"POOL_2D",
|
1642
1625
|
"UPSCALE",
|
1626
|
+
"PAD",
|
1627
|
+
"ARGSORT",
|
1628
|
+
"LEAKY_RELU",
|
1643
1629
|
|
1644
1630
|
"FLASH_ATTN",
|
1645
1631
|
"FLASH_FF",
|
@@ -1666,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1666
1652
|
"CROSS_ENTROPY_LOSS_BACK",
|
1667
1653
|
};
|
1668
1654
|
|
1669
|
-
static_assert(GGML_OP_COUNT ==
|
1655
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1670
1656
|
|
1671
1657
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1672
1658
|
"none",
|
@@ -1695,6 +1681,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1695
1681
|
"group_norm(x)",
|
1696
1682
|
|
1697
1683
|
"X*Y",
|
1684
|
+
"X[i]*Y",
|
1698
1685
|
"X*Y",
|
1699
1686
|
|
1700
1687
|
"x*v",
|
@@ -1722,6 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1722
1709
|
"pool_1d(x)",
|
1723
1710
|
"pool_2d(x)",
|
1724
1711
|
"upscale(x)",
|
1712
|
+
"pad(x)",
|
1713
|
+
"argsort(x)",
|
1714
|
+
"leaky_relu(x)",
|
1725
1715
|
|
1726
1716
|
"flash_attn(x)",
|
1727
1717
|
"flash_ff(x)",
|
@@ -1748,15 +1738,32 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1748
1738
|
"cross_entropy_loss_back(x,y)",
|
1749
1739
|
};
|
1750
1740
|
|
1751
|
-
static_assert(GGML_OP_COUNT ==
|
1741
|
+
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
1752
1742
|
|
1753
1743
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1754
1744
|
|
1745
|
+
|
1746
|
+
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
1747
|
+
"ABS",
|
1748
|
+
"SGN",
|
1749
|
+
"NEG",
|
1750
|
+
"STEP",
|
1751
|
+
"TANH",
|
1752
|
+
"ELU",
|
1753
|
+
"RELU",
|
1754
|
+
"GELU",
|
1755
|
+
"GELU_QUICK",
|
1756
|
+
"SILU",
|
1757
|
+
};
|
1758
|
+
|
1759
|
+
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
|
1760
|
+
|
1761
|
+
|
1755
1762
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1756
1763
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1757
1764
|
|
1758
1765
|
// WARN:
|
1759
|
-
// Mis-
|
1766
|
+
// Mis-configuration can lead to problem that's hard to reason about:
|
1760
1767
|
// * At best it crash or talks nosense.
|
1761
1768
|
// * At worst it talks slightly difference but hard to perceive.
|
1762
1769
|
//
|
@@ -1771,6 +1778,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1771
1778
|
|
1772
1779
|
p[GGML_OP_ACC ] = true;
|
1773
1780
|
p[GGML_OP_MUL_MAT ] = true;
|
1781
|
+
p[GGML_OP_MUL_MAT_ID ] = true;
|
1774
1782
|
p[GGML_OP_OUT_PROD ] = true;
|
1775
1783
|
p[GGML_OP_SET ] = true;
|
1776
1784
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
@@ -1989,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
1989
1997
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
1990
1998
|
}
|
1991
1999
|
|
1992
|
-
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
1993
|
-
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
1994
|
-
|
1995
|
-
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
1996
|
-
}
|
1997
|
-
|
1998
2000
|
int ggml_blck_size(enum ggml_type type) {
|
1999
2001
|
return type_traits[type].blck_size;
|
2000
2002
|
}
|
@@ -2003,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) {
|
|
2003
2005
|
return type_traits[type].type_size;
|
2004
2006
|
}
|
2005
2007
|
|
2006
|
-
|
2007
|
-
|
2008
|
+
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2009
|
+
assert(ne % ggml_blck_size(type) == 0);
|
2010
|
+
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
double ggml_type_sizef(enum ggml_type type) {
|
2014
|
+
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
2008
2015
|
}
|
2009
2016
|
|
2010
2017
|
const char * ggml_type_name(enum ggml_type type) {
|
@@ -2023,28 +2030,55 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
2023
2030
|
return GGML_OP_SYMBOL[op];
|
2024
2031
|
}
|
2025
2032
|
|
2033
|
+
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
2034
|
+
return GGML_UNARY_OP_NAME[op];
|
2035
|
+
}
|
2036
|
+
|
2037
|
+
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2038
|
+
if (t->op == GGML_OP_UNARY) {
|
2039
|
+
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2040
|
+
return ggml_unary_op_name(uop);
|
2041
|
+
}
|
2042
|
+
else {
|
2043
|
+
return ggml_op_name(t->op);
|
2044
|
+
}
|
2045
|
+
}
|
2046
|
+
|
2026
2047
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2027
2048
|
return ggml_type_size(tensor->type);
|
2028
2049
|
}
|
2029
2050
|
|
2030
|
-
|
2051
|
+
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
2031
2052
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2032
2053
|
|
2033
2054
|
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2034
2055
|
}
|
2035
2056
|
|
2036
|
-
|
2057
|
+
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
2037
2058
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2038
2059
|
|
2039
2060
|
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2040
2061
|
}
|
2041
2062
|
|
2042
|
-
|
2063
|
+
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
2043
2064
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2044
2065
|
|
2045
2066
|
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
2046
2067
|
}
|
2047
2068
|
|
2069
|
+
bool ggml_is_3d(const struct ggml_tensor * tensor) {
|
2070
|
+
return tensor->ne[3] == 1;
|
2071
|
+
}
|
2072
|
+
|
2073
|
+
int ggml_n_dims(const struct ggml_tensor * tensor) {
|
2074
|
+
for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
|
2075
|
+
if (tensor->ne[i] > 1) {
|
2076
|
+
return i + 1;
|
2077
|
+
}
|
2078
|
+
}
|
2079
|
+
return 1;
|
2080
|
+
}
|
2081
|
+
|
2048
2082
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2049
2083
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2050
2084
|
|
@@ -2451,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2451
2485
|
view_src = view_src->view_src;
|
2452
2486
|
}
|
2453
2487
|
|
2454
|
-
size_t data_size =
|
2488
|
+
size_t data_size = ggml_row_size(type, ne[0]);
|
2455
2489
|
for (int i = 1; i < n_dims; i++) {
|
2456
2490
|
data_size *= ne[i];
|
2457
2491
|
}
|
@@ -2494,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2494
2528
|
/*.type =*/ type,
|
2495
2529
|
/*.backend =*/ GGML_BACKEND_CPU,
|
2496
2530
|
/*.buffer =*/ NULL,
|
2497
|
-
/*.n_dims =*/ n_dims,
|
2498
2531
|
/*.ne =*/ { 1, 1, 1, 1 },
|
2499
2532
|
/*.nb =*/ { 0, 0, 0, 0 },
|
2500
2533
|
/*.op =*/ GGML_OP_NONE,
|
@@ -2601,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
2601
2634
|
}
|
2602
2635
|
|
2603
2636
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
2604
|
-
return ggml_new_tensor(ctx, src->type,
|
2637
|
+
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
|
2605
2638
|
}
|
2606
2639
|
|
2607
2640
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
@@ -3050,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
3050
3083
|
struct ggml_tensor * ggml_view_tensor(
|
3051
3084
|
struct ggml_context * ctx,
|
3052
3085
|
struct ggml_tensor * src) {
|
3053
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type,
|
3086
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
|
3054
3087
|
ggml_format_name(result, "%s (view)", src->name);
|
3055
3088
|
|
3056
3089
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
@@ -3154,9 +3187,7 @@ static struct ggml_tensor * ggml_add_impl(
|
|
3154
3187
|
struct ggml_tensor * a,
|
3155
3188
|
struct ggml_tensor * b,
|
3156
3189
|
bool inplace) {
|
3157
|
-
|
3158
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3159
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3190
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3160
3191
|
|
3161
3192
|
bool is_node = false;
|
3162
3193
|
|
@@ -3210,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|
3210
3241
|
is_node = true;
|
3211
3242
|
}
|
3212
3243
|
|
3213
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, type,
|
3244
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
3214
3245
|
|
3215
3246
|
result->op = GGML_OP_ADD;
|
3216
|
-
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3247
|
+
result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
|
3217
3248
|
result->src[0] = a;
|
3218
3249
|
result->src[1] = b;
|
3219
3250
|
|
@@ -3371,9 +3402,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
3371
3402
|
struct ggml_tensor * a,
|
3372
3403
|
struct ggml_tensor * b,
|
3373
3404
|
bool inplace) {
|
3374
|
-
|
3375
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3376
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3405
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3377
3406
|
|
3378
3407
|
bool is_node = false;
|
3379
3408
|
|
@@ -3418,7 +3447,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
3418
3447
|
struct ggml_tensor * a,
|
3419
3448
|
struct ggml_tensor * b,
|
3420
3449
|
bool inplace) {
|
3421
|
-
GGML_ASSERT(
|
3450
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3422
3451
|
|
3423
3452
|
bool is_node = false;
|
3424
3453
|
|
@@ -3584,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows(
|
|
3584
3613
|
is_node = true;
|
3585
3614
|
}
|
3586
3615
|
|
3587
|
-
int64_t ne[
|
3588
|
-
for (int i=1; i<
|
3616
|
+
int64_t ne[GGML_MAX_DIMS] = { 1 };
|
3617
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
3589
3618
|
ne[i] = a->ne[i];
|
3590
3619
|
}
|
3591
3620
|
|
3592
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3621
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
3593
3622
|
|
3594
3623
|
result->op = GGML_OP_SUM_ROWS;
|
3595
3624
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3610,8 +3639,8 @@ struct ggml_tensor * ggml_mean(
|
|
3610
3639
|
is_node = true;
|
3611
3640
|
}
|
3612
3641
|
|
3613
|
-
int64_t ne[
|
3614
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
3642
|
+
int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
3643
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
3615
3644
|
|
3616
3645
|
result->op = GGML_OP_MEAN;
|
3617
3646
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3633,8 +3662,7 @@ struct ggml_tensor * ggml_argmax(
|
|
3633
3662
|
is_node = true;
|
3634
3663
|
}
|
3635
3664
|
|
3636
|
-
|
3637
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
3665
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
|
3638
3666
|
|
3639
3667
|
result->op = GGML_OP_ARGMAX;
|
3640
3668
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3657,7 +3685,7 @@ struct ggml_tensor * ggml_repeat(
|
|
3657
3685
|
is_node = true;
|
3658
3686
|
}
|
3659
3687
|
|
3660
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3688
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3661
3689
|
|
3662
3690
|
result->op = GGML_OP_REPEAT;
|
3663
3691
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3684,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back(
|
|
3684
3712
|
return a;
|
3685
3713
|
}
|
3686
3714
|
|
3687
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
3715
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
|
3688
3716
|
|
3689
3717
|
result->op = GGML_OP_REPEAT_BACK;
|
3690
3718
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -3815,12 +3843,25 @@ struct ggml_tensor * ggml_relu_inplace(
|
|
3815
3843
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
|
3816
3844
|
}
|
3817
3845
|
|
3818
|
-
//
|
3846
|
+
// ggml_leaky_relu
|
3819
3847
|
|
3820
|
-
struct ggml_tensor *
|
3848
|
+
struct ggml_tensor * ggml_leaky_relu(
|
3821
3849
|
struct ggml_context * ctx,
|
3822
|
-
struct ggml_tensor * a) {
|
3823
|
-
|
3850
|
+
struct ggml_tensor * a, float negative_slope, bool inplace) {
|
3851
|
+
bool is_node = false;
|
3852
|
+
|
3853
|
+
if (!inplace && (a->grad)) {
|
3854
|
+
is_node = true;
|
3855
|
+
}
|
3856
|
+
|
3857
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
3858
|
+
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
3859
|
+
|
3860
|
+
result->op = GGML_OP_LEAKY_RELU;
|
3861
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
3862
|
+
result->src[0] = a;
|
3863
|
+
|
3864
|
+
return result;
|
3824
3865
|
}
|
3825
3866
|
|
3826
3867
|
// ggml_gelu
|
@@ -4007,8 +4048,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|
4007
4048
|
|
4008
4049
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4009
4050
|
|
4010
|
-
result->op = GGML_OP_GROUP_NORM;
|
4011
4051
|
result->op_params[0] = n_groups;
|
4052
|
+
|
4053
|
+
result->op = GGML_OP_GROUP_NORM;
|
4012
4054
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4013
4055
|
result->src[0] = a;
|
4014
4056
|
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
@@ -4046,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4046
4088
|
}
|
4047
4089
|
|
4048
4090
|
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4049
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4091
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4050
4092
|
|
4051
4093
|
result->op = GGML_OP_MUL_MAT;
|
4052
4094
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4056,6 +4098,51 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4056
4098
|
return result;
|
4057
4099
|
}
|
4058
4100
|
|
4101
|
+
// ggml_mul_mat_id
|
4102
|
+
|
4103
|
+
struct ggml_tensor * ggml_mul_mat_id(
|
4104
|
+
struct ggml_context * ctx,
|
4105
|
+
struct ggml_tensor * const as[],
|
4106
|
+
int n_as,
|
4107
|
+
struct ggml_tensor * ids,
|
4108
|
+
int id,
|
4109
|
+
struct ggml_tensor * b) {
|
4110
|
+
|
4111
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4112
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4113
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4114
|
+
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4115
|
+
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4116
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]);
|
4117
|
+
|
4118
|
+
bool is_node = false;
|
4119
|
+
|
4120
|
+
if (as[0]->grad || b->grad) {
|
4121
|
+
is_node = true;
|
4122
|
+
}
|
4123
|
+
|
4124
|
+
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4125
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4126
|
+
|
4127
|
+
ggml_set_op_params_i32(result, 0, id);
|
4128
|
+
ggml_set_op_params_i32(result, 1, n_as);
|
4129
|
+
|
4130
|
+
result->op = GGML_OP_MUL_MAT_ID;
|
4131
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4132
|
+
result->src[0] = ids;
|
4133
|
+
result->src[1] = b;
|
4134
|
+
|
4135
|
+
for (int i = 0; i < n_as; i++) {
|
4136
|
+
struct ggml_tensor * a = as[i];
|
4137
|
+
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4138
|
+
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4139
|
+
GGML_ASSERT(!ggml_is_transposed(a));
|
4140
|
+
result->src[i + 2] = a;
|
4141
|
+
}
|
4142
|
+
|
4143
|
+
return result;
|
4144
|
+
}
|
4145
|
+
|
4059
4146
|
// ggml_out_prod
|
4060
4147
|
|
4061
4148
|
struct ggml_tensor * ggml_out_prod(
|
@@ -4073,7 +4160,7 @@ struct ggml_tensor * ggml_out_prod(
|
|
4073
4160
|
|
4074
4161
|
// a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
|
4075
4162
|
const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
|
4076
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
4163
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4077
4164
|
|
4078
4165
|
result->op = GGML_OP_OUT_PROD;
|
4079
4166
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4209,7 +4296,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4209
4296
|
struct ggml_tensor * b,
|
4210
4297
|
size_t nb1,
|
4211
4298
|
size_t offset) {
|
4212
|
-
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset,
|
4299
|
+
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
4213
4300
|
}
|
4214
4301
|
|
4215
4302
|
// ggml_cpy
|
@@ -4358,7 +4445,7 @@ struct ggml_tensor * ggml_reshape(
|
|
4358
4445
|
//GGML_ASSERT(false);
|
4359
4446
|
}
|
4360
4447
|
|
4361
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type,
|
4448
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
|
4362
4449
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
4363
4450
|
|
4364
4451
|
result->op = GGML_OP_RESHAPE;
|
@@ -4673,7 +4760,9 @@ struct ggml_tensor * ggml_get_rows(
|
|
4673
4760
|
struct ggml_context * ctx,
|
4674
4761
|
struct ggml_tensor * a,
|
4675
4762
|
struct ggml_tensor * b) {
|
4676
|
-
GGML_ASSERT(
|
4763
|
+
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
4764
|
+
GGML_ASSERT(b->ne[3] == 1);
|
4765
|
+
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
4677
4766
|
|
4678
4767
|
bool is_node = false;
|
4679
4768
|
|
@@ -4683,7 +4772,7 @@ struct ggml_tensor * ggml_get_rows(
|
|
4683
4772
|
|
4684
4773
|
// TODO: implement non F32 return
|
4685
4774
|
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
|
4686
|
-
struct ggml_tensor * result =
|
4775
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
|
4687
4776
|
|
4688
4777
|
result->op = GGML_OP_GET_ROWS;
|
4689
4778
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -4734,7 +4823,7 @@ struct ggml_tensor * ggml_diag(
|
|
4734
4823
|
}
|
4735
4824
|
|
4736
4825
|
const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
|
4737
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type,
|
4826
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
|
4738
4827
|
|
4739
4828
|
result->op = GGML_OP_DIAG;
|
4740
4829
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5381,7 +5470,7 @@ struct ggml_tensor * ggml_pool_1d(
|
|
5381
5470
|
is_node = true;
|
5382
5471
|
}
|
5383
5472
|
|
5384
|
-
const int64_t ne[
|
5473
|
+
const int64_t ne[2] = {
|
5385
5474
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
5386
5475
|
a->ne[1],
|
5387
5476
|
};
|
@@ -5461,6 +5550,30 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
5461
5550
|
return result;
|
5462
5551
|
}
|
5463
5552
|
|
5553
|
+
struct ggml_tensor * ggml_pad(
|
5554
|
+
struct ggml_context * ctx,
|
5555
|
+
struct ggml_tensor * a,
|
5556
|
+
int p0, int p1, int p2, int p3) {
|
5557
|
+
bool is_node = false;
|
5558
|
+
|
5559
|
+
if (a->grad) {
|
5560
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5561
|
+
is_node = true;
|
5562
|
+
}
|
5563
|
+
|
5564
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
5565
|
+
a->ne[0] + p0,
|
5566
|
+
a->ne[1] + p1,
|
5567
|
+
a->ne[2] + p2,
|
5568
|
+
a->ne[3] + p3);
|
5569
|
+
|
5570
|
+
result->op = GGML_OP_PAD;
|
5571
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5572
|
+
result->src[0] = a;
|
5573
|
+
|
5574
|
+
return result;
|
5575
|
+
}
|
5576
|
+
|
5464
5577
|
struct ggml_tensor * ggml_upscale(
|
5465
5578
|
struct ggml_context * ctx,
|
5466
5579
|
struct ggml_tensor * a,
|
@@ -5468,6 +5581,43 @@ struct ggml_tensor * ggml_upscale(
|
|
5468
5581
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5469
5582
|
}
|
5470
5583
|
|
5584
|
+
// ggml_argsort
|
5585
|
+
|
5586
|
+
struct ggml_tensor * ggml_argsort(
|
5587
|
+
struct ggml_context * ctx,
|
5588
|
+
struct ggml_tensor * a,
|
5589
|
+
enum ggml_sort_order order) {
|
5590
|
+
bool is_node = false;
|
5591
|
+
|
5592
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
5593
|
+
|
5594
|
+
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5595
|
+
|
5596
|
+
result->op = GGML_OP_ARGSORT;
|
5597
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5598
|
+
result->src[0] = a;
|
5599
|
+
|
5600
|
+
return result;
|
5601
|
+
}
|
5602
|
+
|
5603
|
+
// ggml_top_k
|
5604
|
+
|
5605
|
+
struct ggml_tensor * ggml_top_k(
|
5606
|
+
struct ggml_context * ctx,
|
5607
|
+
struct ggml_tensor * a,
|
5608
|
+
int k) {
|
5609
|
+
GGML_ASSERT(a->ne[0] >= k);
|
5610
|
+
|
5611
|
+
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
5612
|
+
|
5613
|
+
result = ggml_view_4d(ctx, result,
|
5614
|
+
k, result->ne[1], result->ne[2], result->ne[3],
|
5615
|
+
result->nb[1], result->nb[2], result->nb[3],
|
5616
|
+
0);
|
5617
|
+
|
5618
|
+
return result;
|
5619
|
+
}
|
5620
|
+
|
5471
5621
|
// ggml_flash_attn
|
5472
5622
|
|
5473
5623
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -5486,7 +5636,7 @@ struct ggml_tensor * ggml_flash_attn(
|
|
5486
5636
|
}
|
5487
5637
|
|
5488
5638
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
5489
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5639
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
5490
5640
|
|
5491
5641
|
int32_t t = masked ? 1 : 0;
|
5492
5642
|
ggml_set_op_params(result, &t, sizeof(t));
|
@@ -5519,7 +5669,7 @@ struct ggml_tensor * ggml_flash_ff(
|
|
5519
5669
|
}
|
5520
5670
|
|
5521
5671
|
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5522
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32,
|
5672
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
5523
5673
|
|
5524
5674
|
result->op = GGML_OP_FLASH_FF;
|
5525
5675
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5635,7 +5785,6 @@ struct ggml_tensor * ggml_win_part(
|
|
5635
5785
|
const int np = npx*npy;
|
5636
5786
|
|
5637
5787
|
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
5638
|
-
|
5639
5788
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
5640
5789
|
|
5641
5790
|
int32_t params[] = { npx, npy, w };
|
@@ -6827,7 +6976,7 @@ static void ggml_compute_forward_add_f32(
|
|
6827
6976
|
const struct ggml_tensor * src0,
|
6828
6977
|
const struct ggml_tensor * src1,
|
6829
6978
|
struct ggml_tensor * dst) {
|
6830
|
-
GGML_ASSERT(
|
6979
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
6831
6980
|
|
6832
6981
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
6833
6982
|
return;
|
@@ -6860,16 +7009,19 @@ static void ggml_compute_forward_add_f32(
|
|
6860
7009
|
const int64_t i13 = i03 % ne13;
|
6861
7010
|
const int64_t i12 = i02 % ne12;
|
6862
7011
|
const int64_t i11 = i01 % ne11;
|
7012
|
+
const int64_t nr0 = ne00 / ne10;
|
6863
7013
|
|
6864
7014
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6865
7015
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6866
7016
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
6867
7017
|
|
7018
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
6868
7019
|
#ifdef GGML_USE_ACCELERATE
|
6869
|
-
|
7020
|
+
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
6870
7021
|
#else
|
6871
|
-
|
7022
|
+
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
6872
7023
|
#endif
|
7024
|
+
}
|
6873
7025
|
}
|
6874
7026
|
} else {
|
6875
7027
|
// src1 is not contiguous
|
@@ -6886,8 +7038,9 @@ static void ggml_compute_forward_add_f32(
|
|
6886
7038
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6887
7039
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6888
7040
|
|
6889
|
-
for (
|
6890
|
-
|
7041
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
7042
|
+
const int64_t i10 = i0 % ne10;
|
7043
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
6891
7044
|
|
6892
7045
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
6893
7046
|
}
|
@@ -7421,7 +7574,7 @@ static void ggml_compute_forward_acc_f32(
|
|
7421
7574
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
7422
7575
|
|
7423
7576
|
// view src0 and dst with these strides and data offset inbytes during acc
|
7424
|
-
// nb0 is
|
7577
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
7425
7578
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
7426
7579
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
7427
7580
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -7607,7 +7760,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7607
7760
|
const struct ggml_tensor * src0,
|
7608
7761
|
const struct ggml_tensor * src1,
|
7609
7762
|
struct ggml_tensor * dst) {
|
7610
|
-
GGML_ASSERT(
|
7763
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7611
7764
|
|
7612
7765
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7613
7766
|
return;
|
@@ -7617,6 +7770,8 @@ static void ggml_compute_forward_mul_f32(
|
|
7617
7770
|
|
7618
7771
|
#ifdef GGML_USE_CLBLAST
|
7619
7772
|
if (src1->backend == GGML_BACKEND_GPU) {
|
7773
|
+
// TODO: OpenCL kernel support full broadcast
|
7774
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
7620
7775
|
if (ith == 0) {
|
7621
7776
|
ggml_cl_mul(src0, src1, dst);
|
7622
7777
|
}
|
@@ -7630,7 +7785,6 @@ static void ggml_compute_forward_mul_f32(
|
|
7630
7785
|
|
7631
7786
|
GGML_ASSERT( nb0 == sizeof(float));
|
7632
7787
|
GGML_ASSERT(nb00 == sizeof(float));
|
7633
|
-
GGML_ASSERT(ne00 == ne10);
|
7634
7788
|
|
7635
7789
|
if (nb10 == sizeof(float)) {
|
7636
7790
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
@@ -7642,20 +7796,21 @@ static void ggml_compute_forward_mul_f32(
|
|
7642
7796
|
const int64_t i13 = i03 % ne13;
|
7643
7797
|
const int64_t i12 = i02 % ne12;
|
7644
7798
|
const int64_t i11 = i01 % ne11;
|
7799
|
+
const int64_t nr0 = ne00 / ne10;
|
7645
7800
|
|
7646
7801
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7647
7802
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7648
7803
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7649
7804
|
|
7805
|
+
for (int64_t r = 0 ; r < nr0; ++r) {
|
7650
7806
|
#ifdef GGML_USE_ACCELERATE
|
7651
|
-
|
7807
|
+
UNUSED(ggml_vec_mul_f32);
|
7652
7808
|
|
7653
|
-
|
7809
|
+
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
7654
7810
|
#else
|
7655
|
-
|
7811
|
+
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7656
7812
|
#endif
|
7657
|
-
|
7658
|
-
// }
|
7813
|
+
}
|
7659
7814
|
}
|
7660
7815
|
} else {
|
7661
7816
|
// src1 is not contiguous
|
@@ -7673,8 +7828,9 @@ static void ggml_compute_forward_mul_f32(
|
|
7673
7828
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7674
7829
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7675
7830
|
|
7676
|
-
for (int64_t i0 = 0; i0 < ne00; i0
|
7677
|
-
|
7831
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7832
|
+
const int64_t i10 = i0 % ne10;
|
7833
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7678
7834
|
|
7679
7835
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
7680
7836
|
}
|
@@ -7708,14 +7864,16 @@ static void ggml_compute_forward_div_f32(
|
|
7708
7864
|
const struct ggml_tensor * src0,
|
7709
7865
|
const struct ggml_tensor * src1,
|
7710
7866
|
struct ggml_tensor * dst) {
|
7711
|
-
|
7712
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
7867
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7713
7868
|
|
7714
7869
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7715
7870
|
return;
|
7716
7871
|
}
|
7717
7872
|
|
7718
|
-
const int
|
7873
|
+
const int ith = params->ith;
|
7874
|
+
const int nth = params->nth;
|
7875
|
+
|
7876
|
+
const int64_t nr = ggml_nrows(src0);
|
7719
7877
|
|
7720
7878
|
GGML_TENSOR_BINARY_OP_LOCALS
|
7721
7879
|
|
@@ -7723,41 +7881,50 @@ static void ggml_compute_forward_div_f32(
|
|
7723
7881
|
GGML_ASSERT(nb00 == sizeof(float));
|
7724
7882
|
|
7725
7883
|
if (nb10 == sizeof(float)) {
|
7726
|
-
for (
|
7727
|
-
// src0
|
7728
|
-
const
|
7729
|
-
const
|
7730
|
-
const
|
7884
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7885
|
+
// src0 and dst are same shape => same indices
|
7886
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7887
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7888
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7889
|
+
|
7890
|
+
const int64_t i13 = i03 % ne13;
|
7891
|
+
const int64_t i12 = i02 % ne12;
|
7892
|
+
const int64_t i11 = i01 % ne11;
|
7893
|
+
const int64_t nr0 = ne00 / ne10;
|
7894
|
+
|
7895
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7896
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7897
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7731
7898
|
|
7899
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
7732
7900
|
#ifdef GGML_USE_ACCELERATE
|
7733
|
-
|
7901
|
+
UNUSED(ggml_vec_div_f32);
|
7734
7902
|
|
7735
|
-
|
7736
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
7737
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
7738
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
7739
|
-
ne0);
|
7903
|
+
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
7740
7904
|
#else
|
7741
|
-
|
7742
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
7743
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
7744
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
7905
|
+
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7745
7906
|
#endif
|
7746
|
-
|
7747
|
-
// }
|
7907
|
+
}
|
7748
7908
|
}
|
7749
7909
|
} else {
|
7750
7910
|
// src1 is not contiguous
|
7751
|
-
for (
|
7752
|
-
// src0
|
7753
|
-
|
7754
|
-
const
|
7755
|
-
const
|
7911
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7912
|
+
// src0 and dst are same shape => same indices
|
7913
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
7914
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7915
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7916
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7756
7917
|
|
7757
|
-
|
7758
|
-
|
7759
|
-
|
7760
|
-
|
7918
|
+
const int64_t i13 = i03 % ne13;
|
7919
|
+
const int64_t i12 = i02 % ne12;
|
7920
|
+
const int64_t i11 = i01 % ne11;
|
7921
|
+
|
7922
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7923
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7924
|
+
|
7925
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7926
|
+
const int64_t i10 = i0 % ne10;
|
7927
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7761
7928
|
|
7762
7929
|
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
7763
7930
|
}
|
@@ -8203,7 +8370,7 @@ static void ggml_compute_forward_repeat_f16(
|
|
8203
8370
|
return;
|
8204
8371
|
}
|
8205
8372
|
|
8206
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
8373
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
8207
8374
|
|
8208
8375
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
8209
8376
|
const int nr0 = (int)(ne0/ne00);
|
@@ -8348,6 +8515,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8348
8515
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
8349
8516
|
|
8350
8517
|
const int ith = params->ith;
|
8518
|
+
const int nth = params->nth;
|
8351
8519
|
|
8352
8520
|
GGML_TENSOR_BINARY_OP_LOCALS
|
8353
8521
|
|
@@ -8357,7 +8525,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8357
8525
|
GGML_ASSERT(nb10 == sizeof(float));
|
8358
8526
|
|
8359
8527
|
for (int i3 = 0; i3 < ne3; i3++) {
|
8360
|
-
for (int i2 = ith; i2 < ne2; i2
|
8528
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
8361
8529
|
if (i2 < ne02) { // src0
|
8362
8530
|
for (int i1 = 0; i1 < ne1; i1++) {
|
8363
8531
|
for (int i0 = 0; i0 < ne0; i0++) {
|
@@ -8869,10 +9037,9 @@ static void ggml_compute_forward_silu(
|
|
8869
9037
|
} break;
|
8870
9038
|
}
|
8871
9039
|
}
|
9040
|
+
// ggml_compute_forward_leaky_relu
|
8872
9041
|
|
8873
|
-
|
8874
|
-
|
8875
|
-
static void ggml_compute_forward_leaky_f32(
|
9042
|
+
static void ggml_compute_forward_leaky_relu_f32(
|
8876
9043
|
const struct ggml_compute_params * params,
|
8877
9044
|
const struct ggml_tensor * src0,
|
8878
9045
|
struct ggml_tensor * dst) {
|
@@ -8886,24 +9053,27 @@ static void ggml_compute_forward_leaky_f32(
|
|
8886
9053
|
const int n = ggml_nrows(src0);
|
8887
9054
|
const int nc = src0->ne[0];
|
8888
9055
|
|
9056
|
+
float negative_slope;
|
9057
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
9058
|
+
|
8889
9059
|
assert(dst->nb[0] == sizeof(float));
|
8890
9060
|
assert(src0->nb[0] == sizeof(float));
|
8891
9061
|
|
8892
9062
|
for (int i = 0; i < n; i++) {
|
8893
|
-
|
9063
|
+
ggml_vec_leaky_relu_f32(nc,
|
8894
9064
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
8895
|
-
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9065
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
|
8896
9066
|
}
|
8897
9067
|
}
|
8898
9068
|
|
8899
|
-
static void
|
9069
|
+
static void ggml_compute_forward_leaky_relu(
|
8900
9070
|
const struct ggml_compute_params * params,
|
8901
9071
|
const struct ggml_tensor * src0,
|
8902
9072
|
struct ggml_tensor * dst) {
|
8903
9073
|
switch (src0->type) {
|
8904
9074
|
case GGML_TYPE_F32:
|
8905
9075
|
{
|
8906
|
-
|
9076
|
+
ggml_compute_forward_leaky_relu_f32(params, src0, dst);
|
8907
9077
|
} break;
|
8908
9078
|
default:
|
8909
9079
|
{
|
@@ -9392,8 +9562,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9392
9562
|
const int64_t ne0 = dst->ne[0];
|
9393
9563
|
const int64_t ne1 = dst->ne[1];
|
9394
9564
|
|
9565
|
+
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
9566
|
+
// all the experts for each batch element and the processing would become incredibly slow
|
9395
9567
|
// TODO: find the optimal values for these
|
9396
|
-
if (
|
9568
|
+
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
9569
|
+
ggml_is_contiguous(src0) &&
|
9397
9570
|
ggml_is_contiguous(src1) &&
|
9398
9571
|
//src0->type == GGML_TYPE_F32 &&
|
9399
9572
|
src1->type == GGML_TYPE_F32 &&
|
@@ -9407,11 +9580,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9407
9580
|
}
|
9408
9581
|
#endif
|
9409
9582
|
|
9583
|
+
// off1 = offset in i11 and i1
|
9584
|
+
// cne1 = ne11 and ne1
|
9585
|
+
// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
|
9586
|
+
// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
|
9410
9587
|
static void ggml_compute_forward_mul_mat(
|
9411
9588
|
const struct ggml_compute_params * params,
|
9412
9589
|
const struct ggml_tensor * src0,
|
9413
9590
|
const struct ggml_tensor * src1,
|
9414
|
-
struct ggml_tensor * dst
|
9591
|
+
struct ggml_tensor * dst,
|
9592
|
+
int64_t off1, int64_t cne1) {
|
9415
9593
|
int64_t t0 = ggml_perf_time_us();
|
9416
9594
|
UNUSED(t0);
|
9417
9595
|
|
@@ -9479,10 +9657,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9479
9657
|
const int64_t i03 = i13/r3;
|
9480
9658
|
const int64_t i02 = i12/r2;
|
9481
9659
|
|
9482
|
-
const void * x = (char *) src0->data +
|
9483
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
9484
|
-
|
9485
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9660
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
9661
|
+
const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
|
9662
|
+
float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
|
9486
9663
|
|
9487
9664
|
if (type != GGML_TYPE_F32) {
|
9488
9665
|
float * const wdata = params->wdata;
|
@@ -9499,10 +9676,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9499
9676
|
}
|
9500
9677
|
|
9501
9678
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9502
|
-
|
9503
|
-
|
9504
|
-
|
9505
|
-
|
9679
|
+
cne1, ne01, ne10,
|
9680
|
+
1.0f, y, ne10,
|
9681
|
+
x, ne00,
|
9682
|
+
0.0f, d, ne01);
|
9506
9683
|
}
|
9507
9684
|
}
|
9508
9685
|
|
@@ -9515,7 +9692,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9515
9692
|
if (params->type == GGML_TASK_INIT) {
|
9516
9693
|
if (src1->type != vec_dot_type) {
|
9517
9694
|
char * wdata = params->wdata;
|
9518
|
-
const size_t row_size =
|
9695
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9696
|
+
|
9697
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9698
|
+
assert(src1->type == GGML_TYPE_F32);
|
9519
9699
|
|
9520
9700
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9521
9701
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -9535,10 +9715,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9535
9715
|
}
|
9536
9716
|
|
9537
9717
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
9538
|
-
const size_t row_size =
|
9718
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
9539
9719
|
|
9540
9720
|
const int64_t nr0 = ne01; // src0 rows
|
9541
|
-
const int64_t nr1 =
|
9721
|
+
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
|
9542
9722
|
|
9543
9723
|
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
9544
9724
|
|
@@ -9580,9 +9760,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9580
9760
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
9581
9761
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
9582
9762
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
9583
|
-
const int64_t i13 = (ir1/(ne12*
|
9584
|
-
const int64_t i12 = (ir1 - i13*ne12*
|
9585
|
-
const int64_t i11 = (ir1 - i13*ne12*
|
9763
|
+
const int64_t i13 = (ir1/(ne12*cne1));
|
9764
|
+
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
9765
|
+
const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
|
9586
9766
|
|
9587
9767
|
// broadcast src0 into src1
|
9588
9768
|
const int64_t i03 = i13/r3;
|
@@ -9618,6 +9798,34 @@ static void ggml_compute_forward_mul_mat(
|
|
9618
9798
|
}
|
9619
9799
|
}
|
9620
9800
|
|
9801
|
+
// ggml_compute_forward_mul_mat_id
|
9802
|
+
|
9803
|
+
static void ggml_compute_forward_mul_mat_id(
|
9804
|
+
const struct ggml_compute_params * params,
|
9805
|
+
const struct ggml_tensor * src0,
|
9806
|
+
const struct ggml_tensor * src1,
|
9807
|
+
struct ggml_tensor * dst) {
|
9808
|
+
|
9809
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9810
|
+
// during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
|
9811
|
+
ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
|
9812
|
+
return;
|
9813
|
+
}
|
9814
|
+
|
9815
|
+
const struct ggml_tensor * ids = src0;
|
9816
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9817
|
+
const int n_as = ggml_get_op_params_i32(dst, 1);
|
9818
|
+
|
9819
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
9820
|
+
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
|
9821
|
+
|
9822
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
9823
|
+
|
9824
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
9825
|
+
ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
|
9826
|
+
}
|
9827
|
+
}
|
9828
|
+
|
9621
9829
|
// ggml_compute_forward_out_prod
|
9622
9830
|
|
9623
9831
|
static void ggml_compute_forward_out_prod_f32(
|
@@ -10027,7 +10235,7 @@ static void ggml_compute_forward_set_f32(
|
|
10027
10235
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
10028
10236
|
|
10029
10237
|
// view src0 and dst with these strides and data offset inbytes during set
|
10030
|
-
// nb0 is
|
10238
|
+
// nb0 is implicitly element_size because src0 and dst are contiguous
|
10031
10239
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
10032
10240
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
10033
10241
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
@@ -10191,21 +10399,30 @@ static void ggml_compute_forward_get_rows_q(
|
|
10191
10399
|
return;
|
10192
10400
|
}
|
10193
10401
|
|
10194
|
-
|
10195
|
-
|
10402
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10403
|
+
|
10404
|
+
const int64_t nc = ne00;
|
10405
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10406
|
+
|
10196
10407
|
const enum ggml_type type = src0->type;
|
10197
10408
|
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
10198
10409
|
|
10199
|
-
assert(
|
10200
|
-
assert(
|
10201
|
-
assert(
|
10410
|
+
assert(ne0 == nc);
|
10411
|
+
assert(ne02 == ne11);
|
10412
|
+
assert(nb00 == ggml_type_size(type));
|
10413
|
+
assert(ggml_nrows(dst) == nr);
|
10202
10414
|
|
10203
|
-
|
10204
|
-
|
10415
|
+
// TODO: multi-thread
|
10416
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10417
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10418
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10419
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10205
10420
|
|
10206
|
-
|
10207
|
-
|
10208
|
-
|
10421
|
+
dequantize_row_q(
|
10422
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10423
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10424
|
+
}
|
10425
|
+
}
|
10209
10426
|
}
|
10210
10427
|
}
|
10211
10428
|
|
@@ -10220,19 +10437,26 @@ static void ggml_compute_forward_get_rows_f16(
|
|
10220
10437
|
return;
|
10221
10438
|
}
|
10222
10439
|
|
10223
|
-
|
10224
|
-
const int nr = ggml_nelements(src1);
|
10440
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10225
10441
|
|
10226
|
-
|
10227
|
-
|
10228
|
-
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
10442
|
+
const int64_t nc = ne00;
|
10443
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10229
10444
|
|
10230
|
-
|
10231
|
-
|
10445
|
+
assert(ne0 == nc);
|
10446
|
+
assert(ne02 == ne11);
|
10447
|
+
assert(nb00 == sizeof(ggml_fp16_t));
|
10448
|
+
assert(ggml_nrows(dst) == nr);
|
10232
10449
|
|
10233
|
-
|
10234
|
-
|
10235
|
-
|
10450
|
+
// TODO: multi-thread
|
10451
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10452
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10453
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10454
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10455
|
+
|
10456
|
+
ggml_fp16_to_fp32_row(
|
10457
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
10458
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
10459
|
+
}
|
10236
10460
|
}
|
10237
10461
|
}
|
10238
10462
|
}
|
@@ -10248,19 +10472,27 @@ static void ggml_compute_forward_get_rows_f32(
|
|
10248
10472
|
return;
|
10249
10473
|
}
|
10250
10474
|
|
10251
|
-
|
10252
|
-
const int nr = ggml_nelements(src1);
|
10475
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
10253
10476
|
|
10254
|
-
|
10255
|
-
|
10256
|
-
assert(src0->nb[0] == sizeof(float));
|
10477
|
+
const int64_t nc = ne00;
|
10478
|
+
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
|
10257
10479
|
|
10258
|
-
|
10259
|
-
|
10480
|
+
assert(ne0 == nc);
|
10481
|
+
assert(ne02 == ne11);
|
10482
|
+
assert(nb00 == sizeof(float));
|
10483
|
+
assert(ggml_nrows(dst) == nr);
|
10260
10484
|
|
10261
|
-
|
10262
|
-
|
10263
|
-
|
10485
|
+
// TODO: multi-thread
|
10486
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10487
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10488
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10489
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
10490
|
+
|
10491
|
+
ggml_vec_cpy_f32(nc,
|
10492
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
10493
|
+
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
10494
|
+
}
|
10495
|
+
}
|
10264
10496
|
}
|
10265
10497
|
}
|
10266
10498
|
|
@@ -11980,6 +12212,7 @@ static void ggml_compute_forward_upscale_f32(
|
|
11980
12212
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
11981
12213
|
|
11982
12214
|
const int ith = params->ith;
|
12215
|
+
const int nth = params->nth;
|
11983
12216
|
|
11984
12217
|
GGML_TENSOR_UNARY_OP_LOCALS
|
11985
12218
|
|
@@ -11987,16 +12220,17 @@ static void ggml_compute_forward_upscale_f32(
|
|
11987
12220
|
|
11988
12221
|
// TODO: optimize
|
11989
12222
|
|
11990
|
-
for (
|
11991
|
-
|
11992
|
-
|
11993
|
-
|
11994
|
-
|
11995
|
-
|
11996
|
-
|
11997
|
-
const
|
12223
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
12224
|
+
const int64_t i03 = i3;
|
12225
|
+
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
12226
|
+
const int64_t i02 = i2;
|
12227
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
12228
|
+
const int64_t i01 = i1 / scale_factor;
|
12229
|
+
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
12230
|
+
const int64_t i00 = i0 / scale_factor;
|
11998
12231
|
|
11999
|
-
float *
|
12232
|
+
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
12233
|
+
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
12000
12234
|
|
12001
12235
|
*y = *x;
|
12002
12236
|
}
|
@@ -12021,6 +12255,125 @@ static void ggml_compute_forward_upscale(
|
|
12021
12255
|
}
|
12022
12256
|
}
|
12023
12257
|
|
12258
|
+
// ggml_compute_forward_pad
|
12259
|
+
|
12260
|
+
static void ggml_compute_forward_pad_f32(
|
12261
|
+
const struct ggml_compute_params * params,
|
12262
|
+
const struct ggml_tensor * src0,
|
12263
|
+
struct ggml_tensor * dst) {
|
12264
|
+
|
12265
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12266
|
+
return;
|
12267
|
+
}
|
12268
|
+
|
12269
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
12270
|
+
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
12271
|
+
|
12272
|
+
const int ith = params->ith;
|
12273
|
+
const int nth = params->nth;
|
12274
|
+
|
12275
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12276
|
+
|
12277
|
+
float * dst_ptr = (float *) dst->data;
|
12278
|
+
|
12279
|
+
// TODO: optimize
|
12280
|
+
|
12281
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
12282
|
+
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
12283
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
12284
|
+
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
12285
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
12286
|
+
|
12287
|
+
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12288
|
+
|
12289
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
12290
|
+
dst_ptr[dst_idx] = *src_ptr;
|
12291
|
+
} else {
|
12292
|
+
dst_ptr[dst_idx] = 0;
|
12293
|
+
}
|
12294
|
+
}
|
12295
|
+
}
|
12296
|
+
}
|
12297
|
+
}
|
12298
|
+
}
|
12299
|
+
|
12300
|
+
static void ggml_compute_forward_pad(
|
12301
|
+
const struct ggml_compute_params * params,
|
12302
|
+
const struct ggml_tensor * src0,
|
12303
|
+
struct ggml_tensor * dst) {
|
12304
|
+
switch (src0->type) {
|
12305
|
+
case GGML_TYPE_F32:
|
12306
|
+
{
|
12307
|
+
ggml_compute_forward_pad_f32(params, src0, dst);
|
12308
|
+
} break;
|
12309
|
+
default:
|
12310
|
+
{
|
12311
|
+
GGML_ASSERT(false);
|
12312
|
+
} break;
|
12313
|
+
}
|
12314
|
+
}
|
12315
|
+
|
12316
|
+
// ggml_compute_forward_argsort
|
12317
|
+
|
12318
|
+
static void ggml_compute_forward_argsort_f32(
|
12319
|
+
const struct ggml_compute_params * params,
|
12320
|
+
const struct ggml_tensor * src0,
|
12321
|
+
struct ggml_tensor * dst) {
|
12322
|
+
|
12323
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12324
|
+
return;
|
12325
|
+
}
|
12326
|
+
|
12327
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12328
|
+
|
12329
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12330
|
+
|
12331
|
+
const int ith = params->ith;
|
12332
|
+
const int nth = params->nth;
|
12333
|
+
|
12334
|
+
const int64_t nr = ggml_nrows(src0);
|
12335
|
+
|
12336
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
12337
|
+
|
12338
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
12339
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
12340
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
12341
|
+
|
12342
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12343
|
+
dst_data[j] = j;
|
12344
|
+
}
|
12345
|
+
|
12346
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
12347
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12348
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
12349
|
+
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
12350
|
+
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
12351
|
+
int32_t tmp = dst_data[j];
|
12352
|
+
dst_data[j] = dst_data[k];
|
12353
|
+
dst_data[k] = tmp;
|
12354
|
+
}
|
12355
|
+
}
|
12356
|
+
}
|
12357
|
+
}
|
12358
|
+
}
|
12359
|
+
|
12360
|
+
static void ggml_compute_forward_argsort(
|
12361
|
+
const struct ggml_compute_params * params,
|
12362
|
+
const struct ggml_tensor * src0,
|
12363
|
+
struct ggml_tensor * dst) {
|
12364
|
+
|
12365
|
+
switch (src0->type) {
|
12366
|
+
case GGML_TYPE_F32:
|
12367
|
+
{
|
12368
|
+
ggml_compute_forward_argsort_f32(params, src0, dst);
|
12369
|
+
} break;
|
12370
|
+
default:
|
12371
|
+
{
|
12372
|
+
GGML_ASSERT(false);
|
12373
|
+
} break;
|
12374
|
+
}
|
12375
|
+
}
|
12376
|
+
|
12024
12377
|
// ggml_compute_forward_flash_attn
|
12025
12378
|
|
12026
12379
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13167,10 +13520,6 @@ static void ggml_compute_forward_unary(
|
|
13167
13520
|
{
|
13168
13521
|
ggml_compute_forward_silu(params, src0, dst);
|
13169
13522
|
} break;
|
13170
|
-
case GGML_UNARY_OP_LEAKY:
|
13171
|
-
{
|
13172
|
-
ggml_compute_forward_leaky(params, src0, dst);
|
13173
|
-
} break;
|
13174
13523
|
default:
|
13175
13524
|
{
|
13176
13525
|
GGML_ASSERT(false);
|
@@ -13842,7 +14191,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13842
14191
|
} break;
|
13843
14192
|
case GGML_OP_MUL_MAT:
|
13844
14193
|
{
|
13845
|
-
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
14194
|
+
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
|
14195
|
+
} break;
|
14196
|
+
case GGML_OP_MUL_MAT_ID:
|
14197
|
+
{
|
14198
|
+
ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
|
13846
14199
|
} break;
|
13847
14200
|
case GGML_OP_OUT_PROD:
|
13848
14201
|
{
|
@@ -13948,6 +14301,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13948
14301
|
{
|
13949
14302
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
13950
14303
|
} break;
|
14304
|
+
case GGML_OP_PAD:
|
14305
|
+
{
|
14306
|
+
ggml_compute_forward_pad(params, tensor->src[0], tensor);
|
14307
|
+
} break;
|
14308
|
+
case GGML_OP_ARGSORT:
|
14309
|
+
{
|
14310
|
+
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14311
|
+
} break;
|
14312
|
+
case GGML_OP_LEAKY_RELU:
|
14313
|
+
{
|
14314
|
+
ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
|
14315
|
+
} break;
|
13951
14316
|
case GGML_OP_FLASH_ATTN:
|
13952
14317
|
{
|
13953
14318
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14202,7 +14567,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|
14202
14567
|
return replacements->vals[i];
|
14203
14568
|
}
|
14204
14569
|
|
14205
|
-
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type,
|
14570
|
+
struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
|
14206
14571
|
|
14207
14572
|
// insert clone into replacements
|
14208
14573
|
GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
|
@@ -14272,7 +14637,7 @@ void ggml_build_backward_gradient_checkpointing(
|
|
14272
14637
|
// insert new tensors recomputing src, reusing already made replacements,
|
14273
14638
|
// remember replacements: remember new tensors with mapping from corresponding gf nodes
|
14274
14639
|
// recurse for input tensors,
|
14275
|
-
// unless (i.e. terminating when) input tensors are
|
14640
|
+
// unless (i.e. terminating when) input tensors are replacements (like checkpoints)
|
14276
14641
|
node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
|
14277
14642
|
}
|
14278
14643
|
// insert rewritten backward node with replacements made into resulting backward graph gb
|
@@ -14598,6 +14963,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14598
14963
|
zero_table);
|
14599
14964
|
}
|
14600
14965
|
} break;
|
14966
|
+
case GGML_OP_MUL_MAT_ID:
|
14967
|
+
{
|
14968
|
+
GGML_ASSERT(false); // TODO: not implemented
|
14969
|
+
} break;
|
14601
14970
|
case GGML_OP_OUT_PROD:
|
14602
14971
|
{
|
14603
14972
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -14936,6 +15305,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14936
15305
|
{
|
14937
15306
|
GGML_ASSERT(false); // TODO: not implemented
|
14938
15307
|
} break;
|
15308
|
+
case GGML_OP_PAD:
|
15309
|
+
{
|
15310
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15311
|
+
} break;
|
15312
|
+
case GGML_OP_ARGSORT:
|
15313
|
+
{
|
15314
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15315
|
+
} break;
|
15316
|
+
case GGML_OP_LEAKY_RELU:
|
15317
|
+
{
|
15318
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15319
|
+
} break;
|
14939
15320
|
case GGML_OP_FLASH_ATTN:
|
14940
15321
|
{
|
14941
15322
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15296,12 +15677,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
15296
15677
|
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15297
15678
|
}
|
15298
15679
|
|
15299
|
-
struct ggml_cgraph
|
15300
|
-
|
15301
|
-
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15302
|
-
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15303
|
-
|
15304
|
-
*cgraph = (struct ggml_cgraph) {
|
15680
|
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15681
|
+
struct ggml_cgraph cgraph = {
|
15305
15682
|
/*.size =*/ 0,
|
15306
15683
|
/*.n_nodes =*/ i1 - i0,
|
15307
15684
|
/*.n_leafs =*/ 0,
|
@@ -15536,7 +15913,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15536
15913
|
n_tasks = n_threads;
|
15537
15914
|
} break;
|
15538
15915
|
case GGML_OP_SUB:
|
15539
|
-
case GGML_OP_DIV:
|
15540
15916
|
case GGML_OP_SQR:
|
15541
15917
|
case GGML_OP_SQRT:
|
15542
15918
|
case GGML_OP_LOG:
|
@@ -15546,6 +15922,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15546
15922
|
case GGML_OP_ARGMAX:
|
15547
15923
|
case GGML_OP_REPEAT:
|
15548
15924
|
case GGML_OP_REPEAT_BACK:
|
15925
|
+
case GGML_OP_LEAKY_RELU:
|
15549
15926
|
{
|
15550
15927
|
n_tasks = 1;
|
15551
15928
|
} break;
|
@@ -15558,7 +15935,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15558
15935
|
case GGML_UNARY_OP_TANH:
|
15559
15936
|
case GGML_UNARY_OP_ELU:
|
15560
15937
|
case GGML_UNARY_OP_RELU:
|
15561
|
-
case GGML_UNARY_OP_LEAKY:
|
15562
15938
|
{
|
15563
15939
|
n_tasks = 1;
|
15564
15940
|
} break;
|
@@ -15569,10 +15945,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15569
15945
|
{
|
15570
15946
|
n_tasks = n_threads;
|
15571
15947
|
} break;
|
15948
|
+
default:
|
15949
|
+
GGML_ASSERT(false);
|
15572
15950
|
}
|
15573
15951
|
break;
|
15574
15952
|
case GGML_OP_SILU_BACK:
|
15575
15953
|
case GGML_OP_MUL:
|
15954
|
+
case GGML_OP_DIV:
|
15576
15955
|
case GGML_OP_NORM:
|
15577
15956
|
case GGML_OP_RMS_NORM:
|
15578
15957
|
case GGML_OP_RMS_NORM_BACK:
|
@@ -15610,6 +15989,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15610
15989
|
}
|
15611
15990
|
#endif
|
15612
15991
|
} break;
|
15992
|
+
case GGML_OP_MUL_MAT_ID:
|
15993
|
+
{
|
15994
|
+
// FIXME: blas
|
15995
|
+
n_tasks = n_threads;
|
15996
|
+
} break;
|
15613
15997
|
case GGML_OP_OUT_PROD:
|
15614
15998
|
{
|
15615
15999
|
n_tasks = n_threads;
|
@@ -15629,7 +16013,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15629
16013
|
} break;
|
15630
16014
|
case GGML_OP_DIAG_MASK_ZERO:
|
15631
16015
|
case GGML_OP_DIAG_MASK_INF:
|
15632
|
-
case GGML_OP_SOFT_MAX:
|
15633
16016
|
case GGML_OP_SOFT_MAX_BACK:
|
15634
16017
|
case GGML_OP_ROPE:
|
15635
16018
|
case GGML_OP_ROPE_BACK:
|
@@ -15645,6 +16028,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15645
16028
|
{
|
15646
16029
|
n_tasks = 1; //TODO
|
15647
16030
|
} break;
|
16031
|
+
case GGML_OP_SOFT_MAX:
|
16032
|
+
{
|
16033
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
16034
|
+
} break;
|
15648
16035
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15649
16036
|
{
|
15650
16037
|
n_tasks = n_threads;
|
@@ -15666,6 +16053,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15666
16053
|
{
|
15667
16054
|
n_tasks = n_threads;
|
15668
16055
|
} break;
|
16056
|
+
case GGML_OP_PAD:
|
16057
|
+
{
|
16058
|
+
n_tasks = n_threads;
|
16059
|
+
} break;
|
16060
|
+
case GGML_OP_ARGSORT:
|
16061
|
+
{
|
16062
|
+
n_tasks = n_threads;
|
16063
|
+
} break;
|
15669
16064
|
case GGML_OP_FLASH_ATTN:
|
15670
16065
|
{
|
15671
16066
|
n_tasks = n_threads;
|
@@ -15728,6 +16123,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15728
16123
|
{
|
15729
16124
|
n_tasks = 1;
|
15730
16125
|
} break;
|
16126
|
+
case GGML_OP_COUNT:
|
16127
|
+
{
|
16128
|
+
GGML_ASSERT(false);
|
16129
|
+
} break;
|
15731
16130
|
default:
|
15732
16131
|
{
|
15733
16132
|
fprintf(stderr, "%s: op not implemented: ", __func__);
|
@@ -15876,18 +16275,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15876
16275
|
|
15877
16276
|
// thread scheduling for the different operations + work buffer size estimation
|
15878
16277
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15879
|
-
int n_tasks = 1;
|
15880
|
-
|
15881
16278
|
struct ggml_tensor * node = cgraph->nodes[i];
|
15882
16279
|
|
16280
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16281
|
+
|
15883
16282
|
size_t cur = 0;
|
15884
16283
|
|
15885
16284
|
switch (node->op) {
|
15886
16285
|
case GGML_OP_CPY:
|
15887
16286
|
case GGML_OP_DUP:
|
15888
16287
|
{
|
15889
|
-
n_tasks = n_threads;
|
15890
|
-
|
15891
16288
|
if (ggml_is_quantized(node->type)) {
|
15892
16289
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15893
16290
|
}
|
@@ -15895,16 +16292,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15895
16292
|
case GGML_OP_ADD:
|
15896
16293
|
case GGML_OP_ADD1:
|
15897
16294
|
{
|
15898
|
-
n_tasks = n_threads;
|
15899
|
-
|
15900
16295
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15901
16296
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15902
16297
|
}
|
15903
16298
|
} break;
|
15904
16299
|
case GGML_OP_ACC:
|
15905
16300
|
{
|
15906
|
-
n_tasks = n_threads;
|
15907
|
-
|
15908
16301
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15909
16302
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
15910
16303
|
}
|
@@ -15927,21 +16320,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15927
16320
|
} else
|
15928
16321
|
#endif
|
15929
16322
|
if (node->src[1]->type != vec_dot_type) {
|
15930
|
-
cur =
|
16323
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
16324
|
+
}
|
16325
|
+
} break;
|
16326
|
+
case GGML_OP_MUL_MAT_ID:
|
16327
|
+
{
|
16328
|
+
const struct ggml_tensor * a = node->src[2];
|
16329
|
+
const struct ggml_tensor * b = node->src[1];
|
16330
|
+
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
16331
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16332
|
+
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
16333
|
+
if (a->type != GGML_TYPE_F32) {
|
16334
|
+
// here we need memory just for single 2D matrix from src0
|
16335
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16336
|
+
}
|
16337
|
+
} else
|
16338
|
+
#endif
|
16339
|
+
if (b->type != vec_dot_type) {
|
16340
|
+
cur = ggml_row_size(vec_dot_type, ggml_nelements(b));
|
15931
16341
|
}
|
15932
16342
|
} break;
|
15933
16343
|
case GGML_OP_OUT_PROD:
|
15934
16344
|
{
|
15935
|
-
n_tasks = n_threads;
|
15936
|
-
|
15937
16345
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15938
16346
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15939
16347
|
}
|
15940
16348
|
} break;
|
15941
16349
|
case GGML_OP_SOFT_MAX:
|
15942
16350
|
{
|
15943
|
-
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
-
|
15945
16351
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
16352
|
} break;
|
15947
16353
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
@@ -15969,10 +16375,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15969
16375
|
GGML_ASSERT(false);
|
15970
16376
|
}
|
15971
16377
|
} break;
|
15972
|
-
case GGML_OP_IM2COL:
|
15973
|
-
{
|
15974
|
-
n_tasks = n_threads;
|
15975
|
-
} break;
|
15976
16378
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15977
16379
|
{
|
15978
16380
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
@@ -15989,8 +16391,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15989
16391
|
} break;
|
15990
16392
|
case GGML_OP_FLASH_ATTN:
|
15991
16393
|
{
|
15992
|
-
n_tasks = n_threads;
|
15993
|
-
|
15994
16394
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15995
16395
|
|
15996
16396
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -16003,8 +16403,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16003
16403
|
} break;
|
16004
16404
|
case GGML_OP_FLASH_FF:
|
16005
16405
|
{
|
16006
|
-
n_tasks = n_threads;
|
16007
|
-
|
16008
16406
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16009
16407
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16010
16408
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
@@ -16015,8 +16413,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16015
16413
|
} break;
|
16016
16414
|
case GGML_OP_FLASH_ATTN_BACK:
|
16017
16415
|
{
|
16018
|
-
n_tasks = n_threads;
|
16019
|
-
|
16020
16416
|
const int64_t D = node->src[0]->ne[0];
|
16021
16417
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16022
16418
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
@@ -16031,8 +16427,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16031
16427
|
|
16032
16428
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16033
16429
|
{
|
16034
|
-
n_tasks = n_threads;
|
16035
|
-
|
16036
16430
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16037
16431
|
} break;
|
16038
16432
|
case GGML_OP_COUNT:
|
@@ -16174,7 +16568,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|
16174
16568
|
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
16175
16569
|
ggml_type_name(tensor->type),
|
16176
16570
|
ggml_op_name (tensor->op),
|
16177
|
-
tensor
|
16571
|
+
ggml_n_dims(tensor),
|
16178
16572
|
ne[0], ne[1], ne[2], ne[3],
|
16179
16573
|
nb[0], nb[1], nb[2], nb[3],
|
16180
16574
|
tensor->data,
|
@@ -16189,7 +16583,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|
16189
16583
|
arg,
|
16190
16584
|
ggml_type_name(tensor->type),
|
16191
16585
|
ggml_op_name (tensor->op),
|
16192
|
-
tensor
|
16586
|
+
ggml_n_dims(tensor),
|
16193
16587
|
ne[0], ne[1], ne[2], ne[3],
|
16194
16588
|
nb[0], nb[1], nb[2], nb[3],
|
16195
16589
|
tensor->data,
|
@@ -16279,11 +16673,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16279
16673
|
|
16280
16674
|
const uint32_t type = tensor->type;
|
16281
16675
|
const uint32_t op = tensor->op;
|
16282
|
-
const uint32_t n_dims = tensor->n_dims;
|
16283
16676
|
|
16284
16677
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16285
16678
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16286
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16287
16679
|
|
16288
16680
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16289
16681
|
const uint64_t ne = tensor->ne[j];
|
@@ -16313,11 +16705,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16313
16705
|
|
16314
16706
|
const uint32_t type = tensor->type;
|
16315
16707
|
const uint32_t op = tensor->op;
|
16316
|
-
const uint32_t n_dims = tensor->n_dims;
|
16317
16708
|
|
16318
16709
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
16319
16710
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
16320
|
-
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
16321
16711
|
|
16322
16712
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
16323
16713
|
const uint64_t ne = tensor->ne[j];
|
@@ -16489,12 +16879,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16489
16879
|
{
|
16490
16880
|
uint32_t type;
|
16491
16881
|
uint32_t op;
|
16492
|
-
uint32_t n_dims;
|
16493
16882
|
|
16494
16883
|
for (uint32_t i = 0; i < n_leafs; ++i) {
|
16495
16884
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16496
16885
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16497
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16498
16886
|
|
16499
16887
|
int64_t ne[GGML_MAX_DIMS];
|
16500
16888
|
size_t nb[GGML_MAX_DIMS];
|
@@ -16510,7 +16898,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16510
16898
|
nb[j] = nb_cur;
|
16511
16899
|
}
|
16512
16900
|
|
16513
|
-
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
16901
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16514
16902
|
|
16515
16903
|
tensor->op = (enum ggml_op) op;
|
16516
16904
|
|
@@ -16527,7 +16915,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16527
16915
|
|
16528
16916
|
ptr += ggml_nbytes(tensor);
|
16529
16917
|
|
16530
|
-
fprintf(stderr, "%s: loaded leaf %d: '%16s', %
|
16918
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16531
16919
|
}
|
16532
16920
|
}
|
16533
16921
|
|
@@ -16537,12 +16925,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16537
16925
|
{
|
16538
16926
|
uint32_t type;
|
16539
16927
|
uint32_t op;
|
16540
|
-
uint32_t n_dims;
|
16541
16928
|
|
16542
16929
|
for (uint32_t i = 0; i < n_nodes; ++i) {
|
16543
16930
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
16544
16931
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
16545
|
-
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
16546
16932
|
|
16547
16933
|
enum ggml_op eop = (enum ggml_op) op;
|
16548
16934
|
|
@@ -16613,7 +16999,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16613
16999
|
} break;
|
16614
17000
|
default:
|
16615
17001
|
{
|
16616
|
-
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type,
|
17002
|
+
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
|
16617
17003
|
|
16618
17004
|
tensor->op = eop;
|
16619
17005
|
} break;
|
@@ -16632,7 +17018,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
16632
17018
|
|
16633
17019
|
result->nodes[i] = tensor;
|
16634
17020
|
|
16635
|
-
fprintf(stderr, "%s: loaded node %d: '%16s', %
|
17021
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
16636
17022
|
}
|
16637
17023
|
}
|
16638
17024
|
}
|
@@ -16770,7 +17156,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16770
17156
|
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16771
17157
|
}
|
16772
17158
|
|
16773
|
-
if (node
|
17159
|
+
if (ggml_is_matrix(node)) {
|
16774
17160
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
|
16775
17161
|
} else {
|
16776
17162
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
|
@@ -17037,7 +17423,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17037
17423
|
int64_t i = 0;
|
17038
17424
|
for (int p = 0; p < np; ++p) {
|
17039
17425
|
const int64_t ne = ggml_nelements(ps[p]);
|
17040
|
-
const float p_decay = ((ps[p]
|
17426
|
+
const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
|
17041
17427
|
for (int64_t j = 0; j < ne; ++j) {
|
17042
17428
|
float x = ggml_get_f32_1d(ps[p], j);
|
17043
17429
|
float g_ = g[i]*gnorm;
|
@@ -17819,8 +18205,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
17819
18205
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17820
18206
|
|
17821
18207
|
for (int j = 0; j < QK5_0; j += 2) {
|
17822
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17823
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18208
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18209
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17824
18210
|
|
17825
18211
|
// cast to 16 bins
|
17826
18212
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17849,8 +18235,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
17849
18235
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17850
18236
|
|
17851
18237
|
for (int j = 0; j < QK5_1; j += 2) {
|
17852
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17853
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18238
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18239
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17854
18240
|
|
17855
18241
|
// cast to 16 bins
|
17856
18242
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -18040,6 +18426,7 @@ struct gguf_kv {
|
|
18040
18426
|
|
18041
18427
|
struct gguf_header {
|
18042
18428
|
char magic[4];
|
18429
|
+
|
18043
18430
|
uint32_t version;
|
18044
18431
|
uint64_t n_tensors; // GGUFv2
|
18045
18432
|
uint64_t n_kv; // GGUFv2
|
@@ -18129,7 +18516,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18129
18516
|
|
18130
18517
|
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
18131
18518
|
if (magic[i] != GGUF_MAGIC[i]) {
|
18132
|
-
fprintf(stderr, "%s: invalid magic characters %
|
18519
|
+
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
18133
18520
|
fclose(file);
|
18134
18521
|
return NULL;
|
18135
18522
|
}
|
@@ -18144,7 +18531,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18144
18531
|
{
|
18145
18532
|
strncpy(ctx->header.magic, magic, 4);
|
18146
18533
|
|
18147
|
-
|
18148
18534
|
ctx->kv = NULL;
|
18149
18535
|
ctx->infos = NULL;
|
18150
18536
|
ctx->data = NULL;
|
@@ -18311,7 +18697,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18311
18697
|
return NULL;
|
18312
18698
|
}
|
18313
18699
|
|
18314
|
-
const size_t size_cur = (
|
18700
|
+
const size_t size_cur = ggml_row_size(info->type, ne);
|
18315
18701
|
|
18316
18702
|
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
18317
18703
|
}
|
@@ -18815,8 +19201,8 @@ void gguf_add_tensor(
|
|
18815
19201
|
ctx->infos[idx].ne[i] = 1;
|
18816
19202
|
}
|
18817
19203
|
|
18818
|
-
ctx->infos[idx].n_dims = tensor
|
18819
|
-
for (
|
19204
|
+
ctx->infos[idx].n_dims = ggml_n_dims(tensor);
|
19205
|
+
for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
|
18820
19206
|
ctx->infos[idx].ne[i] = tensor->ne[i];
|
18821
19207
|
}
|
18822
19208
|
|