llama_cpp 0.9.4 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
233
233
|
#define UNUSED GGML_UNUSED
|
234
234
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
235
|
|
236
|
-
//
|
237
|
-
// tensor access macros
|
238
|
-
//
|
239
|
-
|
240
|
-
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
241
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
242
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
243
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
244
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
245
|
-
|
246
|
-
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
247
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
248
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
249
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
250
|
-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
251
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
252
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
253
|
-
|
254
236
|
#if defined(GGML_USE_ACCELERATE)
|
255
237
|
#include <Accelerate/Accelerate.h>
|
256
238
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1613
1595
|
"GROUP_NORM",
|
1614
1596
|
|
1615
1597
|
"MUL_MAT",
|
1598
|
+
"MUL_MAT_ID",
|
1616
1599
|
"OUT_PROD",
|
1617
1600
|
|
1618
1601
|
"SCALE",
|
@@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1640
1623
|
"POOL_1D",
|
1641
1624
|
"POOL_2D",
|
1642
1625
|
"UPSCALE",
|
1626
|
+
"ARGSORT",
|
1643
1627
|
|
1644
1628
|
"FLASH_ATTN",
|
1645
1629
|
"FLASH_FF",
|
@@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1666
1650
|
"CROSS_ENTROPY_LOSS_BACK",
|
1667
1651
|
};
|
1668
1652
|
|
1669
|
-
static_assert(GGML_OP_COUNT ==
|
1653
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1670
1654
|
|
1671
1655
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1672
1656
|
"none",
|
@@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1695
1679
|
"group_norm(x)",
|
1696
1680
|
|
1697
1681
|
"X*Y",
|
1682
|
+
"X[i]*Y",
|
1698
1683
|
"X*Y",
|
1699
1684
|
|
1700
1685
|
"x*v",
|
@@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1722
1707
|
"pool_1d(x)",
|
1723
1708
|
"pool_2d(x)",
|
1724
1709
|
"upscale(x)",
|
1710
|
+
"argsort(x)",
|
1725
1711
|
|
1726
1712
|
"flash_attn(x)",
|
1727
1713
|
"flash_ff(x)",
|
@@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1748
1734
|
"cross_entropy_loss_back(x,y)",
|
1749
1735
|
};
|
1750
1736
|
|
1751
|
-
static_assert(GGML_OP_COUNT ==
|
1737
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1752
1738
|
|
1753
1739
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1754
1740
|
|
1741
|
+
|
1742
|
+
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
1743
|
+
"ABS",
|
1744
|
+
"SGN",
|
1745
|
+
"NEG",
|
1746
|
+
"STEP",
|
1747
|
+
"TANH",
|
1748
|
+
"ELU",
|
1749
|
+
"RELU",
|
1750
|
+
"GELU",
|
1751
|
+
"GELU_QUICK",
|
1752
|
+
"SILU",
|
1753
|
+
"LEAKY",
|
1754
|
+
};
|
1755
|
+
|
1756
|
+
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
|
1757
|
+
|
1758
|
+
|
1755
1759
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1756
1760
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1757
1761
|
|
@@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1771
1775
|
|
1772
1776
|
p[GGML_OP_ACC ] = true;
|
1773
1777
|
p[GGML_OP_MUL_MAT ] = true;
|
1778
|
+
p[GGML_OP_MUL_MAT_ID ] = true;
|
1774
1779
|
p[GGML_OP_OUT_PROD ] = true;
|
1775
1780
|
p[GGML_OP_SET ] = true;
|
1776
1781
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
@@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
2023
2028
|
return GGML_OP_SYMBOL[op];
|
2024
2029
|
}
|
2025
2030
|
|
2031
|
+
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
2032
|
+
return GGML_UNARY_OP_NAME[op];
|
2033
|
+
}
|
2034
|
+
|
2035
|
+
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2036
|
+
if (t->op == GGML_OP_UNARY) {
|
2037
|
+
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2038
|
+
return ggml_unary_op_name(uop);
|
2039
|
+
}
|
2040
|
+
else {
|
2041
|
+
return ggml_op_name(t->op);
|
2042
|
+
}
|
2043
|
+
}
|
2044
|
+
|
2026
2045
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2027
2046
|
return ggml_type_size(tensor->type);
|
2028
2047
|
}
|
@@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
|
|
3154
3173
|
struct ggml_tensor * a,
|
3155
3174
|
struct ggml_tensor * b,
|
3156
3175
|
bool inplace) {
|
3157
|
-
|
3158
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3159
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3176
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3160
3177
|
|
3161
3178
|
bool is_node = false;
|
3162
3179
|
|
@@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
3371
3388
|
struct ggml_tensor * a,
|
3372
3389
|
struct ggml_tensor * b,
|
3373
3390
|
bool inplace) {
|
3374
|
-
|
3375
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3376
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3391
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3377
3392
|
|
3378
3393
|
bool is_node = false;
|
3379
3394
|
|
@@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
3418
3433
|
struct ggml_tensor * a,
|
3419
3434
|
struct ggml_tensor * b,
|
3420
3435
|
bool inplace) {
|
3421
|
-
GGML_ASSERT(
|
3436
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3422
3437
|
|
3423
3438
|
bool is_node = false;
|
3424
3439
|
|
@@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4056
4071
|
return result;
|
4057
4072
|
}
|
4058
4073
|
|
4074
|
+
// ggml_mul_mat_id
|
4075
|
+
|
4076
|
+
struct ggml_tensor * ggml_mul_mat_id(
|
4077
|
+
struct ggml_context * ctx,
|
4078
|
+
struct ggml_tensor * as[],
|
4079
|
+
struct ggml_tensor * ids,
|
4080
|
+
int id,
|
4081
|
+
struct ggml_tensor * b) {
|
4082
|
+
|
4083
|
+
int64_t n_as = ids->ne[0];
|
4084
|
+
|
4085
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4086
|
+
GGML_ASSERT(ggml_is_vector(ids));
|
4087
|
+
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4088
|
+
GGML_ASSERT(id >= 0 && id < n_as);
|
4089
|
+
|
4090
|
+
bool is_node = false;
|
4091
|
+
|
4092
|
+
if (as[0]->grad || b->grad) {
|
4093
|
+
is_node = true;
|
4094
|
+
}
|
4095
|
+
|
4096
|
+
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4097
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
4098
|
+
|
4099
|
+
ggml_set_op_params_i32(result, 0, id);
|
4100
|
+
|
4101
|
+
result->op = GGML_OP_MUL_MAT_ID;
|
4102
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4103
|
+
result->src[0] = ids;
|
4104
|
+
result->src[1] = b;
|
4105
|
+
|
4106
|
+
for (int64_t i = 0; i < n_as; i++) {
|
4107
|
+
struct ggml_tensor * a = as[i];
|
4108
|
+
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4109
|
+
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4110
|
+
GGML_ASSERT(!ggml_is_transposed(a));
|
4111
|
+
result->src[i + 2] = a;
|
4112
|
+
}
|
4113
|
+
|
4114
|
+
return result;
|
4115
|
+
}
|
4116
|
+
|
4059
4117
|
// ggml_out_prod
|
4060
4118
|
|
4061
4119
|
struct ggml_tensor * ggml_out_prod(
|
@@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4209
4267
|
struct ggml_tensor * b,
|
4210
4268
|
size_t nb1,
|
4211
4269
|
size_t offset) {
|
4212
|
-
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset,
|
4270
|
+
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
4213
4271
|
}
|
4214
4272
|
|
4215
4273
|
// ggml_cpy
|
@@ -4826,7 +4884,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
|
4826
4884
|
static struct ggml_tensor * ggml_soft_max_impl(
|
4827
4885
|
struct ggml_context * ctx,
|
4828
4886
|
struct ggml_tensor * a,
|
4887
|
+
struct ggml_tensor * mask,
|
4888
|
+
float scale,
|
4829
4889
|
bool inplace) {
|
4890
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
4891
|
+
if (mask) {
|
4892
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
4893
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
4894
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
4895
|
+
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
4896
|
+
}
|
4897
|
+
|
4830
4898
|
bool is_node = false;
|
4831
4899
|
|
4832
4900
|
if (a->grad) {
|
@@ -4835,9 +4903,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4835
4903
|
|
4836
4904
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4837
4905
|
|
4906
|
+
float params[] = { scale };
|
4907
|
+
ggml_set_op_params(result, params, sizeof(params));
|
4908
|
+
|
4838
4909
|
result->op = GGML_OP_SOFT_MAX;
|
4839
4910
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4840
4911
|
result->src[0] = a;
|
4912
|
+
result->src[1] = mask;
|
4841
4913
|
|
4842
4914
|
return result;
|
4843
4915
|
}
|
@@ -4845,13 +4917,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4845
4917
|
struct ggml_tensor * ggml_soft_max(
|
4846
4918
|
struct ggml_context * ctx,
|
4847
4919
|
struct ggml_tensor * a) {
|
4848
|
-
return ggml_soft_max_impl(ctx, a, false);
|
4920
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
4849
4921
|
}
|
4850
4922
|
|
4851
4923
|
struct ggml_tensor * ggml_soft_max_inplace(
|
4852
4924
|
struct ggml_context * ctx,
|
4853
4925
|
struct ggml_tensor * a) {
|
4854
|
-
return ggml_soft_max_impl(ctx, a, true);
|
4926
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
4927
|
+
}
|
4928
|
+
|
4929
|
+
struct ggml_tensor * ggml_soft_max_ext(
|
4930
|
+
struct ggml_context * ctx,
|
4931
|
+
struct ggml_tensor * a,
|
4932
|
+
struct ggml_tensor * mask,
|
4933
|
+
float scale) {
|
4934
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
4855
4935
|
}
|
4856
4936
|
|
4857
4937
|
// ggml_soft_max_back
|
@@ -5446,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
|
|
5446
5526
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5447
5527
|
}
|
5448
5528
|
|
5529
|
+
// ggml_argsort
|
5530
|
+
|
5531
|
+
struct ggml_tensor * ggml_argsort(
|
5532
|
+
struct ggml_context * ctx,
|
5533
|
+
struct ggml_tensor * a,
|
5534
|
+
enum ggml_sort_order order) {
|
5535
|
+
bool is_node = false;
|
5536
|
+
|
5537
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
|
5538
|
+
|
5539
|
+
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5540
|
+
|
5541
|
+
result->op = GGML_OP_ARGSORT;
|
5542
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5543
|
+
result->src[0] = a;
|
5544
|
+
|
5545
|
+
return result;
|
5546
|
+
}
|
5547
|
+
|
5548
|
+
// ggml_top_k
|
5549
|
+
|
5550
|
+
struct ggml_tensor * ggml_top_k(
|
5551
|
+
struct ggml_context * ctx,
|
5552
|
+
struct ggml_tensor * a,
|
5553
|
+
int k) {
|
5554
|
+
GGML_ASSERT(a->ne[0] >= k);
|
5555
|
+
|
5556
|
+
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
5557
|
+
|
5558
|
+
result = ggml_view_4d(ctx, result,
|
5559
|
+
k, result->ne[1], result->ne[2], result->ne[3],
|
5560
|
+
result->nb[1], result->nb[2], result->nb[3],
|
5561
|
+
0);
|
5562
|
+
|
5563
|
+
return result;
|
5564
|
+
}
|
5565
|
+
|
5449
5566
|
// ggml_flash_attn
|
5450
5567
|
|
5451
5568
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -6805,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
|
|
6805
6922
|
const struct ggml_tensor * src0,
|
6806
6923
|
const struct ggml_tensor * src1,
|
6807
6924
|
struct ggml_tensor * dst) {
|
6808
|
-
GGML_ASSERT(
|
6925
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
6809
6926
|
|
6810
6927
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
6811
6928
|
return;
|
@@ -6838,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
|
|
6838
6955
|
const int64_t i13 = i03 % ne13;
|
6839
6956
|
const int64_t i12 = i02 % ne12;
|
6840
6957
|
const int64_t i11 = i01 % ne11;
|
6958
|
+
const int64_t nr0 = ne00 / ne10;
|
6841
6959
|
|
6842
6960
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6843
6961
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6844
6962
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
6845
6963
|
|
6964
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
6846
6965
|
#ifdef GGML_USE_ACCELERATE
|
6847
|
-
|
6966
|
+
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
6848
6967
|
#else
|
6849
|
-
|
6968
|
+
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
6850
6969
|
#endif
|
6970
|
+
}
|
6851
6971
|
}
|
6852
6972
|
} else {
|
6853
6973
|
// src1 is not contiguous
|
@@ -6864,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
|
|
6864
6984
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6865
6985
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6866
6986
|
|
6867
|
-
for (
|
6868
|
-
|
6987
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
6988
|
+
const int64_t i10 = i0 % ne10;
|
6989
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
6869
6990
|
|
6870
6991
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
6871
6992
|
}
|
@@ -7585,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7585
7706
|
const struct ggml_tensor * src0,
|
7586
7707
|
const struct ggml_tensor * src1,
|
7587
7708
|
struct ggml_tensor * dst) {
|
7588
|
-
GGML_ASSERT(
|
7709
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7589
7710
|
|
7590
7711
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7591
7712
|
return;
|
@@ -7608,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
|
|
7608
7729
|
|
7609
7730
|
GGML_ASSERT( nb0 == sizeof(float));
|
7610
7731
|
GGML_ASSERT(nb00 == sizeof(float));
|
7611
|
-
GGML_ASSERT(ne00 == ne10);
|
7612
7732
|
|
7613
7733
|
if (nb10 == sizeof(float)) {
|
7614
7734
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
@@ -7620,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
|
|
7620
7740
|
const int64_t i13 = i03 % ne13;
|
7621
7741
|
const int64_t i12 = i02 % ne12;
|
7622
7742
|
const int64_t i11 = i01 % ne11;
|
7743
|
+
const int64_t nr0 = ne00 / ne10;
|
7623
7744
|
|
7624
7745
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7625
7746
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7626
7747
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7627
7748
|
|
7749
|
+
for (int64_t r = 0 ; r < nr0; ++r) {
|
7628
7750
|
#ifdef GGML_USE_ACCELERATE
|
7629
|
-
|
7751
|
+
UNUSED(ggml_vec_mul_f32);
|
7630
7752
|
|
7631
|
-
|
7753
|
+
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
7632
7754
|
#else
|
7633
|
-
|
7755
|
+
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7634
7756
|
#endif
|
7635
|
-
|
7636
|
-
// }
|
7757
|
+
}
|
7637
7758
|
}
|
7638
7759
|
} else {
|
7639
7760
|
// src1 is not contiguous
|
@@ -7651,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
|
|
7651
7772
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7652
7773
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7653
7774
|
|
7654
|
-
for (int64_t i0 = 0; i0 < ne00; i0
|
7655
|
-
|
7775
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7776
|
+
const int64_t i10 = i0 % ne10;
|
7777
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7656
7778
|
|
7657
7779
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
7658
7780
|
}
|
@@ -7686,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
|
|
7686
7808
|
const struct ggml_tensor * src0,
|
7687
7809
|
const struct ggml_tensor * src1,
|
7688
7810
|
struct ggml_tensor * dst) {
|
7689
|
-
|
7690
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
7811
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7691
7812
|
|
7692
7813
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7693
7814
|
return;
|
7694
7815
|
}
|
7695
7816
|
|
7696
|
-
const int
|
7817
|
+
const int ith = params->ith;
|
7818
|
+
const int nth = params->nth;
|
7819
|
+
|
7820
|
+
const int64_t nr = ggml_nrows(src0);
|
7697
7821
|
|
7698
7822
|
GGML_TENSOR_BINARY_OP_LOCALS
|
7699
7823
|
|
@@ -7701,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
|
|
7701
7825
|
GGML_ASSERT(nb00 == sizeof(float));
|
7702
7826
|
|
7703
7827
|
if (nb10 == sizeof(float)) {
|
7704
|
-
for (
|
7705
|
-
// src0
|
7706
|
-
const
|
7707
|
-
const
|
7708
|
-
const
|
7828
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7829
|
+
// src0 and dst are same shape => same indices
|
7830
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7831
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7832
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7833
|
+
|
7834
|
+
const int64_t i13 = i03 % ne13;
|
7835
|
+
const int64_t i12 = i02 % ne12;
|
7836
|
+
const int64_t i11 = i01 % ne11;
|
7837
|
+
const int64_t nr0 = ne00 / ne10;
|
7709
7838
|
|
7839
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7840
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7841
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7842
|
+
|
7843
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
7710
7844
|
#ifdef GGML_USE_ACCELERATE
|
7711
|
-
|
7845
|
+
UNUSED(ggml_vec_div_f32);
|
7712
7846
|
|
7713
|
-
|
7714
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
7715
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
7716
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
7717
|
-
ne0);
|
7847
|
+
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
7718
7848
|
#else
|
7719
|
-
|
7720
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
7721
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
7722
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
7849
|
+
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7723
7850
|
#endif
|
7724
|
-
|
7725
|
-
// }
|
7851
|
+
}
|
7726
7852
|
}
|
7727
7853
|
} else {
|
7728
7854
|
// src1 is not contiguous
|
7729
|
-
for (
|
7730
|
-
// src0
|
7731
|
-
|
7732
|
-
const
|
7733
|
-
const
|
7855
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7856
|
+
// src0 and dst are same shape => same indices
|
7857
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
7858
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7859
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7860
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7734
7861
|
|
7735
|
-
|
7736
|
-
|
7737
|
-
|
7738
|
-
|
7862
|
+
const int64_t i13 = i03 % ne13;
|
7863
|
+
const int64_t i12 = i02 % ne12;
|
7864
|
+
const int64_t i11 = i01 % ne11;
|
7865
|
+
|
7866
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7867
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7868
|
+
|
7869
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7870
|
+
const int64_t i10 = i0 % ne10;
|
7871
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7739
7872
|
|
7740
7873
|
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
7741
7874
|
}
|
@@ -8181,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
|
|
8181
8314
|
return;
|
8182
8315
|
}
|
8183
8316
|
|
8184
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
8317
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
8185
8318
|
|
8186
8319
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
8187
8320
|
const int nr0 = (int)(ne0/ne00);
|
@@ -8326,6 +8459,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8326
8459
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
8327
8460
|
|
8328
8461
|
const int ith = params->ith;
|
8462
|
+
const int nth = params->nth;
|
8329
8463
|
|
8330
8464
|
GGML_TENSOR_BINARY_OP_LOCALS
|
8331
8465
|
|
@@ -8335,7 +8469,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8335
8469
|
GGML_ASSERT(nb10 == sizeof(float));
|
8336
8470
|
|
8337
8471
|
for (int i3 = 0; i3 < ne3; i3++) {
|
8338
|
-
for (int i2 = ith; i2 < ne2; i2
|
8472
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
8339
8473
|
if (i2 < ne02) { // src0
|
8340
8474
|
for (int i1 = 0; i1 < ne1; i1++) {
|
8341
8475
|
for (int i0 = 0; i0 < ne0; i0++) {
|
@@ -9373,7 +9507,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9373
9507
|
// TODO: find the optimal values for these
|
9374
9508
|
if (ggml_is_contiguous(src0) &&
|
9375
9509
|
ggml_is_contiguous(src1) &&
|
9376
|
-
|
9510
|
+
//src0->type == GGML_TYPE_F32 &&
|
9377
9511
|
src1->type == GGML_TYPE_F32 &&
|
9378
9512
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9379
9513
|
|
@@ -9495,6 +9629,8 @@ static void ggml_compute_forward_mul_mat(
|
|
9495
9629
|
char * wdata = params->wdata;
|
9496
9630
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
9497
9631
|
|
9632
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9633
|
+
|
9498
9634
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9499
9635
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
9500
9636
|
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
@@ -9596,6 +9732,26 @@ static void ggml_compute_forward_mul_mat(
|
|
9596
9732
|
}
|
9597
9733
|
}
|
9598
9734
|
|
9735
|
+
// ggml_compute_forward_mul_mat_id
|
9736
|
+
|
9737
|
+
static void ggml_compute_forward_mul_mat_id(
|
9738
|
+
const struct ggml_compute_params * params,
|
9739
|
+
struct ggml_tensor * dst) {
|
9740
|
+
|
9741
|
+
const struct ggml_tensor * ids = dst->src[0];
|
9742
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9743
|
+
|
9744
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9745
|
+
|
9746
|
+
const int a_id = ((int32_t *)ids->data)[id];
|
9747
|
+
|
9748
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
9749
|
+
|
9750
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
9751
|
+
|
9752
|
+
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
9753
|
+
}
|
9754
|
+
|
9599
9755
|
// ggml_compute_forward_out_prod
|
9600
9756
|
|
9601
9757
|
static void ggml_compute_forward_out_prod_f32(
|
@@ -10551,20 +10707,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
|
10551
10707
|
static void ggml_compute_forward_soft_max_f32(
|
10552
10708
|
const struct ggml_compute_params * params,
|
10553
10709
|
const struct ggml_tensor * src0,
|
10554
|
-
struct ggml_tensor *
|
10555
|
-
|
10556
|
-
|
10557
|
-
|
10710
|
+
const struct ggml_tensor * src1,
|
10711
|
+
struct ggml_tensor * dst) {
|
10712
|
+
assert(ggml_is_contiguous(dst));
|
10713
|
+
assert(ggml_are_same_shape(src0, dst));
|
10558
10714
|
|
10559
10715
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10560
10716
|
return;
|
10561
10717
|
}
|
10562
10718
|
|
10719
|
+
float scale = 1.0f;
|
10720
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
10721
|
+
|
10563
10722
|
// TODO: handle transposed/permuted matrices
|
10564
10723
|
|
10565
10724
|
const int ith = params->ith;
|
10566
10725
|
const int nth = params->nth;
|
10567
10726
|
|
10727
|
+
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
10728
|
+
|
10568
10729
|
const int nc = src0->ne[0];
|
10569
10730
|
const int nr = ggml_nrows(src0);
|
10570
10731
|
|
@@ -10575,29 +10736,40 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10575
10736
|
const int ir0 = dr*ith;
|
10576
10737
|
const int ir1 = MIN(ir0 + dr, nr);
|
10577
10738
|
|
10739
|
+
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
10740
|
+
|
10578
10741
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
10579
|
-
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10580
|
-
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10742
|
+
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10743
|
+
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10744
|
+
|
10745
|
+
// broadcast the mask across rows
|
10746
|
+
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
10747
|
+
|
10748
|
+
ggml_vec_cpy_f32 (nc, wp, sp);
|
10749
|
+
ggml_vec_scale_f32(nc, wp, scale);
|
10750
|
+
if (mp) {
|
10751
|
+
ggml_vec_acc_f32(nc, wp, mp);
|
10752
|
+
}
|
10581
10753
|
|
10582
10754
|
#ifndef NDEBUG
|
10583
10755
|
for (int i = 0; i < nc; ++i) {
|
10584
10756
|
//printf("p[%d] = %f\n", i, p[i]);
|
10585
|
-
assert(!isnan(
|
10757
|
+
assert(!isnan(wp[i]));
|
10586
10758
|
}
|
10587
10759
|
#endif
|
10588
10760
|
|
10589
10761
|
float max = -INFINITY;
|
10590
|
-
ggml_vec_max_f32(nc, &max,
|
10762
|
+
ggml_vec_max_f32(nc, &max, wp);
|
10591
10763
|
|
10592
10764
|
ggml_float sum = 0.0;
|
10593
10765
|
|
10594
10766
|
uint16_t scvt;
|
10595
10767
|
for (int i = 0; i < nc; i++) {
|
10596
|
-
if (
|
10768
|
+
if (wp[i] == -INFINITY) {
|
10597
10769
|
dp[i] = 0.0f;
|
10598
10770
|
} else {
|
10599
|
-
// const float val = (
|
10600
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(
|
10771
|
+
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
10772
|
+
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
10601
10773
|
memcpy(&scvt, &s, sizeof(scvt));
|
10602
10774
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
10603
10775
|
sum += (ggml_float)val;
|
@@ -10622,11 +10794,12 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10622
10794
|
static void ggml_compute_forward_soft_max(
|
10623
10795
|
const struct ggml_compute_params * params,
|
10624
10796
|
const struct ggml_tensor * src0,
|
10625
|
-
struct ggml_tensor *
|
10797
|
+
const struct ggml_tensor * src1,
|
10798
|
+
struct ggml_tensor * dst) {
|
10626
10799
|
switch (src0->type) {
|
10627
10800
|
case GGML_TYPE_F32:
|
10628
10801
|
{
|
10629
|
-
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
10802
|
+
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
10630
10803
|
} break;
|
10631
10804
|
default:
|
10632
10805
|
{
|
@@ -11982,6 +12155,67 @@ static void ggml_compute_forward_upscale(
|
|
11982
12155
|
}
|
11983
12156
|
}
|
11984
12157
|
|
12158
|
+
// ggml_compute_forward_argsort
|
12159
|
+
|
12160
|
+
static void ggml_compute_forward_argsort_f32(
|
12161
|
+
const struct ggml_compute_params * params,
|
12162
|
+
const struct ggml_tensor * src0,
|
12163
|
+
struct ggml_tensor * dst) {
|
12164
|
+
|
12165
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12166
|
+
return;
|
12167
|
+
}
|
12168
|
+
|
12169
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12170
|
+
|
12171
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12172
|
+
|
12173
|
+
const int ith = params->ith;
|
12174
|
+
const int nth = params->nth;
|
12175
|
+
|
12176
|
+
const int64_t nr = ggml_nrows(src0);
|
12177
|
+
|
12178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
12179
|
+
|
12180
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
12181
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
12182
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
12183
|
+
|
12184
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12185
|
+
dst_data[j] = j;
|
12186
|
+
}
|
12187
|
+
|
12188
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
12189
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12190
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
12191
|
+
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
12192
|
+
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
12193
|
+
int32_t tmp = dst_data[j];
|
12194
|
+
dst_data[j] = dst_data[k];
|
12195
|
+
dst_data[k] = tmp;
|
12196
|
+
}
|
12197
|
+
}
|
12198
|
+
}
|
12199
|
+
}
|
12200
|
+
}
|
12201
|
+
|
12202
|
+
static void ggml_compute_forward_argsort(
|
12203
|
+
const struct ggml_compute_params * params,
|
12204
|
+
const struct ggml_tensor * src0,
|
12205
|
+
struct ggml_tensor * dst) {
|
12206
|
+
|
12207
|
+
switch (src0->type) {
|
12208
|
+
case GGML_TYPE_F32:
|
12209
|
+
{
|
12210
|
+
ggml_compute_forward_argsort_f32(params, src0, dst);
|
12211
|
+
} break;
|
12212
|
+
default:
|
12213
|
+
{
|
12214
|
+
GGML_ASSERT(false);
|
12215
|
+
} break;
|
12216
|
+
}
|
12217
|
+
}
|
12218
|
+
|
11985
12219
|
// ggml_compute_forward_flash_attn
|
11986
12220
|
|
11987
12221
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13805,6 +14039,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13805
14039
|
{
|
13806
14040
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
13807
14041
|
} break;
|
14042
|
+
case GGML_OP_MUL_MAT_ID:
|
14043
|
+
{
|
14044
|
+
ggml_compute_forward_mul_mat_id(params, tensor);
|
14045
|
+
} break;
|
13808
14046
|
case GGML_OP_OUT_PROD:
|
13809
14047
|
{
|
13810
14048
|
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -13863,7 +14101,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13863
14101
|
} break;
|
13864
14102
|
case GGML_OP_SOFT_MAX:
|
13865
14103
|
{
|
13866
|
-
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
14104
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
13867
14105
|
} break;
|
13868
14106
|
case GGML_OP_SOFT_MAX_BACK:
|
13869
14107
|
{
|
@@ -13909,6 +14147,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13909
14147
|
{
|
13910
14148
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
13911
14149
|
} break;
|
14150
|
+
case GGML_OP_ARGSORT:
|
14151
|
+
{
|
14152
|
+
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14153
|
+
} break;
|
13912
14154
|
case GGML_OP_FLASH_ATTN:
|
13913
14155
|
{
|
13914
14156
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14559,6 +14801,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14559
14801
|
zero_table);
|
14560
14802
|
}
|
14561
14803
|
} break;
|
14804
|
+
case GGML_OP_MUL_MAT_ID:
|
14805
|
+
{
|
14806
|
+
GGML_ASSERT(false); // TODO: not implemented
|
14807
|
+
} break;
|
14562
14808
|
case GGML_OP_OUT_PROD:
|
14563
14809
|
{
|
14564
14810
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -14897,6 +15143,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14897
15143
|
{
|
14898
15144
|
GGML_ASSERT(false); // TODO: not implemented
|
14899
15145
|
} break;
|
15146
|
+
case GGML_OP_ARGSORT:
|
15147
|
+
{
|
15148
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15149
|
+
} break;
|
14900
15150
|
case GGML_OP_FLASH_ATTN:
|
14901
15151
|
{
|
14902
15152
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15257,12 +15507,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
15257
15507
|
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15258
15508
|
}
|
15259
15509
|
|
15260
|
-
struct ggml_cgraph
|
15261
|
-
|
15262
|
-
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15263
|
-
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15264
|
-
|
15265
|
-
*cgraph = (struct ggml_cgraph) {
|
15510
|
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15511
|
+
struct ggml_cgraph cgraph = {
|
15266
15512
|
/*.size =*/ 0,
|
15267
15513
|
/*.n_nodes =*/ i1 - i0,
|
15268
15514
|
/*.n_leafs =*/ 0,
|
@@ -15497,7 +15743,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15497
15743
|
n_tasks = n_threads;
|
15498
15744
|
} break;
|
15499
15745
|
case GGML_OP_SUB:
|
15500
|
-
case GGML_OP_DIV:
|
15501
15746
|
case GGML_OP_SQR:
|
15502
15747
|
case GGML_OP_SQRT:
|
15503
15748
|
case GGML_OP_LOG:
|
@@ -15530,10 +15775,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15530
15775
|
{
|
15531
15776
|
n_tasks = n_threads;
|
15532
15777
|
} break;
|
15778
|
+
default:
|
15779
|
+
GGML_ASSERT(false);
|
15533
15780
|
}
|
15534
15781
|
break;
|
15535
15782
|
case GGML_OP_SILU_BACK:
|
15536
15783
|
case GGML_OP_MUL:
|
15784
|
+
case GGML_OP_DIV:
|
15537
15785
|
case GGML_OP_NORM:
|
15538
15786
|
case GGML_OP_RMS_NORM:
|
15539
15787
|
case GGML_OP_RMS_NORM_BACK:
|
@@ -15571,6 +15819,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15571
15819
|
}
|
15572
15820
|
#endif
|
15573
15821
|
} break;
|
15822
|
+
case GGML_OP_MUL_MAT_ID:
|
15823
|
+
{
|
15824
|
+
// FIXME: blas
|
15825
|
+
n_tasks = n_threads;
|
15826
|
+
} break;
|
15574
15827
|
case GGML_OP_OUT_PROD:
|
15575
15828
|
{
|
15576
15829
|
n_tasks = n_threads;
|
@@ -15590,7 +15843,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15590
15843
|
} break;
|
15591
15844
|
case GGML_OP_DIAG_MASK_ZERO:
|
15592
15845
|
case GGML_OP_DIAG_MASK_INF:
|
15593
|
-
case GGML_OP_SOFT_MAX:
|
15594
15846
|
case GGML_OP_SOFT_MAX_BACK:
|
15595
15847
|
case GGML_OP_ROPE:
|
15596
15848
|
case GGML_OP_ROPE_BACK:
|
@@ -15606,6 +15858,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15606
15858
|
{
|
15607
15859
|
n_tasks = 1; //TODO
|
15608
15860
|
} break;
|
15861
|
+
case GGML_OP_SOFT_MAX:
|
15862
|
+
{
|
15863
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15864
|
+
} break;
|
15609
15865
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15610
15866
|
{
|
15611
15867
|
n_tasks = n_threads;
|
@@ -15627,6 +15883,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15627
15883
|
{
|
15628
15884
|
n_tasks = n_threads;
|
15629
15885
|
} break;
|
15886
|
+
case GGML_OP_ARGSORT:
|
15887
|
+
{
|
15888
|
+
n_tasks = n_threads;
|
15889
|
+
} break;
|
15630
15890
|
case GGML_OP_FLASH_ATTN:
|
15631
15891
|
{
|
15632
15892
|
n_tasks = n_threads;
|
@@ -15695,7 +15955,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15695
15955
|
} break;
|
15696
15956
|
default:
|
15697
15957
|
{
|
15698
|
-
|
15958
|
+
fprintf(stderr, "%s: op not implemented: ", __func__);
|
15959
|
+
if (node->op < GGML_OP_COUNT) {
|
15960
|
+
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
15961
|
+
} else {
|
15962
|
+
fprintf(stderr, "%d\n", node->op);
|
15963
|
+
}
|
15699
15964
|
GGML_ASSERT(false);
|
15700
15965
|
} break;
|
15701
15966
|
}
|
@@ -15836,18 +16101,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15836
16101
|
|
15837
16102
|
// thread scheduling for the different operations + work buffer size estimation
|
15838
16103
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15839
|
-
int n_tasks = 1;
|
15840
|
-
|
15841
16104
|
struct ggml_tensor * node = cgraph->nodes[i];
|
15842
16105
|
|
16106
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16107
|
+
|
15843
16108
|
size_t cur = 0;
|
15844
16109
|
|
15845
16110
|
switch (node->op) {
|
15846
16111
|
case GGML_OP_CPY:
|
15847
16112
|
case GGML_OP_DUP:
|
15848
16113
|
{
|
15849
|
-
n_tasks = n_threads;
|
15850
|
-
|
15851
16114
|
if (ggml_is_quantized(node->type)) {
|
15852
16115
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15853
16116
|
}
|
@@ -15855,16 +16118,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15855
16118
|
case GGML_OP_ADD:
|
15856
16119
|
case GGML_OP_ADD1:
|
15857
16120
|
{
|
15858
|
-
n_tasks = n_threads;
|
15859
|
-
|
15860
16121
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15861
16122
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15862
16123
|
}
|
15863
16124
|
} break;
|
15864
16125
|
case GGML_OP_ACC:
|
15865
16126
|
{
|
15866
|
-
n_tasks = n_threads;
|
15867
|
-
|
15868
16127
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15869
16128
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
15870
16129
|
}
|
@@ -15890,14 +16149,33 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15890
16149
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
15891
16150
|
}
|
15892
16151
|
} break;
|
16152
|
+
case GGML_OP_MUL_MAT_ID:
|
16153
|
+
{
|
16154
|
+
const struct ggml_tensor * a = node->src[2];
|
16155
|
+
const struct ggml_tensor * b = node->src[1];
|
16156
|
+
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
16157
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16158
|
+
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
16159
|
+
if (a->type != GGML_TYPE_F32) {
|
16160
|
+
// here we need memory just for single 2D matrix from src0
|
16161
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16162
|
+
}
|
16163
|
+
} else
|
16164
|
+
#endif
|
16165
|
+
if (b->type != vec_dot_type) {
|
16166
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
16167
|
+
}
|
16168
|
+
} break;
|
15893
16169
|
case GGML_OP_OUT_PROD:
|
15894
16170
|
{
|
15895
|
-
n_tasks = n_threads;
|
15896
|
-
|
15897
16171
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15898
16172
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15899
16173
|
}
|
15900
16174
|
} break;
|
16175
|
+
case GGML_OP_SOFT_MAX:
|
16176
|
+
{
|
16177
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16178
|
+
} break;
|
15901
16179
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15902
16180
|
{
|
15903
16181
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
@@ -15923,10 +16201,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15923
16201
|
GGML_ASSERT(false);
|
15924
16202
|
}
|
15925
16203
|
} break;
|
15926
|
-
case GGML_OP_IM2COL:
|
15927
|
-
{
|
15928
|
-
n_tasks = n_threads;
|
15929
|
-
} break;
|
15930
16204
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15931
16205
|
{
|
15932
16206
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
@@ -15943,8 +16217,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15943
16217
|
} break;
|
15944
16218
|
case GGML_OP_FLASH_ATTN:
|
15945
16219
|
{
|
15946
|
-
n_tasks = n_threads;
|
15947
|
-
|
15948
16220
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15949
16221
|
|
15950
16222
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -15957,8 +16229,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15957
16229
|
} break;
|
15958
16230
|
case GGML_OP_FLASH_FF:
|
15959
16231
|
{
|
15960
|
-
n_tasks = n_threads;
|
15961
|
-
|
15962
16232
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
15963
16233
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
15964
16234
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
@@ -15969,8 +16239,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15969
16239
|
} break;
|
15970
16240
|
case GGML_OP_FLASH_ATTN_BACK:
|
15971
16241
|
{
|
15972
|
-
n_tasks = n_threads;
|
15973
|
-
|
15974
16242
|
const int64_t D = node->src[0]->ne[0];
|
15975
16243
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15976
16244
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
@@ -15985,8 +16253,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15985
16253
|
|
15986
16254
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
15987
16255
|
{
|
15988
|
-
n_tasks = n_threads;
|
15989
|
-
|
15990
16256
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
15991
16257
|
} break;
|
15992
16258
|
case GGML_OP_COUNT:
|
@@ -17773,8 +18039,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
17773
18039
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17774
18040
|
|
17775
18041
|
for (int j = 0; j < QK5_0; j += 2) {
|
17776
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17777
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18042
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18043
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17778
18044
|
|
17779
18045
|
// cast to 16 bins
|
17780
18046
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17803,8 +18069,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
17803
18069
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17804
18070
|
|
17805
18071
|
for (int j = 0; j < QK5_1; j += 2) {
|
17806
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17807
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18072
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18073
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17808
18074
|
|
17809
18075
|
// cast to 16 bins
|
17810
18076
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17994,6 +18260,7 @@ struct gguf_kv {
|
|
17994
18260
|
|
17995
18261
|
struct gguf_header {
|
17996
18262
|
char magic[4];
|
18263
|
+
|
17997
18264
|
uint32_t version;
|
17998
18265
|
uint64_t n_tensors; // GGUFv2
|
17999
18266
|
uint64_t n_kv; // GGUFv2
|
@@ -18083,7 +18350,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18083
18350
|
|
18084
18351
|
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
18085
18352
|
if (magic[i] != GGUF_MAGIC[i]) {
|
18086
|
-
fprintf(stderr, "%s: invalid magic characters %
|
18353
|
+
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
18087
18354
|
fclose(file);
|
18088
18355
|
return NULL;
|
18089
18356
|
}
|
@@ -18098,7 +18365,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18098
18365
|
{
|
18099
18366
|
strncpy(ctx->header.magic, magic, 4);
|
18100
18367
|
|
18101
|
-
|
18102
18368
|
ctx->kv = NULL;
|
18103
18369
|
ctx->infos = NULL;
|
18104
18370
|
ctx->data = NULL;
|