llama_cpp 0.9.4 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
233
233
|
#define UNUSED GGML_UNUSED
|
234
234
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
235
|
|
236
|
-
//
|
237
|
-
// tensor access macros
|
238
|
-
//
|
239
|
-
|
240
|
-
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
241
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
242
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
243
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
244
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
245
|
-
|
246
|
-
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
247
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
248
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
249
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
250
|
-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
251
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
252
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
253
|
-
|
254
236
|
#if defined(GGML_USE_ACCELERATE)
|
255
237
|
#include <Accelerate/Accelerate.h>
|
256
238
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1613
1595
|
"GROUP_NORM",
|
1614
1596
|
|
1615
1597
|
"MUL_MAT",
|
1598
|
+
"MUL_MAT_ID",
|
1616
1599
|
"OUT_PROD",
|
1617
1600
|
|
1618
1601
|
"SCALE",
|
@@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1640
1623
|
"POOL_1D",
|
1641
1624
|
"POOL_2D",
|
1642
1625
|
"UPSCALE",
|
1626
|
+
"ARGSORT",
|
1643
1627
|
|
1644
1628
|
"FLASH_ATTN",
|
1645
1629
|
"FLASH_FF",
|
@@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1666
1650
|
"CROSS_ENTROPY_LOSS_BACK",
|
1667
1651
|
};
|
1668
1652
|
|
1669
|
-
static_assert(GGML_OP_COUNT ==
|
1653
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1670
1654
|
|
1671
1655
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1672
1656
|
"none",
|
@@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1695
1679
|
"group_norm(x)",
|
1696
1680
|
|
1697
1681
|
"X*Y",
|
1682
|
+
"X[i]*Y",
|
1698
1683
|
"X*Y",
|
1699
1684
|
|
1700
1685
|
"x*v",
|
@@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1722
1707
|
"pool_1d(x)",
|
1723
1708
|
"pool_2d(x)",
|
1724
1709
|
"upscale(x)",
|
1710
|
+
"argsort(x)",
|
1725
1711
|
|
1726
1712
|
"flash_attn(x)",
|
1727
1713
|
"flash_ff(x)",
|
@@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1748
1734
|
"cross_entropy_loss_back(x,y)",
|
1749
1735
|
};
|
1750
1736
|
|
1751
|
-
static_assert(GGML_OP_COUNT ==
|
1737
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1752
1738
|
|
1753
1739
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1754
1740
|
|
1741
|
+
|
1742
|
+
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
1743
|
+
"ABS",
|
1744
|
+
"SGN",
|
1745
|
+
"NEG",
|
1746
|
+
"STEP",
|
1747
|
+
"TANH",
|
1748
|
+
"ELU",
|
1749
|
+
"RELU",
|
1750
|
+
"GELU",
|
1751
|
+
"GELU_QUICK",
|
1752
|
+
"SILU",
|
1753
|
+
"LEAKY",
|
1754
|
+
};
|
1755
|
+
|
1756
|
+
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
|
1757
|
+
|
1758
|
+
|
1755
1759
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1756
1760
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1757
1761
|
|
@@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1771
1775
|
|
1772
1776
|
p[GGML_OP_ACC ] = true;
|
1773
1777
|
p[GGML_OP_MUL_MAT ] = true;
|
1778
|
+
p[GGML_OP_MUL_MAT_ID ] = true;
|
1774
1779
|
p[GGML_OP_OUT_PROD ] = true;
|
1775
1780
|
p[GGML_OP_SET ] = true;
|
1776
1781
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
@@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
2023
2028
|
return GGML_OP_SYMBOL[op];
|
2024
2029
|
}
|
2025
2030
|
|
2031
|
+
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
2032
|
+
return GGML_UNARY_OP_NAME[op];
|
2033
|
+
}
|
2034
|
+
|
2035
|
+
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2036
|
+
if (t->op == GGML_OP_UNARY) {
|
2037
|
+
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2038
|
+
return ggml_unary_op_name(uop);
|
2039
|
+
}
|
2040
|
+
else {
|
2041
|
+
return ggml_op_name(t->op);
|
2042
|
+
}
|
2043
|
+
}
|
2044
|
+
|
2026
2045
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2027
2046
|
return ggml_type_size(tensor->type);
|
2028
2047
|
}
|
@@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
|
|
3154
3173
|
struct ggml_tensor * a,
|
3155
3174
|
struct ggml_tensor * b,
|
3156
3175
|
bool inplace) {
|
3157
|
-
|
3158
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3159
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3176
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3160
3177
|
|
3161
3178
|
bool is_node = false;
|
3162
3179
|
|
@@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
3371
3388
|
struct ggml_tensor * a,
|
3372
3389
|
struct ggml_tensor * b,
|
3373
3390
|
bool inplace) {
|
3374
|
-
|
3375
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3376
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3391
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3377
3392
|
|
3378
3393
|
bool is_node = false;
|
3379
3394
|
|
@@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
3418
3433
|
struct ggml_tensor * a,
|
3419
3434
|
struct ggml_tensor * b,
|
3420
3435
|
bool inplace) {
|
3421
|
-
GGML_ASSERT(
|
3436
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3422
3437
|
|
3423
3438
|
bool is_node = false;
|
3424
3439
|
|
@@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4056
4071
|
return result;
|
4057
4072
|
}
|
4058
4073
|
|
4074
|
+
// ggml_mul_mat_id
|
4075
|
+
|
4076
|
+
struct ggml_tensor * ggml_mul_mat_id(
|
4077
|
+
struct ggml_context * ctx,
|
4078
|
+
struct ggml_tensor * as[],
|
4079
|
+
struct ggml_tensor * ids,
|
4080
|
+
int id,
|
4081
|
+
struct ggml_tensor * b) {
|
4082
|
+
|
4083
|
+
int64_t n_as = ids->ne[0];
|
4084
|
+
|
4085
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4086
|
+
GGML_ASSERT(ggml_is_vector(ids));
|
4087
|
+
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4088
|
+
GGML_ASSERT(id >= 0 && id < n_as);
|
4089
|
+
|
4090
|
+
bool is_node = false;
|
4091
|
+
|
4092
|
+
if (as[0]->grad || b->grad) {
|
4093
|
+
is_node = true;
|
4094
|
+
}
|
4095
|
+
|
4096
|
+
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4097
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
4098
|
+
|
4099
|
+
ggml_set_op_params_i32(result, 0, id);
|
4100
|
+
|
4101
|
+
result->op = GGML_OP_MUL_MAT_ID;
|
4102
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4103
|
+
result->src[0] = ids;
|
4104
|
+
result->src[1] = b;
|
4105
|
+
|
4106
|
+
for (int64_t i = 0; i < n_as; i++) {
|
4107
|
+
struct ggml_tensor * a = as[i];
|
4108
|
+
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4109
|
+
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4110
|
+
GGML_ASSERT(!ggml_is_transposed(a));
|
4111
|
+
result->src[i + 2] = a;
|
4112
|
+
}
|
4113
|
+
|
4114
|
+
return result;
|
4115
|
+
}
|
4116
|
+
|
4059
4117
|
// ggml_out_prod
|
4060
4118
|
|
4061
4119
|
struct ggml_tensor * ggml_out_prod(
|
@@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4209
4267
|
struct ggml_tensor * b,
|
4210
4268
|
size_t nb1,
|
4211
4269
|
size_t offset) {
|
4212
|
-
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset,
|
4270
|
+
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
4213
4271
|
}
|
4214
4272
|
|
4215
4273
|
// ggml_cpy
|
@@ -4826,7 +4884,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
|
4826
4884
|
static struct ggml_tensor * ggml_soft_max_impl(
|
4827
4885
|
struct ggml_context * ctx,
|
4828
4886
|
struct ggml_tensor * a,
|
4887
|
+
struct ggml_tensor * mask,
|
4888
|
+
float scale,
|
4829
4889
|
bool inplace) {
|
4890
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
4891
|
+
if (mask) {
|
4892
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
4893
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
4894
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
4895
|
+
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
4896
|
+
}
|
4897
|
+
|
4830
4898
|
bool is_node = false;
|
4831
4899
|
|
4832
4900
|
if (a->grad) {
|
@@ -4835,9 +4903,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4835
4903
|
|
4836
4904
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4837
4905
|
|
4906
|
+
float params[] = { scale };
|
4907
|
+
ggml_set_op_params(result, params, sizeof(params));
|
4908
|
+
|
4838
4909
|
result->op = GGML_OP_SOFT_MAX;
|
4839
4910
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4840
4911
|
result->src[0] = a;
|
4912
|
+
result->src[1] = mask;
|
4841
4913
|
|
4842
4914
|
return result;
|
4843
4915
|
}
|
@@ -4845,13 +4917,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4845
4917
|
struct ggml_tensor * ggml_soft_max(
|
4846
4918
|
struct ggml_context * ctx,
|
4847
4919
|
struct ggml_tensor * a) {
|
4848
|
-
return ggml_soft_max_impl(ctx, a, false);
|
4920
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
4849
4921
|
}
|
4850
4922
|
|
4851
4923
|
struct ggml_tensor * ggml_soft_max_inplace(
|
4852
4924
|
struct ggml_context * ctx,
|
4853
4925
|
struct ggml_tensor * a) {
|
4854
|
-
return ggml_soft_max_impl(ctx, a, true);
|
4926
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
4927
|
+
}
|
4928
|
+
|
4929
|
+
struct ggml_tensor * ggml_soft_max_ext(
|
4930
|
+
struct ggml_context * ctx,
|
4931
|
+
struct ggml_tensor * a,
|
4932
|
+
struct ggml_tensor * mask,
|
4933
|
+
float scale) {
|
4934
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
4855
4935
|
}
|
4856
4936
|
|
4857
4937
|
// ggml_soft_max_back
|
@@ -5446,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
|
|
5446
5526
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5447
5527
|
}
|
5448
5528
|
|
5529
|
+
// ggml_argsort
|
5530
|
+
|
5531
|
+
struct ggml_tensor * ggml_argsort(
|
5532
|
+
struct ggml_context * ctx,
|
5533
|
+
struct ggml_tensor * a,
|
5534
|
+
enum ggml_sort_order order) {
|
5535
|
+
bool is_node = false;
|
5536
|
+
|
5537
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
|
5538
|
+
|
5539
|
+
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5540
|
+
|
5541
|
+
result->op = GGML_OP_ARGSORT;
|
5542
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5543
|
+
result->src[0] = a;
|
5544
|
+
|
5545
|
+
return result;
|
5546
|
+
}
|
5547
|
+
|
5548
|
+
// ggml_top_k
|
5549
|
+
|
5550
|
+
struct ggml_tensor * ggml_top_k(
|
5551
|
+
struct ggml_context * ctx,
|
5552
|
+
struct ggml_tensor * a,
|
5553
|
+
int k) {
|
5554
|
+
GGML_ASSERT(a->ne[0] >= k);
|
5555
|
+
|
5556
|
+
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
5557
|
+
|
5558
|
+
result = ggml_view_4d(ctx, result,
|
5559
|
+
k, result->ne[1], result->ne[2], result->ne[3],
|
5560
|
+
result->nb[1], result->nb[2], result->nb[3],
|
5561
|
+
0);
|
5562
|
+
|
5563
|
+
return result;
|
5564
|
+
}
|
5565
|
+
|
5449
5566
|
// ggml_flash_attn
|
5450
5567
|
|
5451
5568
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -6805,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
|
|
6805
6922
|
const struct ggml_tensor * src0,
|
6806
6923
|
const struct ggml_tensor * src1,
|
6807
6924
|
struct ggml_tensor * dst) {
|
6808
|
-
GGML_ASSERT(
|
6925
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
6809
6926
|
|
6810
6927
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
6811
6928
|
return;
|
@@ -6838,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
|
|
6838
6955
|
const int64_t i13 = i03 % ne13;
|
6839
6956
|
const int64_t i12 = i02 % ne12;
|
6840
6957
|
const int64_t i11 = i01 % ne11;
|
6958
|
+
const int64_t nr0 = ne00 / ne10;
|
6841
6959
|
|
6842
6960
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6843
6961
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6844
6962
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
6845
6963
|
|
6964
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
6846
6965
|
#ifdef GGML_USE_ACCELERATE
|
6847
|
-
|
6966
|
+
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
6848
6967
|
#else
|
6849
|
-
|
6968
|
+
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
6850
6969
|
#endif
|
6970
|
+
}
|
6851
6971
|
}
|
6852
6972
|
} else {
|
6853
6973
|
// src1 is not contiguous
|
@@ -6864,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
|
|
6864
6984
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6865
6985
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6866
6986
|
|
6867
|
-
for (
|
6868
|
-
|
6987
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
6988
|
+
const int64_t i10 = i0 % ne10;
|
6989
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
6869
6990
|
|
6870
6991
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
6871
6992
|
}
|
@@ -7585,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7585
7706
|
const struct ggml_tensor * src0,
|
7586
7707
|
const struct ggml_tensor * src1,
|
7587
7708
|
struct ggml_tensor * dst) {
|
7588
|
-
GGML_ASSERT(
|
7709
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7589
7710
|
|
7590
7711
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7591
7712
|
return;
|
@@ -7608,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
|
|
7608
7729
|
|
7609
7730
|
GGML_ASSERT( nb0 == sizeof(float));
|
7610
7731
|
GGML_ASSERT(nb00 == sizeof(float));
|
7611
|
-
GGML_ASSERT(ne00 == ne10);
|
7612
7732
|
|
7613
7733
|
if (nb10 == sizeof(float)) {
|
7614
7734
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
@@ -7620,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
|
|
7620
7740
|
const int64_t i13 = i03 % ne13;
|
7621
7741
|
const int64_t i12 = i02 % ne12;
|
7622
7742
|
const int64_t i11 = i01 % ne11;
|
7743
|
+
const int64_t nr0 = ne00 / ne10;
|
7623
7744
|
|
7624
7745
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7625
7746
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7626
7747
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7627
7748
|
|
7749
|
+
for (int64_t r = 0 ; r < nr0; ++r) {
|
7628
7750
|
#ifdef GGML_USE_ACCELERATE
|
7629
|
-
|
7751
|
+
UNUSED(ggml_vec_mul_f32);
|
7630
7752
|
|
7631
|
-
|
7753
|
+
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
7632
7754
|
#else
|
7633
|
-
|
7755
|
+
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7634
7756
|
#endif
|
7635
|
-
|
7636
|
-
// }
|
7757
|
+
}
|
7637
7758
|
}
|
7638
7759
|
} else {
|
7639
7760
|
// src1 is not contiguous
|
@@ -7651,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
|
|
7651
7772
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7652
7773
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7653
7774
|
|
7654
|
-
for (int64_t i0 = 0; i0 < ne00; i0
|
7655
|
-
|
7775
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7776
|
+
const int64_t i10 = i0 % ne10;
|
7777
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7656
7778
|
|
7657
7779
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
7658
7780
|
}
|
@@ -7686,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
|
|
7686
7808
|
const struct ggml_tensor * src0,
|
7687
7809
|
const struct ggml_tensor * src1,
|
7688
7810
|
struct ggml_tensor * dst) {
|
7689
|
-
|
7690
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
7811
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7691
7812
|
|
7692
7813
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7693
7814
|
return;
|
7694
7815
|
}
|
7695
7816
|
|
7696
|
-
const int
|
7817
|
+
const int ith = params->ith;
|
7818
|
+
const int nth = params->nth;
|
7819
|
+
|
7820
|
+
const int64_t nr = ggml_nrows(src0);
|
7697
7821
|
|
7698
7822
|
GGML_TENSOR_BINARY_OP_LOCALS
|
7699
7823
|
|
@@ -7701,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
|
|
7701
7825
|
GGML_ASSERT(nb00 == sizeof(float));
|
7702
7826
|
|
7703
7827
|
if (nb10 == sizeof(float)) {
|
7704
|
-
for (
|
7705
|
-
// src0
|
7706
|
-
const
|
7707
|
-
const
|
7708
|
-
const
|
7828
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7829
|
+
// src0 and dst are same shape => same indices
|
7830
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7831
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7832
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7833
|
+
|
7834
|
+
const int64_t i13 = i03 % ne13;
|
7835
|
+
const int64_t i12 = i02 % ne12;
|
7836
|
+
const int64_t i11 = i01 % ne11;
|
7837
|
+
const int64_t nr0 = ne00 / ne10;
|
7709
7838
|
|
7839
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7840
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7841
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7842
|
+
|
7843
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
7710
7844
|
#ifdef GGML_USE_ACCELERATE
|
7711
|
-
|
7845
|
+
UNUSED(ggml_vec_div_f32);
|
7712
7846
|
|
7713
|
-
|
7714
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
7715
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
7716
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
7717
|
-
ne0);
|
7847
|
+
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
7718
7848
|
#else
|
7719
|
-
|
7720
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
7721
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
7722
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
7849
|
+
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7723
7850
|
#endif
|
7724
|
-
|
7725
|
-
// }
|
7851
|
+
}
|
7726
7852
|
}
|
7727
7853
|
} else {
|
7728
7854
|
// src1 is not contiguous
|
7729
|
-
for (
|
7730
|
-
// src0
|
7731
|
-
|
7732
|
-
const
|
7733
|
-
const
|
7855
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7856
|
+
// src0 and dst are same shape => same indices
|
7857
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
7858
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7859
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7860
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7734
7861
|
|
7735
|
-
|
7736
|
-
|
7737
|
-
|
7738
|
-
|
7862
|
+
const int64_t i13 = i03 % ne13;
|
7863
|
+
const int64_t i12 = i02 % ne12;
|
7864
|
+
const int64_t i11 = i01 % ne11;
|
7865
|
+
|
7866
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7867
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7868
|
+
|
7869
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7870
|
+
const int64_t i10 = i0 % ne10;
|
7871
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7739
7872
|
|
7740
7873
|
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
7741
7874
|
}
|
@@ -8181,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
|
|
8181
8314
|
return;
|
8182
8315
|
}
|
8183
8316
|
|
8184
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
8317
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
8185
8318
|
|
8186
8319
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
8187
8320
|
const int nr0 = (int)(ne0/ne00);
|
@@ -8326,6 +8459,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8326
8459
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
8327
8460
|
|
8328
8461
|
const int ith = params->ith;
|
8462
|
+
const int nth = params->nth;
|
8329
8463
|
|
8330
8464
|
GGML_TENSOR_BINARY_OP_LOCALS
|
8331
8465
|
|
@@ -8335,7 +8469,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8335
8469
|
GGML_ASSERT(nb10 == sizeof(float));
|
8336
8470
|
|
8337
8471
|
for (int i3 = 0; i3 < ne3; i3++) {
|
8338
|
-
for (int i2 = ith; i2 < ne2; i2
|
8472
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
8339
8473
|
if (i2 < ne02) { // src0
|
8340
8474
|
for (int i1 = 0; i1 < ne1; i1++) {
|
8341
8475
|
for (int i0 = 0; i0 < ne0; i0++) {
|
@@ -9373,7 +9507,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9373
9507
|
// TODO: find the optimal values for these
|
9374
9508
|
if (ggml_is_contiguous(src0) &&
|
9375
9509
|
ggml_is_contiguous(src1) &&
|
9376
|
-
|
9510
|
+
//src0->type == GGML_TYPE_F32 &&
|
9377
9511
|
src1->type == GGML_TYPE_F32 &&
|
9378
9512
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9379
9513
|
|
@@ -9495,6 +9629,8 @@ static void ggml_compute_forward_mul_mat(
|
|
9495
9629
|
char * wdata = params->wdata;
|
9496
9630
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
9497
9631
|
|
9632
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9633
|
+
|
9498
9634
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9499
9635
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
9500
9636
|
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
@@ -9596,6 +9732,26 @@ static void ggml_compute_forward_mul_mat(
|
|
9596
9732
|
}
|
9597
9733
|
}
|
9598
9734
|
|
9735
|
+
// ggml_compute_forward_mul_mat_id
|
9736
|
+
|
9737
|
+
static void ggml_compute_forward_mul_mat_id(
|
9738
|
+
const struct ggml_compute_params * params,
|
9739
|
+
struct ggml_tensor * dst) {
|
9740
|
+
|
9741
|
+
const struct ggml_tensor * ids = dst->src[0];
|
9742
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9743
|
+
|
9744
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9745
|
+
|
9746
|
+
const int a_id = ((int32_t *)ids->data)[id];
|
9747
|
+
|
9748
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
9749
|
+
|
9750
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
9751
|
+
|
9752
|
+
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
9753
|
+
}
|
9754
|
+
|
9599
9755
|
// ggml_compute_forward_out_prod
|
9600
9756
|
|
9601
9757
|
static void ggml_compute_forward_out_prod_f32(
|
@@ -10551,20 +10707,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
|
10551
10707
|
static void ggml_compute_forward_soft_max_f32(
|
10552
10708
|
const struct ggml_compute_params * params,
|
10553
10709
|
const struct ggml_tensor * src0,
|
10554
|
-
struct ggml_tensor *
|
10555
|
-
|
10556
|
-
|
10557
|
-
|
10710
|
+
const struct ggml_tensor * src1,
|
10711
|
+
struct ggml_tensor * dst) {
|
10712
|
+
assert(ggml_is_contiguous(dst));
|
10713
|
+
assert(ggml_are_same_shape(src0, dst));
|
10558
10714
|
|
10559
10715
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10560
10716
|
return;
|
10561
10717
|
}
|
10562
10718
|
|
10719
|
+
float scale = 1.0f;
|
10720
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
10721
|
+
|
10563
10722
|
// TODO: handle transposed/permuted matrices
|
10564
10723
|
|
10565
10724
|
const int ith = params->ith;
|
10566
10725
|
const int nth = params->nth;
|
10567
10726
|
|
10727
|
+
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
10728
|
+
|
10568
10729
|
const int nc = src0->ne[0];
|
10569
10730
|
const int nr = ggml_nrows(src0);
|
10570
10731
|
|
@@ -10575,29 +10736,40 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10575
10736
|
const int ir0 = dr*ith;
|
10576
10737
|
const int ir1 = MIN(ir0 + dr, nr);
|
10577
10738
|
|
10739
|
+
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
10740
|
+
|
10578
10741
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
10579
|
-
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10580
|
-
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10742
|
+
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10743
|
+
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10744
|
+
|
10745
|
+
// broadcast the mask across rows
|
10746
|
+
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
10747
|
+
|
10748
|
+
ggml_vec_cpy_f32 (nc, wp, sp);
|
10749
|
+
ggml_vec_scale_f32(nc, wp, scale);
|
10750
|
+
if (mp) {
|
10751
|
+
ggml_vec_acc_f32(nc, wp, mp);
|
10752
|
+
}
|
10581
10753
|
|
10582
10754
|
#ifndef NDEBUG
|
10583
10755
|
for (int i = 0; i < nc; ++i) {
|
10584
10756
|
//printf("p[%d] = %f\n", i, p[i]);
|
10585
|
-
assert(!isnan(
|
10757
|
+
assert(!isnan(wp[i]));
|
10586
10758
|
}
|
10587
10759
|
#endif
|
10588
10760
|
|
10589
10761
|
float max = -INFINITY;
|
10590
|
-
ggml_vec_max_f32(nc, &max,
|
10762
|
+
ggml_vec_max_f32(nc, &max, wp);
|
10591
10763
|
|
10592
10764
|
ggml_float sum = 0.0;
|
10593
10765
|
|
10594
10766
|
uint16_t scvt;
|
10595
10767
|
for (int i = 0; i < nc; i++) {
|
10596
|
-
if (
|
10768
|
+
if (wp[i] == -INFINITY) {
|
10597
10769
|
dp[i] = 0.0f;
|
10598
10770
|
} else {
|
10599
|
-
// const float val = (
|
10600
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(
|
10771
|
+
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
10772
|
+
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
10601
10773
|
memcpy(&scvt, &s, sizeof(scvt));
|
10602
10774
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
10603
10775
|
sum += (ggml_float)val;
|
@@ -10622,11 +10794,12 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10622
10794
|
static void ggml_compute_forward_soft_max(
|
10623
10795
|
const struct ggml_compute_params * params,
|
10624
10796
|
const struct ggml_tensor * src0,
|
10625
|
-
struct ggml_tensor *
|
10797
|
+
const struct ggml_tensor * src1,
|
10798
|
+
struct ggml_tensor * dst) {
|
10626
10799
|
switch (src0->type) {
|
10627
10800
|
case GGML_TYPE_F32:
|
10628
10801
|
{
|
10629
|
-
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
10802
|
+
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
10630
10803
|
} break;
|
10631
10804
|
default:
|
10632
10805
|
{
|
@@ -11982,6 +12155,67 @@ static void ggml_compute_forward_upscale(
|
|
11982
12155
|
}
|
11983
12156
|
}
|
11984
12157
|
|
12158
|
+
// ggml_compute_forward_argsort
|
12159
|
+
|
12160
|
+
static void ggml_compute_forward_argsort_f32(
|
12161
|
+
const struct ggml_compute_params * params,
|
12162
|
+
const struct ggml_tensor * src0,
|
12163
|
+
struct ggml_tensor * dst) {
|
12164
|
+
|
12165
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12166
|
+
return;
|
12167
|
+
}
|
12168
|
+
|
12169
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12170
|
+
|
12171
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12172
|
+
|
12173
|
+
const int ith = params->ith;
|
12174
|
+
const int nth = params->nth;
|
12175
|
+
|
12176
|
+
const int64_t nr = ggml_nrows(src0);
|
12177
|
+
|
12178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
12179
|
+
|
12180
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
12181
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
12182
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
12183
|
+
|
12184
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12185
|
+
dst_data[j] = j;
|
12186
|
+
}
|
12187
|
+
|
12188
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
12189
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12190
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
12191
|
+
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
12192
|
+
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
12193
|
+
int32_t tmp = dst_data[j];
|
12194
|
+
dst_data[j] = dst_data[k];
|
12195
|
+
dst_data[k] = tmp;
|
12196
|
+
}
|
12197
|
+
}
|
12198
|
+
}
|
12199
|
+
}
|
12200
|
+
}
|
12201
|
+
|
12202
|
+
static void ggml_compute_forward_argsort(
|
12203
|
+
const struct ggml_compute_params * params,
|
12204
|
+
const struct ggml_tensor * src0,
|
12205
|
+
struct ggml_tensor * dst) {
|
12206
|
+
|
12207
|
+
switch (src0->type) {
|
12208
|
+
case GGML_TYPE_F32:
|
12209
|
+
{
|
12210
|
+
ggml_compute_forward_argsort_f32(params, src0, dst);
|
12211
|
+
} break;
|
12212
|
+
default:
|
12213
|
+
{
|
12214
|
+
GGML_ASSERT(false);
|
12215
|
+
} break;
|
12216
|
+
}
|
12217
|
+
}
|
12218
|
+
|
11985
12219
|
// ggml_compute_forward_flash_attn
|
11986
12220
|
|
11987
12221
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13805,6 +14039,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13805
14039
|
{
|
13806
14040
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
13807
14041
|
} break;
|
14042
|
+
case GGML_OP_MUL_MAT_ID:
|
14043
|
+
{
|
14044
|
+
ggml_compute_forward_mul_mat_id(params, tensor);
|
14045
|
+
} break;
|
13808
14046
|
case GGML_OP_OUT_PROD:
|
13809
14047
|
{
|
13810
14048
|
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -13863,7 +14101,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13863
14101
|
} break;
|
13864
14102
|
case GGML_OP_SOFT_MAX:
|
13865
14103
|
{
|
13866
|
-
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
14104
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
13867
14105
|
} break;
|
13868
14106
|
case GGML_OP_SOFT_MAX_BACK:
|
13869
14107
|
{
|
@@ -13909,6 +14147,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13909
14147
|
{
|
13910
14148
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
13911
14149
|
} break;
|
14150
|
+
case GGML_OP_ARGSORT:
|
14151
|
+
{
|
14152
|
+
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14153
|
+
} break;
|
13912
14154
|
case GGML_OP_FLASH_ATTN:
|
13913
14155
|
{
|
13914
14156
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14559,6 +14801,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14559
14801
|
zero_table);
|
14560
14802
|
}
|
14561
14803
|
} break;
|
14804
|
+
case GGML_OP_MUL_MAT_ID:
|
14805
|
+
{
|
14806
|
+
GGML_ASSERT(false); // TODO: not implemented
|
14807
|
+
} break;
|
14562
14808
|
case GGML_OP_OUT_PROD:
|
14563
14809
|
{
|
14564
14810
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -14897,6 +15143,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14897
15143
|
{
|
14898
15144
|
GGML_ASSERT(false); // TODO: not implemented
|
14899
15145
|
} break;
|
15146
|
+
case GGML_OP_ARGSORT:
|
15147
|
+
{
|
15148
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15149
|
+
} break;
|
14900
15150
|
case GGML_OP_FLASH_ATTN:
|
14901
15151
|
{
|
14902
15152
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15257,12 +15507,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
15257
15507
|
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15258
15508
|
}
|
15259
15509
|
|
15260
|
-
struct ggml_cgraph
|
15261
|
-
|
15262
|
-
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15263
|
-
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15264
|
-
|
15265
|
-
*cgraph = (struct ggml_cgraph) {
|
15510
|
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15511
|
+
struct ggml_cgraph cgraph = {
|
15266
15512
|
/*.size =*/ 0,
|
15267
15513
|
/*.n_nodes =*/ i1 - i0,
|
15268
15514
|
/*.n_leafs =*/ 0,
|
@@ -15497,7 +15743,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15497
15743
|
n_tasks = n_threads;
|
15498
15744
|
} break;
|
15499
15745
|
case GGML_OP_SUB:
|
15500
|
-
case GGML_OP_DIV:
|
15501
15746
|
case GGML_OP_SQR:
|
15502
15747
|
case GGML_OP_SQRT:
|
15503
15748
|
case GGML_OP_LOG:
|
@@ -15530,10 +15775,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15530
15775
|
{
|
15531
15776
|
n_tasks = n_threads;
|
15532
15777
|
} break;
|
15778
|
+
default:
|
15779
|
+
GGML_ASSERT(false);
|
15533
15780
|
}
|
15534
15781
|
break;
|
15535
15782
|
case GGML_OP_SILU_BACK:
|
15536
15783
|
case GGML_OP_MUL:
|
15784
|
+
case GGML_OP_DIV:
|
15537
15785
|
case GGML_OP_NORM:
|
15538
15786
|
case GGML_OP_RMS_NORM:
|
15539
15787
|
case GGML_OP_RMS_NORM_BACK:
|
@@ -15571,6 +15819,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15571
15819
|
}
|
15572
15820
|
#endif
|
15573
15821
|
} break;
|
15822
|
+
case GGML_OP_MUL_MAT_ID:
|
15823
|
+
{
|
15824
|
+
// FIXME: blas
|
15825
|
+
n_tasks = n_threads;
|
15826
|
+
} break;
|
15574
15827
|
case GGML_OP_OUT_PROD:
|
15575
15828
|
{
|
15576
15829
|
n_tasks = n_threads;
|
@@ -15590,7 +15843,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15590
15843
|
} break;
|
15591
15844
|
case GGML_OP_DIAG_MASK_ZERO:
|
15592
15845
|
case GGML_OP_DIAG_MASK_INF:
|
15593
|
-
case GGML_OP_SOFT_MAX:
|
15594
15846
|
case GGML_OP_SOFT_MAX_BACK:
|
15595
15847
|
case GGML_OP_ROPE:
|
15596
15848
|
case GGML_OP_ROPE_BACK:
|
@@ -15606,6 +15858,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15606
15858
|
{
|
15607
15859
|
n_tasks = 1; //TODO
|
15608
15860
|
} break;
|
15861
|
+
case GGML_OP_SOFT_MAX:
|
15862
|
+
{
|
15863
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15864
|
+
} break;
|
15609
15865
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15610
15866
|
{
|
15611
15867
|
n_tasks = n_threads;
|
@@ -15627,6 +15883,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15627
15883
|
{
|
15628
15884
|
n_tasks = n_threads;
|
15629
15885
|
} break;
|
15886
|
+
case GGML_OP_ARGSORT:
|
15887
|
+
{
|
15888
|
+
n_tasks = n_threads;
|
15889
|
+
} break;
|
15630
15890
|
case GGML_OP_FLASH_ATTN:
|
15631
15891
|
{
|
15632
15892
|
n_tasks = n_threads;
|
@@ -15695,7 +15955,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15695
15955
|
} break;
|
15696
15956
|
default:
|
15697
15957
|
{
|
15698
|
-
|
15958
|
+
fprintf(stderr, "%s: op not implemented: ", __func__);
|
15959
|
+
if (node->op < GGML_OP_COUNT) {
|
15960
|
+
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
15961
|
+
} else {
|
15962
|
+
fprintf(stderr, "%d\n", node->op);
|
15963
|
+
}
|
15699
15964
|
GGML_ASSERT(false);
|
15700
15965
|
} break;
|
15701
15966
|
}
|
@@ -15836,18 +16101,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15836
16101
|
|
15837
16102
|
// thread scheduling for the different operations + work buffer size estimation
|
15838
16103
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15839
|
-
int n_tasks = 1;
|
15840
|
-
|
15841
16104
|
struct ggml_tensor * node = cgraph->nodes[i];
|
15842
16105
|
|
16106
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16107
|
+
|
15843
16108
|
size_t cur = 0;
|
15844
16109
|
|
15845
16110
|
switch (node->op) {
|
15846
16111
|
case GGML_OP_CPY:
|
15847
16112
|
case GGML_OP_DUP:
|
15848
16113
|
{
|
15849
|
-
n_tasks = n_threads;
|
15850
|
-
|
15851
16114
|
if (ggml_is_quantized(node->type)) {
|
15852
16115
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15853
16116
|
}
|
@@ -15855,16 +16118,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15855
16118
|
case GGML_OP_ADD:
|
15856
16119
|
case GGML_OP_ADD1:
|
15857
16120
|
{
|
15858
|
-
n_tasks = n_threads;
|
15859
|
-
|
15860
16121
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15861
16122
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15862
16123
|
}
|
15863
16124
|
} break;
|
15864
16125
|
case GGML_OP_ACC:
|
15865
16126
|
{
|
15866
|
-
n_tasks = n_threads;
|
15867
|
-
|
15868
16127
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15869
16128
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
15870
16129
|
}
|
@@ -15890,14 +16149,33 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15890
16149
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
15891
16150
|
}
|
15892
16151
|
} break;
|
16152
|
+
case GGML_OP_MUL_MAT_ID:
|
16153
|
+
{
|
16154
|
+
const struct ggml_tensor * a = node->src[2];
|
16155
|
+
const struct ggml_tensor * b = node->src[1];
|
16156
|
+
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
16157
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16158
|
+
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
16159
|
+
if (a->type != GGML_TYPE_F32) {
|
16160
|
+
// here we need memory just for single 2D matrix from src0
|
16161
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16162
|
+
}
|
16163
|
+
} else
|
16164
|
+
#endif
|
16165
|
+
if (b->type != vec_dot_type) {
|
16166
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
16167
|
+
}
|
16168
|
+
} break;
|
15893
16169
|
case GGML_OP_OUT_PROD:
|
15894
16170
|
{
|
15895
|
-
n_tasks = n_threads;
|
15896
|
-
|
15897
16171
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15898
16172
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15899
16173
|
}
|
15900
16174
|
} break;
|
16175
|
+
case GGML_OP_SOFT_MAX:
|
16176
|
+
{
|
16177
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16178
|
+
} break;
|
15901
16179
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15902
16180
|
{
|
15903
16181
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
@@ -15923,10 +16201,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15923
16201
|
GGML_ASSERT(false);
|
15924
16202
|
}
|
15925
16203
|
} break;
|
15926
|
-
case GGML_OP_IM2COL:
|
15927
|
-
{
|
15928
|
-
n_tasks = n_threads;
|
15929
|
-
} break;
|
15930
16204
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15931
16205
|
{
|
15932
16206
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
@@ -15943,8 +16217,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15943
16217
|
} break;
|
15944
16218
|
case GGML_OP_FLASH_ATTN:
|
15945
16219
|
{
|
15946
|
-
n_tasks = n_threads;
|
15947
|
-
|
15948
16220
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15949
16221
|
|
15950
16222
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -15957,8 +16229,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15957
16229
|
} break;
|
15958
16230
|
case GGML_OP_FLASH_FF:
|
15959
16231
|
{
|
15960
|
-
n_tasks = n_threads;
|
15961
|
-
|
15962
16232
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
15963
16233
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
15964
16234
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
@@ -15969,8 +16239,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15969
16239
|
} break;
|
15970
16240
|
case GGML_OP_FLASH_ATTN_BACK:
|
15971
16241
|
{
|
15972
|
-
n_tasks = n_threads;
|
15973
|
-
|
15974
16242
|
const int64_t D = node->src[0]->ne[0];
|
15975
16243
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15976
16244
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
@@ -15985,8 +16253,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15985
16253
|
|
15986
16254
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
15987
16255
|
{
|
15988
|
-
n_tasks = n_threads;
|
15989
|
-
|
15990
16256
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
15991
16257
|
} break;
|
15992
16258
|
case GGML_OP_COUNT:
|
@@ -17773,8 +18039,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
17773
18039
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17774
18040
|
|
17775
18041
|
for (int j = 0; j < QK5_0; j += 2) {
|
17776
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17777
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18042
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18043
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17778
18044
|
|
17779
18045
|
// cast to 16 bins
|
17780
18046
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17803,8 +18069,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
17803
18069
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17804
18070
|
|
17805
18071
|
for (int j = 0; j < QK5_1; j += 2) {
|
17806
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17807
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18072
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18073
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17808
18074
|
|
17809
18075
|
// cast to 16 bins
|
17810
18076
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17994,6 +18260,7 @@ struct gguf_kv {
|
|
17994
18260
|
|
17995
18261
|
struct gguf_header {
|
17996
18262
|
char magic[4];
|
18263
|
+
|
17997
18264
|
uint32_t version;
|
17998
18265
|
uint64_t n_tensors; // GGUFv2
|
17999
18266
|
uint64_t n_kv; // GGUFv2
|
@@ -18083,7 +18350,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18083
18350
|
|
18084
18351
|
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
18085
18352
|
if (magic[i] != GGUF_MAGIC[i]) {
|
18086
|
-
fprintf(stderr, "%s: invalid magic characters %
|
18353
|
+
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
18087
18354
|
fclose(file);
|
18088
18355
|
return NULL;
|
18089
18356
|
}
|
@@ -18098,7 +18365,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18098
18365
|
{
|
18099
18366
|
strncpy(ctx->header.magic, magic, 4);
|
18100
18367
|
|
18101
|
-
|
18102
18368
|
ctx->kv = NULL;
|
18103
18369
|
ctx->infos = NULL;
|
18104
18370
|
ctx->data = NULL;
|