llama_cpp 0.9.5 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1140 -355
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +506 -158
- data/ext/llama_cpp/src/ggml-metal.metal +795 -144
- data/ext/llama_cpp/src/ggml.c +331 -111
- data/ext/llama_cpp/src/ggml.h +49 -4
- data/ext/llama_cpp/src/llama.cpp +749 -329
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
233
233
|
#define UNUSED GGML_UNUSED
|
234
234
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
235
|
|
236
|
-
//
|
237
|
-
// tensor access macros
|
238
|
-
//
|
239
|
-
|
240
|
-
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
241
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
242
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
243
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
244
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
245
|
-
|
246
|
-
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
247
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
248
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
249
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
250
|
-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
251
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
252
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
253
|
-
|
254
236
|
#if defined(GGML_USE_ACCELERATE)
|
255
237
|
#include <Accelerate/Accelerate.h>
|
256
238
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1613
1595
|
"GROUP_NORM",
|
1614
1596
|
|
1615
1597
|
"MUL_MAT",
|
1598
|
+
"MUL_MAT_ID",
|
1616
1599
|
"OUT_PROD",
|
1617
1600
|
|
1618
1601
|
"SCALE",
|
@@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1640
1623
|
"POOL_1D",
|
1641
1624
|
"POOL_2D",
|
1642
1625
|
"UPSCALE",
|
1626
|
+
"ARGSORT",
|
1643
1627
|
|
1644
1628
|
"FLASH_ATTN",
|
1645
1629
|
"FLASH_FF",
|
@@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1666
1650
|
"CROSS_ENTROPY_LOSS_BACK",
|
1667
1651
|
};
|
1668
1652
|
|
1669
|
-
static_assert(GGML_OP_COUNT ==
|
1653
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1670
1654
|
|
1671
1655
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1672
1656
|
"none",
|
@@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1695
1679
|
"group_norm(x)",
|
1696
1680
|
|
1697
1681
|
"X*Y",
|
1682
|
+
"X[i]*Y",
|
1698
1683
|
"X*Y",
|
1699
1684
|
|
1700
1685
|
"x*v",
|
@@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1722
1707
|
"pool_1d(x)",
|
1723
1708
|
"pool_2d(x)",
|
1724
1709
|
"upscale(x)",
|
1710
|
+
"argsort(x)",
|
1725
1711
|
|
1726
1712
|
"flash_attn(x)",
|
1727
1713
|
"flash_ff(x)",
|
@@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1748
1734
|
"cross_entropy_loss_back(x,y)",
|
1749
1735
|
};
|
1750
1736
|
|
1751
|
-
static_assert(GGML_OP_COUNT ==
|
1737
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1752
1738
|
|
1753
1739
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1754
1740
|
|
1741
|
+
|
1742
|
+
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
1743
|
+
"ABS",
|
1744
|
+
"SGN",
|
1745
|
+
"NEG",
|
1746
|
+
"STEP",
|
1747
|
+
"TANH",
|
1748
|
+
"ELU",
|
1749
|
+
"RELU",
|
1750
|
+
"GELU",
|
1751
|
+
"GELU_QUICK",
|
1752
|
+
"SILU",
|
1753
|
+
"LEAKY",
|
1754
|
+
};
|
1755
|
+
|
1756
|
+
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
|
1757
|
+
|
1758
|
+
|
1755
1759
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1756
1760
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1757
1761
|
|
@@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1771
1775
|
|
1772
1776
|
p[GGML_OP_ACC ] = true;
|
1773
1777
|
p[GGML_OP_MUL_MAT ] = true;
|
1778
|
+
p[GGML_OP_MUL_MAT_ID ] = true;
|
1774
1779
|
p[GGML_OP_OUT_PROD ] = true;
|
1775
1780
|
p[GGML_OP_SET ] = true;
|
1776
1781
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
@@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
2023
2028
|
return GGML_OP_SYMBOL[op];
|
2024
2029
|
}
|
2025
2030
|
|
2031
|
+
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
2032
|
+
return GGML_UNARY_OP_NAME[op];
|
2033
|
+
}
|
2034
|
+
|
2035
|
+
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2036
|
+
if (t->op == GGML_OP_UNARY) {
|
2037
|
+
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2038
|
+
return ggml_unary_op_name(uop);
|
2039
|
+
}
|
2040
|
+
else {
|
2041
|
+
return ggml_op_name(t->op);
|
2042
|
+
}
|
2043
|
+
}
|
2044
|
+
|
2026
2045
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2027
2046
|
return ggml_type_size(tensor->type);
|
2028
2047
|
}
|
@@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
|
|
3154
3173
|
struct ggml_tensor * a,
|
3155
3174
|
struct ggml_tensor * b,
|
3156
3175
|
bool inplace) {
|
3157
|
-
|
3158
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3159
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3176
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3160
3177
|
|
3161
3178
|
bool is_node = false;
|
3162
3179
|
|
@@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
3371
3388
|
struct ggml_tensor * a,
|
3372
3389
|
struct ggml_tensor * b,
|
3373
3390
|
bool inplace) {
|
3374
|
-
|
3375
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3376
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3391
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3377
3392
|
|
3378
3393
|
bool is_node = false;
|
3379
3394
|
|
@@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
3418
3433
|
struct ggml_tensor * a,
|
3419
3434
|
struct ggml_tensor * b,
|
3420
3435
|
bool inplace) {
|
3421
|
-
GGML_ASSERT(
|
3436
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3422
3437
|
|
3423
3438
|
bool is_node = false;
|
3424
3439
|
|
@@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4056
4071
|
return result;
|
4057
4072
|
}
|
4058
4073
|
|
4074
|
+
// ggml_mul_mat_id
|
4075
|
+
|
4076
|
+
struct ggml_tensor * ggml_mul_mat_id(
|
4077
|
+
struct ggml_context * ctx,
|
4078
|
+
struct ggml_tensor * as[],
|
4079
|
+
struct ggml_tensor * ids,
|
4080
|
+
int id,
|
4081
|
+
struct ggml_tensor * b) {
|
4082
|
+
|
4083
|
+
int64_t n_as = ids->ne[0];
|
4084
|
+
|
4085
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4086
|
+
GGML_ASSERT(ggml_is_vector(ids));
|
4087
|
+
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4088
|
+
GGML_ASSERT(id >= 0 && id < n_as);
|
4089
|
+
|
4090
|
+
bool is_node = false;
|
4091
|
+
|
4092
|
+
if (as[0]->grad || b->grad) {
|
4093
|
+
is_node = true;
|
4094
|
+
}
|
4095
|
+
|
4096
|
+
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4097
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
4098
|
+
|
4099
|
+
ggml_set_op_params_i32(result, 0, id);
|
4100
|
+
|
4101
|
+
result->op = GGML_OP_MUL_MAT_ID;
|
4102
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4103
|
+
result->src[0] = ids;
|
4104
|
+
result->src[1] = b;
|
4105
|
+
|
4106
|
+
for (int64_t i = 0; i < n_as; i++) {
|
4107
|
+
struct ggml_tensor * a = as[i];
|
4108
|
+
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4109
|
+
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4110
|
+
GGML_ASSERT(!ggml_is_transposed(a));
|
4111
|
+
result->src[i + 2] = a;
|
4112
|
+
}
|
4113
|
+
|
4114
|
+
return result;
|
4115
|
+
}
|
4116
|
+
|
4059
4117
|
// ggml_out_prod
|
4060
4118
|
|
4061
4119
|
struct ggml_tensor * ggml_out_prod(
|
@@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4209
4267
|
struct ggml_tensor * b,
|
4210
4268
|
size_t nb1,
|
4211
4269
|
size_t offset) {
|
4212
|
-
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset,
|
4270
|
+
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
4213
4271
|
}
|
4214
4272
|
|
4215
4273
|
// ggml_cpy
|
@@ -5468,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
|
|
5468
5526
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5469
5527
|
}
|
5470
5528
|
|
5529
|
+
// ggml_argsort
|
5530
|
+
|
5531
|
+
struct ggml_tensor * ggml_argsort(
|
5532
|
+
struct ggml_context * ctx,
|
5533
|
+
struct ggml_tensor * a,
|
5534
|
+
enum ggml_sort_order order) {
|
5535
|
+
bool is_node = false;
|
5536
|
+
|
5537
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
|
5538
|
+
|
5539
|
+
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5540
|
+
|
5541
|
+
result->op = GGML_OP_ARGSORT;
|
5542
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5543
|
+
result->src[0] = a;
|
5544
|
+
|
5545
|
+
return result;
|
5546
|
+
}
|
5547
|
+
|
5548
|
+
// ggml_top_k
|
5549
|
+
|
5550
|
+
struct ggml_tensor * ggml_top_k(
|
5551
|
+
struct ggml_context * ctx,
|
5552
|
+
struct ggml_tensor * a,
|
5553
|
+
int k) {
|
5554
|
+
GGML_ASSERT(a->ne[0] >= k);
|
5555
|
+
|
5556
|
+
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
5557
|
+
|
5558
|
+
result = ggml_view_4d(ctx, result,
|
5559
|
+
k, result->ne[1], result->ne[2], result->ne[3],
|
5560
|
+
result->nb[1], result->nb[2], result->nb[3],
|
5561
|
+
0);
|
5562
|
+
|
5563
|
+
return result;
|
5564
|
+
}
|
5565
|
+
|
5471
5566
|
// ggml_flash_attn
|
5472
5567
|
|
5473
5568
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -6827,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
|
|
6827
6922
|
const struct ggml_tensor * src0,
|
6828
6923
|
const struct ggml_tensor * src1,
|
6829
6924
|
struct ggml_tensor * dst) {
|
6830
|
-
GGML_ASSERT(
|
6925
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
6831
6926
|
|
6832
6927
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
6833
6928
|
return;
|
@@ -6860,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
|
|
6860
6955
|
const int64_t i13 = i03 % ne13;
|
6861
6956
|
const int64_t i12 = i02 % ne12;
|
6862
6957
|
const int64_t i11 = i01 % ne11;
|
6958
|
+
const int64_t nr0 = ne00 / ne10;
|
6863
6959
|
|
6864
6960
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6865
6961
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6866
6962
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
6867
6963
|
|
6964
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
6868
6965
|
#ifdef GGML_USE_ACCELERATE
|
6869
|
-
|
6966
|
+
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
6870
6967
|
#else
|
6871
|
-
|
6968
|
+
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
6872
6969
|
#endif
|
6970
|
+
}
|
6873
6971
|
}
|
6874
6972
|
} else {
|
6875
6973
|
// src1 is not contiguous
|
@@ -6886,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
|
|
6886
6984
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6887
6985
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6888
6986
|
|
6889
|
-
for (
|
6890
|
-
|
6987
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
6988
|
+
const int64_t i10 = i0 % ne10;
|
6989
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
6891
6990
|
|
6892
6991
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
6893
6992
|
}
|
@@ -7607,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7607
7706
|
const struct ggml_tensor * src0,
|
7608
7707
|
const struct ggml_tensor * src1,
|
7609
7708
|
struct ggml_tensor * dst) {
|
7610
|
-
GGML_ASSERT(
|
7709
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7611
7710
|
|
7612
7711
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7613
7712
|
return;
|
@@ -7630,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
|
|
7630
7729
|
|
7631
7730
|
GGML_ASSERT( nb0 == sizeof(float));
|
7632
7731
|
GGML_ASSERT(nb00 == sizeof(float));
|
7633
|
-
GGML_ASSERT(ne00 == ne10);
|
7634
7732
|
|
7635
7733
|
if (nb10 == sizeof(float)) {
|
7636
7734
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
@@ -7642,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
|
|
7642
7740
|
const int64_t i13 = i03 % ne13;
|
7643
7741
|
const int64_t i12 = i02 % ne12;
|
7644
7742
|
const int64_t i11 = i01 % ne11;
|
7743
|
+
const int64_t nr0 = ne00 / ne10;
|
7645
7744
|
|
7646
7745
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7647
7746
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7648
7747
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7649
7748
|
|
7749
|
+
for (int64_t r = 0 ; r < nr0; ++r) {
|
7650
7750
|
#ifdef GGML_USE_ACCELERATE
|
7651
|
-
|
7751
|
+
UNUSED(ggml_vec_mul_f32);
|
7652
7752
|
|
7653
|
-
|
7753
|
+
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
7654
7754
|
#else
|
7655
|
-
|
7755
|
+
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7656
7756
|
#endif
|
7657
|
-
|
7658
|
-
// }
|
7757
|
+
}
|
7659
7758
|
}
|
7660
7759
|
} else {
|
7661
7760
|
// src1 is not contiguous
|
@@ -7673,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
|
|
7673
7772
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7674
7773
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7675
7774
|
|
7676
|
-
for (int64_t i0 = 0; i0 < ne00; i0
|
7677
|
-
|
7775
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7776
|
+
const int64_t i10 = i0 % ne10;
|
7777
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7678
7778
|
|
7679
7779
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
7680
7780
|
}
|
@@ -7708,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
|
|
7708
7808
|
const struct ggml_tensor * src0,
|
7709
7809
|
const struct ggml_tensor * src1,
|
7710
7810
|
struct ggml_tensor * dst) {
|
7711
|
-
|
7712
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
7811
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7713
7812
|
|
7714
7813
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7715
7814
|
return;
|
7716
7815
|
}
|
7717
7816
|
|
7718
|
-
const int
|
7817
|
+
const int ith = params->ith;
|
7818
|
+
const int nth = params->nth;
|
7819
|
+
|
7820
|
+
const int64_t nr = ggml_nrows(src0);
|
7719
7821
|
|
7720
7822
|
GGML_TENSOR_BINARY_OP_LOCALS
|
7721
7823
|
|
@@ -7723,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
|
|
7723
7825
|
GGML_ASSERT(nb00 == sizeof(float));
|
7724
7826
|
|
7725
7827
|
if (nb10 == sizeof(float)) {
|
7726
|
-
for (
|
7727
|
-
// src0
|
7728
|
-
const
|
7729
|
-
const
|
7730
|
-
const
|
7828
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7829
|
+
// src0 and dst are same shape => same indices
|
7830
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7831
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7832
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7833
|
+
|
7834
|
+
const int64_t i13 = i03 % ne13;
|
7835
|
+
const int64_t i12 = i02 % ne12;
|
7836
|
+
const int64_t i11 = i01 % ne11;
|
7837
|
+
const int64_t nr0 = ne00 / ne10;
|
7838
|
+
|
7839
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7840
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7841
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7731
7842
|
|
7843
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
7732
7844
|
#ifdef GGML_USE_ACCELERATE
|
7733
|
-
|
7845
|
+
UNUSED(ggml_vec_div_f32);
|
7734
7846
|
|
7735
|
-
|
7736
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
7737
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
7738
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
7739
|
-
ne0);
|
7847
|
+
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
7740
7848
|
#else
|
7741
|
-
|
7742
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
7743
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
7744
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
7849
|
+
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7745
7850
|
#endif
|
7746
|
-
|
7747
|
-
// }
|
7851
|
+
}
|
7748
7852
|
}
|
7749
7853
|
} else {
|
7750
7854
|
// src1 is not contiguous
|
7751
|
-
for (
|
7752
|
-
// src0
|
7753
|
-
|
7754
|
-
const
|
7755
|
-
const
|
7855
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7856
|
+
// src0 and dst are same shape => same indices
|
7857
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
7858
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7859
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7860
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7756
7861
|
|
7757
|
-
|
7758
|
-
|
7759
|
-
|
7760
|
-
|
7862
|
+
const int64_t i13 = i03 % ne13;
|
7863
|
+
const int64_t i12 = i02 % ne12;
|
7864
|
+
const int64_t i11 = i01 % ne11;
|
7865
|
+
|
7866
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7867
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7868
|
+
|
7869
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7870
|
+
const int64_t i10 = i0 % ne10;
|
7871
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7761
7872
|
|
7762
7873
|
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
7763
7874
|
}
|
@@ -8203,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
|
|
8203
8314
|
return;
|
8204
8315
|
}
|
8205
8316
|
|
8206
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
8317
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
8207
8318
|
|
8208
8319
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
8209
8320
|
const int nr0 = (int)(ne0/ne00);
|
@@ -8348,6 +8459,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8348
8459
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
8349
8460
|
|
8350
8461
|
const int ith = params->ith;
|
8462
|
+
const int nth = params->nth;
|
8351
8463
|
|
8352
8464
|
GGML_TENSOR_BINARY_OP_LOCALS
|
8353
8465
|
|
@@ -8357,7 +8469,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8357
8469
|
GGML_ASSERT(nb10 == sizeof(float));
|
8358
8470
|
|
8359
8471
|
for (int i3 = 0; i3 < ne3; i3++) {
|
8360
|
-
for (int i2 = ith; i2 < ne2; i2
|
8472
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
8361
8473
|
if (i2 < ne02) { // src0
|
8362
8474
|
for (int i1 = 0; i1 < ne1; i1++) {
|
8363
8475
|
for (int i0 = 0; i0 < ne0; i0++) {
|
@@ -9517,6 +9629,8 @@ static void ggml_compute_forward_mul_mat(
|
|
9517
9629
|
char * wdata = params->wdata;
|
9518
9630
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
9519
9631
|
|
9632
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9633
|
+
|
9520
9634
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9521
9635
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
9522
9636
|
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
@@ -9618,6 +9732,26 @@ static void ggml_compute_forward_mul_mat(
|
|
9618
9732
|
}
|
9619
9733
|
}
|
9620
9734
|
|
9735
|
+
// ggml_compute_forward_mul_mat_id
|
9736
|
+
|
9737
|
+
static void ggml_compute_forward_mul_mat_id(
|
9738
|
+
const struct ggml_compute_params * params,
|
9739
|
+
struct ggml_tensor * dst) {
|
9740
|
+
|
9741
|
+
const struct ggml_tensor * ids = dst->src[0];
|
9742
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9743
|
+
|
9744
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9745
|
+
|
9746
|
+
const int a_id = ((int32_t *)ids->data)[id];
|
9747
|
+
|
9748
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
9749
|
+
|
9750
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
9751
|
+
|
9752
|
+
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
9753
|
+
}
|
9754
|
+
|
9621
9755
|
// ggml_compute_forward_out_prod
|
9622
9756
|
|
9623
9757
|
static void ggml_compute_forward_out_prod_f32(
|
@@ -12021,6 +12155,67 @@ static void ggml_compute_forward_upscale(
|
|
12021
12155
|
}
|
12022
12156
|
}
|
12023
12157
|
|
12158
|
+
// ggml_compute_forward_argsort
|
12159
|
+
|
12160
|
+
static void ggml_compute_forward_argsort_f32(
|
12161
|
+
const struct ggml_compute_params * params,
|
12162
|
+
const struct ggml_tensor * src0,
|
12163
|
+
struct ggml_tensor * dst) {
|
12164
|
+
|
12165
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12166
|
+
return;
|
12167
|
+
}
|
12168
|
+
|
12169
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12170
|
+
|
12171
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12172
|
+
|
12173
|
+
const int ith = params->ith;
|
12174
|
+
const int nth = params->nth;
|
12175
|
+
|
12176
|
+
const int64_t nr = ggml_nrows(src0);
|
12177
|
+
|
12178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
12179
|
+
|
12180
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
12181
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
12182
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
12183
|
+
|
12184
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12185
|
+
dst_data[j] = j;
|
12186
|
+
}
|
12187
|
+
|
12188
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
12189
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12190
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
12191
|
+
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
12192
|
+
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
12193
|
+
int32_t tmp = dst_data[j];
|
12194
|
+
dst_data[j] = dst_data[k];
|
12195
|
+
dst_data[k] = tmp;
|
12196
|
+
}
|
12197
|
+
}
|
12198
|
+
}
|
12199
|
+
}
|
12200
|
+
}
|
12201
|
+
|
12202
|
+
static void ggml_compute_forward_argsort(
|
12203
|
+
const struct ggml_compute_params * params,
|
12204
|
+
const struct ggml_tensor * src0,
|
12205
|
+
struct ggml_tensor * dst) {
|
12206
|
+
|
12207
|
+
switch (src0->type) {
|
12208
|
+
case GGML_TYPE_F32:
|
12209
|
+
{
|
12210
|
+
ggml_compute_forward_argsort_f32(params, src0, dst);
|
12211
|
+
} break;
|
12212
|
+
default:
|
12213
|
+
{
|
12214
|
+
GGML_ASSERT(false);
|
12215
|
+
} break;
|
12216
|
+
}
|
12217
|
+
}
|
12218
|
+
|
12024
12219
|
// ggml_compute_forward_flash_attn
|
12025
12220
|
|
12026
12221
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13844,6 +14039,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13844
14039
|
{
|
13845
14040
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
13846
14041
|
} break;
|
14042
|
+
case GGML_OP_MUL_MAT_ID:
|
14043
|
+
{
|
14044
|
+
ggml_compute_forward_mul_mat_id(params, tensor);
|
14045
|
+
} break;
|
13847
14046
|
case GGML_OP_OUT_PROD:
|
13848
14047
|
{
|
13849
14048
|
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -13948,6 +14147,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13948
14147
|
{
|
13949
14148
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
13950
14149
|
} break;
|
14150
|
+
case GGML_OP_ARGSORT:
|
14151
|
+
{
|
14152
|
+
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14153
|
+
} break;
|
13951
14154
|
case GGML_OP_FLASH_ATTN:
|
13952
14155
|
{
|
13953
14156
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14598,6 +14801,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14598
14801
|
zero_table);
|
14599
14802
|
}
|
14600
14803
|
} break;
|
14804
|
+
case GGML_OP_MUL_MAT_ID:
|
14805
|
+
{
|
14806
|
+
GGML_ASSERT(false); // TODO: not implemented
|
14807
|
+
} break;
|
14601
14808
|
case GGML_OP_OUT_PROD:
|
14602
14809
|
{
|
14603
14810
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -14936,6 +15143,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14936
15143
|
{
|
14937
15144
|
GGML_ASSERT(false); // TODO: not implemented
|
14938
15145
|
} break;
|
15146
|
+
case GGML_OP_ARGSORT:
|
15147
|
+
{
|
15148
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15149
|
+
} break;
|
14939
15150
|
case GGML_OP_FLASH_ATTN:
|
14940
15151
|
{
|
14941
15152
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15296,12 +15507,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
15296
15507
|
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15297
15508
|
}
|
15298
15509
|
|
15299
|
-
struct ggml_cgraph
|
15300
|
-
|
15301
|
-
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15302
|
-
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15303
|
-
|
15304
|
-
*cgraph = (struct ggml_cgraph) {
|
15510
|
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15511
|
+
struct ggml_cgraph cgraph = {
|
15305
15512
|
/*.size =*/ 0,
|
15306
15513
|
/*.n_nodes =*/ i1 - i0,
|
15307
15514
|
/*.n_leafs =*/ 0,
|
@@ -15536,7 +15743,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15536
15743
|
n_tasks = n_threads;
|
15537
15744
|
} break;
|
15538
15745
|
case GGML_OP_SUB:
|
15539
|
-
case GGML_OP_DIV:
|
15540
15746
|
case GGML_OP_SQR:
|
15541
15747
|
case GGML_OP_SQRT:
|
15542
15748
|
case GGML_OP_LOG:
|
@@ -15569,10 +15775,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15569
15775
|
{
|
15570
15776
|
n_tasks = n_threads;
|
15571
15777
|
} break;
|
15778
|
+
default:
|
15779
|
+
GGML_ASSERT(false);
|
15572
15780
|
}
|
15573
15781
|
break;
|
15574
15782
|
case GGML_OP_SILU_BACK:
|
15575
15783
|
case GGML_OP_MUL:
|
15784
|
+
case GGML_OP_DIV:
|
15576
15785
|
case GGML_OP_NORM:
|
15577
15786
|
case GGML_OP_RMS_NORM:
|
15578
15787
|
case GGML_OP_RMS_NORM_BACK:
|
@@ -15610,6 +15819,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15610
15819
|
}
|
15611
15820
|
#endif
|
15612
15821
|
} break;
|
15822
|
+
case GGML_OP_MUL_MAT_ID:
|
15823
|
+
{
|
15824
|
+
// FIXME: blas
|
15825
|
+
n_tasks = n_threads;
|
15826
|
+
} break;
|
15613
15827
|
case GGML_OP_OUT_PROD:
|
15614
15828
|
{
|
15615
15829
|
n_tasks = n_threads;
|
@@ -15629,7 +15843,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15629
15843
|
} break;
|
15630
15844
|
case GGML_OP_DIAG_MASK_ZERO:
|
15631
15845
|
case GGML_OP_DIAG_MASK_INF:
|
15632
|
-
case GGML_OP_SOFT_MAX:
|
15633
15846
|
case GGML_OP_SOFT_MAX_BACK:
|
15634
15847
|
case GGML_OP_ROPE:
|
15635
15848
|
case GGML_OP_ROPE_BACK:
|
@@ -15645,6 +15858,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15645
15858
|
{
|
15646
15859
|
n_tasks = 1; //TODO
|
15647
15860
|
} break;
|
15861
|
+
case GGML_OP_SOFT_MAX:
|
15862
|
+
{
|
15863
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15864
|
+
} break;
|
15648
15865
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15649
15866
|
{
|
15650
15867
|
n_tasks = n_threads;
|
@@ -15666,6 +15883,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15666
15883
|
{
|
15667
15884
|
n_tasks = n_threads;
|
15668
15885
|
} break;
|
15886
|
+
case GGML_OP_ARGSORT:
|
15887
|
+
{
|
15888
|
+
n_tasks = n_threads;
|
15889
|
+
} break;
|
15669
15890
|
case GGML_OP_FLASH_ATTN:
|
15670
15891
|
{
|
15671
15892
|
n_tasks = n_threads;
|
@@ -15728,6 +15949,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15728
15949
|
{
|
15729
15950
|
n_tasks = 1;
|
15730
15951
|
} break;
|
15952
|
+
case GGML_OP_COUNT:
|
15953
|
+
{
|
15954
|
+
GGML_ASSERT(false);
|
15955
|
+
} break;
|
15731
15956
|
default:
|
15732
15957
|
{
|
15733
15958
|
fprintf(stderr, "%s: op not implemented: ", __func__);
|
@@ -15876,18 +16101,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15876
16101
|
|
15877
16102
|
// thread scheduling for the different operations + work buffer size estimation
|
15878
16103
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15879
|
-
int n_tasks = 1;
|
15880
|
-
|
15881
16104
|
struct ggml_tensor * node = cgraph->nodes[i];
|
15882
16105
|
|
16106
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16107
|
+
|
15883
16108
|
size_t cur = 0;
|
15884
16109
|
|
15885
16110
|
switch (node->op) {
|
15886
16111
|
case GGML_OP_CPY:
|
15887
16112
|
case GGML_OP_DUP:
|
15888
16113
|
{
|
15889
|
-
n_tasks = n_threads;
|
15890
|
-
|
15891
16114
|
if (ggml_is_quantized(node->type)) {
|
15892
16115
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15893
16116
|
}
|
@@ -15895,16 +16118,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15895
16118
|
case GGML_OP_ADD:
|
15896
16119
|
case GGML_OP_ADD1:
|
15897
16120
|
{
|
15898
|
-
n_tasks = n_threads;
|
15899
|
-
|
15900
16121
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15901
16122
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15902
16123
|
}
|
15903
16124
|
} break;
|
15904
16125
|
case GGML_OP_ACC:
|
15905
16126
|
{
|
15906
|
-
n_tasks = n_threads;
|
15907
|
-
|
15908
16127
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15909
16128
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
15910
16129
|
}
|
@@ -15930,18 +16149,31 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15930
16149
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
15931
16150
|
}
|
15932
16151
|
} break;
|
16152
|
+
case GGML_OP_MUL_MAT_ID:
|
16153
|
+
{
|
16154
|
+
const struct ggml_tensor * a = node->src[2];
|
16155
|
+
const struct ggml_tensor * b = node->src[1];
|
16156
|
+
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
16157
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16158
|
+
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
16159
|
+
if (a->type != GGML_TYPE_F32) {
|
16160
|
+
// here we need memory just for single 2D matrix from src0
|
16161
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16162
|
+
}
|
16163
|
+
} else
|
16164
|
+
#endif
|
16165
|
+
if (b->type != vec_dot_type) {
|
16166
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
16167
|
+
}
|
16168
|
+
} break;
|
15933
16169
|
case GGML_OP_OUT_PROD:
|
15934
16170
|
{
|
15935
|
-
n_tasks = n_threads;
|
15936
|
-
|
15937
16171
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15938
16172
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15939
16173
|
}
|
15940
16174
|
} break;
|
15941
16175
|
case GGML_OP_SOFT_MAX:
|
15942
16176
|
{
|
15943
|
-
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
-
|
15945
16177
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
16178
|
} break;
|
15947
16179
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
@@ -15969,10 +16201,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15969
16201
|
GGML_ASSERT(false);
|
15970
16202
|
}
|
15971
16203
|
} break;
|
15972
|
-
case GGML_OP_IM2COL:
|
15973
|
-
{
|
15974
|
-
n_tasks = n_threads;
|
15975
|
-
} break;
|
15976
16204
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15977
16205
|
{
|
15978
16206
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
@@ -15989,8 +16217,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15989
16217
|
} break;
|
15990
16218
|
case GGML_OP_FLASH_ATTN:
|
15991
16219
|
{
|
15992
|
-
n_tasks = n_threads;
|
15993
|
-
|
15994
16220
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15995
16221
|
|
15996
16222
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -16003,8 +16229,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16003
16229
|
} break;
|
16004
16230
|
case GGML_OP_FLASH_FF:
|
16005
16231
|
{
|
16006
|
-
n_tasks = n_threads;
|
16007
|
-
|
16008
16232
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16009
16233
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16010
16234
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
@@ -16015,8 +16239,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16015
16239
|
} break;
|
16016
16240
|
case GGML_OP_FLASH_ATTN_BACK:
|
16017
16241
|
{
|
16018
|
-
n_tasks = n_threads;
|
16019
|
-
|
16020
16242
|
const int64_t D = node->src[0]->ne[0];
|
16021
16243
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16022
16244
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
@@ -16031,8 +16253,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16031
16253
|
|
16032
16254
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16033
16255
|
{
|
16034
|
-
n_tasks = n_threads;
|
16035
|
-
|
16036
16256
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16037
16257
|
} break;
|
16038
16258
|
case GGML_OP_COUNT:
|
@@ -17819,8 +18039,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
17819
18039
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17820
18040
|
|
17821
18041
|
for (int j = 0; j < QK5_0; j += 2) {
|
17822
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17823
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18042
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18043
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17824
18044
|
|
17825
18045
|
// cast to 16 bins
|
17826
18046
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17849,8 +18069,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
17849
18069
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17850
18070
|
|
17851
18071
|
for (int j = 0; j < QK5_1; j += 2) {
|
17852
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17853
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18072
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18073
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17854
18074
|
|
17855
18075
|
// cast to 16 bins
|
17856
18076
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -18040,6 +18260,7 @@ struct gguf_kv {
|
|
18040
18260
|
|
18041
18261
|
struct gguf_header {
|
18042
18262
|
char magic[4];
|
18263
|
+
|
18043
18264
|
uint32_t version;
|
18044
18265
|
uint64_t n_tensors; // GGUFv2
|
18045
18266
|
uint64_t n_kv; // GGUFv2
|
@@ -18129,7 +18350,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18129
18350
|
|
18130
18351
|
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
18131
18352
|
if (magic[i] != GGUF_MAGIC[i]) {
|
18132
|
-
fprintf(stderr, "%s: invalid magic characters %
|
18353
|
+
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
18133
18354
|
fclose(file);
|
18134
18355
|
return NULL;
|
18135
18356
|
}
|
@@ -18144,7 +18365,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18144
18365
|
{
|
18145
18366
|
strncpy(ctx->header.magic, magic, 4);
|
18146
18367
|
|
18147
|
-
|
18148
18368
|
ctx->kv = NULL;
|
18149
18369
|
ctx->infos = NULL;
|
18150
18370
|
ctx->data = NULL;
|