llama_cpp 0.9.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1140 -355
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +506 -158
- data/ext/llama_cpp/src/ggml-metal.metal +795 -144
- data/ext/llama_cpp/src/ggml.c +331 -111
- data/ext/llama_cpp/src/ggml.h +49 -4
- data/ext/llama_cpp/src/llama.cpp +749 -329
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
233
233
|
#define UNUSED GGML_UNUSED
|
234
234
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
235
|
|
236
|
-
//
|
237
|
-
// tensor access macros
|
238
|
-
//
|
239
|
-
|
240
|
-
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
241
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
242
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
243
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
244
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
245
|
-
|
246
|
-
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
247
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
248
|
-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
249
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
250
|
-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
251
|
-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
252
|
-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
253
|
-
|
254
236
|
#if defined(GGML_USE_ACCELERATE)
|
255
237
|
#include <Accelerate/Accelerate.h>
|
256
238
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1613
1595
|
"GROUP_NORM",
|
1614
1596
|
|
1615
1597
|
"MUL_MAT",
|
1598
|
+
"MUL_MAT_ID",
|
1616
1599
|
"OUT_PROD",
|
1617
1600
|
|
1618
1601
|
"SCALE",
|
@@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1640
1623
|
"POOL_1D",
|
1641
1624
|
"POOL_2D",
|
1642
1625
|
"UPSCALE",
|
1626
|
+
"ARGSORT",
|
1643
1627
|
|
1644
1628
|
"FLASH_ATTN",
|
1645
1629
|
"FLASH_FF",
|
@@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1666
1650
|
"CROSS_ENTROPY_LOSS_BACK",
|
1667
1651
|
};
|
1668
1652
|
|
1669
|
-
static_assert(GGML_OP_COUNT ==
|
1653
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1670
1654
|
|
1671
1655
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1672
1656
|
"none",
|
@@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1695
1679
|
"group_norm(x)",
|
1696
1680
|
|
1697
1681
|
"X*Y",
|
1682
|
+
"X[i]*Y",
|
1698
1683
|
"X*Y",
|
1699
1684
|
|
1700
1685
|
"x*v",
|
@@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1722
1707
|
"pool_1d(x)",
|
1723
1708
|
"pool_2d(x)",
|
1724
1709
|
"upscale(x)",
|
1710
|
+
"argsort(x)",
|
1725
1711
|
|
1726
1712
|
"flash_attn(x)",
|
1727
1713
|
"flash_ff(x)",
|
@@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1748
1734
|
"cross_entropy_loss_back(x,y)",
|
1749
1735
|
};
|
1750
1736
|
|
1751
|
-
static_assert(GGML_OP_COUNT ==
|
1737
|
+
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
1752
1738
|
|
1753
1739
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1754
1740
|
|
1741
|
+
|
1742
|
+
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
1743
|
+
"ABS",
|
1744
|
+
"SGN",
|
1745
|
+
"NEG",
|
1746
|
+
"STEP",
|
1747
|
+
"TANH",
|
1748
|
+
"ELU",
|
1749
|
+
"RELU",
|
1750
|
+
"GELU",
|
1751
|
+
"GELU_QUICK",
|
1752
|
+
"SILU",
|
1753
|
+
"LEAKY",
|
1754
|
+
};
|
1755
|
+
|
1756
|
+
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
|
1757
|
+
|
1758
|
+
|
1755
1759
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
1756
1760
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
1757
1761
|
|
@@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
1771
1775
|
|
1772
1776
|
p[GGML_OP_ACC ] = true;
|
1773
1777
|
p[GGML_OP_MUL_MAT ] = true;
|
1778
|
+
p[GGML_OP_MUL_MAT_ID ] = true;
|
1774
1779
|
p[GGML_OP_OUT_PROD ] = true;
|
1775
1780
|
p[GGML_OP_SET ] = true;
|
1776
1781
|
p[GGML_OP_GET_ROWS_BACK ] = true;
|
@@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
2023
2028
|
return GGML_OP_SYMBOL[op];
|
2024
2029
|
}
|
2025
2030
|
|
2031
|
+
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
2032
|
+
return GGML_UNARY_OP_NAME[op];
|
2033
|
+
}
|
2034
|
+
|
2035
|
+
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2036
|
+
if (t->op == GGML_OP_UNARY) {
|
2037
|
+
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2038
|
+
return ggml_unary_op_name(uop);
|
2039
|
+
}
|
2040
|
+
else {
|
2041
|
+
return ggml_op_name(t->op);
|
2042
|
+
}
|
2043
|
+
}
|
2044
|
+
|
2026
2045
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2027
2046
|
return ggml_type_size(tensor->type);
|
2028
2047
|
}
|
@@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
|
|
3154
3173
|
struct ggml_tensor * a,
|
3155
3174
|
struct ggml_tensor * b,
|
3156
3175
|
bool inplace) {
|
3157
|
-
|
3158
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3159
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3176
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3160
3177
|
|
3161
3178
|
bool is_node = false;
|
3162
3179
|
|
@@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
3371
3388
|
struct ggml_tensor * a,
|
3372
3389
|
struct ggml_tensor * b,
|
3373
3390
|
bool inplace) {
|
3374
|
-
|
3375
|
-
// GGML_ASSERT(ggml_can_repeat(b, a));
|
3376
|
-
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
3391
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3377
3392
|
|
3378
3393
|
bool is_node = false;
|
3379
3394
|
|
@@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
3418
3433
|
struct ggml_tensor * a,
|
3419
3434
|
struct ggml_tensor * b,
|
3420
3435
|
bool inplace) {
|
3421
|
-
GGML_ASSERT(
|
3436
|
+
GGML_ASSERT(ggml_can_repeat(b, a));
|
3422
3437
|
|
3423
3438
|
bool is_node = false;
|
3424
3439
|
|
@@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
|
|
4056
4071
|
return result;
|
4057
4072
|
}
|
4058
4073
|
|
4074
|
+
// ggml_mul_mat_id
|
4075
|
+
|
4076
|
+
struct ggml_tensor * ggml_mul_mat_id(
|
4077
|
+
struct ggml_context * ctx,
|
4078
|
+
struct ggml_tensor * as[],
|
4079
|
+
struct ggml_tensor * ids,
|
4080
|
+
int id,
|
4081
|
+
struct ggml_tensor * b) {
|
4082
|
+
|
4083
|
+
int64_t n_as = ids->ne[0];
|
4084
|
+
|
4085
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4086
|
+
GGML_ASSERT(ggml_is_vector(ids));
|
4087
|
+
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
4088
|
+
GGML_ASSERT(id >= 0 && id < n_as);
|
4089
|
+
|
4090
|
+
bool is_node = false;
|
4091
|
+
|
4092
|
+
if (as[0]->grad || b->grad) {
|
4093
|
+
is_node = true;
|
4094
|
+
}
|
4095
|
+
|
4096
|
+
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4097
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
4098
|
+
|
4099
|
+
ggml_set_op_params_i32(result, 0, id);
|
4100
|
+
|
4101
|
+
result->op = GGML_OP_MUL_MAT_ID;
|
4102
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4103
|
+
result->src[0] = ids;
|
4104
|
+
result->src[1] = b;
|
4105
|
+
|
4106
|
+
for (int64_t i = 0; i < n_as; i++) {
|
4107
|
+
struct ggml_tensor * a = as[i];
|
4108
|
+
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4109
|
+
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4110
|
+
GGML_ASSERT(!ggml_is_transposed(a));
|
4111
|
+
result->src[i + 2] = a;
|
4112
|
+
}
|
4113
|
+
|
4114
|
+
return result;
|
4115
|
+
}
|
4116
|
+
|
4059
4117
|
// ggml_out_prod
|
4060
4118
|
|
4061
4119
|
struct ggml_tensor * ggml_out_prod(
|
@@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4209
4267
|
struct ggml_tensor * b,
|
4210
4268
|
size_t nb1,
|
4211
4269
|
size_t offset) {
|
4212
|
-
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset,
|
4270
|
+
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
4213
4271
|
}
|
4214
4272
|
|
4215
4273
|
// ggml_cpy
|
@@ -5468,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
|
|
5468
5526
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5469
5527
|
}
|
5470
5528
|
|
5529
|
+
// ggml_argsort
|
5530
|
+
|
5531
|
+
struct ggml_tensor * ggml_argsort(
|
5532
|
+
struct ggml_context * ctx,
|
5533
|
+
struct ggml_tensor * a,
|
5534
|
+
enum ggml_sort_order order) {
|
5535
|
+
bool is_node = false;
|
5536
|
+
|
5537
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
|
5538
|
+
|
5539
|
+
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
5540
|
+
|
5541
|
+
result->op = GGML_OP_ARGSORT;
|
5542
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5543
|
+
result->src[0] = a;
|
5544
|
+
|
5545
|
+
return result;
|
5546
|
+
}
|
5547
|
+
|
5548
|
+
// ggml_top_k
|
5549
|
+
|
5550
|
+
struct ggml_tensor * ggml_top_k(
|
5551
|
+
struct ggml_context * ctx,
|
5552
|
+
struct ggml_tensor * a,
|
5553
|
+
int k) {
|
5554
|
+
GGML_ASSERT(a->ne[0] >= k);
|
5555
|
+
|
5556
|
+
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
5557
|
+
|
5558
|
+
result = ggml_view_4d(ctx, result,
|
5559
|
+
k, result->ne[1], result->ne[2], result->ne[3],
|
5560
|
+
result->nb[1], result->nb[2], result->nb[3],
|
5561
|
+
0);
|
5562
|
+
|
5563
|
+
return result;
|
5564
|
+
}
|
5565
|
+
|
5471
5566
|
// ggml_flash_attn
|
5472
5567
|
|
5473
5568
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -6827,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
|
|
6827
6922
|
const struct ggml_tensor * src0,
|
6828
6923
|
const struct ggml_tensor * src1,
|
6829
6924
|
struct ggml_tensor * dst) {
|
6830
|
-
GGML_ASSERT(
|
6925
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
6831
6926
|
|
6832
6927
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
6833
6928
|
return;
|
@@ -6860,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
|
|
6860
6955
|
const int64_t i13 = i03 % ne13;
|
6861
6956
|
const int64_t i12 = i02 % ne12;
|
6862
6957
|
const int64_t i11 = i01 % ne11;
|
6958
|
+
const int64_t nr0 = ne00 / ne10;
|
6863
6959
|
|
6864
6960
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6865
6961
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6866
6962
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
6867
6963
|
|
6964
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
6868
6965
|
#ifdef GGML_USE_ACCELERATE
|
6869
|
-
|
6966
|
+
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
6870
6967
|
#else
|
6871
|
-
|
6968
|
+
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
6872
6969
|
#endif
|
6970
|
+
}
|
6873
6971
|
}
|
6874
6972
|
} else {
|
6875
6973
|
// src1 is not contiguous
|
@@ -6886,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
|
|
6886
6984
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
6887
6985
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
6888
6986
|
|
6889
|
-
for (
|
6890
|
-
|
6987
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
6988
|
+
const int64_t i10 = i0 % ne10;
|
6989
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
6891
6990
|
|
6892
6991
|
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
6893
6992
|
}
|
@@ -7607,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7607
7706
|
const struct ggml_tensor * src0,
|
7608
7707
|
const struct ggml_tensor * src1,
|
7609
7708
|
struct ggml_tensor * dst) {
|
7610
|
-
GGML_ASSERT(
|
7709
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7611
7710
|
|
7612
7711
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7613
7712
|
return;
|
@@ -7630,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
|
|
7630
7729
|
|
7631
7730
|
GGML_ASSERT( nb0 == sizeof(float));
|
7632
7731
|
GGML_ASSERT(nb00 == sizeof(float));
|
7633
|
-
GGML_ASSERT(ne00 == ne10);
|
7634
7732
|
|
7635
7733
|
if (nb10 == sizeof(float)) {
|
7636
7734
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
@@ -7642,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
|
|
7642
7740
|
const int64_t i13 = i03 % ne13;
|
7643
7741
|
const int64_t i12 = i02 % ne12;
|
7644
7742
|
const int64_t i11 = i01 % ne11;
|
7743
|
+
const int64_t nr0 = ne00 / ne10;
|
7645
7744
|
|
7646
7745
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7647
7746
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7648
7747
|
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7649
7748
|
|
7749
|
+
for (int64_t r = 0 ; r < nr0; ++r) {
|
7650
7750
|
#ifdef GGML_USE_ACCELERATE
|
7651
|
-
|
7751
|
+
UNUSED(ggml_vec_mul_f32);
|
7652
7752
|
|
7653
|
-
|
7753
|
+
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
7654
7754
|
#else
|
7655
|
-
|
7755
|
+
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7656
7756
|
#endif
|
7657
|
-
|
7658
|
-
// }
|
7757
|
+
}
|
7659
7758
|
}
|
7660
7759
|
} else {
|
7661
7760
|
// src1 is not contiguous
|
@@ -7673,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
|
|
7673
7772
|
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7674
7773
|
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7675
7774
|
|
7676
|
-
for (int64_t i0 = 0; i0 < ne00; i0
|
7677
|
-
|
7775
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7776
|
+
const int64_t i10 = i0 % ne10;
|
7777
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7678
7778
|
|
7679
7779
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
7680
7780
|
}
|
@@ -7708,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
|
|
7708
7808
|
const struct ggml_tensor * src0,
|
7709
7809
|
const struct ggml_tensor * src1,
|
7710
7810
|
struct ggml_tensor * dst) {
|
7711
|
-
|
7712
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
7811
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
7713
7812
|
|
7714
7813
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7715
7814
|
return;
|
7716
7815
|
}
|
7717
7816
|
|
7718
|
-
const int
|
7817
|
+
const int ith = params->ith;
|
7818
|
+
const int nth = params->nth;
|
7819
|
+
|
7820
|
+
const int64_t nr = ggml_nrows(src0);
|
7719
7821
|
|
7720
7822
|
GGML_TENSOR_BINARY_OP_LOCALS
|
7721
7823
|
|
@@ -7723,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
|
|
7723
7825
|
GGML_ASSERT(nb00 == sizeof(float));
|
7724
7826
|
|
7725
7827
|
if (nb10 == sizeof(float)) {
|
7726
|
-
for (
|
7727
|
-
// src0
|
7728
|
-
const
|
7729
|
-
const
|
7730
|
-
const
|
7828
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7829
|
+
// src0 and dst are same shape => same indices
|
7830
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7831
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7832
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7833
|
+
|
7834
|
+
const int64_t i13 = i03 % ne13;
|
7835
|
+
const int64_t i12 = i02 % ne12;
|
7836
|
+
const int64_t i11 = i01 % ne11;
|
7837
|
+
const int64_t nr0 = ne00 / ne10;
|
7838
|
+
|
7839
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7840
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7841
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
7731
7842
|
|
7843
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
7732
7844
|
#ifdef GGML_USE_ACCELERATE
|
7733
|
-
|
7845
|
+
UNUSED(ggml_vec_div_f32);
|
7734
7846
|
|
7735
|
-
|
7736
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
7737
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
7738
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
7739
|
-
ne0);
|
7847
|
+
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
7740
7848
|
#else
|
7741
|
-
|
7742
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
7743
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
7744
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
7849
|
+
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
7745
7850
|
#endif
|
7746
|
-
|
7747
|
-
// }
|
7851
|
+
}
|
7748
7852
|
}
|
7749
7853
|
} else {
|
7750
7854
|
// src1 is not contiguous
|
7751
|
-
for (
|
7752
|
-
// src0
|
7753
|
-
|
7754
|
-
const
|
7755
|
-
const
|
7855
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
7856
|
+
// src0 and dst are same shape => same indices
|
7857
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
7858
|
+
const int64_t i03 = ir/(ne02*ne01);
|
7859
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
7860
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
7756
7861
|
|
7757
|
-
|
7758
|
-
|
7759
|
-
|
7760
|
-
|
7862
|
+
const int64_t i13 = i03 % ne13;
|
7863
|
+
const int64_t i12 = i02 % ne12;
|
7864
|
+
const int64_t i11 = i01 % ne11;
|
7865
|
+
|
7866
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
7867
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
7868
|
+
|
7869
|
+
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
7870
|
+
const int64_t i10 = i0 % ne10;
|
7871
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
7761
7872
|
|
7762
7873
|
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
7763
7874
|
}
|
@@ -8203,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
|
|
8203
8314
|
return;
|
8204
8315
|
}
|
8205
8316
|
|
8206
|
-
GGML_TENSOR_UNARY_OP_LOCALS
|
8317
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
8207
8318
|
|
8208
8319
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
8209
8320
|
const int nr0 = (int)(ne0/ne00);
|
@@ -8348,6 +8459,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8348
8459
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
8349
8460
|
|
8350
8461
|
const int ith = params->ith;
|
8462
|
+
const int nth = params->nth;
|
8351
8463
|
|
8352
8464
|
GGML_TENSOR_BINARY_OP_LOCALS
|
8353
8465
|
|
@@ -8357,7 +8469,7 @@ static void ggml_compute_forward_concat_f32(
|
|
8357
8469
|
GGML_ASSERT(nb10 == sizeof(float));
|
8358
8470
|
|
8359
8471
|
for (int i3 = 0; i3 < ne3; i3++) {
|
8360
|
-
for (int i2 = ith; i2 < ne2; i2
|
8472
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
8361
8473
|
if (i2 < ne02) { // src0
|
8362
8474
|
for (int i1 = 0; i1 < ne1; i1++) {
|
8363
8475
|
for (int i0 = 0; i0 < ne0; i0++) {
|
@@ -9517,6 +9629,8 @@ static void ggml_compute_forward_mul_mat(
|
|
9517
9629
|
char * wdata = params->wdata;
|
9518
9630
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
9519
9631
|
|
9632
|
+
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
9633
|
+
|
9520
9634
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
9521
9635
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
9522
9636
|
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
@@ -9618,6 +9732,26 @@ static void ggml_compute_forward_mul_mat(
|
|
9618
9732
|
}
|
9619
9733
|
}
|
9620
9734
|
|
9735
|
+
// ggml_compute_forward_mul_mat_id
|
9736
|
+
|
9737
|
+
static void ggml_compute_forward_mul_mat_id(
|
9738
|
+
const struct ggml_compute_params * params,
|
9739
|
+
struct ggml_tensor * dst) {
|
9740
|
+
|
9741
|
+
const struct ggml_tensor * ids = dst->src[0];
|
9742
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9743
|
+
|
9744
|
+
const int id = ggml_get_op_params_i32(dst, 0);
|
9745
|
+
|
9746
|
+
const int a_id = ((int32_t *)ids->data)[id];
|
9747
|
+
|
9748
|
+
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
9749
|
+
|
9750
|
+
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
9751
|
+
|
9752
|
+
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
9753
|
+
}
|
9754
|
+
|
9621
9755
|
// ggml_compute_forward_out_prod
|
9622
9756
|
|
9623
9757
|
static void ggml_compute_forward_out_prod_f32(
|
@@ -12021,6 +12155,67 @@ static void ggml_compute_forward_upscale(
|
|
12021
12155
|
}
|
12022
12156
|
}
|
12023
12157
|
|
12158
|
+
// ggml_compute_forward_argsort
|
12159
|
+
|
12160
|
+
static void ggml_compute_forward_argsort_f32(
|
12161
|
+
const struct ggml_compute_params * params,
|
12162
|
+
const struct ggml_tensor * src0,
|
12163
|
+
struct ggml_tensor * dst) {
|
12164
|
+
|
12165
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12166
|
+
return;
|
12167
|
+
}
|
12168
|
+
|
12169
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
12170
|
+
|
12171
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12172
|
+
|
12173
|
+
const int ith = params->ith;
|
12174
|
+
const int nth = params->nth;
|
12175
|
+
|
12176
|
+
const int64_t nr = ggml_nrows(src0);
|
12177
|
+
|
12178
|
+
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
12179
|
+
|
12180
|
+
for (int64_t i = ith; i < nr; i += nth) {
|
12181
|
+
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
12182
|
+
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
12183
|
+
|
12184
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12185
|
+
dst_data[j] = j;
|
12186
|
+
}
|
12187
|
+
|
12188
|
+
// C doesn't have a functional sort, so we do a bubble sort instead
|
12189
|
+
for (int64_t j = 0; j < ne0; j++) {
|
12190
|
+
for (int64_t k = j + 1; k < ne0; k++) {
|
12191
|
+
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
12192
|
+
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
12193
|
+
int32_t tmp = dst_data[j];
|
12194
|
+
dst_data[j] = dst_data[k];
|
12195
|
+
dst_data[k] = tmp;
|
12196
|
+
}
|
12197
|
+
}
|
12198
|
+
}
|
12199
|
+
}
|
12200
|
+
}
|
12201
|
+
|
12202
|
+
static void ggml_compute_forward_argsort(
|
12203
|
+
const struct ggml_compute_params * params,
|
12204
|
+
const struct ggml_tensor * src0,
|
12205
|
+
struct ggml_tensor * dst) {
|
12206
|
+
|
12207
|
+
switch (src0->type) {
|
12208
|
+
case GGML_TYPE_F32:
|
12209
|
+
{
|
12210
|
+
ggml_compute_forward_argsort_f32(params, src0, dst);
|
12211
|
+
} break;
|
12212
|
+
default:
|
12213
|
+
{
|
12214
|
+
GGML_ASSERT(false);
|
12215
|
+
} break;
|
12216
|
+
}
|
12217
|
+
}
|
12218
|
+
|
12024
12219
|
// ggml_compute_forward_flash_attn
|
12025
12220
|
|
12026
12221
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13844,6 +14039,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13844
14039
|
{
|
13845
14040
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
13846
14041
|
} break;
|
14042
|
+
case GGML_OP_MUL_MAT_ID:
|
14043
|
+
{
|
14044
|
+
ggml_compute_forward_mul_mat_id(params, tensor);
|
14045
|
+
} break;
|
13847
14046
|
case GGML_OP_OUT_PROD:
|
13848
14047
|
{
|
13849
14048
|
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -13948,6 +14147,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13948
14147
|
{
|
13949
14148
|
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
13950
14149
|
} break;
|
14150
|
+
case GGML_OP_ARGSORT:
|
14151
|
+
{
|
14152
|
+
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
14153
|
+
} break;
|
13951
14154
|
case GGML_OP_FLASH_ATTN:
|
13952
14155
|
{
|
13953
14156
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -14598,6 +14801,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14598
14801
|
zero_table);
|
14599
14802
|
}
|
14600
14803
|
} break;
|
14804
|
+
case GGML_OP_MUL_MAT_ID:
|
14805
|
+
{
|
14806
|
+
GGML_ASSERT(false); // TODO: not implemented
|
14807
|
+
} break;
|
14601
14808
|
case GGML_OP_OUT_PROD:
|
14602
14809
|
{
|
14603
14810
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -14936,6 +15143,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14936
15143
|
{
|
14937
15144
|
GGML_ASSERT(false); // TODO: not implemented
|
14938
15145
|
} break;
|
15146
|
+
case GGML_OP_ARGSORT:
|
15147
|
+
{
|
15148
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15149
|
+
} break;
|
14939
15150
|
case GGML_OP_FLASH_ATTN:
|
14940
15151
|
{
|
14941
15152
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15296,12 +15507,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|
15296
15507
|
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
15297
15508
|
}
|
15298
15509
|
|
15299
|
-
struct ggml_cgraph
|
15300
|
-
|
15301
|
-
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
15302
|
-
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
15303
|
-
|
15304
|
-
*cgraph = (struct ggml_cgraph) {
|
15510
|
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
15511
|
+
struct ggml_cgraph cgraph = {
|
15305
15512
|
/*.size =*/ 0,
|
15306
15513
|
/*.n_nodes =*/ i1 - i0,
|
15307
15514
|
/*.n_leafs =*/ 0,
|
@@ -15536,7 +15743,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15536
15743
|
n_tasks = n_threads;
|
15537
15744
|
} break;
|
15538
15745
|
case GGML_OP_SUB:
|
15539
|
-
case GGML_OP_DIV:
|
15540
15746
|
case GGML_OP_SQR:
|
15541
15747
|
case GGML_OP_SQRT:
|
15542
15748
|
case GGML_OP_LOG:
|
@@ -15569,10 +15775,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15569
15775
|
{
|
15570
15776
|
n_tasks = n_threads;
|
15571
15777
|
} break;
|
15778
|
+
default:
|
15779
|
+
GGML_ASSERT(false);
|
15572
15780
|
}
|
15573
15781
|
break;
|
15574
15782
|
case GGML_OP_SILU_BACK:
|
15575
15783
|
case GGML_OP_MUL:
|
15784
|
+
case GGML_OP_DIV:
|
15576
15785
|
case GGML_OP_NORM:
|
15577
15786
|
case GGML_OP_RMS_NORM:
|
15578
15787
|
case GGML_OP_RMS_NORM_BACK:
|
@@ -15610,6 +15819,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15610
15819
|
}
|
15611
15820
|
#endif
|
15612
15821
|
} break;
|
15822
|
+
case GGML_OP_MUL_MAT_ID:
|
15823
|
+
{
|
15824
|
+
// FIXME: blas
|
15825
|
+
n_tasks = n_threads;
|
15826
|
+
} break;
|
15613
15827
|
case GGML_OP_OUT_PROD:
|
15614
15828
|
{
|
15615
15829
|
n_tasks = n_threads;
|
@@ -15629,7 +15843,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15629
15843
|
} break;
|
15630
15844
|
case GGML_OP_DIAG_MASK_ZERO:
|
15631
15845
|
case GGML_OP_DIAG_MASK_INF:
|
15632
|
-
case GGML_OP_SOFT_MAX:
|
15633
15846
|
case GGML_OP_SOFT_MAX_BACK:
|
15634
15847
|
case GGML_OP_ROPE:
|
15635
15848
|
case GGML_OP_ROPE_BACK:
|
@@ -15645,6 +15858,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15645
15858
|
{
|
15646
15859
|
n_tasks = 1; //TODO
|
15647
15860
|
} break;
|
15861
|
+
case GGML_OP_SOFT_MAX:
|
15862
|
+
{
|
15863
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15864
|
+
} break;
|
15648
15865
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15649
15866
|
{
|
15650
15867
|
n_tasks = n_threads;
|
@@ -15666,6 +15883,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15666
15883
|
{
|
15667
15884
|
n_tasks = n_threads;
|
15668
15885
|
} break;
|
15886
|
+
case GGML_OP_ARGSORT:
|
15887
|
+
{
|
15888
|
+
n_tasks = n_threads;
|
15889
|
+
} break;
|
15669
15890
|
case GGML_OP_FLASH_ATTN:
|
15670
15891
|
{
|
15671
15892
|
n_tasks = n_threads;
|
@@ -15728,6 +15949,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15728
15949
|
{
|
15729
15950
|
n_tasks = 1;
|
15730
15951
|
} break;
|
15952
|
+
case GGML_OP_COUNT:
|
15953
|
+
{
|
15954
|
+
GGML_ASSERT(false);
|
15955
|
+
} break;
|
15731
15956
|
default:
|
15732
15957
|
{
|
15733
15958
|
fprintf(stderr, "%s: op not implemented: ", __func__);
|
@@ -15876,18 +16101,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15876
16101
|
|
15877
16102
|
// thread scheduling for the different operations + work buffer size estimation
|
15878
16103
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
15879
|
-
int n_tasks = 1;
|
15880
|
-
|
15881
16104
|
struct ggml_tensor * node = cgraph->nodes[i];
|
15882
16105
|
|
16106
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16107
|
+
|
15883
16108
|
size_t cur = 0;
|
15884
16109
|
|
15885
16110
|
switch (node->op) {
|
15886
16111
|
case GGML_OP_CPY:
|
15887
16112
|
case GGML_OP_DUP:
|
15888
16113
|
{
|
15889
|
-
n_tasks = n_threads;
|
15890
|
-
|
15891
16114
|
if (ggml_is_quantized(node->type)) {
|
15892
16115
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15893
16116
|
}
|
@@ -15895,16 +16118,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15895
16118
|
case GGML_OP_ADD:
|
15896
16119
|
case GGML_OP_ADD1:
|
15897
16120
|
{
|
15898
|
-
n_tasks = n_threads;
|
15899
|
-
|
15900
16121
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15901
16122
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15902
16123
|
}
|
15903
16124
|
} break;
|
15904
16125
|
case GGML_OP_ACC:
|
15905
16126
|
{
|
15906
|
-
n_tasks = n_threads;
|
15907
|
-
|
15908
16127
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15909
16128
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
15910
16129
|
}
|
@@ -15930,18 +16149,31 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15930
16149
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
15931
16150
|
}
|
15932
16151
|
} break;
|
16152
|
+
case GGML_OP_MUL_MAT_ID:
|
16153
|
+
{
|
16154
|
+
const struct ggml_tensor * a = node->src[2];
|
16155
|
+
const struct ggml_tensor * b = node->src[1];
|
16156
|
+
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
16157
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16158
|
+
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
16159
|
+
if (a->type != GGML_TYPE_F32) {
|
16160
|
+
// here we need memory just for single 2D matrix from src0
|
16161
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
16162
|
+
}
|
16163
|
+
} else
|
16164
|
+
#endif
|
16165
|
+
if (b->type != vec_dot_type) {
|
16166
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
16167
|
+
}
|
16168
|
+
} break;
|
15933
16169
|
case GGML_OP_OUT_PROD:
|
15934
16170
|
{
|
15935
|
-
n_tasks = n_threads;
|
15936
|
-
|
15937
16171
|
if (ggml_is_quantized(node->src[0]->type)) {
|
15938
16172
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15939
16173
|
}
|
15940
16174
|
} break;
|
15941
16175
|
case GGML_OP_SOFT_MAX:
|
15942
16176
|
{
|
15943
|
-
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
-
|
15945
16177
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
16178
|
} break;
|
15947
16179
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
@@ -15969,10 +16201,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15969
16201
|
GGML_ASSERT(false);
|
15970
16202
|
}
|
15971
16203
|
} break;
|
15972
|
-
case GGML_OP_IM2COL:
|
15973
|
-
{
|
15974
|
-
n_tasks = n_threads;
|
15975
|
-
} break;
|
15976
16204
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
15977
16205
|
{
|
15978
16206
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
@@ -15989,8 +16217,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15989
16217
|
} break;
|
15990
16218
|
case GGML_OP_FLASH_ATTN:
|
15991
16219
|
{
|
15992
|
-
n_tasks = n_threads;
|
15993
|
-
|
15994
16220
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
15995
16221
|
|
15996
16222
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
@@ -16003,8 +16229,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16003
16229
|
} break;
|
16004
16230
|
case GGML_OP_FLASH_FF:
|
16005
16231
|
{
|
16006
|
-
n_tasks = n_threads;
|
16007
|
-
|
16008
16232
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
16009
16233
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
16010
16234
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
@@ -16015,8 +16239,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16015
16239
|
} break;
|
16016
16240
|
case GGML_OP_FLASH_ATTN_BACK:
|
16017
16241
|
{
|
16018
|
-
n_tasks = n_threads;
|
16019
|
-
|
16020
16242
|
const int64_t D = node->src[0]->ne[0];
|
16021
16243
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
16022
16244
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
@@ -16031,8 +16253,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16031
16253
|
|
16032
16254
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
16033
16255
|
{
|
16034
|
-
n_tasks = n_threads;
|
16035
|
-
|
16036
16256
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
16037
16257
|
} break;
|
16038
16258
|
case GGML_OP_COUNT:
|
@@ -17819,8 +18039,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|
17819
18039
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17820
18040
|
|
17821
18041
|
for (int j = 0; j < QK5_0; j += 2) {
|
17822
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17823
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18042
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18043
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17824
18044
|
|
17825
18045
|
// cast to 16 bins
|
17826
18046
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -17849,8 +18069,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|
17849
18069
|
memcpy(&qh, &y[i].qh, sizeof(qh));
|
17850
18070
|
|
17851
18071
|
for (int j = 0; j < QK5_1; j += 2) {
|
17852
|
-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
17853
|
-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
18072
|
+
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
18073
|
+
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
17854
18074
|
|
17855
18075
|
// cast to 16 bins
|
17856
18076
|
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
@@ -18040,6 +18260,7 @@ struct gguf_kv {
|
|
18040
18260
|
|
18041
18261
|
struct gguf_header {
|
18042
18262
|
char magic[4];
|
18263
|
+
|
18043
18264
|
uint32_t version;
|
18044
18265
|
uint64_t n_tensors; // GGUFv2
|
18045
18266
|
uint64_t n_kv; // GGUFv2
|
@@ -18129,7 +18350,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18129
18350
|
|
18130
18351
|
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
18131
18352
|
if (magic[i] != GGUF_MAGIC[i]) {
|
18132
|
-
fprintf(stderr, "%s: invalid magic characters %
|
18353
|
+
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
18133
18354
|
fclose(file);
|
18134
18355
|
return NULL;
|
18135
18356
|
}
|
@@ -18144,7 +18365,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18144
18365
|
{
|
18145
18366
|
strncpy(ctx->header.magic, magic, 4);
|
18146
18367
|
|
18147
|
-
|
18148
18368
|
ctx->kv = NULL;
|
18149
18369
|
ctx->infos = NULL;
|
18150
18370
|
ctx->data = NULL;
|