@fugood/llama.node 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +14 -12
- package/src/llama.cpp/common/common.cpp +19 -5
- package/src/llama.cpp/common/common.h +2 -0
- package/src/llama.cpp/common/grammar-parser.cpp +9 -0
- package/src/llama.cpp/common/sampling.cpp +3 -3
- package/src/llama.cpp/common/sampling.h +1 -1
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
- package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
- package/src/llama.cpp/examples/main/main.cpp +5 -1
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
- package/src/llama.cpp/examples/server/server.cpp +12 -16
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/ggml-backend.c +2 -2
- package/src/llama.cpp/ggml-kompute.cpp +9 -3
- package/src/llama.cpp/ggml-quants.c +6 -0
- package/src/llama.cpp/ggml-rpc.cpp +1023 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +20 -143
- package/src/llama.cpp/ggml-vulkan.cpp +4 -2
- package/src/llama.cpp/ggml.c +116 -271
- package/src/llama.cpp/ggml.h +12 -15
- package/src/llama.cpp/llama.cpp +451 -265
- package/src/llama.cpp/llama.h +3 -0
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
package/src/llama.cpp/ggml.c
CHANGED
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-quants.h"
|
|
6
6
|
#include "ggml.h"
|
|
7
|
-
#include "sgemm.h"
|
|
8
7
|
|
|
9
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
10
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
@@ -37,6 +36,10 @@
|
|
|
37
36
|
#undef GGML_USE_LLAMAFILE
|
|
38
37
|
#endif
|
|
39
38
|
|
|
39
|
+
#ifdef GGML_USE_LLAMAFILE
|
|
40
|
+
#include "sgemm.h"
|
|
41
|
+
#endif
|
|
42
|
+
|
|
40
43
|
#if defined(_MSC_VER)
|
|
41
44
|
// disable "possible loss of data" to avoid hundreds of casts
|
|
42
45
|
// we should just be careful :)
|
|
@@ -1949,6 +1952,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
|
1949
1952
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
|
1950
1953
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
1951
1954
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
|
1955
|
+
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
|
1952
1956
|
// TODO: optimize performance
|
|
1953
1957
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
1954
1958
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
@@ -2185,7 +2189,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
2185
2189
|
"SOFT_MAX_BACK",
|
|
2186
2190
|
"ROPE",
|
|
2187
2191
|
"ROPE_BACK",
|
|
2188
|
-
"ALIBI",
|
|
2189
2192
|
"CLAMP",
|
|
2190
2193
|
"CONV_TRANSPOSE_1D",
|
|
2191
2194
|
"IM2COL",
|
|
@@ -2227,7 +2230,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
2227
2230
|
"CROSS_ENTROPY_LOSS_BACK",
|
|
2228
2231
|
};
|
|
2229
2232
|
|
|
2230
|
-
static_assert(GGML_OP_COUNT ==
|
|
2233
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
|
2231
2234
|
|
|
2232
2235
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2233
2236
|
"none",
|
|
@@ -2276,7 +2279,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
2276
2279
|
"soft_max_back(x)",
|
|
2277
2280
|
"rope(x)",
|
|
2278
2281
|
"rope_back(x)",
|
|
2279
|
-
"alibi(x)",
|
|
2280
2282
|
"clamp(x)",
|
|
2281
2283
|
"conv_transpose_1d(x)",
|
|
2282
2284
|
"im2col(x)",
|
|
@@ -2318,7 +2320,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
2318
2320
|
"cross_entropy_loss_back(x,y)",
|
|
2319
2321
|
};
|
|
2320
2322
|
|
|
2321
|
-
static_assert(GGML_OP_COUNT ==
|
|
2323
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
|
2322
2324
|
|
|
2323
2325
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
|
2324
2326
|
|
|
@@ -2331,6 +2333,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
|
2331
2333
|
"TANH",
|
|
2332
2334
|
"ELU",
|
|
2333
2335
|
"RELU",
|
|
2336
|
+
"SIGMOID",
|
|
2334
2337
|
"GELU",
|
|
2335
2338
|
"GELU_QUICK",
|
|
2336
2339
|
"SILU",
|
|
@@ -2338,7 +2341,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
|
2338
2341
|
"HARDSIGMOID",
|
|
2339
2342
|
};
|
|
2340
2343
|
|
|
2341
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
|
2344
|
+
static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
|
2342
2345
|
|
|
2343
2346
|
|
|
2344
2347
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
|
@@ -4563,6 +4566,20 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
|
4563
4566
|
return result;
|
|
4564
4567
|
}
|
|
4565
4568
|
|
|
4569
|
+
// ggml_sigmoid
|
|
4570
|
+
|
|
4571
|
+
struct ggml_tensor * ggml_sigmoid(
|
|
4572
|
+
struct ggml_context * ctx,
|
|
4573
|
+
struct ggml_tensor * a) {
|
|
4574
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
|
|
4575
|
+
}
|
|
4576
|
+
|
|
4577
|
+
struct ggml_tensor * ggml_sigmoid_inplace(
|
|
4578
|
+
struct ggml_context * ctx,
|
|
4579
|
+
struct ggml_tensor * a) {
|
|
4580
|
+
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
|
|
4581
|
+
}
|
|
4582
|
+
|
|
4566
4583
|
// ggml_gelu
|
|
4567
4584
|
|
|
4568
4585
|
struct ggml_tensor * ggml_gelu(
|
|
@@ -5646,7 +5663,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
|
5646
5663
|
struct ggml_context * ctx,
|
|
5647
5664
|
struct ggml_tensor * a,
|
|
5648
5665
|
struct ggml_tensor * mask,
|
|
5649
|
-
struct ggml_tensor * pos,
|
|
5650
5666
|
float scale,
|
|
5651
5667
|
float max_bias,
|
|
5652
5668
|
bool inplace) {
|
|
@@ -5660,18 +5676,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
|
5660
5676
|
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
|
5661
5677
|
}
|
|
5662
5678
|
|
|
5663
|
-
if (pos) {
|
|
5664
|
-
GGML_ASSERT(ggml_is_vector(pos));
|
|
5665
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
|
5666
|
-
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
|
5667
|
-
}
|
|
5668
|
-
|
|
5669
|
-
if (pos && mask) {
|
|
5670
|
-
GGML_ASSERT(pos->type == mask->type);
|
|
5671
|
-
}
|
|
5672
|
-
|
|
5673
5679
|
if (max_bias > 0.0f) {
|
|
5674
|
-
GGML_ASSERT(
|
|
5680
|
+
GGML_ASSERT(mask);
|
|
5675
5681
|
}
|
|
5676
5682
|
|
|
5677
5683
|
bool is_node = false;
|
|
@@ -5689,7 +5695,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
|
5689
5695
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
5690
5696
|
result->src[0] = a;
|
|
5691
5697
|
result->src[1] = mask;
|
|
5692
|
-
result->src[2] = pos;
|
|
5693
5698
|
|
|
5694
5699
|
return result;
|
|
5695
5700
|
}
|
|
@@ -5697,23 +5702,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
|
5697
5702
|
struct ggml_tensor * ggml_soft_max(
|
|
5698
5703
|
struct ggml_context * ctx,
|
|
5699
5704
|
struct ggml_tensor * a) {
|
|
5700
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
|
5705
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
|
|
5701
5706
|
}
|
|
5702
5707
|
|
|
5703
5708
|
struct ggml_tensor * ggml_soft_max_inplace(
|
|
5704
5709
|
struct ggml_context * ctx,
|
|
5705
5710
|
struct ggml_tensor * a) {
|
|
5706
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
|
5711
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
|
|
5707
5712
|
}
|
|
5708
5713
|
|
|
5709
5714
|
struct ggml_tensor * ggml_soft_max_ext(
|
|
5710
5715
|
struct ggml_context * ctx,
|
|
5711
5716
|
struct ggml_tensor * a,
|
|
5712
5717
|
struct ggml_tensor * mask,
|
|
5713
|
-
struct ggml_tensor * pos,
|
|
5714
5718
|
float scale,
|
|
5715
5719
|
float max_bias) {
|
|
5716
|
-
return ggml_soft_max_impl(ctx, a, mask,
|
|
5720
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
|
5717
5721
|
}
|
|
5718
5722
|
|
|
5719
5723
|
// ggml_soft_max_back
|
|
@@ -5928,37 +5932,6 @@ struct ggml_tensor * ggml_rope_back(
|
|
|
5928
5932
|
return result;
|
|
5929
5933
|
}
|
|
5930
5934
|
|
|
5931
|
-
// ggml_alibi
|
|
5932
|
-
|
|
5933
|
-
struct ggml_tensor * ggml_alibi(
|
|
5934
|
-
struct ggml_context * ctx,
|
|
5935
|
-
struct ggml_tensor * a,
|
|
5936
|
-
int n_past,
|
|
5937
|
-
int n_head,
|
|
5938
|
-
float bias_max) {
|
|
5939
|
-
GGML_ASSERT(n_past >= 0);
|
|
5940
|
-
bool is_node = false;
|
|
5941
|
-
|
|
5942
|
-
if (a->grad) {
|
|
5943
|
-
GGML_ASSERT(false); // TODO: implement backward
|
|
5944
|
-
is_node = true;
|
|
5945
|
-
}
|
|
5946
|
-
|
|
5947
|
-
// TODO: when implement backward, fix this:
|
|
5948
|
-
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
5949
|
-
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
|
5950
|
-
|
|
5951
|
-
int32_t op_params[3] = { n_past, n_head };
|
|
5952
|
-
memcpy(op_params + 2, &bias_max, sizeof(float));
|
|
5953
|
-
ggml_set_op_params(result, op_params, sizeof(op_params));
|
|
5954
|
-
|
|
5955
|
-
result->op = GGML_OP_ALIBI;
|
|
5956
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
5957
|
-
result->src[0] = a;
|
|
5958
|
-
|
|
5959
|
-
return result;
|
|
5960
|
-
}
|
|
5961
|
-
|
|
5962
5935
|
// ggml_clamp
|
|
5963
5936
|
|
|
5964
5937
|
struct ggml_tensor * ggml_clamp(
|
|
@@ -6486,9 +6459,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
|
6486
6459
|
struct ggml_tensor * k,
|
|
6487
6460
|
struct ggml_tensor * v,
|
|
6488
6461
|
struct ggml_tensor * mask,
|
|
6489
|
-
float scale
|
|
6462
|
+
float scale,
|
|
6463
|
+
float max_bias) {
|
|
6490
6464
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
|
6491
6465
|
// TODO: check if vT can be multiplied by (k*qT)
|
|
6466
|
+
|
|
6492
6467
|
if (mask) {
|
|
6493
6468
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
|
6494
6469
|
GGML_ASSERT(mask->ne[2] == 1);
|
|
@@ -6498,6 +6473,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
|
6498
6473
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
|
6499
6474
|
}
|
|
6500
6475
|
|
|
6476
|
+
if (max_bias > 0.0f) {
|
|
6477
|
+
GGML_ASSERT(mask);
|
|
6478
|
+
}
|
|
6479
|
+
|
|
6501
6480
|
bool is_node = false;
|
|
6502
6481
|
|
|
6503
6482
|
if (q->grad || k->grad || v->grad) {
|
|
@@ -6508,7 +6487,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
|
6508
6487
|
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
|
6509
6488
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
|
6510
6489
|
|
|
6511
|
-
float params[] = { scale };
|
|
6490
|
+
float params[] = { scale, max_bias };
|
|
6512
6491
|
ggml_set_op_params(result, params, sizeof(params));
|
|
6513
6492
|
|
|
6514
6493
|
result->op = GGML_OP_FLASH_ATTN_EXT;
|
|
@@ -6528,7 +6507,7 @@ void ggml_flash_attn_ext_set_prec(
|
|
|
6528
6507
|
|
|
6529
6508
|
const int32_t prec_i32 = (int32_t) prec;
|
|
6530
6509
|
|
|
6531
|
-
ggml_set_op_params_i32(a,
|
|
6510
|
+
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
|
6532
6511
|
}
|
|
6533
6512
|
|
|
6534
6513
|
// ggml_flash_ff
|
|
@@ -10892,6 +10871,52 @@ static void ggml_compute_forward_relu(
|
|
|
10892
10871
|
}
|
|
10893
10872
|
}
|
|
10894
10873
|
|
|
10874
|
+
// ggml_compute_forward_sigmoid
|
|
10875
|
+
|
|
10876
|
+
static void ggml_compute_forward_sigmoid_f32(
|
|
10877
|
+
const struct ggml_compute_params * params,
|
|
10878
|
+
struct ggml_tensor * dst) {
|
|
10879
|
+
|
|
10880
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
10881
|
+
|
|
10882
|
+
assert(params->ith == 0);
|
|
10883
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
10884
|
+
|
|
10885
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
10886
|
+
return;
|
|
10887
|
+
}
|
|
10888
|
+
|
|
10889
|
+
const int n = ggml_nrows(src0);
|
|
10890
|
+
const int nc = src0->ne[0];
|
|
10891
|
+
|
|
10892
|
+
assert(dst->nb[0] == sizeof(float));
|
|
10893
|
+
assert(src0->nb[0] == sizeof(float));
|
|
10894
|
+
|
|
10895
|
+
for (int i = 0; i < n; i++) {
|
|
10896
|
+
ggml_vec_sigmoid_f32(nc,
|
|
10897
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
10898
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
|
10899
|
+
}
|
|
10900
|
+
}
|
|
10901
|
+
|
|
10902
|
+
static void ggml_compute_forward_sigmoid(
|
|
10903
|
+
const struct ggml_compute_params * params,
|
|
10904
|
+
struct ggml_tensor * dst) {
|
|
10905
|
+
|
|
10906
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
10907
|
+
|
|
10908
|
+
switch (src0->type) {
|
|
10909
|
+
case GGML_TYPE_F32:
|
|
10910
|
+
{
|
|
10911
|
+
ggml_compute_forward_sigmoid_f32(params, dst);
|
|
10912
|
+
} break;
|
|
10913
|
+
default:
|
|
10914
|
+
{
|
|
10915
|
+
GGML_ASSERT(false);
|
|
10916
|
+
} break;
|
|
10917
|
+
}
|
|
10918
|
+
}
|
|
10919
|
+
|
|
10895
10920
|
// ggml_compute_forward_gelu
|
|
10896
10921
|
|
|
10897
10922
|
static void ggml_compute_forward_gelu_f32(
|
|
@@ -13333,7 +13358,6 @@ static void ggml_compute_forward_soft_max_f32(
|
|
|
13333
13358
|
|
|
13334
13359
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
13335
13360
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
13336
|
-
const struct ggml_tensor * src2 = dst->src[2];
|
|
13337
13361
|
|
|
13338
13362
|
assert(ggml_is_contiguous(dst));
|
|
13339
13363
|
assert(ggml_are_same_shape(src0, dst));
|
|
@@ -13359,8 +13383,8 @@ static void ggml_compute_forward_soft_max_f32(
|
|
|
13359
13383
|
|
|
13360
13384
|
// TODO: is this supposed to be ceil instead of floor?
|
|
13361
13385
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
|
13362
|
-
const uint32_t
|
|
13363
|
-
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(
|
|
13386
|
+
const uint32_t n_head = ne02;
|
|
13387
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
|
13364
13388
|
|
|
13365
13389
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
13366
13390
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
@@ -13377,13 +13401,13 @@ static void ggml_compute_forward_soft_max_f32(
|
|
|
13377
13401
|
|
|
13378
13402
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
|
13379
13403
|
|
|
13380
|
-
|
|
13381
|
-
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
|
13382
|
-
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
|
13383
|
-
|
|
13384
|
-
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
|
13404
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
|
13385
13405
|
|
|
13386
13406
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
13407
|
+
// ALiBi
|
|
13408
|
+
const uint32_t h = (i1/ne01)%ne02; // head
|
|
13409
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
|
13410
|
+
|
|
13387
13411
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
|
13388
13412
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
|
13389
13413
|
|
|
@@ -13396,27 +13420,11 @@ static void ggml_compute_forward_soft_max_f32(
|
|
|
13396
13420
|
if (mp_f32) {
|
|
13397
13421
|
if (use_f16) {
|
|
13398
13422
|
for (int i = 0; i < nc; ++i) {
|
|
13399
|
-
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
|
13400
|
-
}
|
|
13401
|
-
} else {
|
|
13402
|
-
for (int i = 0; i < nc; ++i) {
|
|
13403
|
-
wp[i] += mp_f32[i];
|
|
13404
|
-
}
|
|
13405
|
-
}
|
|
13406
|
-
}
|
|
13407
|
-
|
|
13408
|
-
// ALiBi bias
|
|
13409
|
-
if (max_bias > 0.0f) {
|
|
13410
|
-
const uint32_t h = (i1/ne01)%ne02; // head
|
|
13411
|
-
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
|
13412
|
-
|
|
13413
|
-
if (use_f16) {
|
|
13414
|
-
for (int i = 0; i < nc; ++i) {
|
|
13415
|
-
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
|
13423
|
+
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
|
|
13416
13424
|
}
|
|
13417
13425
|
} else {
|
|
13418
13426
|
for (int i = 0; i < nc; ++i) {
|
|
13419
|
-
wp[i] += slope*
|
|
13427
|
+
wp[i] += slope*mp_f32[i];
|
|
13420
13428
|
}
|
|
13421
13429
|
}
|
|
13422
13430
|
}
|
|
@@ -13578,178 +13586,6 @@ static void ggml_compute_forward_soft_max_back(
|
|
|
13578
13586
|
}
|
|
13579
13587
|
}
|
|
13580
13588
|
|
|
13581
|
-
// ggml_compute_forward_alibi
|
|
13582
|
-
|
|
13583
|
-
static void ggml_compute_forward_alibi_f32(
|
|
13584
|
-
const struct ggml_compute_params * params,
|
|
13585
|
-
struct ggml_tensor * dst) {
|
|
13586
|
-
|
|
13587
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
|
13588
|
-
|
|
13589
|
-
assert(params->ith == 0);
|
|
13590
|
-
|
|
13591
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
13592
|
-
return;
|
|
13593
|
-
}
|
|
13594
|
-
|
|
13595
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
13596
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
13597
|
-
float max_bias;
|
|
13598
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
13599
|
-
|
|
13600
|
-
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
|
13601
|
-
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
|
13602
|
-
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
|
13603
|
-
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
|
13604
|
-
|
|
13605
|
-
const int64_t n = ggml_nrows(src0);
|
|
13606
|
-
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
|
13607
|
-
|
|
13608
|
-
const size_t nb0 = src0->nb[0];
|
|
13609
|
-
const size_t nb1 = src0->nb[1];
|
|
13610
|
-
const size_t nb2 = src0->nb[2];
|
|
13611
|
-
//const int nb3 = src0->nb[3];
|
|
13612
|
-
|
|
13613
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
|
13614
|
-
GGML_ASSERT(n_head == ne2);
|
|
13615
|
-
|
|
13616
|
-
// add alibi to src0 (KQ_scaled)
|
|
13617
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
13618
|
-
|
|
13619
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
13620
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
13621
|
-
|
|
13622
|
-
for (int64_t k = 0; k < ne2_ne3; k++) {
|
|
13623
|
-
// TODO: k*nb2 or k*nb3
|
|
13624
|
-
float m_k;
|
|
13625
|
-
|
|
13626
|
-
if (k < n_heads_log2_floor) {
|
|
13627
|
-
m_k = powf(m0, k + 1);
|
|
13628
|
-
} else {
|
|
13629
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
13630
|
-
}
|
|
13631
|
-
|
|
13632
|
-
for (int64_t i = 0; i < ne0; i++) {
|
|
13633
|
-
for (int64_t j = 0; j < ne1; j++) {
|
|
13634
|
-
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
|
13635
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
|
13636
|
-
pdst[0] = i * m_k + src[0];
|
|
13637
|
-
}
|
|
13638
|
-
}
|
|
13639
|
-
}
|
|
13640
|
-
}
|
|
13641
|
-
|
|
13642
|
-
static void ggml_compute_forward_alibi_f16(
|
|
13643
|
-
const struct ggml_compute_params * params,
|
|
13644
|
-
struct ggml_tensor * dst) {
|
|
13645
|
-
|
|
13646
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
|
13647
|
-
|
|
13648
|
-
assert(params->ith == 0);
|
|
13649
|
-
|
|
13650
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
13651
|
-
return;
|
|
13652
|
-
}
|
|
13653
|
-
|
|
13654
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
13655
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
13656
|
-
float max_bias;
|
|
13657
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
13658
|
-
|
|
13659
|
-
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
|
13660
|
-
const int ne1 = src0->ne[1]; // seq_len_without_past
|
|
13661
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
|
13662
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
|
13663
|
-
|
|
13664
|
-
const int n = ggml_nrows(src0);
|
|
13665
|
-
const int ne2_ne3 = n/ne1; // ne2*ne3
|
|
13666
|
-
|
|
13667
|
-
const int nb0 = src0->nb[0];
|
|
13668
|
-
const int nb1 = src0->nb[1];
|
|
13669
|
-
const int nb2 = src0->nb[2];
|
|
13670
|
-
//const int nb3 = src0->nb[3];
|
|
13671
|
-
|
|
13672
|
-
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
|
13673
|
-
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
|
13674
|
-
GGML_ASSERT(n_head == ne2);
|
|
13675
|
-
|
|
13676
|
-
// add alibi to src0 (KQ_scaled)
|
|
13677
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
13678
|
-
|
|
13679
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
13680
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
13681
|
-
|
|
13682
|
-
for (int k = 0; k < ne2_ne3; k++) {
|
|
13683
|
-
// TODO: k*nb2 or k*nb3
|
|
13684
|
-
float m_k;
|
|
13685
|
-
|
|
13686
|
-
if (k < n_heads_log2_floor) {
|
|
13687
|
-
m_k = powf(m0, k + 1);
|
|
13688
|
-
} else {
|
|
13689
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
13690
|
-
}
|
|
13691
|
-
|
|
13692
|
-
for (int i = 0; i < ne0; i++) {
|
|
13693
|
-
for (int j = 0; j < ne1; j++) {
|
|
13694
|
-
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
|
13695
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
|
13696
|
-
|
|
13697
|
-
// we return F32
|
|
13698
|
-
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
|
13699
|
-
}
|
|
13700
|
-
}
|
|
13701
|
-
}
|
|
13702
|
-
}
|
|
13703
|
-
|
|
13704
|
-
static void ggml_compute_forward_alibi(
|
|
13705
|
-
const struct ggml_compute_params * params,
|
|
13706
|
-
struct ggml_tensor * dst) {
|
|
13707
|
-
|
|
13708
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
|
13709
|
-
|
|
13710
|
-
switch (src0->type) {
|
|
13711
|
-
case GGML_TYPE_F16:
|
|
13712
|
-
{
|
|
13713
|
-
ggml_compute_forward_alibi_f16(params, dst);
|
|
13714
|
-
} break;
|
|
13715
|
-
case GGML_TYPE_F32:
|
|
13716
|
-
{
|
|
13717
|
-
ggml_compute_forward_alibi_f32(params, dst);
|
|
13718
|
-
} break;
|
|
13719
|
-
case GGML_TYPE_BF16:
|
|
13720
|
-
case GGML_TYPE_Q4_0:
|
|
13721
|
-
case GGML_TYPE_Q4_1:
|
|
13722
|
-
case GGML_TYPE_Q5_0:
|
|
13723
|
-
case GGML_TYPE_Q5_1:
|
|
13724
|
-
case GGML_TYPE_Q8_0:
|
|
13725
|
-
case GGML_TYPE_Q8_1:
|
|
13726
|
-
case GGML_TYPE_Q2_K:
|
|
13727
|
-
case GGML_TYPE_Q3_K:
|
|
13728
|
-
case GGML_TYPE_Q4_K:
|
|
13729
|
-
case GGML_TYPE_Q5_K:
|
|
13730
|
-
case GGML_TYPE_Q6_K:
|
|
13731
|
-
case GGML_TYPE_IQ2_XXS:
|
|
13732
|
-
case GGML_TYPE_IQ2_XS:
|
|
13733
|
-
case GGML_TYPE_IQ3_XXS:
|
|
13734
|
-
case GGML_TYPE_IQ1_S:
|
|
13735
|
-
case GGML_TYPE_IQ1_M:
|
|
13736
|
-
case GGML_TYPE_IQ4_NL:
|
|
13737
|
-
case GGML_TYPE_IQ4_XS:
|
|
13738
|
-
case GGML_TYPE_IQ3_S:
|
|
13739
|
-
case GGML_TYPE_IQ2_S:
|
|
13740
|
-
case GGML_TYPE_Q8_K:
|
|
13741
|
-
case GGML_TYPE_I8:
|
|
13742
|
-
case GGML_TYPE_I16:
|
|
13743
|
-
case GGML_TYPE_I32:
|
|
13744
|
-
case GGML_TYPE_I64:
|
|
13745
|
-
case GGML_TYPE_F64:
|
|
13746
|
-
case GGML_TYPE_COUNT:
|
|
13747
|
-
{
|
|
13748
|
-
GGML_ASSERT(false);
|
|
13749
|
-
} break;
|
|
13750
|
-
}
|
|
13751
|
-
}
|
|
13752
|
-
|
|
13753
13589
|
// ggml_compute_forward_clamp
|
|
13754
13590
|
|
|
13755
13591
|
static void ggml_compute_forward_clamp_f32(
|
|
@@ -15763,8 +15599,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
15763
15599
|
const int ir0 = dr*ith;
|
|
15764
15600
|
const int ir1 = MIN(ir0 + dr, nr);
|
|
15765
15601
|
|
|
15766
|
-
float scale
|
|
15767
|
-
|
|
15602
|
+
float scale = 1.0f;
|
|
15603
|
+
float max_bias = 0.0f;
|
|
15604
|
+
|
|
15605
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
|
15606
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
|
15607
|
+
|
|
15608
|
+
const uint32_t n_head = neq2;
|
|
15609
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
|
15610
|
+
|
|
15611
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
15612
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
15768
15613
|
|
|
15769
15614
|
// loop over n_batch and n_head
|
|
15770
15615
|
for (int ir = ir0; ir < ir1; ++ir) {
|
|
@@ -15773,6 +15618,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
15773
15618
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
|
15774
15619
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
|
15775
15620
|
|
|
15621
|
+
const uint32_t h = iq2; // head
|
|
15622
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
|
15623
|
+
|
|
15776
15624
|
float S = 0.0f;
|
|
15777
15625
|
float M = -INFINITY;
|
|
15778
15626
|
|
|
@@ -15796,7 +15644,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
15796
15644
|
// loop over n_kv and n_head_kv
|
|
15797
15645
|
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
|
15798
15646
|
for (int64_t ic = 0; ic < nek1; ++ic) {
|
|
15799
|
-
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
|
15647
|
+
const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
|
15800
15648
|
if (mv == -INFINITY) {
|
|
15801
15649
|
continue;
|
|
15802
15650
|
}
|
|
@@ -15867,7 +15715,7 @@ static void ggml_compute_forward_flash_attn_ext(
|
|
|
15867
15715
|
const struct ggml_tensor * v,
|
|
15868
15716
|
const struct ggml_tensor * mask,
|
|
15869
15717
|
struct ggml_tensor * dst) {
|
|
15870
|
-
switch (dst->op_params[
|
|
15718
|
+
switch (dst->op_params[2]) {
|
|
15871
15719
|
case GGML_PREC_DEFAULT:
|
|
15872
15720
|
case GGML_PREC_F32:
|
|
15873
15721
|
{
|
|
@@ -16834,6 +16682,10 @@ static void ggml_compute_forward_unary(
|
|
|
16834
16682
|
{
|
|
16835
16683
|
ggml_compute_forward_relu(params, dst);
|
|
16836
16684
|
} break;
|
|
16685
|
+
case GGML_UNARY_OP_SIGMOID:
|
|
16686
|
+
{
|
|
16687
|
+
ggml_compute_forward_sigmoid(params, dst);
|
|
16688
|
+
} break;
|
|
16837
16689
|
case GGML_UNARY_OP_GELU:
|
|
16838
16690
|
{
|
|
16839
16691
|
ggml_compute_forward_gelu(params, dst);
|
|
@@ -17630,10 +17482,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
17630
17482
|
{
|
|
17631
17483
|
ggml_compute_forward_rope_back(params, tensor);
|
|
17632
17484
|
} break;
|
|
17633
|
-
case GGML_OP_ALIBI:
|
|
17634
|
-
{
|
|
17635
|
-
ggml_compute_forward_alibi(params, tensor);
|
|
17636
|
-
} break;
|
|
17637
17485
|
case GGML_OP_CLAMP:
|
|
17638
17486
|
{
|
|
17639
17487
|
ggml_compute_forward_clamp(params, tensor);
|
|
@@ -18652,10 +18500,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18652
18500
|
zero_table);
|
|
18653
18501
|
}
|
|
18654
18502
|
} break;
|
|
18655
|
-
case GGML_OP_ALIBI:
|
|
18656
|
-
{
|
|
18657
|
-
GGML_ASSERT(false); // TODO: not implemented
|
|
18658
|
-
} break;
|
|
18659
18503
|
case GGML_OP_CLAMP:
|
|
18660
18504
|
{
|
|
18661
18505
|
GGML_ASSERT(false); // TODO: not implemented
|
|
@@ -18826,6 +18670,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18826
18670
|
zero_table);
|
|
18827
18671
|
}
|
|
18828
18672
|
} break;
|
|
18673
|
+
case GGML_UNARY_OP_SIGMOID:
|
|
18674
|
+
{
|
|
18675
|
+
GGML_ASSERT(false); // TODO: not implemented
|
|
18676
|
+
} break;
|
|
18829
18677
|
case GGML_UNARY_OP_GELU:
|
|
18830
18678
|
{
|
|
18831
18679
|
GGML_ASSERT(false); // TODO: not implemented
|
|
@@ -19355,6 +19203,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
19355
19203
|
case GGML_UNARY_OP_TANH:
|
|
19356
19204
|
case GGML_UNARY_OP_ELU:
|
|
19357
19205
|
case GGML_UNARY_OP_RELU:
|
|
19206
|
+
case GGML_UNARY_OP_SIGMOID:
|
|
19358
19207
|
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
|
19359
19208
|
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
|
19360
19209
|
{
|
|
@@ -19428,10 +19277,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
19428
19277
|
{
|
|
19429
19278
|
n_tasks = n_threads;
|
|
19430
19279
|
} break;
|
|
19431
|
-
case GGML_OP_ALIBI:
|
|
19432
|
-
{
|
|
19433
|
-
n_tasks = 1; //TODO
|
|
19434
|
-
} break;
|
|
19435
19280
|
case GGML_OP_CLAMP:
|
|
19436
19281
|
{
|
|
19437
19282
|
n_tasks = 1; //TODO
|