@fugood/llama.node 0.3.14 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/llama.cpp/.github/workflows/build.yml +30 -1
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/arg.cpp +20 -2
- package/src/llama.cpp/common/common.cpp +6 -3
- package/src/llama.cpp/common/speculative.cpp +4 -4
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +6 -6
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/run.cpp +91 -46
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +37 -15
- package/src/llama.cpp/examples/server/utils.hpp +3 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/tts/tts.cpp +20 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +24 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
- package/src/llama.cpp/ggml/src/ggml.c +85 -2
- package/src/llama.cpp/include/llama.h +86 -22
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +103 -16
- package/src/llama.cpp/src/llama-arch.h +18 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -110
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-model.cpp +8244 -173
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama.cpp +51 -9984
- package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
|
@@ -426,14 +426,16 @@ void process_shaders() {
|
|
|
426
426
|
}
|
|
427
427
|
}
|
|
428
428
|
|
|
429
|
-
string_to_spv("
|
|
430
|
-
string_to_spv("
|
|
429
|
+
string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
|
|
430
|
+
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
|
431
|
+
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
|
431
432
|
|
|
432
433
|
// Norms
|
|
433
434
|
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
434
435
|
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
435
436
|
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
436
437
|
string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
438
|
+
string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
437
439
|
|
|
438
440
|
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
439
441
|
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
|
@@ -444,6 +446,7 @@ void process_shaders() {
|
|
|
444
446
|
|
|
445
447
|
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
|
446
448
|
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
449
|
+
string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
|
|
447
450
|
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
448
451
|
}
|
|
449
452
|
|
|
@@ -528,6 +531,8 @@ void process_shaders() {
|
|
|
528
531
|
|
|
529
532
|
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
530
533
|
|
|
534
|
+
string_to_spv("rwkv_wkv7_f32", "wkv7.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
535
|
+
|
|
531
536
|
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
532
537
|
|
|
533
538
|
for (auto &c : compiles) {
|
|
@@ -929,6 +929,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
929
929
|
"RMS_NORM",
|
|
930
930
|
"RMS_NORM_BACK",
|
|
931
931
|
"GROUP_NORM",
|
|
932
|
+
"L2_NORM",
|
|
932
933
|
|
|
933
934
|
"MUL_MAT",
|
|
934
935
|
"MUL_MAT_ID",
|
|
@@ -977,6 +978,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
977
978
|
"ADD_REL_POS",
|
|
978
979
|
"RWKV_WKV6",
|
|
979
980
|
"GATED_LINEAR_ATTN",
|
|
981
|
+
"RWKV_WKV7",
|
|
980
982
|
|
|
981
983
|
"UNARY",
|
|
982
984
|
|
|
@@ -996,7 +998,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
996
998
|
"OPT_STEP_ADAMW",
|
|
997
999
|
};
|
|
998
1000
|
|
|
999
|
-
static_assert(GGML_OP_COUNT ==
|
|
1001
|
+
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
|
1000
1002
|
|
|
1001
1003
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1002
1004
|
"none",
|
|
@@ -1026,6 +1028,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1026
1028
|
"rms_norm(x)",
|
|
1027
1029
|
"rms_norm_back(x)",
|
|
1028
1030
|
"group_norm(x)",
|
|
1031
|
+
"l2_norm(x)",
|
|
1029
1032
|
|
|
1030
1033
|
"X*Y",
|
|
1031
1034
|
"X[i]*Y",
|
|
@@ -1074,6 +1077,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1074
1077
|
"add_rel_pos(x)",
|
|
1075
1078
|
"rwkv_wkv6(k, v, r, tf, td, s)",
|
|
1076
1079
|
"gated_linear_attn(k, v, q, gate, s)",
|
|
1080
|
+
"rwkv_wkv7(r, w, k, v, a, b, s)",
|
|
1077
1081
|
|
|
1078
1082
|
"unary(x)",
|
|
1079
1083
|
|
|
@@ -1093,7 +1097,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1093
1097
|
"adamw(x)",
|
|
1094
1098
|
};
|
|
1095
1099
|
|
|
1096
|
-
static_assert(GGML_OP_COUNT ==
|
|
1100
|
+
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
|
1097
1101
|
|
|
1098
1102
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
|
1099
1103
|
|
|
@@ -2686,6 +2690,37 @@ struct ggml_tensor * ggml_group_norm_inplace(
|
|
|
2686
2690
|
return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
|
|
2687
2691
|
}
|
|
2688
2692
|
|
|
2693
|
+
// ggml_l2_norm
|
|
2694
|
+
|
|
2695
|
+
static struct ggml_tensor * ggml_l2_norm_impl(
|
|
2696
|
+
struct ggml_context * ctx,
|
|
2697
|
+
struct ggml_tensor * a,
|
|
2698
|
+
float eps,
|
|
2699
|
+
bool inplace) {
|
|
2700
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
2701
|
+
|
|
2702
|
+
ggml_set_op_params_f32(result, 0, eps);
|
|
2703
|
+
|
|
2704
|
+
result->op = GGML_OP_L2_NORM;
|
|
2705
|
+
result->src[0] = a;
|
|
2706
|
+
|
|
2707
|
+
return result;
|
|
2708
|
+
}
|
|
2709
|
+
|
|
2710
|
+
struct ggml_tensor * ggml_l2_norm(
|
|
2711
|
+
struct ggml_context * ctx,
|
|
2712
|
+
struct ggml_tensor * a,
|
|
2713
|
+
float eps) {
|
|
2714
|
+
return ggml_l2_norm_impl(ctx, a, eps, false);
|
|
2715
|
+
}
|
|
2716
|
+
|
|
2717
|
+
struct ggml_tensor * ggml_l2_norm_inplace(
|
|
2718
|
+
struct ggml_context * ctx,
|
|
2719
|
+
struct ggml_tensor * a,
|
|
2720
|
+
float eps) {
|
|
2721
|
+
return ggml_l2_norm_impl(ctx, a, eps, true);
|
|
2722
|
+
}
|
|
2723
|
+
|
|
2689
2724
|
// ggml_mul_mat
|
|
2690
2725
|
|
|
2691
2726
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
|
@@ -4720,6 +4755,54 @@ struct ggml_tensor * ggml_gated_linear_attn(
|
|
|
4720
4755
|
return result;
|
|
4721
4756
|
}
|
|
4722
4757
|
|
|
4758
|
+
// ggml_rwkv_wkv7
|
|
4759
|
+
|
|
4760
|
+
struct ggml_tensor * ggml_rwkv_wkv7(
|
|
4761
|
+
struct ggml_context * ctx,
|
|
4762
|
+
struct ggml_tensor * r,
|
|
4763
|
+
struct ggml_tensor * w,
|
|
4764
|
+
struct ggml_tensor * k,
|
|
4765
|
+
struct ggml_tensor * v,
|
|
4766
|
+
struct ggml_tensor * a,
|
|
4767
|
+
struct ggml_tensor * b,
|
|
4768
|
+
struct ggml_tensor * state) {
|
|
4769
|
+
GGML_ASSERT(ggml_is_contiguous(r));
|
|
4770
|
+
GGML_ASSERT(ggml_is_contiguous(w));
|
|
4771
|
+
GGML_ASSERT(ggml_is_contiguous(k));
|
|
4772
|
+
GGML_ASSERT(ggml_is_contiguous(v));
|
|
4773
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
|
4774
|
+
GGML_ASSERT(ggml_is_contiguous(b));
|
|
4775
|
+
GGML_ASSERT(ggml_is_contiguous(state));
|
|
4776
|
+
|
|
4777
|
+
const int64_t S = k->ne[0];
|
|
4778
|
+
const int64_t H = k->ne[1];
|
|
4779
|
+
const int64_t n_tokens = k->ne[2];
|
|
4780
|
+
const int64_t n_seqs = state->ne[1];
|
|
4781
|
+
{
|
|
4782
|
+
GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
|
|
4783
|
+
GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
|
|
4784
|
+
GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
|
|
4785
|
+
GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
|
|
4786
|
+
GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
|
|
4787
|
+
GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
|
|
4788
|
+
}
|
|
4789
|
+
|
|
4790
|
+
// concat output and new_state
|
|
4791
|
+
const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
|
|
4792
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
|
4793
|
+
|
|
4794
|
+
result->op = GGML_OP_RWKV_WKV7;
|
|
4795
|
+
result->src[0] = r;
|
|
4796
|
+
result->src[1] = w;
|
|
4797
|
+
result->src[2] = k;
|
|
4798
|
+
result->src[3] = v;
|
|
4799
|
+
result->src[4] = a;
|
|
4800
|
+
result->src[5] = b;
|
|
4801
|
+
result->src[6] = state;
|
|
4802
|
+
|
|
4803
|
+
return result;
|
|
4804
|
+
}
|
|
4805
|
+
|
|
4723
4806
|
// ggml_unary
|
|
4724
4807
|
|
|
4725
4808
|
static struct ggml_tensor * ggml_unary_impl(
|
|
@@ -60,6 +60,7 @@ extern "C" {
|
|
|
60
60
|
struct llama_model;
|
|
61
61
|
struct llama_context;
|
|
62
62
|
struct llama_sampler;
|
|
63
|
+
struct llama_kv_cache;
|
|
63
64
|
|
|
64
65
|
typedef int32_t llama_pos;
|
|
65
66
|
typedef int32_t llama_token;
|
|
@@ -469,7 +470,8 @@ extern "C" {
|
|
|
469
470
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
|
470
471
|
|
|
471
472
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
|
472
|
-
LLAMA_API
|
|
473
|
+
LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
|
|
474
|
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
473
475
|
|
|
474
476
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
475
477
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
@@ -586,7 +588,7 @@ extern "C" {
|
|
|
586
588
|
// KV cache
|
|
587
589
|
//
|
|
588
590
|
|
|
589
|
-
// TODO:
|
|
591
|
+
// TODO: start using struct llama_kv_cache
|
|
590
592
|
|
|
591
593
|
// Information associated with an individual cell in the KV cache view.
|
|
592
594
|
struct llama_kv_cache_view_cell {
|
|
@@ -641,13 +643,19 @@ extern "C" {
|
|
|
641
643
|
|
|
642
644
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
643
645
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
644
|
-
LLAMA_API int32_t
|
|
646
|
+
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
|
|
647
|
+
|
|
648
|
+
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
|
649
|
+
"use llama_kv_self_n_tokens instead");
|
|
645
650
|
|
|
646
651
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
647
|
-
LLAMA_API int32_t
|
|
652
|
+
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
|
|
653
|
+
|
|
654
|
+
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
|
|
655
|
+
"use llama_kv_self_used_cells instead");
|
|
648
656
|
|
|
649
657
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
650
|
-
LLAMA_API void
|
|
658
|
+
LLAMA_API void llama_kv_self_clear(
|
|
651
659
|
struct llama_context * ctx);
|
|
652
660
|
|
|
653
661
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
@@ -655,7 +663,7 @@ extern "C" {
|
|
|
655
663
|
// seq_id < 0 : match any sequence
|
|
656
664
|
// p0 < 0 : [0, p1]
|
|
657
665
|
// p1 < 0 : [p0, inf)
|
|
658
|
-
LLAMA_API bool
|
|
666
|
+
LLAMA_API bool llama_kv_self_seq_rm(
|
|
659
667
|
struct llama_context * ctx,
|
|
660
668
|
llama_seq_id seq_id,
|
|
661
669
|
llama_pos p0,
|
|
@@ -665,7 +673,7 @@ extern "C" {
|
|
|
665
673
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
666
674
|
// p0 < 0 : [0, p1]
|
|
667
675
|
// p1 < 0 : [p0, inf)
|
|
668
|
-
LLAMA_API void
|
|
676
|
+
LLAMA_API void llama_kv_self_seq_cp(
|
|
669
677
|
struct llama_context * ctx,
|
|
670
678
|
llama_seq_id seq_id_src,
|
|
671
679
|
llama_seq_id seq_id_dst,
|
|
@@ -673,17 +681,17 @@ extern "C" {
|
|
|
673
681
|
llama_pos p1);
|
|
674
682
|
|
|
675
683
|
// Removes all tokens that do not belong to the specified sequence
|
|
676
|
-
LLAMA_API void
|
|
684
|
+
LLAMA_API void llama_kv_self_seq_keep(
|
|
677
685
|
struct llama_context * ctx,
|
|
678
686
|
llama_seq_id seq_id);
|
|
679
687
|
|
|
680
688
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
681
689
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
682
690
|
// - lazily on next llama_decode()
|
|
683
|
-
// - explicitly with
|
|
691
|
+
// - explicitly with llama_kv_self_update()
|
|
684
692
|
// p0 < 0 : [0, p1]
|
|
685
693
|
// p1 < 0 : [p0, inf)
|
|
686
|
-
LLAMA_API void
|
|
694
|
+
LLAMA_API void llama_kv_self_seq_add(
|
|
687
695
|
struct llama_context * ctx,
|
|
688
696
|
llama_seq_id seq_id,
|
|
689
697
|
llama_pos p0,
|
|
@@ -693,10 +701,10 @@ extern "C" {
|
|
|
693
701
|
// Integer division of the positions by factor of `d > 1`
|
|
694
702
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
695
703
|
// - lazily on next llama_decode()
|
|
696
|
-
// - explicitly with
|
|
704
|
+
// - explicitly with llama_kv_self_update()
|
|
697
705
|
// p0 < 0 : [0, p1]
|
|
698
706
|
// p1 < 0 : [p0, inf)
|
|
699
|
-
LLAMA_API void
|
|
707
|
+
LLAMA_API void llama_kv_self_seq_div(
|
|
700
708
|
struct llama_context * ctx,
|
|
701
709
|
llama_seq_id seq_id,
|
|
702
710
|
llama_pos p0,
|
|
@@ -704,24 +712,76 @@ extern "C" {
|
|
|
704
712
|
int d);
|
|
705
713
|
|
|
706
714
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
707
|
-
LLAMA_API llama_pos
|
|
715
|
+
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
708
716
|
struct llama_context * ctx,
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
// TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
|
|
712
|
-
// how to avoid this?
|
|
717
|
+
llama_seq_id seq_id);
|
|
713
718
|
|
|
714
719
|
// Defragment the KV cache
|
|
715
720
|
// This will be applied:
|
|
716
721
|
// - lazily on next llama_decode()
|
|
717
|
-
// - explicitly with
|
|
718
|
-
LLAMA_API void
|
|
722
|
+
// - explicitly with llama_kv_self_update()
|
|
723
|
+
LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
|
|
724
|
+
|
|
725
|
+
// Check if the context supports KV cache shifting
|
|
726
|
+
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
|
|
719
727
|
|
|
720
728
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
721
|
-
LLAMA_API void
|
|
729
|
+
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
|
|
730
|
+
|
|
731
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_clear(
|
|
732
|
+
struct llama_context * ctx),
|
|
733
|
+
"use llama_kv_self_clear instead");
|
|
734
|
+
|
|
735
|
+
DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
|
|
736
|
+
struct llama_context * ctx,
|
|
737
|
+
llama_seq_id seq_id,
|
|
738
|
+
llama_pos p0,
|
|
739
|
+
llama_pos p1),
|
|
740
|
+
"use llama_kv_self_seq_rm instead");
|
|
741
|
+
|
|
742
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
|
|
743
|
+
struct llama_context * ctx,
|
|
744
|
+
llama_seq_id seq_id_src,
|
|
745
|
+
llama_seq_id seq_id_dst,
|
|
746
|
+
llama_pos p0,
|
|
747
|
+
llama_pos p1),
|
|
748
|
+
"use llama_kv_self_seq_cp instead");
|
|
749
|
+
|
|
750
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
|
|
751
|
+
struct llama_context * ctx,
|
|
752
|
+
llama_seq_id seq_id),
|
|
753
|
+
"use llama_kv_self_seq_keep instead");
|
|
754
|
+
|
|
755
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
|
|
756
|
+
struct llama_context * ctx,
|
|
757
|
+
llama_seq_id seq_id,
|
|
758
|
+
llama_pos p0,
|
|
759
|
+
llama_pos p1,
|
|
760
|
+
llama_pos delta),
|
|
761
|
+
"use llama_kv_self_seq_add instead");
|
|
762
|
+
|
|
763
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
|
|
764
|
+
struct llama_context * ctx,
|
|
765
|
+
llama_seq_id seq_id,
|
|
766
|
+
llama_pos p0,
|
|
767
|
+
llama_pos p1,
|
|
768
|
+
int d),
|
|
769
|
+
"use llama_kv_self_seq_div instead");
|
|
770
|
+
|
|
771
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
|
772
|
+
struct llama_context * ctx,
|
|
773
|
+
llama_seq_id seq_id),
|
|
774
|
+
"use llama_kv_self_seq_pos_max instead");
|
|
775
|
+
|
|
776
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
|
|
777
|
+
"use llama_kv_self_defrag instead");
|
|
778
|
+
|
|
779
|
+
DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
|
|
780
|
+
"use llama_kv_self_can_shift instead");
|
|
781
|
+
|
|
782
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
|
|
783
|
+
"use llama_kv_self_update instead");
|
|
722
784
|
|
|
723
|
-
// Check if the context supports KV cache shifting
|
|
724
|
-
LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
|
|
725
785
|
|
|
726
786
|
//
|
|
727
787
|
// State / sessions
|
|
@@ -885,6 +945,10 @@ extern "C" {
|
|
|
885
945
|
// If set to true, the model will only attend to the past tokens
|
|
886
946
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
|
887
947
|
|
|
948
|
+
// Set whether the model is in warmup mode or not
|
|
949
|
+
// If true, all model tensors are activated during llama_decode() to load and cache their weights.
|
|
950
|
+
LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
|
|
951
|
+
|
|
888
952
|
// Set abort callback
|
|
889
953
|
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
890
954
|
|
|
@@ -15,18 +15,21 @@ add_library(llama
|
|
|
15
15
|
llama-chat.cpp
|
|
16
16
|
llama-context.cpp
|
|
17
17
|
llama-grammar.cpp
|
|
18
|
+
llama-graph.cpp
|
|
18
19
|
llama-hparams.cpp
|
|
19
20
|
llama-impl.cpp
|
|
21
|
+
llama-io.cpp
|
|
20
22
|
llama-kv-cache.cpp
|
|
23
|
+
llama-memory.cpp
|
|
21
24
|
llama-mmap.cpp
|
|
22
25
|
llama-model-loader.cpp
|
|
23
26
|
llama-model.cpp
|
|
24
27
|
llama-quant.cpp
|
|
25
28
|
llama-sampling.cpp
|
|
26
29
|
llama-vocab.cpp
|
|
27
|
-
unicode.h
|
|
28
|
-
unicode.cpp
|
|
29
30
|
unicode-data.cpp
|
|
31
|
+
unicode.cpp
|
|
32
|
+
unicode.h
|
|
30
33
|
)
|
|
31
34
|
|
|
32
35
|
target_include_directories(llama PUBLIC . ../include ../common)
|
|
@@ -4,14 +4,13 @@
|
|
|
4
4
|
#include "llama-mmap.h"
|
|
5
5
|
#include "llama-model.h"
|
|
6
6
|
|
|
7
|
-
#include <algorithm>
|
|
8
7
|
#include <map>
|
|
9
8
|
#include <cassert>
|
|
10
9
|
#include <stdexcept>
|
|
11
10
|
|
|
12
11
|
// vec
|
|
13
12
|
|
|
14
|
-
|
|
13
|
+
ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
|
15
14
|
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
|
16
15
|
return nullptr;
|
|
17
16
|
}
|
|
@@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
|
|
19
18
|
return tensors[il];
|
|
20
19
|
}
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
|
|
23
22
|
ggml_tensor * layer_dir = tensor_for(il);
|
|
24
23
|
if (layer_dir != nullptr) {
|
|
25
24
|
cur = ggml_add(ctx, cur, layer_dir);
|
|
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
|
|
40
39
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
41
40
|
auto it = ctx_map.find(buft);
|
|
42
41
|
if (it == ctx_map.end()) {
|
|
43
|
-
|
|
42
|
+
ggml_init_params params = {
|
|
44
43
|
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
|
|
45
44
|
/*.mem_buffer =*/ NULL,
|
|
46
45
|
/*.no_alloc =*/ true,
|
|
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
|
|
91
90
|
return true;
|
|
92
91
|
}
|
|
93
92
|
|
|
94
|
-
|
|
93
|
+
bool llama_adapter_cvec::apply(
|
|
95
94
|
const llama_model & model,
|
|
96
95
|
const float * data,
|
|
97
96
|
size_t len,
|
|
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
|
|
|
104
103
|
// disable the current control vector (but leave allocated for later)
|
|
105
104
|
layer_start = -1;
|
|
106
105
|
layer_end = -1;
|
|
107
|
-
return
|
|
106
|
+
return true;
|
|
108
107
|
}
|
|
109
108
|
|
|
110
109
|
if (n_embd != (int) hparams.n_embd) {
|
|
111
110
|
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
|
112
|
-
return
|
|
111
|
+
return false;
|
|
113
112
|
}
|
|
114
113
|
|
|
115
114
|
if (tensors.empty()) {
|
|
116
115
|
if (!init(model)) {
|
|
117
|
-
return
|
|
116
|
+
return false;
|
|
118
117
|
}
|
|
119
118
|
}
|
|
120
119
|
|
|
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
|
|
|
130
129
|
}
|
|
131
130
|
}
|
|
132
131
|
|
|
133
|
-
return
|
|
132
|
+
return true;
|
|
134
133
|
}
|
|
135
134
|
|
|
136
135
|
// lora
|
|
137
136
|
|
|
138
|
-
llama_adapter_lora_weight * llama_adapter_lora::get_weight(
|
|
137
|
+
llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|
139
138
|
const std::string name(w->name);
|
|
140
139
|
|
|
141
140
|
const auto pos = ab_map.find(name);
|
|
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
|
|
|
146
145
|
return nullptr;
|
|
147
146
|
}
|
|
148
147
|
|
|
149
|
-
static void llama_adapter_lora_init_impl(
|
|
148
|
+
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
|
150
149
|
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
|
151
150
|
|
|
152
151
|
ggml_context * ctx_init;
|
|
153
|
-
|
|
152
|
+
gguf_init_params meta_gguf_params = {
|
|
154
153
|
/* .no_alloc = */ true,
|
|
155
154
|
/* .ctx = */ &ctx_init,
|
|
156
155
|
};
|
|
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
201
200
|
auto it = ctx_map.find(buft);
|
|
202
201
|
if (it == ctx_map.end()) {
|
|
203
202
|
// add a new context
|
|
204
|
-
|
|
203
|
+
ggml_init_params params = {
|
|
205
204
|
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
|
|
206
205
|
/*.mem_buffer =*/ NULL,
|
|
207
206
|
/*.no_alloc =*/ true,
|
|
@@ -264,7 +263,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
264
263
|
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
|
265
264
|
}
|
|
266
265
|
|
|
267
|
-
|
|
266
|
+
ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
|
268
267
|
// validate tensor shape
|
|
269
268
|
if (is_token_embd) {
|
|
270
269
|
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
|
@@ -281,8 +280,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
281
280
|
}
|
|
282
281
|
|
|
283
282
|
// save tensor to adapter
|
|
284
|
-
|
|
285
|
-
|
|
283
|
+
ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
|
|
284
|
+
ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
|
286
285
|
ggml_set_name(tensor_a, w.a->name);
|
|
287
286
|
ggml_set_name(tensor_b, w.b->name);
|
|
288
287
|
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
|
@@ -308,7 +307,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
308
307
|
{
|
|
309
308
|
llama_file gguf_file(path_lora, "rb");
|
|
310
309
|
std::vector<uint8_t> read_buf;
|
|
311
|
-
auto set_tensor = [&](
|
|
310
|
+
auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
|
|
312
311
|
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
|
|
313
312
|
size_t size = ggml_nbytes(orig);
|
|
314
313
|
read_buf.resize(size);
|
|
@@ -327,8 +326,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
|
327
326
|
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
|
328
327
|
}
|
|
329
328
|
|
|
330
|
-
|
|
331
|
-
|
|
329
|
+
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
|
330
|
+
llama_adapter_lora * adapter = new llama_adapter_lora();
|
|
332
331
|
|
|
333
332
|
try {
|
|
334
333
|
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
|
@@ -342,6 +341,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
|
|
|
342
341
|
return nullptr;
|
|
343
342
|
}
|
|
344
343
|
|
|
345
|
-
void llama_adapter_lora_free(
|
|
344
|
+
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
|
346
345
|
delete adapter;
|
|
347
346
|
}
|
|
@@ -15,11 +15,11 @@
|
|
|
15
15
|
//
|
|
16
16
|
|
|
17
17
|
struct llama_adapter_cvec {
|
|
18
|
-
|
|
18
|
+
ggml_tensor * tensor_for(int il) const;
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const;
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
bool apply(
|
|
23
23
|
const llama_model & model,
|
|
24
24
|
const float * data,
|
|
25
25
|
size_t len,
|
|
@@ -36,7 +36,7 @@ private:
|
|
|
36
36
|
std::vector<ggml_context_ptr> ctxs;
|
|
37
37
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
38
38
|
|
|
39
|
-
std::vector<
|
|
39
|
+
std::vector<ggml_tensor *> tensors; // per layer
|
|
40
40
|
};
|
|
41
41
|
|
|
42
42
|
//
|
|
@@ -44,8 +44,8 @@ private:
|
|
|
44
44
|
//
|
|
45
45
|
|
|
46
46
|
struct llama_adapter_lora_weight {
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
ggml_tensor * a = nullptr;
|
|
48
|
+
ggml_tensor * b = nullptr;
|
|
49
49
|
|
|
50
50
|
// get actual scale based on rank and alpha
|
|
51
51
|
float get_scale(float alpha, float adapter_scale) const {
|
|
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
|
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
llama_adapter_lora_weight() = default;
|
|
58
|
-
llama_adapter_lora_weight(
|
|
58
|
+
llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
|
|
59
59
|
};
|
|
60
60
|
|
|
61
61
|
struct llama_adapter_lora {
|
|
62
62
|
// map tensor name to lora_a_b
|
|
63
|
-
std::unordered_map<std::string,
|
|
63
|
+
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
|
64
64
|
|
|
65
65
|
std::vector<ggml_context_ptr> ctxs;
|
|
66
66
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
|
|
|
70
70
|
llama_adapter_lora() = default;
|
|
71
71
|
~llama_adapter_lora() = default;
|
|
72
72
|
|
|
73
|
-
llama_adapter_lora_weight * get_weight(
|
|
73
|
+
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
|
74
74
|
};
|
|
75
|
+
|
|
76
|
+
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|