@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -325,11 +325,17 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
|
|
|
325
325
|
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
|
326
326
|
|
|
327
327
|
for (const auto& tname : type_names) {
|
|
328
|
+
std::string load_vec_quant = "2";
|
|
329
|
+
if ((tname == "q4_0") || (tname == "q4_1"))
|
|
330
|
+
load_vec_quant = "8";
|
|
331
|
+
else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl"))
|
|
332
|
+
load_vec_quant = "4";
|
|
333
|
+
|
|
328
334
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
|
329
335
|
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
|
330
|
-
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" :
|
|
336
|
+
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : load_vec_quant;
|
|
331
337
|
// For aligned matmul loads
|
|
332
|
-
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec :
|
|
338
|
+
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : load_vec_quant;
|
|
333
339
|
|
|
334
340
|
// don't generate f32 variants for coopmat2
|
|
335
341
|
if (!coopmat2) {
|
|
@@ -396,7 +402,7 @@ void process_shaders() {
|
|
|
396
402
|
for (const auto& tname : type_names) {
|
|
397
403
|
// mul mat vec
|
|
398
404
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
|
399
|
-
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
|
405
|
+
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
|
400
406
|
|
|
401
407
|
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
|
402
408
|
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
|
@@ -427,6 +433,8 @@ void process_shaders() {
|
|
|
427
433
|
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
428
434
|
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
429
435
|
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
436
|
+
string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
437
|
+
string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
430
438
|
|
|
431
439
|
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
432
440
|
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
|
@@ -477,14 +485,17 @@ void process_shaders() {
|
|
|
477
485
|
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
478
486
|
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
479
487
|
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
488
|
+
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
480
489
|
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
481
490
|
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
482
491
|
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
492
|
+
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
483
493
|
|
|
484
494
|
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
485
495
|
|
|
486
496
|
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
487
497
|
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
|
498
|
+
string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
488
499
|
|
|
489
500
|
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
490
501
|
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
@@ -518,6 +529,8 @@ void process_shaders() {
|
|
|
518
529
|
|
|
519
530
|
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
520
531
|
|
|
532
|
+
string_to_spv("rwkv_wkv7_f32", "wkv7.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
533
|
+
|
|
521
534
|
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
522
535
|
|
|
523
536
|
for (auto &c : compiles) {
|
|
@@ -240,7 +240,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
|
|
240
240
|
|
|
241
241
|
|
|
242
242
|
void * ggml_aligned_malloc(size_t size) {
|
|
243
|
+
#if defined(__s390x__)
|
|
244
|
+
const int alignment = 256;
|
|
245
|
+
#else
|
|
243
246
|
const int alignment = 64;
|
|
247
|
+
#endif
|
|
244
248
|
|
|
245
249
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
246
250
|
return _aligned_malloc(size, alignment);
|
|
@@ -561,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
|
|
561
565
|
#endif
|
|
562
566
|
|
|
563
567
|
}
|
|
564
|
-
static void ggml_vec_dot_f32(int n, float *
|
|
565
|
-
static void ggml_vec_dot_f16(int n, float *
|
|
566
|
-
static void ggml_vec_dot_bf16(int n, float *
|
|
568
|
+
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
|
569
|
+
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
570
|
+
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
567
571
|
|
|
568
572
|
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|
569
573
|
[GGML_TYPE_I8] = {
|
|
@@ -925,6 +929,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
925
929
|
"RMS_NORM",
|
|
926
930
|
"RMS_NORM_BACK",
|
|
927
931
|
"GROUP_NORM",
|
|
932
|
+
"L2_NORM",
|
|
928
933
|
|
|
929
934
|
"MUL_MAT",
|
|
930
935
|
"MUL_MAT_ID",
|
|
@@ -973,6 +978,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
973
978
|
"ADD_REL_POS",
|
|
974
979
|
"RWKV_WKV6",
|
|
975
980
|
"GATED_LINEAR_ATTN",
|
|
981
|
+
"RWKV_WKV7",
|
|
976
982
|
|
|
977
983
|
"UNARY",
|
|
978
984
|
|
|
@@ -992,7 +998,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
992
998
|
"OPT_STEP_ADAMW",
|
|
993
999
|
};
|
|
994
1000
|
|
|
995
|
-
static_assert(GGML_OP_COUNT ==
|
|
1001
|
+
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
|
996
1002
|
|
|
997
1003
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
998
1004
|
"none",
|
|
@@ -1022,6 +1028,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1022
1028
|
"rms_norm(x)",
|
|
1023
1029
|
"rms_norm_back(x)",
|
|
1024
1030
|
"group_norm(x)",
|
|
1031
|
+
"l2_norm(x)",
|
|
1025
1032
|
|
|
1026
1033
|
"X*Y",
|
|
1027
1034
|
"X[i]*Y",
|
|
@@ -1070,6 +1077,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1070
1077
|
"add_rel_pos(x)",
|
|
1071
1078
|
"rwkv_wkv6(k, v, r, tf, td, s)",
|
|
1072
1079
|
"gated_linear_attn(k, v, q, gate, s)",
|
|
1080
|
+
"rwkv_wkv7(r, w, k, v, a, b, s)",
|
|
1073
1081
|
|
|
1074
1082
|
"unary(x)",
|
|
1075
1083
|
|
|
@@ -1089,7 +1097,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1089
1097
|
"adamw(x)",
|
|
1090
1098
|
};
|
|
1091
1099
|
|
|
1092
|
-
static_assert(GGML_OP_COUNT ==
|
|
1100
|
+
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
|
1093
1101
|
|
|
1094
1102
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
|
1095
1103
|
|
|
@@ -2328,6 +2336,7 @@ struct ggml_tensor * ggml_concat(
|
|
|
2328
2336
|
struct ggml_tensor * b,
|
|
2329
2337
|
int dim) {
|
|
2330
2338
|
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
|
2339
|
+
GGML_ASSERT(a->type == b->type);
|
|
2331
2340
|
|
|
2332
2341
|
int64_t ne[GGML_MAX_DIMS];
|
|
2333
2342
|
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
|
@@ -2681,6 +2690,37 @@ struct ggml_tensor * ggml_group_norm_inplace(
|
|
|
2681
2690
|
return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
|
|
2682
2691
|
}
|
|
2683
2692
|
|
|
2693
|
+
// ggml_l2_norm
|
|
2694
|
+
|
|
2695
|
+
static struct ggml_tensor * ggml_l2_norm_impl(
|
|
2696
|
+
struct ggml_context * ctx,
|
|
2697
|
+
struct ggml_tensor * a,
|
|
2698
|
+
float eps,
|
|
2699
|
+
bool inplace) {
|
|
2700
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
2701
|
+
|
|
2702
|
+
ggml_set_op_params_f32(result, 0, eps);
|
|
2703
|
+
|
|
2704
|
+
result->op = GGML_OP_L2_NORM;
|
|
2705
|
+
result->src[0] = a;
|
|
2706
|
+
|
|
2707
|
+
return result;
|
|
2708
|
+
}
|
|
2709
|
+
|
|
2710
|
+
struct ggml_tensor * ggml_l2_norm(
|
|
2711
|
+
struct ggml_context * ctx,
|
|
2712
|
+
struct ggml_tensor * a,
|
|
2713
|
+
float eps) {
|
|
2714
|
+
return ggml_l2_norm_impl(ctx, a, eps, false);
|
|
2715
|
+
}
|
|
2716
|
+
|
|
2717
|
+
struct ggml_tensor * ggml_l2_norm_inplace(
|
|
2718
|
+
struct ggml_context * ctx,
|
|
2719
|
+
struct ggml_tensor * a,
|
|
2720
|
+
float eps) {
|
|
2721
|
+
return ggml_l2_norm_impl(ctx, a, eps, true);
|
|
2722
|
+
}
|
|
2723
|
+
|
|
2684
2724
|
// ggml_mul_mat
|
|
2685
2725
|
|
|
2686
2726
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
|
@@ -4715,6 +4755,54 @@ struct ggml_tensor * ggml_gated_linear_attn(
|
|
|
4715
4755
|
return result;
|
|
4716
4756
|
}
|
|
4717
4757
|
|
|
4758
|
+
// ggml_rwkv_wkv7
|
|
4759
|
+
|
|
4760
|
+
struct ggml_tensor * ggml_rwkv_wkv7(
|
|
4761
|
+
struct ggml_context * ctx,
|
|
4762
|
+
struct ggml_tensor * r,
|
|
4763
|
+
struct ggml_tensor * w,
|
|
4764
|
+
struct ggml_tensor * k,
|
|
4765
|
+
struct ggml_tensor * v,
|
|
4766
|
+
struct ggml_tensor * a,
|
|
4767
|
+
struct ggml_tensor * b,
|
|
4768
|
+
struct ggml_tensor * state) {
|
|
4769
|
+
GGML_ASSERT(ggml_is_contiguous(r));
|
|
4770
|
+
GGML_ASSERT(ggml_is_contiguous(w));
|
|
4771
|
+
GGML_ASSERT(ggml_is_contiguous(k));
|
|
4772
|
+
GGML_ASSERT(ggml_is_contiguous(v));
|
|
4773
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
|
4774
|
+
GGML_ASSERT(ggml_is_contiguous(b));
|
|
4775
|
+
GGML_ASSERT(ggml_is_contiguous(state));
|
|
4776
|
+
|
|
4777
|
+
const int64_t S = k->ne[0];
|
|
4778
|
+
const int64_t H = k->ne[1];
|
|
4779
|
+
const int64_t n_tokens = k->ne[2];
|
|
4780
|
+
const int64_t n_seqs = state->ne[1];
|
|
4781
|
+
{
|
|
4782
|
+
GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
|
|
4783
|
+
GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
|
|
4784
|
+
GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
|
|
4785
|
+
GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
|
|
4786
|
+
GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
|
|
4787
|
+
GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
|
|
4788
|
+
}
|
|
4789
|
+
|
|
4790
|
+
// concat output and new_state
|
|
4791
|
+
const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
|
|
4792
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
|
4793
|
+
|
|
4794
|
+
result->op = GGML_OP_RWKV_WKV7;
|
|
4795
|
+
result->src[0] = r;
|
|
4796
|
+
result->src[1] = w;
|
|
4797
|
+
result->src[2] = k;
|
|
4798
|
+
result->src[3] = v;
|
|
4799
|
+
result->src[4] = a;
|
|
4800
|
+
result->src[5] = b;
|
|
4801
|
+
result->src[6] = state;
|
|
4802
|
+
|
|
4803
|
+
return result;
|
|
4804
|
+
}
|
|
4805
|
+
|
|
4718
4806
|
// ggml_unary
|
|
4719
4807
|
|
|
4720
4808
|
static struct ggml_tensor * ggml_unary_impl(
|
|
@@ -60,6 +60,7 @@ extern "C" {
|
|
|
60
60
|
struct llama_model;
|
|
61
61
|
struct llama_context;
|
|
62
62
|
struct llama_sampler;
|
|
63
|
+
struct llama_kv_cache;
|
|
63
64
|
|
|
64
65
|
typedef int32_t llama_pos;
|
|
65
66
|
typedef int32_t llama_token;
|
|
@@ -105,6 +106,7 @@ extern "C" {
|
|
|
105
106
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
106
107
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
107
108
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
109
|
+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
108
110
|
};
|
|
109
111
|
|
|
110
112
|
enum llama_rope_type {
|
|
@@ -468,7 +470,8 @@ extern "C" {
|
|
|
468
470
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
|
469
471
|
|
|
470
472
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
|
471
|
-
LLAMA_API
|
|
473
|
+
LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
|
|
474
|
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
472
475
|
|
|
473
476
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
474
477
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
@@ -477,6 +480,7 @@ extern "C" {
|
|
|
477
480
|
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
|
478
481
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
479
482
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
483
|
+
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
480
484
|
|
|
481
485
|
// Get the model's RoPE frequency scaling factor
|
|
482
486
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
@@ -584,7 +588,7 @@ extern "C" {
|
|
|
584
588
|
// KV cache
|
|
585
589
|
//
|
|
586
590
|
|
|
587
|
-
// TODO:
|
|
591
|
+
// TODO: start using struct llama_kv_cache
|
|
588
592
|
|
|
589
593
|
// Information associated with an individual cell in the KV cache view.
|
|
590
594
|
struct llama_kv_cache_view_cell {
|
|
@@ -639,13 +643,19 @@ extern "C" {
|
|
|
639
643
|
|
|
640
644
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
641
645
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
642
|
-
LLAMA_API int32_t
|
|
646
|
+
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
|
|
647
|
+
|
|
648
|
+
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
|
649
|
+
"use llama_kv_self_n_tokens instead");
|
|
643
650
|
|
|
644
651
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
645
|
-
LLAMA_API int32_t
|
|
652
|
+
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
|
|
653
|
+
|
|
654
|
+
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
|
|
655
|
+
"use llama_kv_self_used_cells instead");
|
|
646
656
|
|
|
647
657
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
648
|
-
LLAMA_API void
|
|
658
|
+
LLAMA_API void llama_kv_self_clear(
|
|
649
659
|
struct llama_context * ctx);
|
|
650
660
|
|
|
651
661
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
@@ -653,7 +663,7 @@ extern "C" {
|
|
|
653
663
|
// seq_id < 0 : match any sequence
|
|
654
664
|
// p0 < 0 : [0, p1]
|
|
655
665
|
// p1 < 0 : [p0, inf)
|
|
656
|
-
LLAMA_API bool
|
|
666
|
+
LLAMA_API bool llama_kv_self_seq_rm(
|
|
657
667
|
struct llama_context * ctx,
|
|
658
668
|
llama_seq_id seq_id,
|
|
659
669
|
llama_pos p0,
|
|
@@ -663,7 +673,7 @@ extern "C" {
|
|
|
663
673
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
664
674
|
// p0 < 0 : [0, p1]
|
|
665
675
|
// p1 < 0 : [p0, inf)
|
|
666
|
-
LLAMA_API void
|
|
676
|
+
LLAMA_API void llama_kv_self_seq_cp(
|
|
667
677
|
struct llama_context * ctx,
|
|
668
678
|
llama_seq_id seq_id_src,
|
|
669
679
|
llama_seq_id seq_id_dst,
|
|
@@ -671,17 +681,17 @@ extern "C" {
|
|
|
671
681
|
llama_pos p1);
|
|
672
682
|
|
|
673
683
|
// Removes all tokens that do not belong to the specified sequence
|
|
674
|
-
LLAMA_API void
|
|
684
|
+
LLAMA_API void llama_kv_self_seq_keep(
|
|
675
685
|
struct llama_context * ctx,
|
|
676
686
|
llama_seq_id seq_id);
|
|
677
687
|
|
|
678
688
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
679
689
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
680
690
|
// - lazily on next llama_decode()
|
|
681
|
-
// - explicitly with
|
|
691
|
+
// - explicitly with llama_kv_self_update()
|
|
682
692
|
// p0 < 0 : [0, p1]
|
|
683
693
|
// p1 < 0 : [p0, inf)
|
|
684
|
-
LLAMA_API void
|
|
694
|
+
LLAMA_API void llama_kv_self_seq_add(
|
|
685
695
|
struct llama_context * ctx,
|
|
686
696
|
llama_seq_id seq_id,
|
|
687
697
|
llama_pos p0,
|
|
@@ -691,10 +701,10 @@ extern "C" {
|
|
|
691
701
|
// Integer division of the positions by factor of `d > 1`
|
|
692
702
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
693
703
|
// - lazily on next llama_decode()
|
|
694
|
-
// - explicitly with
|
|
704
|
+
// - explicitly with llama_kv_self_update()
|
|
695
705
|
// p0 < 0 : [0, p1]
|
|
696
706
|
// p1 < 0 : [p0, inf)
|
|
697
|
-
LLAMA_API void
|
|
707
|
+
LLAMA_API void llama_kv_self_seq_div(
|
|
698
708
|
struct llama_context * ctx,
|
|
699
709
|
llama_seq_id seq_id,
|
|
700
710
|
llama_pos p0,
|
|
@@ -702,24 +712,76 @@ extern "C" {
|
|
|
702
712
|
int d);
|
|
703
713
|
|
|
704
714
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
705
|
-
LLAMA_API llama_pos
|
|
715
|
+
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
706
716
|
struct llama_context * ctx,
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
// TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
|
|
710
|
-
// how to avoid this?
|
|
717
|
+
llama_seq_id seq_id);
|
|
711
718
|
|
|
712
719
|
// Defragment the KV cache
|
|
713
720
|
// This will be applied:
|
|
714
721
|
// - lazily on next llama_decode()
|
|
715
|
-
// - explicitly with
|
|
716
|
-
LLAMA_API void
|
|
722
|
+
// - explicitly with llama_kv_self_update()
|
|
723
|
+
LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
|
|
724
|
+
|
|
725
|
+
// Check if the context supports KV cache shifting
|
|
726
|
+
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
|
|
717
727
|
|
|
718
728
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
719
|
-
LLAMA_API void
|
|
729
|
+
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
|
|
730
|
+
|
|
731
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_clear(
|
|
732
|
+
struct llama_context * ctx),
|
|
733
|
+
"use llama_kv_self_clear instead");
|
|
734
|
+
|
|
735
|
+
DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
|
|
736
|
+
struct llama_context * ctx,
|
|
737
|
+
llama_seq_id seq_id,
|
|
738
|
+
llama_pos p0,
|
|
739
|
+
llama_pos p1),
|
|
740
|
+
"use llama_kv_self_seq_rm instead");
|
|
741
|
+
|
|
742
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
|
|
743
|
+
struct llama_context * ctx,
|
|
744
|
+
llama_seq_id seq_id_src,
|
|
745
|
+
llama_seq_id seq_id_dst,
|
|
746
|
+
llama_pos p0,
|
|
747
|
+
llama_pos p1),
|
|
748
|
+
"use llama_kv_self_seq_cp instead");
|
|
749
|
+
|
|
750
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
|
|
751
|
+
struct llama_context * ctx,
|
|
752
|
+
llama_seq_id seq_id),
|
|
753
|
+
"use llama_kv_self_seq_keep instead");
|
|
754
|
+
|
|
755
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
|
|
756
|
+
struct llama_context * ctx,
|
|
757
|
+
llama_seq_id seq_id,
|
|
758
|
+
llama_pos p0,
|
|
759
|
+
llama_pos p1,
|
|
760
|
+
llama_pos delta),
|
|
761
|
+
"use llama_kv_self_seq_add instead");
|
|
762
|
+
|
|
763
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
|
|
764
|
+
struct llama_context * ctx,
|
|
765
|
+
llama_seq_id seq_id,
|
|
766
|
+
llama_pos p0,
|
|
767
|
+
llama_pos p1,
|
|
768
|
+
int d),
|
|
769
|
+
"use llama_kv_self_seq_div instead");
|
|
770
|
+
|
|
771
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
|
772
|
+
struct llama_context * ctx,
|
|
773
|
+
llama_seq_id seq_id),
|
|
774
|
+
"use llama_kv_self_seq_pos_max instead");
|
|
775
|
+
|
|
776
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
|
|
777
|
+
"use llama_kv_self_defrag instead");
|
|
778
|
+
|
|
779
|
+
DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
|
|
780
|
+
"use llama_kv_self_can_shift instead");
|
|
781
|
+
|
|
782
|
+
DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
|
|
783
|
+
"use llama_kv_self_update instead");
|
|
720
784
|
|
|
721
|
-
// Check if the context supports KV cache shifting
|
|
722
|
-
LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
|
|
723
785
|
|
|
724
786
|
//
|
|
725
787
|
// State / sessions
|
|
@@ -883,6 +945,10 @@ extern "C" {
|
|
|
883
945
|
// If set to true, the model will only attend to the past tokens
|
|
884
946
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
|
885
947
|
|
|
948
|
+
// Set whether the model is in warmup mode or not
|
|
949
|
+
// If true, all model tensors are activated during llama_decode() to load and cache their weights.
|
|
950
|
+
LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
|
|
951
|
+
|
|
886
952
|
// Set abort callback
|
|
887
953
|
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
888
954
|
|
|
@@ -1203,17 +1269,29 @@ extern "C" {
|
|
|
1203
1269
|
const char * grammar_str,
|
|
1204
1270
|
const char * grammar_root);
|
|
1205
1271
|
|
|
1206
|
-
|
|
1207
|
-
/// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
|
|
1208
|
-
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
|
|
1209
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
1272
|
+
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
1210
1273
|
const struct llama_vocab * vocab,
|
|
1211
1274
|
const char * grammar_str,
|
|
1212
1275
|
const char * grammar_root,
|
|
1213
1276
|
const char ** trigger_words,
|
|
1214
1277
|
size_t num_trigger_words,
|
|
1215
1278
|
const llama_token * trigger_tokens,
|
|
1216
|
-
size_t num_trigger_tokens)
|
|
1279
|
+
size_t num_trigger_tokens),
|
|
1280
|
+
"use llama_sampler_init_grammar_lazy_patterns instead");
|
|
1281
|
+
|
|
1282
|
+
|
|
1283
|
+
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
|
|
1284
|
+
/// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
|
|
1285
|
+
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
|
|
1286
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
|
|
1287
|
+
const struct llama_vocab * vocab,
|
|
1288
|
+
const char * grammar_str,
|
|
1289
|
+
const char * grammar_root,
|
|
1290
|
+
const char ** trigger_patterns,
|
|
1291
|
+
size_t num_trigger_patterns,
|
|
1292
|
+
const llama_token * trigger_tokens,
|
|
1293
|
+
size_t num_trigger_tokens);
|
|
1294
|
+
|
|
1217
1295
|
|
|
1218
1296
|
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
|
1219
1297
|
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
ied 4 ½ months
|
|
2
|
+
__ggml_vocab_test__
|
|
3
|
+
Führer
|
|
4
|
+
__ggml_vocab_test__
|
|
5
|
+
|
|
6
|
+
__ggml_vocab_test__
|
|
7
|
+
|
|
8
|
+
__ggml_vocab_test__
|
|
9
|
+
|
|
10
|
+
__ggml_vocab_test__
|
|
11
|
+
|
|
12
|
+
__ggml_vocab_test__
|
|
13
|
+
|
|
14
|
+
__ggml_vocab_test__
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__ggml_vocab_test__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__ggml_vocab_test__
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__ggml_vocab_test__
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__ggml_vocab_test__
|
|
30
|
+
Hello world
|
|
31
|
+
__ggml_vocab_test__
|
|
32
|
+
Hello world
|
|
33
|
+
__ggml_vocab_test__
|
|
34
|
+
Hello World
|
|
35
|
+
__ggml_vocab_test__
|
|
36
|
+
Hello World
|
|
37
|
+
__ggml_vocab_test__
|
|
38
|
+
Hello World!
|
|
39
|
+
__ggml_vocab_test__
|
|
40
|
+
Hello, world!
|
|
41
|
+
__ggml_vocab_test__
|
|
42
|
+
Hello, world!
|
|
43
|
+
__ggml_vocab_test__
|
|
44
|
+
this is 🦙.cpp
|
|
45
|
+
__ggml_vocab_test__
|
|
46
|
+
w048 7tuijk dsdfhu
|
|
47
|
+
__ggml_vocab_test__
|
|
48
|
+
нещо на Български
|
|
49
|
+
__ggml_vocab_test__
|
|
50
|
+
កាន់តែពិសេសអាចខលចេញ
|
|
51
|
+
__ggml_vocab_test__
|
|
52
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
|
53
|
+
__ggml_vocab_test__
|
|
54
|
+
Hello
|
|
55
|
+
__ggml_vocab_test__
|
|
56
|
+
Hello
|
|
57
|
+
__ggml_vocab_test__
|
|
58
|
+
Hello
|
|
59
|
+
__ggml_vocab_test__
|
|
60
|
+
Hello
|
|
61
|
+
__ggml_vocab_test__
|
|
62
|
+
Hello
|
|
63
|
+
__ggml_vocab_test__
|
|
64
|
+
Hello
|
|
65
|
+
Hello
|
|
66
|
+
__ggml_vocab_test__
|
|
67
|
+
(
|
|
68
|
+
__ggml_vocab_test__
|
|
69
|
+
|
|
70
|
+
=
|
|
71
|
+
__ggml_vocab_test__
|
|
72
|
+
' era
|
|
73
|
+
__ggml_vocab_test__
|
|
74
|
+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
|
75
|
+
__ggml_vocab_test__
|
|
76
|
+
!!!!!!
|
|
77
|
+
__ggml_vocab_test__
|
|
78
|
+
3
|
|
79
|
+
__ggml_vocab_test__
|
|
80
|
+
33
|
|
81
|
+
__ggml_vocab_test__
|
|
82
|
+
333
|
|
83
|
+
__ggml_vocab_test__
|
|
84
|
+
3333
|
|
85
|
+
__ggml_vocab_test__
|
|
86
|
+
33333
|
|
87
|
+
__ggml_vocab_test__
|
|
88
|
+
333333
|
|
89
|
+
__ggml_vocab_test__
|
|
90
|
+
3333333
|
|
91
|
+
__ggml_vocab_test__
|
|
92
|
+
33333333
|
|
93
|
+
__ggml_vocab_test__
|
|
94
|
+
333333333
|
|
95
|
+
__ggml_vocab_test__
|
|
96
|
+
Cửa Việt
|
|
97
|
+
__ggml_vocab_test__
|
|
98
|
+
discards
|
|
99
|
+
__ggml_vocab_test__
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
|
112
|
+
__ggml_vocab_test__
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
1165 220 19 220 27124 5503
|
|
2
|
+
37 19194 259
|
|
3
|
+
|
|
4
|
+
220
|
|
5
|
+
256
|
|
6
|
+
271
|
|
7
|
+
197
|
|
8
|
+
198
|
|
9
|
+
279
|
|
10
|
+
2499
|
|
11
|
+
2775
|
|
12
|
+
13225 2375
|
|
13
|
+
32949 2375
|
|
14
|
+
13225 5922
|
|
15
|
+
32949 5922
|
|
16
|
+
32949 5922 0
|
|
17
|
+
13225 11 2375 0
|
|
18
|
+
32949 11 2375 0
|
|
19
|
+
495 382 9552 99 247 13 17159
|
|
20
|
+
86 45404 220 22 10191 2852 22924 4750 6916
|
|
21
|
+
3907 53641 1235 185386 8118
|
|
22
|
+
11400 107516 15867 20804 22851 134178 77431 32010 104312 37984 16329 27751 89335
|
|
23
|
+
112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 350 7393 74471 484 853 1617 2316 6602 8
|
|
24
|
+
13225
|
|
25
|
+
32949
|
|
26
|
+
220 32949
|
|
27
|
+
256 32949
|
|
28
|
+
271 32949
|
|
29
|
+
271 32949 198 271 32949
|
|
30
|
+
350
|
|
31
|
+
198 314
|
|
32
|
+
6 6837
|
|
33
|
+
13225 11 342 70653 0 3253 553 481 22861 223 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208
|
|
34
|
+
147475
|
|
35
|
+
18
|
|
36
|
+
2546
|
|
37
|
+
15517
|
|
38
|
+
15517 18
|
|
39
|
+
15517 2546
|
|
40
|
+
15517 15517
|
|
41
|
+
15517 15517 18
|
|
42
|
+
15517 15517 2546
|
|
43
|
+
15517 15517 15517
|
|
44
|
+
34 60213 53904
|
|
45
|
+
2960 3098
|
|
46
|
+
126470 25980 160432 16609 2775 4066 172261 19432 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 9552 99 247 4103 99 247 220 18 220 2546 220 15517 220 15517 18 220 15517 2546 220 15517 15517 220 15517 15517 18 220 15517 15517 2546 220 18 13 18 220 18 485 18 220 18 1008 18 44735 107516 15867 20804 22851 134178 77431 32010 104312 156437 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208 105024 106657 1967 53641 1235 185386 8118 22434 39336 26178 26178 168394 194663 27271 147475 25883 6961 9790 1339 461 83 1280 19016 1354 11 461 1099 481 3239 30 461 44 625 3239 17291 1520 480 11 461 35 481 1299 1236 17966 30 1416 6 27493 261 54602 43
|