@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include <algorithm>
|
|
10
10
|
#include <cassert>
|
|
11
11
|
#include <cstring>
|
|
12
|
+
#include <cmath>
|
|
12
13
|
#include <functional>
|
|
13
14
|
#include <map>
|
|
14
15
|
#include <sstream>
|
|
@@ -864,6 +865,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
864
865
|
default: type = LLM_TYPE_UNKNOWN;
|
|
865
866
|
}
|
|
866
867
|
} break;
|
|
868
|
+
case LLM_ARCH_GEMMA3:
|
|
869
|
+
{
|
|
870
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
871
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
872
|
+
|
|
873
|
+
switch (hparams.n_layer) {
|
|
874
|
+
case 26: type = LLM_TYPE_1B; break;
|
|
875
|
+
case 34: type = LLM_TYPE_4B; break;
|
|
876
|
+
case 48: type = LLM_TYPE_12B; break;
|
|
877
|
+
case 62: type = LLM_TYPE_27B; break;
|
|
878
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
882
|
+
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
883
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
884
|
+
} break;
|
|
867
885
|
case LLM_ARCH_STARCODER2:
|
|
868
886
|
{
|
|
869
887
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1275,7 +1293,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1275
1293
|
|
|
1276
1294
|
const bool use_mmap_buffer = true;
|
|
1277
1295
|
|
|
1278
|
-
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__,
|
|
1296
|
+
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
|
1279
1297
|
|
|
1280
1298
|
// build a list of buffer types for the CPU and GPU devices
|
|
1281
1299
|
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
|
@@ -1424,6 +1442,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1424
1442
|
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
|
1425
1443
|
}
|
|
1426
1444
|
|
|
1445
|
+
// skip unused tensors
|
|
1446
|
+
if (info.op == GGML_OP_NONE) {
|
|
1447
|
+
LLAMA_LOG_WARN("model has unused tensor %s -- ignoring\n", tn.str().c_str());
|
|
1448
|
+
ml.n_created++;
|
|
1449
|
+
|
|
1450
|
+
return nullptr;
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1427
1453
|
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
|
1428
1454
|
ggml_op op;
|
|
1429
1455
|
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
@@ -2194,13 +2220,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2194
2220
|
} break;
|
|
2195
2221
|
case LLM_ARCH_PHI3:
|
|
2196
2222
|
{
|
|
2197
|
-
const int64_t n_embd_head = n_embd / n_head;
|
|
2198
|
-
|
|
2199
2223
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2200
2224
|
|
|
2201
2225
|
// output
|
|
2202
2226
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2203
|
-
output
|
|
2227
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2228
|
+
|
|
2229
|
+
// if output is NULL, init from the input tok embed
|
|
2230
|
+
if (output == NULL) {
|
|
2231
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2232
|
+
}
|
|
2204
2233
|
|
|
2205
2234
|
for (int i = 0; i < n_layer; ++i) {
|
|
2206
2235
|
auto & layer = layers[i];
|
|
@@ -2215,8 +2244,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2215
2244
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
2216
2245
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
|
|
2217
2246
|
|
|
2218
|
-
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {
|
|
2219
|
-
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {
|
|
2247
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2248
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2220
2249
|
}
|
|
2221
2250
|
} break;
|
|
2222
2251
|
case LLM_ARCH_PHIMOE:
|
|
@@ -2443,6 +2472,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2443
2472
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2444
2473
|
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2445
2474
|
|
|
2475
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2476
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2477
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2478
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2479
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2480
|
+
}
|
|
2481
|
+
} break;
|
|
2482
|
+
case LLM_ARCH_GEMMA3:
|
|
2483
|
+
{
|
|
2484
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2485
|
+
|
|
2486
|
+
// output
|
|
2487
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2488
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
|
2489
|
+
|
|
2490
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2491
|
+
auto & layer = layers[i];
|
|
2492
|
+
|
|
2493
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2494
|
+
|
|
2495
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2496
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2497
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2498
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2499
|
+
|
|
2500
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2501
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2502
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2503
|
+
|
|
2446
2504
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2447
2505
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2448
2506
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
@@ -3639,6 +3697,7 @@ void llama_model::print_info() const {
|
|
|
3639
3697
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
3640
3698
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
3641
3699
|
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
|
3700
|
+
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
|
3642
3701
|
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
3643
3702
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
3644
3703
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
@@ -3830,6 +3889,10 @@ int32_t llama_model_n_head(const struct llama_model * model) {
|
|
|
3830
3889
|
return model->hparams.n_head();
|
|
3831
3890
|
}
|
|
3832
3891
|
|
|
3892
|
+
int32_t llama_model_n_head_kv(const struct llama_model * model) {
|
|
3893
|
+
return model->hparams.n_head_kv();
|
|
3894
|
+
}
|
|
3895
|
+
|
|
3833
3896
|
// deprecated
|
|
3834
3897
|
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
|
3835
3898
|
return llama_model_n_ctx_train(model);
|
|
@@ -3908,6 +3971,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
|
|
3908
3971
|
case LLM_ARCH_PHIMOE:
|
|
3909
3972
|
case LLM_ARCH_GEMMA:
|
|
3910
3973
|
case LLM_ARCH_GEMMA2:
|
|
3974
|
+
case LLM_ARCH_GEMMA3:
|
|
3911
3975
|
case LLM_ARCH_STARCODER2:
|
|
3912
3976
|
case LLM_ARCH_OPENELM:
|
|
3913
3977
|
case LLM_ARCH_GPTNEOX:
|
|
@@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
|
|
|
316
316
|
|
|
317
317
|
// llama_sampler API
|
|
318
318
|
|
|
319
|
+
struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
|
|
320
|
+
return new llama_sampler {
|
|
321
|
+
/* .iface = */ iface,
|
|
322
|
+
/* .ctx = */ ctx,
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
|
|
319
326
|
const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
|
320
327
|
if (!smpl->iface) {
|
|
321
328
|
return "(null)";
|
|
@@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
|
|
347
354
|
}
|
|
348
355
|
|
|
349
356
|
if (smpl->ctx == nullptr) {
|
|
350
|
-
return
|
|
357
|
+
return llama_sampler_init(
|
|
351
358
|
/* .iface = */ smpl->iface,
|
|
352
|
-
/* .ctx = */ nullptr
|
|
353
|
-
|
|
359
|
+
/* .ctx = */ nullptr
|
|
360
|
+
);
|
|
354
361
|
}
|
|
355
362
|
|
|
356
363
|
GGML_ABORT("the sampler does not support cloning");
|
|
@@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
|
|
|
472
479
|
};
|
|
473
480
|
|
|
474
481
|
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
|
475
|
-
return
|
|
482
|
+
return llama_sampler_init(
|
|
476
483
|
/* .iface = */ &llama_sampler_chain_i,
|
|
477
484
|
/* .ctx = */ new llama_sampler_chain {
|
|
478
485
|
/* .params = */ params,
|
|
479
486
|
/* .samplers = */ {},
|
|
480
487
|
/* .t_sample_us = */ 0,
|
|
481
488
|
/* .n_sample = */ 0,
|
|
482
|
-
}
|
|
483
|
-
|
|
489
|
+
}
|
|
490
|
+
);
|
|
484
491
|
}
|
|
485
492
|
|
|
486
493
|
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
|
@@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
|
|
|
546
553
|
};
|
|
547
554
|
|
|
548
555
|
struct llama_sampler * llama_sampler_init_greedy() {
|
|
549
|
-
return
|
|
556
|
+
return llama_sampler_init(
|
|
550
557
|
/* .iface = */ &llama_sampler_greedy_i,
|
|
551
|
-
/* .ctx = */ nullptr
|
|
552
|
-
|
|
558
|
+
/* .ctx = */ nullptr
|
|
559
|
+
);
|
|
553
560
|
}
|
|
554
561
|
|
|
555
562
|
// dist
|
|
@@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
|
|
|
608
615
|
|
|
609
616
|
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
|
610
617
|
auto seed_cur = get_rng_seed(seed);
|
|
611
|
-
return
|
|
618
|
+
return llama_sampler_init(
|
|
612
619
|
/* .iface = */ &llama_sampler_dist_i,
|
|
613
620
|
/* .ctx = */ new llama_sampler_dist {
|
|
614
621
|
/* .seed = */ seed,
|
|
615
622
|
/* .seed_cur = */ seed_cur,
|
|
616
623
|
/* .rng = */ std::mt19937(seed_cur),
|
|
617
|
-
}
|
|
618
|
-
|
|
624
|
+
}
|
|
625
|
+
);
|
|
619
626
|
}
|
|
620
627
|
|
|
621
628
|
// softmax
|
|
@@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
|
|
|
638
645
|
};
|
|
639
646
|
|
|
640
647
|
struct llama_sampler * llama_sampler_init_softmax() {
|
|
641
|
-
return
|
|
648
|
+
return llama_sampler_init(
|
|
642
649
|
/* .iface = */ &llama_sampler_softmax_i,
|
|
643
|
-
/* .ctx = */ nullptr
|
|
644
|
-
|
|
650
|
+
/* .ctx = */ nullptr
|
|
651
|
+
);
|
|
645
652
|
}
|
|
646
653
|
|
|
647
654
|
// top-k
|
|
@@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
|
|
|
678
685
|
};
|
|
679
686
|
|
|
680
687
|
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
|
681
|
-
return
|
|
688
|
+
return llama_sampler_init(
|
|
682
689
|
/* .iface = */ &llama_sampler_top_k_i,
|
|
683
690
|
/* .ctx = */ new llama_sampler_top_k {
|
|
684
691
|
/* .k = */ k,
|
|
685
|
-
}
|
|
686
|
-
|
|
692
|
+
}
|
|
693
|
+
);
|
|
687
694
|
}
|
|
688
695
|
|
|
689
696
|
// top-p
|
|
@@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
|
|
|
744
751
|
};
|
|
745
752
|
|
|
746
753
|
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
|
747
|
-
return
|
|
754
|
+
return llama_sampler_init(
|
|
748
755
|
/* .iface = */ &llama_sampler_top_p_i,
|
|
749
756
|
/* .ctx = */ new llama_sampler_top_p {
|
|
750
757
|
/* .p = */ p,
|
|
751
758
|
/* .min_keep = */ min_keep,
|
|
752
|
-
}
|
|
753
|
-
|
|
759
|
+
}
|
|
760
|
+
);
|
|
754
761
|
}
|
|
755
762
|
|
|
756
763
|
// min-p
|
|
@@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
|
|
|
840
847
|
};
|
|
841
848
|
|
|
842
849
|
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
|
|
843
|
-
return
|
|
850
|
+
return llama_sampler_init(
|
|
844
851
|
/* .iface = */ &llama_sampler_min_p_i,
|
|
845
852
|
/* .ctx = */ new llama_sampler_min_p {
|
|
846
853
|
/* .p = */ p,
|
|
847
854
|
/* .min_keep = */ min_keep,
|
|
848
|
-
}
|
|
849
|
-
|
|
855
|
+
}
|
|
856
|
+
);
|
|
850
857
|
}
|
|
851
858
|
|
|
852
859
|
// typical
|
|
@@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
|
|
|
939
946
|
};
|
|
940
947
|
|
|
941
948
|
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
|
942
|
-
return
|
|
949
|
+
return llama_sampler_init(
|
|
943
950
|
/* .iface = */ &llama_sampler_typical_i,
|
|
944
951
|
/* .ctx = */ new llama_sampler_typical {
|
|
945
952
|
/* .p = */ p,
|
|
946
953
|
/* .min_keep = */ min_keep,
|
|
947
|
-
}
|
|
948
|
-
|
|
954
|
+
}
|
|
955
|
+
);
|
|
949
956
|
}
|
|
950
957
|
|
|
951
958
|
// temp
|
|
@@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
|
|
|
983
990
|
};
|
|
984
991
|
|
|
985
992
|
struct llama_sampler * llama_sampler_init_temp(float temp) {
|
|
986
|
-
return
|
|
993
|
+
return llama_sampler_init(
|
|
987
994
|
/* .iface = */ &llama_sampler_temp_i,
|
|
988
995
|
/* .ctx = */ new llama_sampler_temp {
|
|
989
996
|
/*.temp = */ temp,
|
|
990
|
-
}
|
|
991
|
-
|
|
997
|
+
}
|
|
998
|
+
);
|
|
992
999
|
}
|
|
993
1000
|
|
|
994
1001
|
// temp-ext
|
|
@@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
|
|
|
1093
1100
|
};
|
|
1094
1101
|
|
|
1095
1102
|
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
|
|
1096
|
-
return
|
|
1103
|
+
return llama_sampler_init(
|
|
1097
1104
|
/* .iface = */ &llama_sampler_temp_ext_i,
|
|
1098
1105
|
/* .ctx = */ new llama_sampler_temp_ext {
|
|
1099
1106
|
/* .temp = */ temp,
|
|
1100
1107
|
/* .delta = */ delta,
|
|
1101
1108
|
/* .exponent = */ exponent,
|
|
1102
|
-
}
|
|
1103
|
-
|
|
1109
|
+
}
|
|
1110
|
+
);
|
|
1104
1111
|
}
|
|
1105
1112
|
|
|
1106
1113
|
// xtc
|
|
@@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
|
|
|
1185
1192
|
|
|
1186
1193
|
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
|
1187
1194
|
auto seed_cur = get_rng_seed(seed);
|
|
1188
|
-
return
|
|
1195
|
+
return llama_sampler_init(
|
|
1189
1196
|
/* .iface = */ &llama_sampler_xtc_i,
|
|
1190
1197
|
/* .ctx = */ new llama_sampler_xtc {
|
|
1191
1198
|
/* .probability = */ p,
|
|
@@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
|
|
|
1194
1201
|
/* .seed = */ seed,
|
|
1195
1202
|
/* .seed_cur = */ seed_cur,
|
|
1196
1203
|
/* .rng = */ std::mt19937(seed_cur),
|
|
1197
|
-
}
|
|
1198
|
-
|
|
1204
|
+
}
|
|
1205
|
+
);
|
|
1199
1206
|
}
|
|
1200
1207
|
|
|
1201
1208
|
// mirostat
|
|
@@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
|
|
|
1292
1299
|
|
|
1293
1300
|
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
|
1294
1301
|
auto seed_cur = get_rng_seed(seed);
|
|
1295
|
-
return
|
|
1302
|
+
return llama_sampler_init(
|
|
1296
1303
|
/* .iface = */ &llama_sampler_mirostat_i,
|
|
1297
1304
|
/* .ctx = */ new llama_sampler_mirostat {
|
|
1298
1305
|
/* .n_vocab = */ n_vocab,
|
|
@@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
|
|
|
1303
1310
|
/* .m = */ m,
|
|
1304
1311
|
/* .mu = */ 2.0f*tau,
|
|
1305
1312
|
/* .rng = */ std::mt19937(seed_cur),
|
|
1306
|
-
}
|
|
1307
|
-
|
|
1313
|
+
}
|
|
1314
|
+
);
|
|
1308
1315
|
}
|
|
1309
1316
|
|
|
1310
1317
|
// mirostat v2
|
|
@@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
|
|
|
1391
1398
|
|
|
1392
1399
|
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
|
1393
1400
|
auto seed_cur = get_rng_seed(seed);
|
|
1394
|
-
return
|
|
1401
|
+
return llama_sampler_init(
|
|
1395
1402
|
/* .iface = */ &llama_sampler_mirostat_v2_i,
|
|
1396
1403
|
/* .ctx = */ new llama_sampler_mirostat_v2 {
|
|
1397
1404
|
/* .seed = */ seed,
|
|
@@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
|
|
|
1400
1407
|
/* .eta = */ eta,
|
|
1401
1408
|
/* .mu = */ 2.0f*tau,
|
|
1402
1409
|
/* .rng = */ std::mt19937(seed_cur),
|
|
1403
|
-
}
|
|
1404
|
-
|
|
1410
|
+
}
|
|
1411
|
+
);
|
|
1405
1412
|
}
|
|
1406
1413
|
|
|
1407
1414
|
// grammar
|
|
@@ -1442,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1442
1449
|
const char ** trigger_words,
|
|
1443
1450
|
size_t num_trigger_words,
|
|
1444
1451
|
const llama_token * trigger_tokens,
|
|
1445
|
-
size_t num_trigger_tokens
|
|
1452
|
+
size_t num_trigger_tokens,
|
|
1453
|
+
const char ** trigger_patterns,
|
|
1454
|
+
size_t num_trigger_patterns);
|
|
1446
1455
|
|
|
1447
1456
|
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
1448
1457
|
auto * ctx = (llama_sampler_grammar *) smpl->ctx;
|
|
@@ -1450,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
|
1450
1459
|
return;
|
|
1451
1460
|
}
|
|
1452
1461
|
|
|
1453
|
-
std::vector<const char *>
|
|
1454
|
-
|
|
1455
|
-
|
|
1462
|
+
std::vector<const char *> trigger_patterns_c;
|
|
1463
|
+
trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
|
|
1464
|
+
for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
|
|
1465
|
+
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
|
1456
1466
|
}
|
|
1467
|
+
|
|
1457
1468
|
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
|
1458
|
-
ctx->grammar->lazy,
|
|
1469
|
+
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
1459
1470
|
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
|
1460
1471
|
|
|
1461
1472
|
llama_grammar_free_impl(ctx->grammar);
|
|
@@ -1465,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
|
1465
1476
|
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
|
1466
1477
|
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
|
1467
1478
|
|
|
1468
|
-
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
|
|
1479
|
+
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1469
1480
|
|
|
1470
1481
|
// copy the state
|
|
1471
1482
|
{
|
|
@@ -1509,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1509
1520
|
const char ** trigger_words,
|
|
1510
1521
|
size_t num_trigger_words,
|
|
1511
1522
|
const llama_token * trigger_tokens,
|
|
1512
|
-
size_t num_trigger_tokens
|
|
1523
|
+
size_t num_trigger_tokens,
|
|
1524
|
+
const char ** trigger_patterns,
|
|
1525
|
+
size_t num_trigger_patterns) {
|
|
1513
1526
|
auto * ctx = new llama_sampler_grammar;
|
|
1514
1527
|
|
|
1515
1528
|
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
|
1529
|
+
// TODO: remove trigger_words support.
|
|
1530
|
+
if (trigger_words != nullptr && num_trigger_words > 0) {
|
|
1531
|
+
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
|
|
1532
|
+
std::string trigger_pattern("[\\s\\S]*?(");
|
|
1533
|
+
for (size_t i = 0; i < num_trigger_words; ++i) {
|
|
1534
|
+
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
1535
|
+
if (i > 0) {
|
|
1536
|
+
trigger_pattern += "|";
|
|
1537
|
+
}
|
|
1538
|
+
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
|
1539
|
+
}
|
|
1540
|
+
trigger_pattern += ")[\\s\\S]*";
|
|
1541
|
+
auto trigger_pattern_c = trigger_pattern.c_str();
|
|
1542
|
+
trigger_patterns = &trigger_pattern_c;
|
|
1543
|
+
num_trigger_patterns = 1;
|
|
1544
|
+
}
|
|
1516
1545
|
*ctx = {
|
|
1517
1546
|
/* .vocab = */ vocab,
|
|
1518
1547
|
/* .grammar_str = */ grammar_str,
|
|
1519
1548
|
/* .grammar_root = */ grammar_root,
|
|
1520
|
-
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy,
|
|
1549
|
+
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
|
1521
1550
|
};
|
|
1522
1551
|
} else {
|
|
1523
1552
|
*ctx = {
|
|
@@ -1528,17 +1557,17 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1528
1557
|
};
|
|
1529
1558
|
}
|
|
1530
1559
|
|
|
1531
|
-
return
|
|
1560
|
+
return llama_sampler_init(
|
|
1532
1561
|
/* .iface = */ &llama_sampler_grammar_i,
|
|
1533
|
-
/* .ctx = */ ctx
|
|
1534
|
-
|
|
1562
|
+
/* .ctx = */ ctx
|
|
1563
|
+
);
|
|
1535
1564
|
}
|
|
1536
1565
|
|
|
1537
1566
|
struct llama_sampler * llama_sampler_init_grammar(
|
|
1538
1567
|
const struct llama_vocab * vocab,
|
|
1539
1568
|
const char * grammar_str,
|
|
1540
1569
|
const char * grammar_root) {
|
|
1541
|
-
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
|
|
1570
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1542
1571
|
}
|
|
1543
1572
|
|
|
1544
1573
|
struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
@@ -1549,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
|
1549
1578
|
size_t num_trigger_words,
|
|
1550
1579
|
const llama_token * trigger_tokens,
|
|
1551
1580
|
size_t num_trigger_tokens) {
|
|
1552
|
-
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
|
|
1581
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
|
|
1585
|
+
const struct llama_vocab * vocab,
|
|
1586
|
+
const char * grammar_str,
|
|
1587
|
+
const char * grammar_root,
|
|
1588
|
+
const char ** trigger_patterns,
|
|
1589
|
+
size_t num_trigger_patterns,
|
|
1590
|
+
const llama_token * trigger_tokens,
|
|
1591
|
+
size_t num_trigger_tokens) {
|
|
1592
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
|
|
1553
1593
|
}
|
|
1554
1594
|
|
|
1555
1595
|
// penalties
|
|
@@ -1678,7 +1718,7 @@ struct llama_sampler * llama_sampler_init_penalties(
|
|
|
1678
1718
|
float penalty_present) {
|
|
1679
1719
|
penalty_last_n = std::max(penalty_last_n, 0);
|
|
1680
1720
|
|
|
1681
|
-
return
|
|
1721
|
+
return llama_sampler_init(
|
|
1682
1722
|
/* .iface = */ &llama_sampler_penalties_i,
|
|
1683
1723
|
/* .ctx = */ new llama_sampler_penalties {
|
|
1684
1724
|
/* .penalty_last_n = */ penalty_last_n,
|
|
@@ -1687,8 +1727,75 @@ struct llama_sampler * llama_sampler_init_penalties(
|
|
|
1687
1727
|
/* .penalty_present = */ penalty_present,
|
|
1688
1728
|
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
|
1689
1729
|
/* .token_count = */ {},
|
|
1690
|
-
}
|
|
1691
|
-
|
|
1730
|
+
}
|
|
1731
|
+
);
|
|
1732
|
+
}
|
|
1733
|
+
|
|
1734
|
+
// top-n-sigma
|
|
1735
|
+
|
|
1736
|
+
struct llama_sampler_top_n_sigma {
|
|
1737
|
+
const float n;
|
|
1738
|
+
};
|
|
1739
|
+
|
|
1740
|
+
static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
|
|
1741
|
+
return "top-n-sigma";
|
|
1742
|
+
}
|
|
1743
|
+
|
|
1744
|
+
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
1745
|
+
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
|
1746
|
+
|
|
1747
|
+
// find max logit and calculate mean
|
|
1748
|
+
float max = cur_p->data[0].logit;
|
|
1749
|
+
float logits_sum = 0;
|
|
1750
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1751
|
+
if (cur_p->data[i].logit > max) {
|
|
1752
|
+
max = cur_p->data[i].logit;
|
|
1753
|
+
}
|
|
1754
|
+
logits_sum += cur_p->data[i].logit;
|
|
1755
|
+
}
|
|
1756
|
+
float mean = logits_sum/cur_p->size;
|
|
1757
|
+
|
|
1758
|
+
// calculate standard deviation
|
|
1759
|
+
float acc = 0;
|
|
1760
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1761
|
+
acc += pow(cur_p->data[i].logit - mean, 2);
|
|
1762
|
+
}
|
|
1763
|
+
float std = sqrt(acc/cur_p->size);
|
|
1764
|
+
|
|
1765
|
+
//apply mask
|
|
1766
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1767
|
+
if (cur_p->data[i].logit < max - (ctx->n * std)) {
|
|
1768
|
+
cur_p->data[i].logit = -INFINITY;
|
|
1769
|
+
}
|
|
1770
|
+
}
|
|
1771
|
+
llama_sampler_softmax_impl(cur_p);
|
|
1772
|
+
}
|
|
1773
|
+
|
|
1774
|
+
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
|
|
1775
|
+
const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx;
|
|
1776
|
+
return llama_sampler_init_top_n_sigma(ctx->n);
|
|
1777
|
+
}
|
|
1778
|
+
|
|
1779
|
+
static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
|
|
1780
|
+
delete (llama_sampler_top_n_sigma *) smpl->ctx;
|
|
1781
|
+
}
|
|
1782
|
+
|
|
1783
|
+
static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
|
|
1784
|
+
/* .name = */ llama_sampler_top_n_sigma_name,
|
|
1785
|
+
/* .accept = */ nullptr,
|
|
1786
|
+
/* .apply = */ llama_sampler_top_n_sigma_apply,
|
|
1787
|
+
/* .reset = */ nullptr,
|
|
1788
|
+
/* .clone = */ llama_sampler_top_n_sigma_clone,
|
|
1789
|
+
/* .free = */ llama_sampler_top_n_sigma_free,
|
|
1790
|
+
};
|
|
1791
|
+
|
|
1792
|
+
struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
|
|
1793
|
+
return llama_sampler_init(
|
|
1794
|
+
/* .iface = */ &llama_sampler_top_n_sigma_i,
|
|
1795
|
+
/* .ctx = */ new llama_sampler_top_n_sigma {
|
|
1796
|
+
/* .n = */ n,
|
|
1797
|
+
}
|
|
1798
|
+
);
|
|
1692
1799
|
}
|
|
1693
1800
|
|
|
1694
1801
|
// DRY
|
|
@@ -2041,7 +2148,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
|
|
2041
2148
|
}
|
|
2042
2149
|
}
|
|
2043
2150
|
|
|
2044
|
-
return
|
|
2151
|
+
return llama_sampler_init(
|
|
2045
2152
|
/* .iface = */ &llama_sampler_dry_i,
|
|
2046
2153
|
/* .ctx = */ new llama_sampler_dry {
|
|
2047
2154
|
/* .total_context_size = */ context_size,
|
|
@@ -2053,8 +2160,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
|
|
2053
2160
|
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
|
|
2054
2161
|
/* .dry_max_token_repeat = */ {},
|
|
2055
2162
|
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
|
|
2056
|
-
}
|
|
2057
|
-
|
|
2163
|
+
}
|
|
2164
|
+
);
|
|
2058
2165
|
}
|
|
2059
2166
|
|
|
2060
2167
|
// wrapper for test-sampling.cpp
|
|
@@ -2155,14 +2262,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
|
|
2155
2262
|
int32_t n_vocab,
|
|
2156
2263
|
int32_t n_logit_bias,
|
|
2157
2264
|
const llama_logit_bias * logit_bias) {
|
|
2158
|
-
return
|
|
2265
|
+
return llama_sampler_init(
|
|
2159
2266
|
/* .iface = */ &llama_sampler_logit_bias_i,
|
|
2160
2267
|
/* .ctx = */ new llama_sampler_logit_bias {
|
|
2161
2268
|
/* .n_vocab = */ n_vocab,
|
|
2162
2269
|
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
|
|
2163
2270
|
/* .to_search = */ {},
|
|
2164
|
-
}
|
|
2165
|
-
|
|
2271
|
+
}
|
|
2272
|
+
);
|
|
2166
2273
|
}
|
|
2167
2274
|
|
|
2168
2275
|
// infill
|
|
@@ -2377,14 +2484,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
|
|
|
2377
2484
|
};
|
|
2378
2485
|
|
|
2379
2486
|
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
|
2380
|
-
return
|
|
2487
|
+
return llama_sampler_init(
|
|
2381
2488
|
/* .iface = */ &llama_sampler_infill_i,
|
|
2382
2489
|
/* .ctx = */ new llama_sampler_infill {
|
|
2383
2490
|
/* .vocab = */ vocab,
|
|
2384
2491
|
/* .buf0 = */ std::vector<char>(512),
|
|
2385
2492
|
/* .buf1 = */ std::vector<char>(512),
|
|
2386
|
-
}
|
|
2387
|
-
|
|
2493
|
+
}
|
|
2494
|
+
);
|
|
2388
2495
|
}
|
|
2389
2496
|
|
|
2390
2497
|
// utils
|