@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
#include <queue>
|
|
17
17
|
#include <set>
|
|
18
18
|
#include <unordered_map>
|
|
19
|
+
#include <cctype>
|
|
19
20
|
|
|
20
21
|
//
|
|
21
22
|
// helpers
|
|
@@ -392,6 +393,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
392
393
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
393
394
|
};
|
|
394
395
|
break;
|
|
396
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT4O:
|
|
397
|
+
regex_exprs = {
|
|
398
|
+
// original regex from tokenizer.json
|
|
399
|
+
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
400
|
+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
401
|
+
};
|
|
402
|
+
break;
|
|
395
403
|
default:
|
|
396
404
|
// default regex for BPE tokenization pre-processing
|
|
397
405
|
regex_exprs = {
|
|
@@ -1592,6 +1600,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1592
1600
|
} else if (
|
|
1593
1601
|
tokenizer_pre == "megrez") {
|
|
1594
1602
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1603
|
+
} else if (
|
|
1604
|
+
tokenizer_pre == "gpt-4o") {
|
|
1605
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
|
1606
|
+
clean_spaces = false;
|
|
1595
1607
|
} else {
|
|
1596
1608
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1597
1609
|
}
|
|
@@ -4978,6 +4978,149 @@ struct llm_build_context {
|
|
|
4978
4978
|
return gf;
|
|
4979
4979
|
}
|
|
4980
4980
|
|
|
4981
|
+
struct ggml_cgraph * build_gemma3() {
|
|
4982
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
|
4983
|
+
|
|
4984
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
4985
|
+
|
|
4986
|
+
struct ggml_tensor * cur;
|
|
4987
|
+
struct ggml_tensor * inpL;
|
|
4988
|
+
|
|
4989
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
4990
|
+
|
|
4991
|
+
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
4992
|
+
if (ubatch.token) {
|
|
4993
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
4994
|
+
cb(inpL, "inp_scaled", -1);
|
|
4995
|
+
}
|
|
4996
|
+
|
|
4997
|
+
// inp_pos - contains the positions
|
|
4998
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
4999
|
+
|
|
5000
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
5001
|
+
// gemma3 requires different mask for layers using sliding window (SWA)
|
|
5002
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
|
|
5003
|
+
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
|
|
5004
|
+
|
|
5005
|
+
// "5-to-1 interleaved attention"
|
|
5006
|
+
// 5 layers of local attention followed by 1 layer of global attention
|
|
5007
|
+
static const int sliding_window_pattern = 6;
|
|
5008
|
+
|
|
5009
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
5010
|
+
const bool is_sliding = (il + 1) % sliding_window_pattern;
|
|
5011
|
+
const float freq_base_l = is_sliding ? 10000.0f : freq_base;
|
|
5012
|
+
const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
|
|
5013
|
+
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
|
5014
|
+
|
|
5015
|
+
// norm
|
|
5016
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
5017
|
+
model.layers[il].attn_norm, NULL,
|
|
5018
|
+
LLM_NORM_RMS, cb, il);
|
|
5019
|
+
cb(cur, "attn_norm", il);
|
|
5020
|
+
|
|
5021
|
+
// self-attention
|
|
5022
|
+
{
|
|
5023
|
+
// compute Q and K and RoPE them
|
|
5024
|
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
5025
|
+
cb(Qcur, "Qcur", il);
|
|
5026
|
+
|
|
5027
|
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
|
5028
|
+
cb(Kcur, "Kcur", il);
|
|
5029
|
+
|
|
5030
|
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
|
5031
|
+
cb(Vcur, "Vcur", il);
|
|
5032
|
+
|
|
5033
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens);
|
|
5034
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
|
5035
|
+
model.layers[il].attn_q_norm,
|
|
5036
|
+
NULL,
|
|
5037
|
+
LLM_NORM_RMS, cb, il);
|
|
5038
|
+
cb(Qcur, "Qcur_normed", il);
|
|
5039
|
+
|
|
5040
|
+
Qcur = ggml_rope_ext(
|
|
5041
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
5042
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
5043
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
5044
|
+
cb(Qcur, "Qcur", il);
|
|
5045
|
+
|
|
5046
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens);
|
|
5047
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
|
5048
|
+
model.layers[il].attn_k_norm,
|
|
5049
|
+
NULL,
|
|
5050
|
+
LLM_NORM_RMS, cb, il);
|
|
5051
|
+
cb(Kcur, "Kcur_normed", il);
|
|
5052
|
+
|
|
5053
|
+
Kcur = ggml_rope_ext(
|
|
5054
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
5055
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
5056
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
5057
|
+
cb(Kcur, "Kcur", il);
|
|
5058
|
+
|
|
5059
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
|
5060
|
+
model.layers[il].wo, NULL,
|
|
5061
|
+
Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
|
|
5062
|
+
}
|
|
5063
|
+
|
|
5064
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
5065
|
+
model.layers[il].attn_post_norm, NULL,
|
|
5066
|
+
LLM_NORM_RMS, cb, il);
|
|
5067
|
+
cb(cur, "attn_post_norm", il);
|
|
5068
|
+
|
|
5069
|
+
if (il == n_layer - 1) {
|
|
5070
|
+
// skip computing output for unused tokens
|
|
5071
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5072
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5073
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5074
|
+
}
|
|
5075
|
+
|
|
5076
|
+
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
5077
|
+
cb(sa_out, "sa_out", il);
|
|
5078
|
+
|
|
5079
|
+
cur = llm_build_norm(ctx0, sa_out, hparams,
|
|
5080
|
+
model.layers[il].ffn_norm, NULL,
|
|
5081
|
+
LLM_NORM_RMS, cb, il);
|
|
5082
|
+
cb(cur, "ffn_norm", il);
|
|
5083
|
+
|
|
5084
|
+
// feed-forward network
|
|
5085
|
+
{
|
|
5086
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
5087
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
5088
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
5089
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
5090
|
+
NULL,
|
|
5091
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
|
5092
|
+
cb(cur, "ffn_out", il);
|
|
5093
|
+
}
|
|
5094
|
+
|
|
5095
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
5096
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
5097
|
+
LLM_NORM_RMS, cb, -1);
|
|
5098
|
+
cb(cur, "ffn_post_norm", -1);
|
|
5099
|
+
|
|
5100
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
5101
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
5102
|
+
cb(cur, "l_out", il);
|
|
5103
|
+
|
|
5104
|
+
// input for next layer
|
|
5105
|
+
inpL = cur;
|
|
5106
|
+
}
|
|
5107
|
+
|
|
5108
|
+
cur = inpL;
|
|
5109
|
+
|
|
5110
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
5111
|
+
model.output_norm, NULL,
|
|
5112
|
+
LLM_NORM_RMS, cb, -1);
|
|
5113
|
+
cb(cur, "result_norm", -1);
|
|
5114
|
+
|
|
5115
|
+
// lm_head
|
|
5116
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
5117
|
+
|
|
5118
|
+
cb(cur, "result_output", -1);
|
|
5119
|
+
|
|
5120
|
+
ggml_build_forward_expand(gf, cur);
|
|
5121
|
+
|
|
5122
|
+
return gf;
|
|
5123
|
+
}
|
|
4981
5124
|
|
|
4982
5125
|
struct ggml_cgraph * build_starcoder2() {
|
|
4983
5126
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
|
@@ -8298,6 +8441,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
8298
8441
|
{
|
|
8299
8442
|
result = llm.build_gemma2();
|
|
8300
8443
|
} break;
|
|
8444
|
+
case LLM_ARCH_GEMMA3:
|
|
8445
|
+
{
|
|
8446
|
+
result = llm.build_gemma3();
|
|
8447
|
+
} break;
|
|
8301
8448
|
case LLM_ARCH_STARCODER2:
|
|
8302
8449
|
{
|
|
8303
8450
|
result = llm.build_starcoder2();
|
|
@@ -8801,12 +8948,14 @@ static int llama_decode_impl(
|
|
|
8801
8948
|
//llama_synchronize(&lctx);
|
|
8802
8949
|
|
|
8803
8950
|
// decide if we need to defrag the kv cache
|
|
8804
|
-
if (cparams.causal_attn && cparams.defrag_thold
|
|
8805
|
-
|
|
8951
|
+
if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
|
|
8952
|
+
// - do not defrag small contexts (i.e. < 2048 tokens)
|
|
8953
|
+
// - count the padding towards the number of used tokens
|
|
8954
|
+
const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
|
|
8806
8955
|
|
|
8807
8956
|
// queue defragmentation for next llama_kv_cache_update
|
|
8808
8957
|
if (fragmentation > cparams.defrag_thold) {
|
|
8809
|
-
|
|
8958
|
+
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
|
8810
8959
|
|
|
8811
8960
|
llama_kv_cache_defrag(kv_self);
|
|
8812
8961
|
}
|
|
@@ -9428,8 +9577,6 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
9428
9577
|
struct llama_model_params params) {
|
|
9429
9578
|
ggml_time_init();
|
|
9430
9579
|
|
|
9431
|
-
llama_model * model = new llama_model(params);
|
|
9432
|
-
|
|
9433
9580
|
unsigned cur_percentage = 0;
|
|
9434
9581
|
if (params.progress_callback == NULL) {
|
|
9435
9582
|
params.progress_callback_user_data = &cur_percentage;
|
|
@@ -9447,6 +9594,8 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
9447
9594
|
};
|
|
9448
9595
|
}
|
|
9449
9596
|
|
|
9597
|
+
llama_model * model = new llama_model(params);
|
|
9598
|
+
|
|
9450
9599
|
// create list of devices to use with this model
|
|
9451
9600
|
if (params.devices) {
|
|
9452
9601
|
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
|
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
|
|
618
618
|
result.reserve(utf8.size());
|
|
619
619
|
size_t offset = 0;
|
|
620
620
|
while (offset < utf8.size()) {
|
|
621
|
-
|
|
621
|
+
try {
|
|
622
|
+
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
|
623
|
+
}
|
|
624
|
+
catch (const std::invalid_argument & /*ex*/) {
|
|
625
|
+
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
|
|
626
|
+
++offset;
|
|
627
|
+
result.emplace_back(0xFFFD); // replacement character
|
|
628
|
+
}
|
|
622
629
|
}
|
|
623
630
|
return result;
|
|
624
631
|
}
|
|
@@ -701,7 +708,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
|
701
708
|
const auto cpts = unicode_cpts_from_utf8(text);
|
|
702
709
|
|
|
703
710
|
// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
|
|
704
|
-
// ref: https://github.com/
|
|
711
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
|
|
705
712
|
std::string text_collapsed;
|
|
706
713
|
if (need_collapse) {
|
|
707
714
|
// collapse all unicode categories
|