@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -55,6 +55,8 @@ const std::vector<std::string> type_names = {
|
|
|
55
55
|
"q4_k",
|
|
56
56
|
"q5_k",
|
|
57
57
|
"q6_k",
|
|
58
|
+
"iq1_s",
|
|
59
|
+
"iq1_m",
|
|
58
60
|
"iq2_xxs",
|
|
59
61
|
"iq2_xs",
|
|
60
62
|
"iq2_s",
|
|
@@ -182,6 +184,13 @@ std::string to_uppercase(const std::string& input) {
|
|
|
182
184
|
return result;
|
|
183
185
|
}
|
|
184
186
|
|
|
187
|
+
bool string_starts_with(const std::string& str, const std::string& prefix) {
|
|
188
|
+
if (prefix.size() > str.size()) {
|
|
189
|
+
return false;
|
|
190
|
+
}
|
|
191
|
+
return std::equal(prefix.begin(), prefix.end(), str.begin());
|
|
192
|
+
}
|
|
193
|
+
|
|
185
194
|
bool string_ends_with(const std::string& str, const std::string& suffix) {
|
|
186
195
|
if (suffix.size() > str.size()) {
|
|
187
196
|
return false;
|
|
@@ -316,11 +325,17 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
|
|
|
316
325
|
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
|
317
326
|
|
|
318
327
|
for (const auto& tname : type_names) {
|
|
328
|
+
std::string load_vec_quant = "2";
|
|
329
|
+
if ((tname == "q4_0") || (tname == "q4_1"))
|
|
330
|
+
load_vec_quant = "8";
|
|
331
|
+
else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl"))
|
|
332
|
+
load_vec_quant = "4";
|
|
333
|
+
|
|
319
334
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
|
320
335
|
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
|
321
|
-
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" :
|
|
336
|
+
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : load_vec_quant;
|
|
322
337
|
// For aligned matmul loads
|
|
323
|
-
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec :
|
|
338
|
+
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : load_vec_quant;
|
|
324
339
|
|
|
325
340
|
// don't generate f32 variants for coopmat2
|
|
326
341
|
if (!coopmat2) {
|
|
@@ -387,7 +402,7 @@ void process_shaders() {
|
|
|
387
402
|
for (const auto& tname : type_names) {
|
|
388
403
|
// mul mat vec
|
|
389
404
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
|
390
|
-
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
|
405
|
+
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
|
391
406
|
|
|
392
407
|
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
|
393
408
|
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
|
@@ -418,6 +433,7 @@ void process_shaders() {
|
|
|
418
433
|
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
419
434
|
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
420
435
|
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
436
|
+
string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
421
437
|
|
|
422
438
|
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
423
439
|
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
|
@@ -434,6 +450,8 @@ void process_shaders() {
|
|
|
434
450
|
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
435
451
|
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
|
436
452
|
|
|
453
|
+
string_to_spv("sub_f32", "sub.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
454
|
+
|
|
437
455
|
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
438
456
|
|
|
439
457
|
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
|
@@ -443,6 +461,7 @@ void process_shaders() {
|
|
|
443
461
|
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
444
462
|
|
|
445
463
|
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
464
|
+
string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
446
465
|
|
|
447
466
|
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
448
467
|
|
|
@@ -465,14 +484,17 @@ void process_shaders() {
|
|
|
465
484
|
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
466
485
|
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
467
486
|
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
487
|
+
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
468
488
|
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
469
489
|
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
470
490
|
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
491
|
+
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
471
492
|
|
|
472
493
|
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
473
494
|
|
|
474
495
|
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
475
496
|
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
|
497
|
+
string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
476
498
|
|
|
477
499
|
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
478
500
|
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
@@ -482,9 +504,19 @@ void process_shaders() {
|
|
|
482
504
|
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
483
505
|
string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
|
484
506
|
|
|
507
|
+
string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
508
|
+
string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
509
|
+
string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
|
510
|
+
|
|
511
|
+
string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
512
|
+
string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
513
|
+
string_to_spv("rope_vision_f16_rte", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
|
514
|
+
|
|
485
515
|
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
|
486
516
|
|
|
517
|
+
string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
|
|
487
518
|
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
519
|
+
string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
|
|
488
520
|
|
|
489
521
|
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
490
522
|
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
|
@@ -496,6 +528,8 @@ void process_shaders() {
|
|
|
496
528
|
|
|
497
529
|
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
498
530
|
|
|
531
|
+
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
532
|
+
|
|
499
533
|
for (auto &c : compiles) {
|
|
500
534
|
c.wait();
|
|
501
535
|
}
|
|
@@ -240,7 +240,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
|
|
240
240
|
|
|
241
241
|
|
|
242
242
|
void * ggml_aligned_malloc(size_t size) {
|
|
243
|
+
#if defined(__s390x__)
|
|
244
|
+
const int alignment = 256;
|
|
245
|
+
#else
|
|
243
246
|
const int alignment = 64;
|
|
247
|
+
#endif
|
|
244
248
|
|
|
245
249
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
246
250
|
return _aligned_malloc(size, alignment);
|
|
@@ -561,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
|
|
561
565
|
#endif
|
|
562
566
|
|
|
563
567
|
}
|
|
564
|
-
static void ggml_vec_dot_f32(int n, float *
|
|
565
|
-
static void ggml_vec_dot_f16(int n, float *
|
|
566
|
-
static void ggml_vec_dot_bf16(int n, float *
|
|
568
|
+
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
|
569
|
+
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
570
|
+
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
567
571
|
|
|
568
572
|
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|
569
573
|
[GGML_TYPE_I8] = {
|
|
@@ -1379,7 +1383,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
|
|
|
1379
1383
|
(t0->nb[3] == t1->nb[3]);
|
|
1380
1384
|
}
|
|
1381
1385
|
|
|
1382
|
-
// check if t1 can be represented as a
|
|
1386
|
+
// check if t1 can be represented as a repetition of t0
|
|
1383
1387
|
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
|
1384
1388
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
1385
1389
|
|
|
@@ -2328,6 +2332,7 @@ struct ggml_tensor * ggml_concat(
|
|
|
2328
2332
|
struct ggml_tensor * b,
|
|
2329
2333
|
int dim) {
|
|
2330
2334
|
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
|
2335
|
+
GGML_ASSERT(a->type == b->type);
|
|
2331
2336
|
|
|
2332
2337
|
int64_t ne[GGML_MAX_DIMS];
|
|
2333
2338
|
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
|
@@ -105,6 +105,7 @@ extern "C" {
|
|
|
105
105
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
106
106
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
107
107
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
108
|
+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
108
109
|
};
|
|
109
110
|
|
|
110
111
|
enum llama_rope_type {
|
|
@@ -213,7 +214,7 @@ extern "C" {
|
|
|
213
214
|
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
|
214
215
|
};
|
|
215
216
|
|
|
216
|
-
// TODO: simplify (https://github.com/
|
|
217
|
+
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
|
217
218
|
typedef struct llama_token_data {
|
|
218
219
|
llama_token id; // token id
|
|
219
220
|
float logit; // log-odds of the token
|
|
@@ -307,7 +308,7 @@ extern "C" {
|
|
|
307
308
|
};
|
|
308
309
|
|
|
309
310
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
310
|
-
// https://github.com/
|
|
311
|
+
// https://github.com/ggml-org/llama.cpp/pull/7544
|
|
311
312
|
struct llama_context_params {
|
|
312
313
|
uint32_t n_ctx; // text context, 0 = from model
|
|
313
314
|
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
|
@@ -320,7 +321,7 @@ extern "C" {
|
|
|
320
321
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
|
321
322
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
|
322
323
|
|
|
323
|
-
// ref: https://github.com/
|
|
324
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
|
324
325
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
|
325
326
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
|
326
327
|
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
|
@@ -385,7 +386,7 @@ extern "C" {
|
|
|
385
386
|
struct llama_adapter_lora;
|
|
386
387
|
|
|
387
388
|
// Helpers for getting default parameters
|
|
388
|
-
// TODO: update API to start accepting pointers to params structs (https://github.com/
|
|
389
|
+
// TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
|
|
389
390
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
|
390
391
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
|
391
392
|
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
|
|
@@ -477,6 +478,7 @@ extern "C" {
|
|
|
477
478
|
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
|
478
479
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
479
480
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
481
|
+
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
480
482
|
|
|
481
483
|
// Get the model's RoPE frequency scaling factor
|
|
482
484
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
@@ -1040,7 +1042,7 @@ extern "C" {
|
|
|
1040
1042
|
|
|
1041
1043
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
|
1042
1044
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
|
1043
|
-
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/
|
|
1045
|
+
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
|
1044
1046
|
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
|
1045
1047
|
/// @param chat Pointer to a list of multiple llama_chat_message
|
|
1046
1048
|
/// @param n_msg Number of llama_chat_message in this chat
|
|
@@ -1114,11 +1116,12 @@ extern "C" {
|
|
|
1114
1116
|
};
|
|
1115
1117
|
|
|
1116
1118
|
struct llama_sampler {
|
|
1117
|
-
struct llama_sampler_i
|
|
1118
|
-
llama_sampler_context_t
|
|
1119
|
+
const struct llama_sampler_i * iface;
|
|
1120
|
+
llama_sampler_context_t ctx;
|
|
1119
1121
|
};
|
|
1120
1122
|
|
|
1121
1123
|
// mirror of llama_sampler_i:
|
|
1124
|
+
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
|
1122
1125
|
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
|
1123
1126
|
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
|
1124
1127
|
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
|
@@ -1148,7 +1151,7 @@ extern "C" {
|
|
|
1148
1151
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
1149
1152
|
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
|
1150
1153
|
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
|
1151
|
-
"will be removed in the future (see https://github.com/
|
|
1154
|
+
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
|
1152
1155
|
|
|
1153
1156
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
1154
1157
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
|
@@ -1156,7 +1159,7 @@ extern "C" {
|
|
|
1156
1159
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
1157
1160
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
|
|
1158
1161
|
|
|
1159
|
-
/// @details Minimum P sampling as described in https://github.com/
|
|
1162
|
+
/// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
|
|
1160
1163
|
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
|
1161
1164
|
|
|
1162
1165
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
|
@@ -1171,6 +1174,9 @@ extern "C" {
|
|
|
1171
1174
|
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
|
1172
1175
|
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
|
1173
1176
|
|
|
1177
|
+
/// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
|
|
1178
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
|
|
1179
|
+
|
|
1174
1180
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
|
1175
1181
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
1176
1182
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
@@ -1199,17 +1205,29 @@ extern "C" {
|
|
|
1199
1205
|
const char * grammar_str,
|
|
1200
1206
|
const char * grammar_root);
|
|
1201
1207
|
|
|
1202
|
-
|
|
1203
|
-
/// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
|
|
1204
|
-
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
|
|
1205
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
1208
|
+
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
1206
1209
|
const struct llama_vocab * vocab,
|
|
1207
1210
|
const char * grammar_str,
|
|
1208
1211
|
const char * grammar_root,
|
|
1209
1212
|
const char ** trigger_words,
|
|
1210
1213
|
size_t num_trigger_words,
|
|
1211
1214
|
const llama_token * trigger_tokens,
|
|
1212
|
-
size_t num_trigger_tokens)
|
|
1215
|
+
size_t num_trigger_tokens),
|
|
1216
|
+
"use llama_sampler_init_grammar_lazy_patterns instead");
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
|
|
1220
|
+
/// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
|
|
1221
|
+
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
|
|
1222
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
|
|
1223
|
+
const struct llama_vocab * vocab,
|
|
1224
|
+
const char * grammar_str,
|
|
1225
|
+
const char * grammar_root,
|
|
1226
|
+
const char ** trigger_patterns,
|
|
1227
|
+
size_t num_trigger_patterns,
|
|
1228
|
+
const llama_token * trigger_tokens,
|
|
1229
|
+
size_t num_trigger_tokens);
|
|
1230
|
+
|
|
1213
1231
|
|
|
1214
1232
|
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
|
1215
1233
|
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
ied 4 ½ months
|
|
2
|
+
__ggml_vocab_test__
|
|
3
|
+
Führer
|
|
4
|
+
__ggml_vocab_test__
|
|
5
|
+
|
|
6
|
+
__ggml_vocab_test__
|
|
7
|
+
|
|
8
|
+
__ggml_vocab_test__
|
|
9
|
+
|
|
10
|
+
__ggml_vocab_test__
|
|
11
|
+
|
|
12
|
+
__ggml_vocab_test__
|
|
13
|
+
|
|
14
|
+
__ggml_vocab_test__
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__ggml_vocab_test__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__ggml_vocab_test__
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__ggml_vocab_test__
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__ggml_vocab_test__
|
|
30
|
+
Hello world
|
|
31
|
+
__ggml_vocab_test__
|
|
32
|
+
Hello world
|
|
33
|
+
__ggml_vocab_test__
|
|
34
|
+
Hello World
|
|
35
|
+
__ggml_vocab_test__
|
|
36
|
+
Hello World
|
|
37
|
+
__ggml_vocab_test__
|
|
38
|
+
Hello World!
|
|
39
|
+
__ggml_vocab_test__
|
|
40
|
+
Hello, world!
|
|
41
|
+
__ggml_vocab_test__
|
|
42
|
+
Hello, world!
|
|
43
|
+
__ggml_vocab_test__
|
|
44
|
+
this is 🦙.cpp
|
|
45
|
+
__ggml_vocab_test__
|
|
46
|
+
w048 7tuijk dsdfhu
|
|
47
|
+
__ggml_vocab_test__
|
|
48
|
+
нещо на Български
|
|
49
|
+
__ggml_vocab_test__
|
|
50
|
+
កាន់តែពិសេសអាចខលចេញ
|
|
51
|
+
__ggml_vocab_test__
|
|
52
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
|
53
|
+
__ggml_vocab_test__
|
|
54
|
+
Hello
|
|
55
|
+
__ggml_vocab_test__
|
|
56
|
+
Hello
|
|
57
|
+
__ggml_vocab_test__
|
|
58
|
+
Hello
|
|
59
|
+
__ggml_vocab_test__
|
|
60
|
+
Hello
|
|
61
|
+
__ggml_vocab_test__
|
|
62
|
+
Hello
|
|
63
|
+
__ggml_vocab_test__
|
|
64
|
+
Hello
|
|
65
|
+
Hello
|
|
66
|
+
__ggml_vocab_test__
|
|
67
|
+
(
|
|
68
|
+
__ggml_vocab_test__
|
|
69
|
+
|
|
70
|
+
=
|
|
71
|
+
__ggml_vocab_test__
|
|
72
|
+
' era
|
|
73
|
+
__ggml_vocab_test__
|
|
74
|
+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
|
75
|
+
__ggml_vocab_test__
|
|
76
|
+
!!!!!!
|
|
77
|
+
__ggml_vocab_test__
|
|
78
|
+
3
|
|
79
|
+
__ggml_vocab_test__
|
|
80
|
+
33
|
|
81
|
+
__ggml_vocab_test__
|
|
82
|
+
333
|
|
83
|
+
__ggml_vocab_test__
|
|
84
|
+
3333
|
|
85
|
+
__ggml_vocab_test__
|
|
86
|
+
33333
|
|
87
|
+
__ggml_vocab_test__
|
|
88
|
+
333333
|
|
89
|
+
__ggml_vocab_test__
|
|
90
|
+
3333333
|
|
91
|
+
__ggml_vocab_test__
|
|
92
|
+
33333333
|
|
93
|
+
__ggml_vocab_test__
|
|
94
|
+
333333333
|
|
95
|
+
__ggml_vocab_test__
|
|
96
|
+
Cửa Việt
|
|
97
|
+
__ggml_vocab_test__
|
|
98
|
+
discards
|
|
99
|
+
__ggml_vocab_test__
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
|
112
|
+
__ggml_vocab_test__
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
1165 220 19 220 27124 5503
|
|
2
|
+
37 19194 259
|
|
3
|
+
|
|
4
|
+
220
|
|
5
|
+
256
|
|
6
|
+
271
|
|
7
|
+
197
|
|
8
|
+
198
|
|
9
|
+
279
|
|
10
|
+
2499
|
|
11
|
+
2775
|
|
12
|
+
13225 2375
|
|
13
|
+
32949 2375
|
|
14
|
+
13225 5922
|
|
15
|
+
32949 5922
|
|
16
|
+
32949 5922 0
|
|
17
|
+
13225 11 2375 0
|
|
18
|
+
32949 11 2375 0
|
|
19
|
+
495 382 9552 99 247 13 17159
|
|
20
|
+
86 45404 220 22 10191 2852 22924 4750 6916
|
|
21
|
+
3907 53641 1235 185386 8118
|
|
22
|
+
11400 107516 15867 20804 22851 134178 77431 32010 104312 37984 16329 27751 89335
|
|
23
|
+
112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 350 7393 74471 484 853 1617 2316 6602 8
|
|
24
|
+
13225
|
|
25
|
+
32949
|
|
26
|
+
220 32949
|
|
27
|
+
256 32949
|
|
28
|
+
271 32949
|
|
29
|
+
271 32949 198 271 32949
|
|
30
|
+
350
|
|
31
|
+
198 314
|
|
32
|
+
6 6837
|
|
33
|
+
13225 11 342 70653 0 3253 553 481 22861 223 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208
|
|
34
|
+
147475
|
|
35
|
+
18
|
|
36
|
+
2546
|
|
37
|
+
15517
|
|
38
|
+
15517 18
|
|
39
|
+
15517 2546
|
|
40
|
+
15517 15517
|
|
41
|
+
15517 15517 18
|
|
42
|
+
15517 15517 2546
|
|
43
|
+
15517 15517 15517
|
|
44
|
+
34 60213 53904
|
|
45
|
+
2960 3098
|
|
46
|
+
126470 25980 160432 16609 2775 4066 172261 19432 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 9552 99 247 4103 99 247 220 18 220 2546 220 15517 220 15517 18 220 15517 2546 220 15517 15517 220 15517 15517 18 220 15517 15517 2546 220 18 13 18 220 18 485 18 220 18 1008 18 44735 107516 15867 20804 22851 134178 77431 32010 104312 156437 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208 105024 106657 1967 53641 1235 185386 8118 22434 39336 26178 26178 168394 194663 27271 147475 25883 6961 9790 1339 461 83 1280 19016 1354 11 461 1099 481 3239 30 461 44 625 3239 17291 1520 480 11 461 35 481 1299 1236 17966 30 1416 6 27493 261 54602 43
|
|
@@ -36,6 +36,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
36
36
|
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
|
37
37
|
{ LLM_ARCH_GEMMA, "gemma" },
|
|
38
38
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
|
39
|
+
{ LLM_ARCH_GEMMA3, "gemma3" },
|
|
39
40
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
40
41
|
{ LLM_ARCH_MAMBA, "mamba" },
|
|
41
42
|
{ LLM_ARCH_XVERSE, "xverse" },
|
|
@@ -766,6 +767,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
766
767
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
767
768
|
},
|
|
768
769
|
},
|
|
770
|
+
{
|
|
771
|
+
LLM_ARCH_GEMMA3,
|
|
772
|
+
{
|
|
773
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
774
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
775
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
776
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
777
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
778
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
779
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
780
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
781
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
782
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
783
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
784
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
785
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
786
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
787
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
788
|
+
},
|
|
789
|
+
},
|
|
769
790
|
{
|
|
770
791
|
LLM_ARCH_STARCODER2,
|
|
771
792
|
{
|