@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -36,14 +36,18 @@ enum llm_type {
|
|
|
36
36
|
LLM_TYPE_335M,
|
|
37
37
|
LLM_TYPE_410M,
|
|
38
38
|
LLM_TYPE_450M,
|
|
39
|
+
LLM_TYPE_475M,
|
|
39
40
|
LLM_TYPE_770M,
|
|
40
41
|
LLM_TYPE_780M,
|
|
41
42
|
LLM_TYPE_0_5B,
|
|
43
|
+
LLM_TYPE_0_6B,
|
|
42
44
|
LLM_TYPE_1B,
|
|
43
45
|
LLM_TYPE_1_3B,
|
|
44
46
|
LLM_TYPE_1_4B,
|
|
45
47
|
LLM_TYPE_1_5B,
|
|
46
48
|
LLM_TYPE_1_6B,
|
|
49
|
+
LLM_TYPE_1_7B,
|
|
50
|
+
LLM_TYPE_1_8B,
|
|
47
51
|
LLM_TYPE_2B,
|
|
48
52
|
LLM_TYPE_2_8B,
|
|
49
53
|
LLM_TYPE_2_9B,
|
|
@@ -61,6 +65,7 @@ enum llm_type {
|
|
|
61
65
|
LLM_TYPE_15B,
|
|
62
66
|
LLM_TYPE_16B,
|
|
63
67
|
LLM_TYPE_20B,
|
|
68
|
+
LLM_TYPE_27B,
|
|
64
69
|
LLM_TYPE_30B,
|
|
65
70
|
LLM_TYPE_32B,
|
|
66
71
|
LLM_TYPE_34B,
|
|
@@ -69,7 +74,9 @@ enum llm_type {
|
|
|
69
74
|
LLM_TYPE_65B,
|
|
70
75
|
LLM_TYPE_70B,
|
|
71
76
|
LLM_TYPE_236B,
|
|
77
|
+
LLM_TYPE_290B,
|
|
72
78
|
LLM_TYPE_314B,
|
|
79
|
+
LLM_TYPE_405B,
|
|
73
80
|
LLM_TYPE_671B,
|
|
74
81
|
LLM_TYPE_SMALL,
|
|
75
82
|
LLM_TYPE_MEDIUM,
|
|
@@ -83,9 +90,14 @@ enum llm_type {
|
|
|
83
90
|
LLM_TYPE_16x3_8B,
|
|
84
91
|
LLM_TYPE_10B_128x3_66B,
|
|
85
92
|
LLM_TYPE_57B_A14B,
|
|
86
|
-
|
|
93
|
+
LLM_TYPE_17B_16E, // llama4 Scout
|
|
94
|
+
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
95
|
+
LLM_TYPE_30B_A3B,
|
|
96
|
+
LLM_TYPE_235B_A22B,
|
|
87
97
|
};
|
|
88
98
|
|
|
99
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
100
|
+
|
|
89
101
|
struct llama_layer_posnet {
|
|
90
102
|
// resnet
|
|
91
103
|
struct ggml_tensor * norm1 = nullptr;
|
|
@@ -167,6 +179,8 @@ struct llama_layer {
|
|
|
167
179
|
struct ggml_tensor * wq_b = nullptr;
|
|
168
180
|
struct ggml_tensor * wkv_a_mqa = nullptr;
|
|
169
181
|
struct ggml_tensor * wkv_b = nullptr;
|
|
182
|
+
struct ggml_tensor * wk_b = nullptr;
|
|
183
|
+
struct ggml_tensor * wv_b = nullptr;
|
|
170
184
|
struct ggml_tensor * wq_cross = nullptr;
|
|
171
185
|
struct ggml_tensor * wk_cross = nullptr;
|
|
172
186
|
struct ggml_tensor * wv_cross = nullptr;
|
|
@@ -380,10 +394,15 @@ struct llama_model {
|
|
|
380
394
|
|
|
381
395
|
ggml_backend_buffer_type_t select_buft(int il) const;
|
|
382
396
|
|
|
397
|
+
bool has_tensor_overrides() const;
|
|
398
|
+
|
|
383
399
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
|
384
400
|
|
|
401
|
+
ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
|
402
|
+
|
|
403
|
+
// note: can mutate `cparams`
|
|
385
404
|
// TODO: move this to new llm_arch_model_i interface
|
|
386
|
-
llama_memory_i * create_memory(
|
|
405
|
+
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
|
387
406
|
|
|
388
407
|
// TODO: move this to new llm_arch_model_i interface
|
|
389
408
|
llm_graph_result_ptr build_graph(
|
|
@@ -10,9 +10,16 @@
|
|
|
10
10
|
#include <cinttypes>
|
|
11
11
|
#include <fstream>
|
|
12
12
|
#include <mutex>
|
|
13
|
+
#include <regex>
|
|
13
14
|
#include <thread>
|
|
14
15
|
#include <unordered_map>
|
|
15
16
|
|
|
17
|
+
// Quantization types. Changes to this struct must be replicated in quantize.cpp
|
|
18
|
+
struct tensor_quantization {
|
|
19
|
+
std::string name;
|
|
20
|
+
ggml_type quant = GGML_TYPE_COUNT;
|
|
21
|
+
};
|
|
22
|
+
|
|
16
23
|
static void zeros(std::ofstream & file, size_t n) {
|
|
17
24
|
char zero = 0;
|
|
18
25
|
for (size_t i = 0; i < n; ++i) {
|
|
@@ -48,7 +55,7 @@ struct quantize_state_impl {
|
|
|
48
55
|
};
|
|
49
56
|
|
|
50
57
|
static void llama_tensor_dequantize_impl(
|
|
51
|
-
|
|
58
|
+
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
|
52
59
|
const size_t nelements, const int nthread
|
|
53
60
|
) {
|
|
54
61
|
if (output.size() < nelements) {
|
|
@@ -512,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
512
519
|
nthread = std::thread::hardware_concurrency();
|
|
513
520
|
}
|
|
514
521
|
|
|
515
|
-
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
|
522
|
+
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
|
516
523
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
|
517
524
|
#if defined(__linux__) || defined(_WIN32)
|
|
518
525
|
constexpr bool use_mmap = true;
|
|
@@ -522,12 +529,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
522
529
|
|
|
523
530
|
llama_model_kv_override * kv_overrides = nullptr;
|
|
524
531
|
if (params->kv_overrides) {
|
|
525
|
-
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
532
|
+
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
526
533
|
kv_overrides = v->data();
|
|
527
534
|
}
|
|
528
535
|
|
|
529
536
|
std::vector<std::string> splits = {};
|
|
530
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
|
|
537
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
|
531
538
|
ml.init_mappings(false); // no prefetching
|
|
532
539
|
|
|
533
540
|
llama_model model(llama_model_default_params());
|
|
@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
536
543
|
model.load_hparams(ml);
|
|
537
544
|
model.load_stats (ml);
|
|
538
545
|
|
|
539
|
-
|
|
546
|
+
quantize_state_impl qs(model, params);
|
|
540
547
|
|
|
541
548
|
if (params->only_copy) {
|
|
542
549
|
ftype = ml.ftype;
|
|
@@ -661,7 +668,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
661
668
|
// populate the original tensors so we get an initial meta data
|
|
662
669
|
for (const auto * it : tensors) {
|
|
663
670
|
uint16_t i_split = params->keep_split ? it->idx : 0;
|
|
664
|
-
|
|
671
|
+
ggml_tensor * tensor = it->tensor;
|
|
665
672
|
if (!ctx_outs[i_split]) {
|
|
666
673
|
ctx_outs[i_split].reset(gguf_init_empty());
|
|
667
674
|
}
|
|
@@ -710,7 +717,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
710
717
|
new_ofstream(0);
|
|
711
718
|
for (const auto * it : tensors) {
|
|
712
719
|
const auto & weight = *it;
|
|
713
|
-
|
|
720
|
+
ggml_tensor * tensor = weight.tensor;
|
|
714
721
|
if (weight.idx != cur_split && params->keep_split) {
|
|
715
722
|
close_ofstream();
|
|
716
723
|
new_ofstream(weight.idx);
|
|
@@ -776,7 +783,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
776
783
|
// do not quantize relative position bias (T5)
|
|
777
784
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
778
785
|
|
|
779
|
-
|
|
786
|
+
ggml_type new_type;
|
|
780
787
|
void * new_data;
|
|
781
788
|
size_t new_size;
|
|
782
789
|
|
|
@@ -786,7 +793,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
786
793
|
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|
787
794
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
|
788
795
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
|
796
|
+
// unless the user specifies a type
|
|
797
|
+
if (params->tensor_types) {
|
|
798
|
+
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
799
|
+
const std::string tensor_name(tensor->name);
|
|
800
|
+
for (const auto & [tname, qtype] : tensor_types) {
|
|
801
|
+
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
|
802
|
+
if (qtype != new_type) {
|
|
803
|
+
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
|
804
|
+
new_type = qtype;
|
|
805
|
+
break; // if two or more types are specified for the tensor, first match wins
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
}
|
|
789
810
|
}
|
|
811
|
+
|
|
790
812
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
|
791
813
|
new_type = params->token_embedding_type;
|
|
792
814
|
}
|
|
@@ -910,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
910
932
|
// interface implementation
|
|
911
933
|
//
|
|
912
934
|
|
|
913
|
-
|
|
914
|
-
|
|
935
|
+
llama_model_quantize_params llama_model_quantize_default_params() {
|
|
936
|
+
llama_model_quantize_params result = {
|
|
915
937
|
/*.nthread =*/ 0,
|
|
916
938
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
|
917
939
|
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
|
@@ -923,6 +945,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
923
945
|
/*.keep_split =*/ false,
|
|
924
946
|
/*.imatrix =*/ nullptr,
|
|
925
947
|
/*.kv_overrides =*/ nullptr,
|
|
948
|
+
/*.tensor_type =*/ nullptr,
|
|
926
949
|
};
|
|
927
950
|
|
|
928
951
|
return result;
|
|
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
|
232
232
|
// }
|
|
233
233
|
|
|
234
234
|
if (k <= 0) {
|
|
235
|
-
|
|
235
|
+
return;
|
|
236
236
|
}
|
|
237
237
|
|
|
238
238
|
k = std::min(k, (int) cur_p->size);
|
|
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
|
298
298
|
}
|
|
299
299
|
cur_p->sorted = true;
|
|
300
300
|
}
|
|
301
|
+
|
|
301
302
|
cur_p->size = k;
|
|
302
303
|
}
|
|
303
304
|
|
|
@@ -1477,6 +1478,7 @@ static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sam
|
|
|
1477
1478
|
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
|
1478
1479
|
|
|
1479
1480
|
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1481
|
+
GGML_ASSERT(result);
|
|
1480
1482
|
|
|
1481
1483
|
// copy the state
|
|
1482
1484
|
{
|
|
@@ -1548,6 +1550,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1548
1550
|
/* .grammar_root = */ grammar_root,
|
|
1549
1551
|
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
|
1550
1552
|
};
|
|
1553
|
+
if (!ctx->grammar) {
|
|
1554
|
+
delete ctx;
|
|
1555
|
+
return nullptr;
|
|
1556
|
+
}
|
|
1551
1557
|
} else {
|
|
1552
1558
|
*ctx = {
|
|
1553
1559
|
/* .vocab = */ vocab,
|
|
@@ -1744,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
|
|
1744
1750
|
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
1745
1751
|
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
|
1746
1752
|
|
|
1753
|
+
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
|
1754
|
+
return;
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1747
1757
|
// find max logit and calculate mean
|
|
1748
1758
|
float max = cur_p->data[0].logit;
|
|
1749
1759
|
float logits_sum = 0;
|
|
1760
|
+
size_t valid_count = 0;
|
|
1750
1761
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1751
|
-
|
|
1752
|
-
|
|
1762
|
+
// Only count non-negative infinity values
|
|
1763
|
+
if (cur_p->data[i].logit != -INFINITY) {
|
|
1764
|
+
if (cur_p->data[i].logit > max) {
|
|
1765
|
+
max = cur_p->data[i].logit;
|
|
1766
|
+
}
|
|
1767
|
+
logits_sum += cur_p->data[i].logit;
|
|
1768
|
+
valid_count++;
|
|
1753
1769
|
}
|
|
1754
|
-
logits_sum += cur_p->data[i].logit;
|
|
1755
1770
|
}
|
|
1756
|
-
float mean = logits_sum/
|
|
1771
|
+
float mean = valid_count > 0 ? logits_sum/valid_count : 0;
|
|
1757
1772
|
|
|
1758
1773
|
// calculate standard deviation
|
|
1759
1774
|
float acc = 0;
|
|
1760
1775
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1761
|
-
|
|
1776
|
+
// Skip -infinity in std calculation
|
|
1777
|
+
if (cur_p->data[i].logit != -INFINITY) {
|
|
1778
|
+
acc += pow(cur_p->data[i].logit - mean, 2);
|
|
1779
|
+
}
|
|
1762
1780
|
}
|
|
1763
|
-
float std = sqrt(acc/
|
|
1781
|
+
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
|
1764
1782
|
|
|
1765
1783
|
//apply mask
|
|
1766
1784
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
#include "llama-vocab.h"
|
|
2
2
|
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "gguf.h"
|
|
3
5
|
#include "llama-impl.h"
|
|
4
6
|
#include "llama-model-loader.h"
|
|
5
7
|
|
|
@@ -342,6 +344,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
342
344
|
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
|
343
345
|
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
|
344
346
|
case LLAMA_VOCAB_PRE_TYPE_JAIS:
|
|
347
|
+
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
|
|
345
348
|
regex_exprs = {
|
|
346
349
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
347
350
|
};
|
|
@@ -400,6 +403,27 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
400
403
|
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
401
404
|
};
|
|
402
405
|
break;
|
|
406
|
+
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
|
|
407
|
+
regex_exprs = {
|
|
408
|
+
"\\p{N}+",
|
|
409
|
+
"(?=(\\d{3})+(?!\\d))",
|
|
410
|
+
};
|
|
411
|
+
break;
|
|
412
|
+
case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
|
|
413
|
+
regex_exprs = {
|
|
414
|
+
// original regex from tokenizer.json
|
|
415
|
+
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
|
416
|
+
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
|
|
417
|
+
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
|
418
|
+
};
|
|
419
|
+
break;
|
|
420
|
+
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
|
421
|
+
regex_exprs = {
|
|
422
|
+
// original regex from tokenizer.json
|
|
423
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
|
424
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
425
|
+
};
|
|
426
|
+
break;
|
|
403
427
|
default:
|
|
404
428
|
// default regex for BPE tokenization pre-processing
|
|
405
429
|
regex_exprs = {
|
|
@@ -1212,6 +1236,9 @@ struct fragment_buffer_variant {
|
|
|
1212
1236
|
struct llama_vocab::impl {
|
|
1213
1237
|
uint32_t n_token_types = 0; // for BERT-style token types
|
|
1214
1238
|
|
|
1239
|
+
std::string tokenizer_model;
|
|
1240
|
+
std::string tokenizer_pre;
|
|
1241
|
+
|
|
1215
1242
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
1216
1243
|
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1217
1244
|
|
|
@@ -1347,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1347
1374
|
|
|
1348
1375
|
// determine vocab type
|
|
1349
1376
|
{
|
|
1350
|
-
std::string tokenizer_model;
|
|
1351
|
-
std::string tokenizer_pre;
|
|
1352
|
-
|
|
1353
1377
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
|
1354
1378
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
1355
1379
|
|
|
@@ -1444,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1444
1468
|
|
|
1445
1469
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
|
1446
1470
|
if (precompiled_charsmap_keyidx != -1) {
|
|
1447
|
-
|
|
1471
|
+
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
|
1472
|
+
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
|
1473
|
+
|
|
1474
|
+
const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
|
1448
1475
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
|
1449
1476
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
|
1450
1477
|
#ifdef IS_BIG_ENDIAN
|
|
@@ -1491,7 +1518,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1491
1518
|
tokenizer_pre == "llama3" ||
|
|
1492
1519
|
tokenizer_pre == "llama-v3" ||
|
|
1493
1520
|
tokenizer_pre == "llama-bpe"||
|
|
1494
|
-
tokenizer_pre == "falcon3"
|
|
1521
|
+
tokenizer_pre == "falcon3" ||
|
|
1522
|
+
tokenizer_pre == "pixtral") {
|
|
1495
1523
|
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
|
1496
1524
|
ignore_merges = true;
|
|
1497
1525
|
add_bos = true;
|
|
@@ -1557,6 +1585,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1557
1585
|
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
|
|
1558
1586
|
clean_spaces = false;
|
|
1559
1587
|
} else if (
|
|
1588
|
+
tokenizer_pre == "glm4" ||
|
|
1560
1589
|
tokenizer_pre == "chatglm-bpe") {
|
|
1561
1590
|
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
|
1562
1591
|
special_bos_id = LLAMA_TOKEN_NULL;
|
|
@@ -1601,9 +1630,26 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1601
1630
|
tokenizer_pre == "megrez") {
|
|
1602
1631
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1603
1632
|
} else if (
|
|
1604
|
-
|
|
1633
|
+
tokenizer_pre == "gpt-4o" ||
|
|
1634
|
+
tokenizer_pre == "llama4") {
|
|
1605
1635
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
|
1606
1636
|
clean_spaces = false;
|
|
1637
|
+
} else if (
|
|
1638
|
+
tokenizer_pre == "superbpe") {
|
|
1639
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
|
|
1640
|
+
clean_spaces = false;
|
|
1641
|
+
} else if (
|
|
1642
|
+
tokenizer_pre == "trillion") {
|
|
1643
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
|
1644
|
+
clean_spaces = false;
|
|
1645
|
+
} else if (
|
|
1646
|
+
tokenizer_pre == "bailingmoe") {
|
|
1647
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
|
1648
|
+
clean_spaces = false;
|
|
1649
|
+
} else if (
|
|
1650
|
+
tokenizer_pre == "seed-coder") {
|
|
1651
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
|
1652
|
+
clean_spaces = false;
|
|
1607
1653
|
} else {
|
|
1608
1654
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1609
1655
|
}
|
|
@@ -1781,6 +1827,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1781
1827
|
|| t.first == "<end_of_turn>"
|
|
1782
1828
|
|| t.first == "<|endoftext|>"
|
|
1783
1829
|
|| t.first == "<EOT>"
|
|
1830
|
+
|| t.first == "_<EOT>"
|
|
1784
1831
|
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
|
1785
1832
|
) {
|
|
1786
1833
|
special_eot_id = t.second;
|
|
@@ -1811,8 +1858,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1811
1858
|
if (false
|
|
1812
1859
|
|| t.first == "<|fim_prefix|>" // Qwen
|
|
1813
1860
|
|| t.first == "<fim-prefix>"
|
|
1861
|
+
|| t.first == "<fim_prefix>" // Granite
|
|
1814
1862
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
|
1815
1863
|
|| t.first == "<PRE>"
|
|
1864
|
+
|| t.first == "▁<PRE>" // CodeLlama
|
|
1816
1865
|
) {
|
|
1817
1866
|
special_fim_pre_id = t.second;
|
|
1818
1867
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1828,8 +1877,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1828
1877
|
if (false
|
|
1829
1878
|
|| t.first == "<|fim_suffix|>" // Qwen
|
|
1830
1879
|
|| t.first == "<fim-suffix>"
|
|
1880
|
+
|| t.first == "<fim_suffix>" // Granite
|
|
1831
1881
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
|
1832
1882
|
|| t.first == "<SUF>"
|
|
1883
|
+
|| t.first == "▁<SUF>" // CodeLlama
|
|
1833
1884
|
) {
|
|
1834
1885
|
special_fim_suf_id = t.second;
|
|
1835
1886
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1845,8 +1896,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1845
1896
|
if (false
|
|
1846
1897
|
|| t.first == "<|fim_middle|>" // Qwen
|
|
1847
1898
|
|| t.first == "<fim-middle>"
|
|
1899
|
+
|| t.first == "<fim_middle>" // Granite
|
|
1848
1900
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
|
1849
1901
|
|| t.first == "<MID>"
|
|
1902
|
+
|| t.first == "▁<MID>" // CodeLlama
|
|
1850
1903
|
) {
|
|
1851
1904
|
special_fim_mid_id = t.second;
|
|
1852
1905
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1862,6 +1915,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1862
1915
|
if (false
|
|
1863
1916
|
|| t.first == "<|fim_pad|>" // Qwen
|
|
1864
1917
|
|| t.first == "<fim-pad>"
|
|
1918
|
+
|| t.first == "<fim_pad>" // Granite
|
|
1865
1919
|
|| t.first == "<PAD>"
|
|
1866
1920
|
) {
|
|
1867
1921
|
special_fim_pad_id = t.second;
|
|
@@ -1880,6 +1934,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1880
1934
|
|| t.first == "<|repo_name|>"
|
|
1881
1935
|
|| t.first == "<fim-repo>"
|
|
1882
1936
|
|| t.first == "<REPO>"
|
|
1937
|
+
|| t.first == "<reponame>" // Granite
|
|
1883
1938
|
) {
|
|
1884
1939
|
special_fim_rep_id = t.second;
|
|
1885
1940
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1931,6 +1986,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1931
1986
|
|| t.first == "<|endoftext|>"
|
|
1932
1987
|
|| t.first == "<|eom_id|>"
|
|
1933
1988
|
|| t.first == "<EOT>"
|
|
1989
|
+
|| t.first == "_<EOT>"
|
|
1934
1990
|
) {
|
|
1935
1991
|
special_eog_ids.insert(t.second);
|
|
1936
1992
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2189,14 +2245,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
|
|
|
2189
2245
|
// find the first occurrence of a given special token in this fragment
|
|
2190
2246
|
// passing offset argument only limit the "search area" but match coordinates
|
|
2191
2247
|
// are still relative to the source full raw_text
|
|
2192
|
-
|
|
2248
|
+
// string_view begins at pos 0 for the same reason
|
|
2249
|
+
auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
|
|
2193
2250
|
|
|
2194
2251
|
// no occurrences found, stop processing this fragment for a given special token
|
|
2195
2252
|
if (match == std::string::npos) break;
|
|
2196
2253
|
|
|
2197
|
-
// check if match is within bounds of offset <-> length
|
|
2198
|
-
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
|
|
2199
|
-
|
|
2200
2254
|
#ifdef PRETOKENIZERDEBUG
|
|
2201
2255
|
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
2202
2256
|
#endif
|
|
@@ -2740,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2740
2794
|
pimpl->load(ml, kv);
|
|
2741
2795
|
}
|
|
2742
2796
|
|
|
2797
|
+
std::string llama_vocab::get_tokenizer_model() const {
|
|
2798
|
+
return pimpl->tokenizer_model;
|
|
2799
|
+
}
|
|
2800
|
+
|
|
2801
|
+
std::string llama_vocab::get_tokenizer_pre() const {
|
|
2802
|
+
return pimpl->tokenizer_pre;
|
|
2803
|
+
}
|
|
2804
|
+
|
|
2743
2805
|
enum llama_vocab_type llama_vocab::get_type() const {
|
|
2744
2806
|
return pimpl->type;
|
|
2745
2807
|
}
|
|
@@ -2962,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
|
|
|
2962
3024
|
return it->second;
|
|
2963
3025
|
}
|
|
2964
3026
|
|
|
3027
|
+
std::vector<std::string> llama_vocab::get_bpe_merges() const {
|
|
3028
|
+
std::vector<std::string> result(pimpl->bpe_ranks.size());
|
|
3029
|
+
|
|
3030
|
+
for (const auto & pair : pimpl->bpe_ranks) {
|
|
3031
|
+
result[pair.second] = pair.first.first + " " + pair.first.second;
|
|
3032
|
+
}
|
|
3033
|
+
|
|
3034
|
+
return result;
|
|
3035
|
+
}
|
|
3036
|
+
|
|
3037
|
+
std::vector<char> llama_vocab::get_precompiled_charsmap() const {
|
|
3038
|
+
return pimpl->precompiled_charsmap;
|
|
3039
|
+
}
|
|
3040
|
+
|
|
2965
3041
|
int32_t llama_vocab::tokenize(
|
|
2966
3042
|
const char * text,
|
|
2967
3043
|
int32_t text_len,
|
|
@@ -21,6 +21,9 @@ struct llama_vocab {
|
|
|
21
21
|
|
|
22
22
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
|
23
23
|
|
|
24
|
+
std::string get_tokenizer_model() const;
|
|
25
|
+
std::string get_tokenizer_pre() const;
|
|
26
|
+
|
|
24
27
|
enum llama_vocab_type get_type() const;
|
|
25
28
|
enum llama_vocab_pre_type get_pre_type() const;
|
|
26
29
|
|
|
@@ -80,6 +83,9 @@ struct llama_vocab {
|
|
|
80
83
|
int max_token_len() const;
|
|
81
84
|
|
|
82
85
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
|
86
|
+
std::vector<std::string> get_bpe_merges() const;
|
|
87
|
+
|
|
88
|
+
std::vector<char> get_precompiled_charsmap() const;
|
|
83
89
|
|
|
84
90
|
int32_t tokenize(
|
|
85
91
|
const char * text,
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "llama-mmap.h"
|
|
5
5
|
#include "llama-vocab.h"
|
|
6
6
|
#include "llama-model-loader.h"
|
|
7
|
+
#include "llama-model-saver.h"
|
|
7
8
|
#include "llama-model.h"
|
|
8
9
|
|
|
9
10
|
#include "ggml.h"
|
|
@@ -92,7 +93,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
92
93
|
model.t_start_us = tm.t_start_us;
|
|
93
94
|
|
|
94
95
|
try {
|
|
95
|
-
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
|
|
96
|
+
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
|
96
97
|
|
|
97
98
|
ml.print_info();
|
|
98
99
|
|
|
@@ -139,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
139
140
|
struct llama_model_params params) {
|
|
140
141
|
ggml_time_init();
|
|
141
142
|
|
|
143
|
+
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
|
144
|
+
LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
|
|
145
|
+
return nullptr;
|
|
146
|
+
}
|
|
147
|
+
|
|
142
148
|
unsigned cur_percentage = 0;
|
|
143
149
|
if (params.progress_callback == NULL) {
|
|
144
150
|
params.progress_callback_user_data = &cur_percentage;
|
|
@@ -253,6 +259,13 @@ struct llama_model * llama_model_load_from_splits(
|
|
|
253
259
|
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
|
254
260
|
}
|
|
255
261
|
|
|
262
|
+
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
|
263
|
+
llama_model_saver ms(*model);
|
|
264
|
+
ms.add_kv_from_model();
|
|
265
|
+
ms.add_tensors_from_model();
|
|
266
|
+
ms.save(path_model);
|
|
267
|
+
}
|
|
268
|
+
|
|
256
269
|
//
|
|
257
270
|
// chat templates
|
|
258
271
|
//
|
|
@@ -338,3 +351,4 @@ const char * llama_print_system_info(void) {
|
|
|
338
351
|
|
|
339
352
|
return s.c_str();
|
|
340
353
|
}
|
|
354
|
+
|