@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -301,12 +301,12 @@ namespace GGUFMeta {
|
|
|
301
301
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
|
302
302
|
|
|
303
303
|
switch (arr_info.gt) {
|
|
304
|
-
case
|
|
305
|
-
case GGUF_TYPE_INT32: GGML_ASSERT(
|
|
306
|
-
|
|
307
|
-
|
|
304
|
+
case GGUF_TYPE_UINT32:
|
|
305
|
+
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
|
306
|
+
(std::is_same<T, uint32_t>::value)); break;
|
|
307
|
+
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
|
308
308
|
default:
|
|
309
|
-
throw std::runtime_error(format("%s is not a float32
|
|
309
|
+
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
|
310
310
|
}
|
|
311
311
|
|
|
312
312
|
result.resize(arr_info.length);
|
|
@@ -330,12 +330,12 @@ namespace GGUFMeta {
|
|
|
330
330
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
|
331
331
|
|
|
332
332
|
switch (arr_info.gt) {
|
|
333
|
-
case
|
|
334
|
-
case GGUF_TYPE_INT32: GGML_ASSERT(
|
|
335
|
-
|
|
336
|
-
|
|
333
|
+
case GGUF_TYPE_UINT32:
|
|
334
|
+
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
|
335
|
+
(std::is_same<T, uint32_t>::value)); break;
|
|
336
|
+
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
|
337
337
|
default:
|
|
338
|
-
throw std::runtime_error(format("%s is not a float32
|
|
338
|
+
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
|
339
339
|
}
|
|
340
340
|
|
|
341
341
|
if (arr_info.length > N_MAX) {
|
|
@@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader(
|
|
|
445
445
|
std::vector<std::string> & splits,
|
|
446
446
|
bool use_mmap,
|
|
447
447
|
bool check_tensors,
|
|
448
|
-
const
|
|
448
|
+
const llama_model_kv_override * param_overrides_p,
|
|
449
|
+
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
|
449
450
|
int trace = 0;
|
|
450
451
|
if (getenv("LLAMA_TRACE")) {
|
|
451
452
|
trace = atoi(getenv("LLAMA_TRACE"));
|
|
@@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader(
|
|
|
457
458
|
}
|
|
458
459
|
}
|
|
459
460
|
|
|
461
|
+
tensor_buft_overrides = param_tensor_buft_overrides_p;
|
|
462
|
+
|
|
460
463
|
// Load the main GGUF
|
|
461
464
|
struct ggml_context * ctx = NULL;
|
|
462
465
|
struct gguf_init_params params = {
|
|
@@ -466,7 +469,7 @@ llama_model_loader::llama_model_loader(
|
|
|
466
469
|
|
|
467
470
|
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
|
468
471
|
if (!meta) {
|
|
469
|
-
throw std::runtime_error(format("%s: failed to load model from %s
|
|
472
|
+
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
|
470
473
|
}
|
|
471
474
|
|
|
472
475
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
@@ -525,7 +528,7 @@ llama_model_loader::llama_model_loader(
|
|
|
525
528
|
};
|
|
526
529
|
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
|
527
530
|
if (!ctx_gguf) {
|
|
528
|
-
throw std::runtime_error(format("%s: failed to load GGUF split from %s
|
|
531
|
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
|
529
532
|
}
|
|
530
533
|
|
|
531
534
|
// check idx
|
|
@@ -600,7 +603,9 @@ llama_model_loader::llama_model_loader(
|
|
|
600
603
|
|
|
601
604
|
if (trace > 0) {
|
|
602
605
|
const uint16_t sid = w.idx;
|
|
603
|
-
LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__,
|
|
606
|
+
LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
|
|
607
|
+
sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
|
|
608
|
+
ggml_nbytes(tensor)/1024.0f/1024.0f);
|
|
604
609
|
}
|
|
605
610
|
}
|
|
606
611
|
|
|
@@ -640,9 +645,9 @@ llama_model_loader::llama_model_loader(
|
|
|
640
645
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
|
641
646
|
|
|
642
647
|
{
|
|
643
|
-
|
|
644
|
-
if (
|
|
645
|
-
ftype = (llama_ftype)
|
|
648
|
+
uint32_t ftype_val = 0;
|
|
649
|
+
if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) {
|
|
650
|
+
ftype = (llama_ftype) ftype_val;
|
|
646
651
|
}
|
|
647
652
|
}
|
|
648
653
|
|
|
@@ -817,9 +822,18 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
|
|
817
822
|
mappings.reserve(files.size());
|
|
818
823
|
mmaps_used.reserve(files.size());
|
|
819
824
|
for (const auto & file : files) {
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
825
|
+
bool is_numa = false;
|
|
826
|
+
|
|
827
|
+
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
828
|
+
if (dev) {
|
|
829
|
+
auto * reg = ggml_backend_dev_backend_reg(dev);
|
|
830
|
+
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
|
831
|
+
if (is_numa_fn) {
|
|
832
|
+
is_numa = is_numa_fn();
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
|
|
823
837
|
mmaps_used.emplace_back(mapping->size(), 0);
|
|
824
838
|
if (mlock_mmaps) {
|
|
825
839
|
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
|
@@ -77,8 +77,9 @@ struct llama_model_loader {
|
|
|
77
77
|
|
|
78
78
|
llama_mmaps mappings;
|
|
79
79
|
|
|
80
|
-
std::map<std::string,
|
|
81
|
-
std::unordered_map<std::string,
|
|
80
|
+
std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
|
|
81
|
+
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
|
82
|
+
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
|
82
83
|
|
|
83
84
|
gguf_context_ptr meta;
|
|
84
85
|
std::vector<ggml_context_ptr> contexts;
|
|
@@ -95,7 +96,8 @@ struct llama_model_loader {
|
|
|
95
96
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
96
97
|
bool use_mmap,
|
|
97
98
|
bool check_tensors,
|
|
98
|
-
const
|
|
99
|
+
const llama_model_kv_override * param_overrides_p,
|
|
100
|
+
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
|
99
101
|
|
|
100
102
|
template<typename T>
|
|
101
103
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
#include "llama-model-saver.h"
|
|
2
|
+
|
|
3
|
+
#include "gguf.h"
|
|
4
|
+
|
|
5
|
+
#include "llama.h"
|
|
6
|
+
#include "llama-hparams.h"
|
|
7
|
+
#include "llama-model.h"
|
|
8
|
+
#include "llama-vocab.h"
|
|
9
|
+
|
|
10
|
+
#include <string>
|
|
11
|
+
|
|
12
|
+
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
|
13
|
+
gguf_ctx = gguf_init_empty();
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
llama_model_saver::~llama_model_saver() {
|
|
17
|
+
gguf_free(gguf_ctx);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
|
21
|
+
gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
|
|
25
|
+
gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
|
|
29
|
+
gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
|
|
33
|
+
gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
|
|
37
|
+
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
[[noreturn]]
|
|
41
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
|
42
|
+
GGML_UNUSED(key);
|
|
43
|
+
GGML_UNUSED(value);
|
|
44
|
+
GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
template <typename Container>
|
|
48
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
|
49
|
+
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
|
50
|
+
GGML_ASSERT(n_values <= value.size());
|
|
51
|
+
|
|
52
|
+
if (n_values == 0) {
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (per_layer) {
|
|
57
|
+
bool all_values_the_same = true;
|
|
58
|
+
for (size_t i = 1; i < n_values; ++i) {
|
|
59
|
+
if (value[i] != value[0]) {
|
|
60
|
+
all_values_the_same = false;
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
if (all_values_the_same) {
|
|
65
|
+
add_kv(key, value[0]);
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (std::is_same<typename Container::value_type, uint8_t>::value) {
|
|
71
|
+
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
|
|
72
|
+
} else if (std::is_same<typename Container::value_type, int8_t>::value) {
|
|
73
|
+
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
|
|
74
|
+
} else if (std::is_same<typename Container::value_type, uint32_t>::value) {
|
|
75
|
+
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
|
|
76
|
+
} else if (std::is_same<typename Container::value_type, int32_t>::value) {
|
|
77
|
+
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
|
|
78
|
+
} else if (std::is_same<typename Container::value_type, float>::value) {
|
|
79
|
+
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
|
|
80
|
+
} else if (std::is_same<Container, std::string>::value) {
|
|
81
|
+
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
|
|
82
|
+
} else {
|
|
83
|
+
GGML_ABORT("fatal error");
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
|
88
|
+
std::vector<const char *> tmp(value.size());
|
|
89
|
+
for (size_t i = 0; i < value.size(); ++i) {
|
|
90
|
+
tmp[i] = value[i].c_str();
|
|
91
|
+
}
|
|
92
|
+
gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
|
96
|
+
if (!tensor) {
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
|
100
|
+
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
gguf_add_tensor(gguf_ctx, tensor);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
void llama_model_saver::add_kv_from_model() {
|
|
107
|
+
const llama_hparams & hparams = model.hparams;
|
|
108
|
+
const llama_vocab & vocab = model.vocab;
|
|
109
|
+
|
|
110
|
+
const int32_t n_vocab = vocab.n_tokens();
|
|
111
|
+
std::vector<std::string> tokens(n_vocab);
|
|
112
|
+
std::vector<float> scores(n_vocab);
|
|
113
|
+
std::vector<int32_t> token_types(n_vocab);
|
|
114
|
+
|
|
115
|
+
for (int32_t id = 0; id < n_vocab; ++id) {
|
|
116
|
+
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
|
117
|
+
|
|
118
|
+
tokens[id] = token_data.text;
|
|
119
|
+
scores[id] = token_data.score;
|
|
120
|
+
|
|
121
|
+
switch(token_data.attr) {
|
|
122
|
+
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
|
123
|
+
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
|
124
|
+
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
|
125
|
+
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
|
126
|
+
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
|
127
|
+
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
|
128
|
+
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
|
129
|
+
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
|
134
|
+
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
|
135
|
+
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
|
136
|
+
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
|
137
|
+
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
|
138
|
+
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
|
139
|
+
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
|
140
|
+
// add_kv(LLM_KV_GENERAL_URL, ???);
|
|
141
|
+
// add_kv(LLM_KV_GENERAL_DESCRIPTION, ???);
|
|
142
|
+
// add_kv(LLM_KV_GENERAL_LICENSE, ???);
|
|
143
|
+
// add_kv(LLM_KV_GENERAL_SOURCE_URL, ???);
|
|
144
|
+
// add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???);
|
|
145
|
+
|
|
146
|
+
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
|
147
|
+
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
148
|
+
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
149
|
+
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
150
|
+
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
151
|
+
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
|
152
|
+
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
153
|
+
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
154
|
+
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
|
155
|
+
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
|
156
|
+
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
157
|
+
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
158
|
+
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
159
|
+
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
160
|
+
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
|
161
|
+
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
162
|
+
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
|
163
|
+
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
|
164
|
+
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
|
165
|
+
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
|
166
|
+
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
|
167
|
+
add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
|
168
|
+
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
|
169
|
+
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
170
|
+
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
171
|
+
|
|
172
|
+
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
|
173
|
+
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
|
174
|
+
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
175
|
+
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
|
176
|
+
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
|
177
|
+
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
|
178
|
+
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
179
|
+
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
180
|
+
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
181
|
+
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
182
|
+
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
183
|
+
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
|
184
|
+
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
185
|
+
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
|
186
|
+
|
|
187
|
+
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
|
188
|
+
|
|
189
|
+
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
|
190
|
+
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
|
191
|
+
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
|
192
|
+
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
|
193
|
+
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
|
194
|
+
add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor);
|
|
195
|
+
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
|
196
|
+
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
|
197
|
+
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
|
198
|
+
|
|
199
|
+
// TODO: implement split file support
|
|
200
|
+
// add_kv(LLM_KV_SPLIT_NO, ???);
|
|
201
|
+
// add_kv(LLM_KV_SPLIT_COUNT, ???);
|
|
202
|
+
// add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???);
|
|
203
|
+
|
|
204
|
+
add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
205
|
+
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
206
|
+
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
207
|
+
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
208
|
+
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
|
209
|
+
|
|
210
|
+
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
|
211
|
+
|
|
212
|
+
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
|
213
|
+
add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre());
|
|
214
|
+
add_kv(LLM_KV_TOKENIZER_LIST, tokens);
|
|
215
|
+
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types);
|
|
216
|
+
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types());
|
|
217
|
+
add_kv(LLM_KV_TOKENIZER_SCORES, scores);
|
|
218
|
+
add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges());
|
|
219
|
+
// FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
|
|
220
|
+
add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos()));
|
|
221
|
+
add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos()));
|
|
222
|
+
add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot()));
|
|
223
|
+
add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom()));
|
|
224
|
+
add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk()));
|
|
225
|
+
add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep()));
|
|
226
|
+
add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad()));
|
|
227
|
+
// add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated
|
|
228
|
+
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
|
229
|
+
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
|
230
|
+
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
|
231
|
+
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
|
232
|
+
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
|
233
|
+
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|
|
234
|
+
// add_kv(LLM_KV_TOKENIZER_HF_JSON, ???);
|
|
235
|
+
// add_kv(LLM_KV_TOKENIZER_RWKV, ???);
|
|
236
|
+
add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre()));
|
|
237
|
+
add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf()));
|
|
238
|
+
add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid()));
|
|
239
|
+
add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad()));
|
|
240
|
+
add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep()));
|
|
241
|
+
add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep()));
|
|
242
|
+
|
|
243
|
+
// TODO: implement LoRA support
|
|
244
|
+
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
|
245
|
+
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
|
246
|
+
|
|
247
|
+
// deprecated
|
|
248
|
+
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
|
249
|
+
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
|
250
|
+
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
void llama_model_saver::add_tensors_from_model() {
|
|
254
|
+
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
|
255
|
+
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
|
256
|
+
}
|
|
257
|
+
add_tensor(model.type_embd);
|
|
258
|
+
add_tensor(model.pos_embd);
|
|
259
|
+
add_tensor(model.tok_norm);
|
|
260
|
+
add_tensor(model.tok_norm_b);
|
|
261
|
+
add_tensor(model.output_norm);
|
|
262
|
+
add_tensor(model.output_norm_b);
|
|
263
|
+
add_tensor(model.output);
|
|
264
|
+
add_tensor(model.output_b);
|
|
265
|
+
add_tensor(model.output_norm_enc);
|
|
266
|
+
add_tensor(model.cls);
|
|
267
|
+
add_tensor(model.cls_b);
|
|
268
|
+
add_tensor(model.cls_out);
|
|
269
|
+
add_tensor(model.cls_out_b);
|
|
270
|
+
|
|
271
|
+
for (const struct llama_layer & layer : model.layers) {
|
|
272
|
+
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
|
273
|
+
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
void llama_model_saver::save(const std::string & path_model) {
|
|
279
|
+
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
|
280
|
+
}
|
|
281
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "llama.h"
|
|
4
|
+
#include "llama-arch.h"
|
|
5
|
+
|
|
6
|
+
#include <vector>
|
|
7
|
+
|
|
8
|
+
struct llama_model_saver {
|
|
9
|
+
struct gguf_context * gguf_ctx = nullptr;
|
|
10
|
+
const struct llama_model & model;
|
|
11
|
+
const struct LLM_KV llm_kv;
|
|
12
|
+
|
|
13
|
+
llama_model_saver(const struct llama_model & model);
|
|
14
|
+
~llama_model_saver();
|
|
15
|
+
|
|
16
|
+
void add_kv(enum llm_kv key, uint32_t value);
|
|
17
|
+
void add_kv(enum llm_kv key, int32_t value);
|
|
18
|
+
void add_kv(enum llm_kv key, float value);
|
|
19
|
+
void add_kv(enum llm_kv key, bool value);
|
|
20
|
+
void add_kv(enum llm_kv key, const char * value);
|
|
21
|
+
|
|
22
|
+
[[noreturn]]
|
|
23
|
+
void add_kv(enum llm_kv key, char value); // needed to make the template below compile
|
|
24
|
+
|
|
25
|
+
template <typename Container>
|
|
26
|
+
void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
|
|
27
|
+
|
|
28
|
+
void add_kv(enum llm_kv key, const std::vector<std::string> & value);
|
|
29
|
+
|
|
30
|
+
void add_tensor(const struct ggml_tensor * tensor);
|
|
31
|
+
|
|
32
|
+
void add_kv_from_model();
|
|
33
|
+
|
|
34
|
+
void add_tensors_from_model();
|
|
35
|
+
|
|
36
|
+
void save(const std::string & path_model);
|
|
37
|
+
};
|