@novastera-oss/llamarn 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +140 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +48 -67
- package/cpp/LlamaCppModel.h +8 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +33 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
- package/cpp/llama.cpp/common/arg.cpp +38 -12
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
- package/cpp/llama.cpp/common/chat-parser.h +4 -1
- package/cpp/llama.cpp/common/chat.cpp +16 -13
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +52 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +140 -38
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
- package/cpp/llama.cpp/src/llama-batch.h +47 -17
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +488 -313
- package/cpp/llama.cpp/src/llama-context.h +38 -17
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
- package/cpp/llama.cpp/src/llama-graph.h +109 -52
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
- package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +89 -4
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +735 -143
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +65 -10
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +140 -38
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -61,7 +61,10 @@ extern "C" {
|
|
|
61
61
|
struct llama_model;
|
|
62
62
|
struct llama_context;
|
|
63
63
|
struct llama_sampler;
|
|
64
|
-
|
|
64
|
+
|
|
65
|
+
typedef struct llama_memory_i * llama_memory_t;
|
|
66
|
+
|
|
67
|
+
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
|
65
68
|
|
|
66
69
|
typedef int32_t llama_pos;
|
|
67
70
|
typedef int32_t llama_token;
|
|
@@ -240,18 +243,21 @@ extern "C" {
|
|
|
240
243
|
|
|
241
244
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
|
242
245
|
|
|
243
|
-
// Input data for llama_decode
|
|
246
|
+
// Input data for llama_encode/llama_decode
|
|
244
247
|
// A llama_batch object can contain input about one or many sequences
|
|
245
248
|
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
|
246
249
|
//
|
|
247
250
|
// - token : the token ids of the input (used when embd is NULL)
|
|
248
251
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
249
252
|
// - pos : the positions of the respective token in the sequence
|
|
250
|
-
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
|
253
|
+
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
|
251
254
|
// - seq_id : the sequence to which the respective token belongs
|
|
252
255
|
// (if set to NULL, the sequence ID will be assumed to be 0)
|
|
253
256
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
|
254
|
-
// (if set to NULL
|
|
257
|
+
// (if set to NULL:
|
|
258
|
+
// - if embeddings: all tokens are output
|
|
259
|
+
// - if not: only the last token is output
|
|
260
|
+
// )
|
|
255
261
|
//
|
|
256
262
|
typedef struct llama_batch {
|
|
257
263
|
int32_t n_tokens;
|
|
@@ -261,7 +267,7 @@ extern "C" {
|
|
|
261
267
|
llama_pos * pos;
|
|
262
268
|
int32_t * n_seq_id;
|
|
263
269
|
llama_seq_id ** seq_id;
|
|
264
|
-
int8_t * logits;
|
|
270
|
+
int8_t * logits; // TODO: rename this to "output"
|
|
265
271
|
} llama_batch;
|
|
266
272
|
|
|
267
273
|
enum llama_model_kv_override_type {
|
|
@@ -366,6 +372,8 @@ extern "C" {
|
|
|
366
372
|
bool no_perf; // measure performance timings
|
|
367
373
|
bool op_offload; // offload host tensor operations to device
|
|
368
374
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
375
|
+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
376
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
369
377
|
};
|
|
370
378
|
|
|
371
379
|
// model quantization parameters
|
|
@@ -491,9 +499,11 @@ extern "C" {
|
|
|
491
499
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
|
492
500
|
|
|
493
501
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
|
494
|
-
LLAMA_API
|
|
502
|
+
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
|
495
503
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
496
504
|
|
|
505
|
+
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
|
506
|
+
|
|
497
507
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
498
508
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
499
509
|
|
|
@@ -502,10 +512,18 @@ extern "C" {
|
|
|
502
512
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
503
513
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
504
514
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
515
|
+
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
|
505
516
|
|
|
506
517
|
// Get the model's RoPE frequency scaling factor
|
|
507
518
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
508
519
|
|
|
520
|
+
// Returns the number of classifier outputs (only valid for classifier models)
|
|
521
|
+
// Undefined behavior for non-classifier models
|
|
522
|
+
LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
|
|
523
|
+
|
|
524
|
+
// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
|
|
525
|
+
LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
|
|
526
|
+
|
|
509
527
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
|
510
528
|
|
|
511
529
|
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
|
@@ -606,7 +624,81 @@ extern "C" {
|
|
|
606
624
|
int32_t il_end);
|
|
607
625
|
|
|
608
626
|
//
|
|
609
|
-
//
|
|
627
|
+
// Memory
|
|
628
|
+
//
|
|
629
|
+
|
|
630
|
+
// Clear the memory contents
|
|
631
|
+
// If data == true, the data buffers will also be cleared together with the metadata
|
|
632
|
+
LLAMA_API void llama_memory_clear(
|
|
633
|
+
llama_memory_t mem,
|
|
634
|
+
bool data);
|
|
635
|
+
|
|
636
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
637
|
+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
638
|
+
// seq_id < 0 : match any sequence
|
|
639
|
+
// p0 < 0 : [0, p1]
|
|
640
|
+
// p1 < 0 : [p0, inf)
|
|
641
|
+
LLAMA_API bool llama_memory_seq_rm(
|
|
642
|
+
llama_memory_t mem,
|
|
643
|
+
llama_seq_id seq_id,
|
|
644
|
+
llama_pos p0,
|
|
645
|
+
llama_pos p1);
|
|
646
|
+
|
|
647
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
|
648
|
+
// p0 < 0 : [0, p1]
|
|
649
|
+
// p1 < 0 : [p0, inf)
|
|
650
|
+
LLAMA_API void llama_memory_seq_cp(
|
|
651
|
+
llama_memory_t mem,
|
|
652
|
+
llama_seq_id seq_id_src,
|
|
653
|
+
llama_seq_id seq_id_dst,
|
|
654
|
+
llama_pos p0,
|
|
655
|
+
llama_pos p1);
|
|
656
|
+
|
|
657
|
+
// Removes all tokens that do not belong to the specified sequence
|
|
658
|
+
LLAMA_API void llama_memory_seq_keep(
|
|
659
|
+
llama_memory_t mem,
|
|
660
|
+
llama_seq_id seq_id);
|
|
661
|
+
|
|
662
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
663
|
+
// p0 < 0 : [0, p1]
|
|
664
|
+
// p1 < 0 : [p0, inf)
|
|
665
|
+
LLAMA_API void llama_memory_seq_add(
|
|
666
|
+
llama_memory_t mem,
|
|
667
|
+
llama_seq_id seq_id,
|
|
668
|
+
llama_pos p0,
|
|
669
|
+
llama_pos p1,
|
|
670
|
+
llama_pos delta);
|
|
671
|
+
|
|
672
|
+
// Integer division of the positions by factor of `d > 1`
|
|
673
|
+
// p0 < 0 : [0, p1]
|
|
674
|
+
// p1 < 0 : [p0, inf)
|
|
675
|
+
LLAMA_API void llama_memory_seq_div(
|
|
676
|
+
llama_memory_t mem,
|
|
677
|
+
llama_seq_id seq_id,
|
|
678
|
+
llama_pos p0,
|
|
679
|
+
llama_pos p1,
|
|
680
|
+
int d);
|
|
681
|
+
|
|
682
|
+
// Returns the smallest position present in the memory for the specified sequence
|
|
683
|
+
// This is typically non-zero only for SWA caches
|
|
684
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
685
|
+
// Return -1 if the sequence is empty
|
|
686
|
+
LLAMA_API llama_pos llama_memory_seq_pos_min(
|
|
687
|
+
llama_memory_t mem,
|
|
688
|
+
llama_seq_id seq_id);
|
|
689
|
+
|
|
690
|
+
// Returns the largest position present in the memory for the specified sequence
|
|
691
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
692
|
+
// Return -1 if the sequence is empty
|
|
693
|
+
LLAMA_API llama_pos llama_memory_seq_pos_max(
|
|
694
|
+
llama_memory_t mem,
|
|
695
|
+
llama_seq_id seq_id);
|
|
696
|
+
|
|
697
|
+
// Check if the memory supports shifting
|
|
698
|
+
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
|
699
|
+
|
|
700
|
+
//
|
|
701
|
+
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
|
610
702
|
//
|
|
611
703
|
|
|
612
704
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
@@ -619,93 +711,103 @@ extern "C" {
|
|
|
619
711
|
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
620
712
|
|
|
621
713
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
622
|
-
LLAMA_API void llama_kv_self_clear(
|
|
623
|
-
|
|
714
|
+
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
|
715
|
+
struct llama_context * ctx),
|
|
716
|
+
"Use llama_memory_clear() instead");
|
|
624
717
|
|
|
625
718
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
626
719
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
627
720
|
// seq_id < 0 : match any sequence
|
|
628
721
|
// p0 < 0 : [0, p1]
|
|
629
722
|
// p1 < 0 : [p0, inf)
|
|
630
|
-
LLAMA_API bool llama_kv_self_seq_rm(
|
|
723
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
|
631
724
|
struct llama_context * ctx,
|
|
632
725
|
llama_seq_id seq_id,
|
|
633
726
|
llama_pos p0,
|
|
634
|
-
llama_pos p1)
|
|
727
|
+
llama_pos p1),
|
|
728
|
+
"Use llama_memory_seq_rm() instead");
|
|
635
729
|
|
|
636
730
|
// Copy all tokens that belong to the specified sequence to another sequence
|
|
637
731
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
638
732
|
// p0 < 0 : [0, p1]
|
|
639
733
|
// p1 < 0 : [p0, inf)
|
|
640
|
-
LLAMA_API void llama_kv_self_seq_cp(
|
|
734
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
|
641
735
|
struct llama_context * ctx,
|
|
642
736
|
llama_seq_id seq_id_src,
|
|
643
737
|
llama_seq_id seq_id_dst,
|
|
644
738
|
llama_pos p0,
|
|
645
|
-
llama_pos p1)
|
|
739
|
+
llama_pos p1),
|
|
740
|
+
"Use llama_memory_seq_cp() instead");
|
|
646
741
|
|
|
647
742
|
// Removes all tokens that do not belong to the specified sequence
|
|
648
|
-
LLAMA_API void llama_kv_self_seq_keep(
|
|
743
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
|
649
744
|
struct llama_context * ctx,
|
|
650
|
-
llama_seq_id seq_id)
|
|
745
|
+
llama_seq_id seq_id),
|
|
746
|
+
"Use llama_memory_seq_keep() instead");
|
|
651
747
|
|
|
652
748
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
653
749
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
654
750
|
// - lazily on next llama_decode()
|
|
655
|
-
// - explicitly with llama_kv_self_update()
|
|
656
751
|
// p0 < 0 : [0, p1]
|
|
657
752
|
// p1 < 0 : [p0, inf)
|
|
658
|
-
LLAMA_API void llama_kv_self_seq_add(
|
|
753
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
|
659
754
|
struct llama_context * ctx,
|
|
660
755
|
llama_seq_id seq_id,
|
|
661
756
|
llama_pos p0,
|
|
662
757
|
llama_pos p1,
|
|
663
|
-
llama_pos delta)
|
|
758
|
+
llama_pos delta),
|
|
759
|
+
"Use llama_memory_seq_add() instead");
|
|
664
760
|
|
|
665
761
|
// Integer division of the positions by factor of `d > 1`
|
|
666
762
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
667
763
|
// - lazily on next llama_decode()
|
|
668
|
-
// - explicitly with llama_kv_self_update()
|
|
669
764
|
// p0 < 0 : [0, p1]
|
|
670
765
|
// p1 < 0 : [p0, inf)
|
|
671
|
-
|
|
766
|
+
DEPRECATED(void llama_kv_self_seq_div(
|
|
672
767
|
struct llama_context * ctx,
|
|
673
768
|
llama_seq_id seq_id,
|
|
674
769
|
llama_pos p0,
|
|
675
770
|
llama_pos p1,
|
|
676
|
-
int d)
|
|
771
|
+
int d),
|
|
772
|
+
"Use llama_memory_seq_div() instead");
|
|
677
773
|
|
|
678
774
|
// Returns the smallest position present in the KV cache for the specified sequence
|
|
679
775
|
// This is typically non-zero only for SWA caches
|
|
776
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
680
777
|
// Return -1 if the sequence is empty
|
|
681
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
778
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
682
779
|
struct llama_context * ctx,
|
|
683
|
-
llama_seq_id seq_id)
|
|
780
|
+
llama_seq_id seq_id),
|
|
781
|
+
"Use llama_memory_seq_pos_min() instead");
|
|
684
782
|
|
|
685
783
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
784
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
686
785
|
// Return -1 if the sequence is empty
|
|
687
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
786
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
688
787
|
struct llama_context * ctx,
|
|
689
|
-
llama_seq_id seq_id)
|
|
788
|
+
llama_seq_id seq_id),
|
|
789
|
+
"Use llama_memory_seq_pos_max() instead");
|
|
690
790
|
|
|
691
791
|
// Defragment the KV cache
|
|
692
792
|
// This will be applied:
|
|
693
793
|
// - lazily on next llama_decode()
|
|
694
|
-
|
|
695
|
-
|
|
794
|
+
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
|
795
|
+
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
696
796
|
|
|
697
797
|
// Check if the context supports KV cache shifting
|
|
698
|
-
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx)
|
|
798
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
|
799
|
+
"use llama_memory_can_shift() instead");
|
|
699
800
|
|
|
700
801
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
701
|
-
LLAMA_API void llama_kv_self_update(struct llama_context * ctx)
|
|
802
|
+
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
|
803
|
+
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
702
804
|
|
|
703
805
|
//
|
|
704
806
|
// State / sessions
|
|
705
807
|
//
|
|
706
808
|
|
|
707
809
|
// Returns the *actual* size in bytes of the state
|
|
708
|
-
// (logits, embedding and
|
|
810
|
+
// (logits, embedding and memory)
|
|
709
811
|
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
|
710
812
|
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
|
711
813
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
|
@@ -761,12 +863,12 @@ extern "C" {
|
|
|
761
863
|
size_t n_token_count),
|
|
762
864
|
"use llama_state_save_file instead");
|
|
763
865
|
|
|
764
|
-
// Get the exact size needed to copy the
|
|
866
|
+
// Get the exact size needed to copy the state of a single sequence
|
|
765
867
|
LLAMA_API size_t llama_state_seq_get_size(
|
|
766
868
|
struct llama_context * ctx,
|
|
767
869
|
llama_seq_id seq_id);
|
|
768
870
|
|
|
769
|
-
// Copy the
|
|
871
|
+
// Copy the state of a single sequence into the specified buffer
|
|
770
872
|
LLAMA_API size_t llama_state_seq_get_data(
|
|
771
873
|
struct llama_context * ctx,
|
|
772
874
|
uint8_t * dst,
|
|
@@ -832,16 +934,16 @@ extern "C" {
|
|
|
832
934
|
// For encode-decoder contexts, processes the batch using the encoder.
|
|
833
935
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
|
834
936
|
// 0 - success
|
|
835
|
-
// < 0 - error. the
|
|
937
|
+
// < 0 - error. the memory state is restored to the state before this call
|
|
836
938
|
LLAMA_API int32_t llama_encode(
|
|
837
939
|
struct llama_context * ctx,
|
|
838
940
|
struct llama_batch batch);
|
|
839
941
|
|
|
840
942
|
// Process a batch of tokens.
|
|
841
|
-
// Requires
|
|
943
|
+
// Requires the context to have a memory.
|
|
842
944
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
843
945
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
844
|
-
// Upon non-zero return values, the
|
|
946
|
+
// Upon non-zero return values, the memory state is restored to the state before this call
|
|
845
947
|
// 0 - success
|
|
846
948
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
847
949
|
// 2 - aborted
|
|
@@ -862,8 +964,8 @@ extern "C" {
|
|
|
862
964
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
|
863
965
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
|
864
966
|
|
|
865
|
-
// Set whether the
|
|
866
|
-
//
|
|
967
|
+
// Set whether the context outputs embeddings or not
|
|
968
|
+
// TODO: rename to avoid confusion with llama_get_embeddings()
|
|
867
969
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
|
868
970
|
|
|
869
971
|
// Set whether to use causal attention or not
|
|
@@ -912,7 +1014,7 @@ extern "C" {
|
|
|
912
1014
|
|
|
913
1015
|
// Get the embeddings for a sequence id
|
|
914
1016
|
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
|
915
|
-
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[
|
|
1017
|
+
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
|
|
916
1018
|
// otherwise: float[n_embd] (1-dimensional)
|
|
917
1019
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
918
1020
|
|
|
@@ -20,8 +20,11 @@ add_library(llama
|
|
|
20
20
|
llama-hparams.cpp
|
|
21
21
|
llama-impl.cpp
|
|
22
22
|
llama-io.cpp
|
|
23
|
-
llama-kv-cache.cpp
|
|
23
|
+
llama-kv-cache-unified.cpp
|
|
24
|
+
llama-kv-cache-unified-iswa.cpp
|
|
24
25
|
llama-memory.cpp
|
|
26
|
+
llama-memory-hybrid.cpp
|
|
27
|
+
llama-memory-recurrent.cpp
|
|
25
28
|
llama-mmap.cpp
|
|
26
29
|
llama-model-loader.cpp
|
|
27
30
|
llama-model-saver.cpp
|
|
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
20
20
|
{ LLM_ARCH_BERT, "bert" },
|
|
21
21
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
22
22
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
|
23
|
+
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
|
23
24
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
|
24
25
|
{ LLM_ARCH_BLOOM, "bloom" },
|
|
25
26
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
@@ -72,6 +73,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
72
73
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
73
74
|
{ LLM_ARCH_PLM, "plm" },
|
|
74
75
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
|
76
|
+
{ LLM_ARCH_DOTS1, "dots1" },
|
|
77
|
+
{ LLM_ARCH_ARCEE, "arcee" },
|
|
75
78
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
76
79
|
};
|
|
77
80
|
|
|
@@ -144,6 +147,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
144
147
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
145
148
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
146
149
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
150
|
+
{ LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
|
|
147
151
|
|
|
148
152
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
149
153
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
@@ -200,7 +204,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
200
204
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
|
201
205
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
|
202
206
|
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
|
203
|
-
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
|
|
204
207
|
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
|
205
208
|
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
|
206
209
|
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
|
@@ -244,6 +247,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
244
247
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
245
248
|
},
|
|
246
249
|
},
|
|
250
|
+
{
|
|
251
|
+
LLM_ARCH_ARCEE,
|
|
252
|
+
{
|
|
253
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
254
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
255
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
256
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
257
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
258
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
259
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
260
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
261
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
262
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
263
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
264
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
265
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
266
|
+
},
|
|
267
|
+
},
|
|
247
268
|
{
|
|
248
269
|
LLM_ARCH_LLAMA4,
|
|
249
270
|
{
|
|
@@ -495,6 +516,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
495
516
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
496
517
|
},
|
|
497
518
|
},
|
|
519
|
+
{
|
|
520
|
+
LLM_ARCH_NEO_BERT,
|
|
521
|
+
{
|
|
522
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
523
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
524
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
525
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
526
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
527
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
528
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
529
|
+
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
|
530
|
+
{ LLM_TENSOR_CLS, "cls" },
|
|
531
|
+
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
|
532
|
+
},
|
|
533
|
+
},
|
|
498
534
|
{
|
|
499
535
|
LLM_ARCH_JINA_BERT_V2,
|
|
500
536
|
{
|
|
@@ -1556,6 +1592,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1556
1592
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1557
1593
|
},
|
|
1558
1594
|
},
|
|
1595
|
+
{
|
|
1596
|
+
LLM_ARCH_DOTS1,
|
|
1597
|
+
{
|
|
1598
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1599
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1600
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1601
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1602
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1603
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1604
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1605
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1606
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1607
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1608
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1609
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1610
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1611
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1612
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1613
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1614
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1615
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1616
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
|
1617
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1618
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1619
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1620
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1621
|
+
}
|
|
1622
|
+
},
|
|
1559
1623
|
{
|
|
1560
1624
|
LLM_ARCH_UNKNOWN,
|
|
1561
1625
|
{
|
|
@@ -1707,8 +1771,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1707
1771
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
1708
1772
|
|
|
1709
1773
|
std::string LLM_KV::operator()(llm_kv kv) const {
|
|
1710
|
-
|
|
1711
|
-
|
|
1774
|
+
std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
|
1775
|
+
|
|
1776
|
+
if (suffix != nullptr) {
|
|
1777
|
+
name += ".";
|
|
1778
|
+
name += suffix;
|
|
1779
|
+
}
|
|
1780
|
+
|
|
1781
|
+
return name;
|
|
1712
1782
|
}
|
|
1713
1783
|
|
|
1714
1784
|
std::string LLM_TN_IMPL::str() const {
|
|
@@ -1747,3 +1817,25 @@ llm_arch llm_arch_from_string(const std::string & name) {
|
|
|
1747
1817
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
|
|
1748
1818
|
return LLM_TENSOR_INFOS.at(tensor);
|
|
1749
1819
|
}
|
|
1820
|
+
|
|
1821
|
+
bool llm_arch_is_recurrent(const llm_arch & arch) {
|
|
1822
|
+
switch (arch) {
|
|
1823
|
+
case LLM_ARCH_MAMBA:
|
|
1824
|
+
case LLM_ARCH_RWKV6:
|
|
1825
|
+
case LLM_ARCH_RWKV6QWEN2:
|
|
1826
|
+
case LLM_ARCH_RWKV7:
|
|
1827
|
+
case LLM_ARCH_ARWKV7:
|
|
1828
|
+
return true;
|
|
1829
|
+
default:
|
|
1830
|
+
return false;
|
|
1831
|
+
}
|
|
1832
|
+
}
|
|
1833
|
+
|
|
1834
|
+
bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
1835
|
+
// TODO: There are currently no hybrid models! Once there are, this will be
|
|
1836
|
+
// the place to identify them
|
|
1837
|
+
switch (arch) {
|
|
1838
|
+
default:
|
|
1839
|
+
return false;
|
|
1840
|
+
}
|
|
1841
|
+
}
|
|
@@ -24,6 +24,7 @@ enum llm_arch {
|
|
|
24
24
|
LLM_ARCH_BERT,
|
|
25
25
|
LLM_ARCH_NOMIC_BERT,
|
|
26
26
|
LLM_ARCH_NOMIC_BERT_MOE,
|
|
27
|
+
LLM_ARCH_NEO_BERT,
|
|
27
28
|
LLM_ARCH_JINA_BERT_V2,
|
|
28
29
|
LLM_ARCH_BLOOM,
|
|
29
30
|
LLM_ARCH_STABLELM,
|
|
@@ -76,6 +77,8 @@ enum llm_arch {
|
|
|
76
77
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
77
78
|
LLM_ARCH_PLM,
|
|
78
79
|
LLM_ARCH_BAILINGMOE,
|
|
80
|
+
LLM_ARCH_DOTS1,
|
|
81
|
+
LLM_ARCH_ARCEE,
|
|
79
82
|
LLM_ARCH_UNKNOWN,
|
|
80
83
|
};
|
|
81
84
|
|
|
@@ -148,6 +151,7 @@ enum llm_kv {
|
|
|
148
151
|
LLM_KV_ATTENTION_SCALE,
|
|
149
152
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
150
153
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
154
|
+
LLM_KV_ATTENTION_LAYER_INDICES,
|
|
151
155
|
|
|
152
156
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
153
157
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
@@ -196,7 +200,6 @@ enum llm_kv {
|
|
|
196
200
|
LLM_KV_TOKENIZER_HF_JSON,
|
|
197
201
|
LLM_KV_TOKENIZER_RWKV,
|
|
198
202
|
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
|
199
|
-
LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
|
|
200
203
|
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
|
201
204
|
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
|
202
205
|
LLM_KV_TOKENIZER_FIM_MID_ID,
|
|
@@ -437,3 +440,6 @@ const char * llm_arch_name(llm_arch arch);
|
|
|
437
440
|
llm_arch llm_arch_from_string(const std::string & name);
|
|
438
441
|
|
|
439
442
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
|
443
|
+
|
|
444
|
+
bool llm_arch_is_recurrent(const llm_arch & arch);
|
|
445
|
+
bool llm_arch_is_hybrid (const llm_arch & arch);
|