@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -61,9 +61,6 @@
|
|
|
61
61
|
#define m512i(p) (__m512i)(p)
|
|
62
62
|
#endif
|
|
63
63
|
|
|
64
|
-
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
|
65
|
-
float ggml_table_f32_f16[1 << 16];
|
|
66
|
-
|
|
67
64
|
#if defined(__linux__) || \
|
|
68
65
|
defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
|
|
69
66
|
(defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
|
|
@@ -936,6 +933,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
936
933
|
"TRANSPOSE",
|
|
937
934
|
"GET_ROWS",
|
|
938
935
|
"GET_ROWS_BACK",
|
|
936
|
+
"SET_ROWS",
|
|
939
937
|
"DIAG",
|
|
940
938
|
"DIAG_MASK_INF",
|
|
941
939
|
"DIAG_MASK_ZERO",
|
|
@@ -955,6 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
955
953
|
"UPSCALE",
|
|
956
954
|
"PAD",
|
|
957
955
|
"PAD_REFLECT_1D",
|
|
956
|
+
"ROLL",
|
|
958
957
|
"ARANGE",
|
|
959
958
|
"TIMESTEP_EMBEDDING",
|
|
960
959
|
"ARGSORT",
|
|
@@ -985,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
985
984
|
"OPT_STEP_ADAMW",
|
|
986
985
|
};
|
|
987
986
|
|
|
988
|
-
static_assert(GGML_OP_COUNT ==
|
|
987
|
+
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
|
989
988
|
|
|
990
989
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
991
990
|
"none",
|
|
@@ -1031,6 +1030,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1031
1030
|
"transpose(x)",
|
|
1032
1031
|
"get_rows(x)",
|
|
1033
1032
|
"get_rows_back(x)",
|
|
1033
|
+
"set_rows(x)",
|
|
1034
1034
|
"diag(x)",
|
|
1035
1035
|
"diag_mask_inf(x)",
|
|
1036
1036
|
"diag_mask_zero(x)",
|
|
@@ -1050,6 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1050
1050
|
"upscale(x)",
|
|
1051
1051
|
"pad(x)",
|
|
1052
1052
|
"pad_reflect_1d(x)",
|
|
1053
|
+
"roll(x)",
|
|
1053
1054
|
"arange(start, stop, step)",
|
|
1054
1055
|
"timestep_embedding(timesteps, dim, max_period)",
|
|
1055
1056
|
"argsort(x)",
|
|
@@ -1080,7 +1081,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1080
1081
|
"adamw(x)",
|
|
1081
1082
|
};
|
|
1082
1083
|
|
|
1083
|
-
static_assert(GGML_OP_COUNT ==
|
|
1084
|
+
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
|
1084
1085
|
|
|
1085
1086
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
|
1086
1087
|
|
|
@@ -1349,6 +1350,12 @@ bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
|
|
|
1349
1350
|
tensor->nb[2] == ggml_type_size(tensor->type);
|
|
1350
1351
|
}
|
|
1351
1352
|
|
|
1353
|
+
bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor) {
|
|
1354
|
+
return
|
|
1355
|
+
tensor->ne[0] == ggml_blck_size(tensor->type) ||
|
|
1356
|
+
tensor->nb[0] == ggml_type_size(tensor->type);
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1352
1359
|
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
1353
1360
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
1354
1361
|
|
|
@@ -1420,14 +1427,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
1420
1427
|
// initialize time system (required on Windows)
|
|
1421
1428
|
ggml_time_init();
|
|
1422
1429
|
|
|
1423
|
-
for (int i = 0; i < (1 << 16); ++i) {
|
|
1424
|
-
union {
|
|
1425
|
-
uint16_t u16;
|
|
1426
|
-
ggml_fp16_t fp16;
|
|
1427
|
-
} u = {i};
|
|
1428
|
-
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
|
1429
|
-
}
|
|
1430
|
-
|
|
1431
1430
|
is_first_call = false;
|
|
1432
1431
|
}
|
|
1433
1432
|
|
|
@@ -3393,6 +3392,35 @@ struct ggml_tensor * ggml_get_rows_back(
|
|
|
3393
3392
|
return result;
|
|
3394
3393
|
}
|
|
3395
3394
|
|
|
3395
|
+
// ggml_set_rows
|
|
3396
|
+
|
|
3397
|
+
struct ggml_tensor * ggml_set_rows(
|
|
3398
|
+
struct ggml_context * ctx,
|
|
3399
|
+
struct ggml_tensor * a,
|
|
3400
|
+
struct ggml_tensor * b,
|
|
3401
|
+
struct ggml_tensor * c) {
|
|
3402
|
+
GGML_ASSERT(a->ne[0] == b->ne[0]);
|
|
3403
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
|
3404
|
+
GGML_ASSERT(a->ne[3] == b->ne[3]);
|
|
3405
|
+
GGML_ASSERT(b->ne[1] == c->ne[0]);
|
|
3406
|
+
GGML_ASSERT(b->ne[2] % c->ne[1] == 0);
|
|
3407
|
+
GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
|
|
3408
|
+
GGML_ASSERT(c->ne[3] == 1);
|
|
3409
|
+
GGML_ASSERT(b->type == GGML_TYPE_F32);
|
|
3410
|
+
GGML_ASSERT(c->type == GGML_TYPE_I64);
|
|
3411
|
+
|
|
3412
|
+
GGML_ASSERT(ggml_is_contiguous_rows(a));
|
|
3413
|
+
GGML_ASSERT(ggml_is_contiguous_rows(b));
|
|
3414
|
+
|
|
3415
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
|
3416
|
+
|
|
3417
|
+
result->op = GGML_OP_SET_ROWS;
|
|
3418
|
+
result->src[0] = b;
|
|
3419
|
+
result->src[1] = c;
|
|
3420
|
+
|
|
3421
|
+
return result;
|
|
3422
|
+
}
|
|
3423
|
+
|
|
3396
3424
|
// ggml_diag
|
|
3397
3425
|
|
|
3398
3426
|
struct ggml_tensor * ggml_diag(
|
|
@@ -4341,6 +4369,34 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
|
|
4341
4369
|
return result;
|
|
4342
4370
|
}
|
|
4343
4371
|
|
|
4372
|
+
// ggml_roll
|
|
4373
|
+
|
|
4374
|
+
struct ggml_tensor * ggml_roll(
|
|
4375
|
+
struct ggml_context * ctx,
|
|
4376
|
+
struct ggml_tensor * a,
|
|
4377
|
+
int shift0,
|
|
4378
|
+
int shift1,
|
|
4379
|
+
int shift2,
|
|
4380
|
+
int shift3) {
|
|
4381
|
+
GGML_ASSERT(a->nb[0] == ggml_type_size(a->type));
|
|
4382
|
+
GGML_ASSERT(abs(shift0) < a->ne[0]);
|
|
4383
|
+
GGML_ASSERT(abs(shift1) < a->ne[1]);
|
|
4384
|
+
GGML_ASSERT(abs(shift2) < a->ne[2]);
|
|
4385
|
+
GGML_ASSERT(abs(shift3) < a->ne[3]);
|
|
4386
|
+
|
|
4387
|
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
4388
|
+
|
|
4389
|
+
ggml_set_op_params_i32(result, 0, shift0);
|
|
4390
|
+
ggml_set_op_params_i32(result, 1, shift1);
|
|
4391
|
+
ggml_set_op_params_i32(result, 2, shift2);
|
|
4392
|
+
ggml_set_op_params_i32(result, 3, shift3);
|
|
4393
|
+
|
|
4394
|
+
result->op = GGML_OP_ROLL;
|
|
4395
|
+
result->src[0] = a;
|
|
4396
|
+
|
|
4397
|
+
return result;
|
|
4398
|
+
}
|
|
4399
|
+
|
|
4344
4400
|
// ggml_arange
|
|
4345
4401
|
|
|
4346
4402
|
struct ggml_tensor * ggml_arange(
|
|
@@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
|
|
335
335
|
|
|
336
336
|
for (uint32_t i = 0; i < magic.size(); i++) {
|
|
337
337
|
if (magic[i] != GGUF_MAGIC[i]) {
|
|
338
|
-
|
|
338
|
+
char c0 = isprint(magic[0]) ? magic[0] : '?';
|
|
339
|
+
char c1 = isprint(magic[1]) ? magic[1] : '?';
|
|
340
|
+
char c2 = isprint(magic[2]) ? magic[2] : '?';
|
|
341
|
+
char c3 = isprint(magic[3]) ? magic[3] : '?';
|
|
342
|
+
GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3);
|
|
339
343
|
gguf_free(ctx);
|
|
340
344
|
return nullptr;
|
|
341
345
|
}
|
|
@@ -118,6 +118,10 @@ class Keys:
|
|
|
118
118
|
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
|
119
119
|
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
|
|
120
120
|
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
|
|
121
|
+
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
|
|
122
|
+
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
|
|
123
|
+
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
|
|
124
|
+
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
|
|
121
125
|
|
|
122
126
|
class Attention:
|
|
123
127
|
HEAD_COUNT = "{arch}.attention.head_count"
|
|
@@ -142,6 +146,8 @@ class Keys:
|
|
|
142
146
|
SCALE = "{arch}.attention.scale"
|
|
143
147
|
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
|
|
144
148
|
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
|
|
149
|
+
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
|
|
150
|
+
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
|
145
151
|
|
|
146
152
|
class Rope:
|
|
147
153
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
|
@@ -198,6 +204,7 @@ class Keys:
|
|
|
198
204
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
|
199
205
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
|
200
206
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
|
207
|
+
ADD_SEP = "tokenizer.ggml.add_sep_token"
|
|
201
208
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
|
202
209
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
|
203
210
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
|
@@ -313,6 +320,7 @@ class MODEL_ARCH(IntEnum):
|
|
|
313
320
|
GEMMA = auto()
|
|
314
321
|
GEMMA2 = auto()
|
|
315
322
|
GEMMA3 = auto()
|
|
323
|
+
GEMMA3N = auto()
|
|
316
324
|
STARCODER2 = auto()
|
|
317
325
|
RWKV6 = auto()
|
|
318
326
|
RWKV6QWEN2 = auto()
|
|
@@ -398,6 +406,22 @@ class MODEL_TENSOR(IntEnum):
|
|
|
398
406
|
ATTN_Q_NORM = auto()
|
|
399
407
|
ATTN_K_NORM = auto()
|
|
400
408
|
LAYER_OUT_NORM = auto()
|
|
409
|
+
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
|
|
410
|
+
PER_LAYER_MODEL_PROJ = auto() # gemma3n
|
|
411
|
+
PER_LAYER_INP_GATE = auto() # gemma3n
|
|
412
|
+
PER_LAYER_PROJ = auto() # gemma3n
|
|
413
|
+
PER_LAYER_PROJ_NORM = auto() # gemma3n
|
|
414
|
+
PER_LAYER_POST_NORM = auto() # gemma3n
|
|
415
|
+
ALTUP_PROJ = auto() # gemma3n
|
|
416
|
+
ALTUP_UNEMBD_PROJ = auto() # gemma3n
|
|
417
|
+
ALTUP_CORRECT_COEF = auto() # gemma3n
|
|
418
|
+
ALTUP_CORRECT_SCALE = auto() # gemma3n
|
|
419
|
+
ALTUP_PREDICT_COEF = auto() # gemma3n
|
|
420
|
+
ALTUP_ROUTER = auto() # gemma3n
|
|
421
|
+
ALTUP_ROUTER_NORM = auto() # gemma3n
|
|
422
|
+
LAUREL_L = auto() # gemma3n
|
|
423
|
+
LAUREL_R = auto() # gemma3n
|
|
424
|
+
LAUREL_POST_NORM = auto() # gemma3n
|
|
401
425
|
SSM_IN = auto()
|
|
402
426
|
SSM_CONV1D = auto()
|
|
403
427
|
SSM_X = auto()
|
|
@@ -596,6 +620,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
596
620
|
MODEL_ARCH.GEMMA: "gemma",
|
|
597
621
|
MODEL_ARCH.GEMMA2: "gemma2",
|
|
598
622
|
MODEL_ARCH.GEMMA3: "gemma3",
|
|
623
|
+
MODEL_ARCH.GEMMA3N: "gemma3n",
|
|
599
624
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
|
600
625
|
MODEL_ARCH.RWKV6: "rwkv6",
|
|
601
626
|
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
|
@@ -681,6 +706,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|
|
681
706
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
|
682
707
|
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
|
|
683
708
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
|
709
|
+
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
|
710
|
+
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
|
|
711
|
+
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
|
|
712
|
+
MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n
|
|
713
|
+
MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n
|
|
714
|
+
MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n
|
|
715
|
+
MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n
|
|
716
|
+
MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n
|
|
717
|
+
MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n
|
|
718
|
+
MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n
|
|
719
|
+
MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n
|
|
720
|
+
MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n
|
|
721
|
+
MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n
|
|
722
|
+
MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n
|
|
723
|
+
MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n
|
|
724
|
+
MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n
|
|
684
725
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
|
685
726
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
|
686
727
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
|
@@ -1485,6 +1526,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
1485
1526
|
MODEL_TENSOR.FFN_PRE_NORM,
|
|
1486
1527
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
1487
1528
|
],
|
|
1529
|
+
MODEL_ARCH.GEMMA3N: [
|
|
1530
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
1531
|
+
MODEL_TENSOR.OUTPUT,
|
|
1532
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
1533
|
+
MODEL_TENSOR.ATTN_Q,
|
|
1534
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
|
1535
|
+
MODEL_TENSOR.ATTN_K,
|
|
1536
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
|
1537
|
+
MODEL_TENSOR.ATTN_V,
|
|
1538
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
1539
|
+
MODEL_TENSOR.FFN_GATE,
|
|
1540
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
1541
|
+
MODEL_TENSOR.FFN_UP,
|
|
1542
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
1543
|
+
MODEL_TENSOR.ATTN_POST_NORM,
|
|
1544
|
+
MODEL_TENSOR.FFN_PRE_NORM,
|
|
1545
|
+
MODEL_TENSOR.FFN_POST_NORM,
|
|
1546
|
+
# altup / laurel
|
|
1547
|
+
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
|
1548
|
+
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
|
|
1549
|
+
MODEL_TENSOR.PER_LAYER_INP_GATE,
|
|
1550
|
+
MODEL_TENSOR.PER_LAYER_PROJ,
|
|
1551
|
+
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
|
|
1552
|
+
MODEL_TENSOR.PER_LAYER_POST_NORM,
|
|
1553
|
+
MODEL_TENSOR.ALTUP_PROJ,
|
|
1554
|
+
MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
|
|
1555
|
+
MODEL_TENSOR.ALTUP_CORRECT_COEF,
|
|
1556
|
+
MODEL_TENSOR.ALTUP_CORRECT_SCALE,
|
|
1557
|
+
MODEL_TENSOR.ALTUP_PREDICT_COEF,
|
|
1558
|
+
MODEL_TENSOR.ALTUP_ROUTER,
|
|
1559
|
+
MODEL_TENSOR.ALTUP_ROUTER_NORM,
|
|
1560
|
+
MODEL_TENSOR.LAUREL_L,
|
|
1561
|
+
MODEL_TENSOR.LAUREL_R,
|
|
1562
|
+
MODEL_TENSOR.LAUREL_POST_NORM,
|
|
1563
|
+
],
|
|
1488
1564
|
MODEL_ARCH.STARCODER2: [
|
|
1489
1565
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
1490
1566
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -672,6 +672,18 @@ class GGUFWriter:
|
|
|
672
672
|
def add_decoder_start_token_id(self, id: int) -> None:
|
|
673
673
|
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
|
674
674
|
|
|
675
|
+
def add_embedding_length_per_layer_input(self, value: int) -> None:
|
|
676
|
+
self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value)
|
|
677
|
+
|
|
678
|
+
def add_altup_active_idx(self, val: int) -> None:
|
|
679
|
+
self.add_uint32(Keys.LLM.ALTUP_ACTIVE_IDX.format(arch=self.arch), val)
|
|
680
|
+
|
|
681
|
+
def add_altup_num_inputs(self, val: int) -> None:
|
|
682
|
+
self.add_uint32(Keys.LLM.ALTUP_NUM_INPUTS.format(arch=self.arch), val)
|
|
683
|
+
|
|
684
|
+
def add_activation_sparsity_scale(self, values: Sequence[float]) -> None:
|
|
685
|
+
self.add_array(Keys.LLM.ACTIVATION_SPARSITY_SCALE.format(arch=self.arch), values)
|
|
686
|
+
|
|
675
687
|
def add_head_count(self, count: int | Sequence[int]) -> None:
|
|
676
688
|
if isinstance(count, int):
|
|
677
689
|
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
|
@@ -702,6 +714,12 @@ class GGUFWriter:
|
|
|
702
714
|
def add_clamp_kqv(self, value: float) -> None:
|
|
703
715
|
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
|
704
716
|
|
|
717
|
+
def add_shared_kv_layers(self, value: float) -> None:
|
|
718
|
+
self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
|
|
719
|
+
|
|
720
|
+
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
|
|
721
|
+
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
|
|
722
|
+
|
|
705
723
|
def add_logit_scale(self, value: float) -> None:
|
|
706
724
|
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
|
707
725
|
|
|
@@ -891,6 +909,9 @@ class GGUFWriter:
|
|
|
891
909
|
def add_add_eos_token(self, value: bool) -> None:
|
|
892
910
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
|
893
911
|
|
|
912
|
+
def add_add_sep_token(self, value: bool) -> None:
|
|
913
|
+
self.add_bool(Keys.Tokenizer.ADD_SEP, value)
|
|
914
|
+
|
|
894
915
|
def add_add_space_prefix(self, value: bool) -> None:
|
|
895
916
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
|
896
917
|
|
|
@@ -480,6 +480,70 @@ class TensorNameMap:
|
|
|
480
480
|
"encoder.layer.{bid}.layer_norm_2" # jina-v2-code
|
|
481
481
|
),
|
|
482
482
|
|
|
483
|
+
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
|
|
484
|
+
"model.embed_tokens_per_layer", # gemma3n
|
|
485
|
+
),
|
|
486
|
+
|
|
487
|
+
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: (
|
|
488
|
+
"model.per_layer_model_projection", # gemma3n
|
|
489
|
+
),
|
|
490
|
+
|
|
491
|
+
MODEL_TENSOR.PER_LAYER_PROJ_NORM: (
|
|
492
|
+
"model.per_layer_projection_norm", # gemma3n
|
|
493
|
+
),
|
|
494
|
+
|
|
495
|
+
MODEL_TENSOR.ALTUP_PROJ: (
|
|
496
|
+
"model.altup_projections", # gemma3n
|
|
497
|
+
),
|
|
498
|
+
|
|
499
|
+
MODEL_TENSOR.ALTUP_UNEMBD_PROJ: (
|
|
500
|
+
"model.altup_unembed_projections", # gemma3n
|
|
501
|
+
),
|
|
502
|
+
|
|
503
|
+
MODEL_TENSOR.PER_LAYER_INP_GATE: (
|
|
504
|
+
"model.layers.{bid}.per_layer_input_gate", # gemma3n
|
|
505
|
+
),
|
|
506
|
+
|
|
507
|
+
MODEL_TENSOR.PER_LAYER_PROJ: (
|
|
508
|
+
"model.layers.{bid}.per_layer_projection", # gemma3n
|
|
509
|
+
),
|
|
510
|
+
|
|
511
|
+
MODEL_TENSOR.PER_LAYER_POST_NORM: (
|
|
512
|
+
"model.layers.{bid}.post_per_layer_input_norm", # gemma3n
|
|
513
|
+
),
|
|
514
|
+
|
|
515
|
+
MODEL_TENSOR.ALTUP_CORRECT_COEF: (
|
|
516
|
+
"model.layers.{bid}.altup.correction_coefs", # gemma3n
|
|
517
|
+
),
|
|
518
|
+
|
|
519
|
+
MODEL_TENSOR.ALTUP_CORRECT_SCALE: (
|
|
520
|
+
"model.layers.{bid}.altup.correct_output_scale", # gemma3n
|
|
521
|
+
),
|
|
522
|
+
|
|
523
|
+
MODEL_TENSOR.ALTUP_PREDICT_COEF: (
|
|
524
|
+
"model.layers.{bid}.altup.prediction_coefs", # gemma3n
|
|
525
|
+
),
|
|
526
|
+
|
|
527
|
+
MODEL_TENSOR.ALTUP_ROUTER: (
|
|
528
|
+
"model.layers.{bid}.altup.modality_router", # gemma3n
|
|
529
|
+
),
|
|
530
|
+
|
|
531
|
+
MODEL_TENSOR.ALTUP_ROUTER_NORM: (
|
|
532
|
+
"model.layers.{bid}.altup.router_norm", # gemma3n
|
|
533
|
+
),
|
|
534
|
+
|
|
535
|
+
MODEL_TENSOR.LAUREL_L: (
|
|
536
|
+
"model.layers.{bid}.laurel.linear_left", # gemma3n
|
|
537
|
+
),
|
|
538
|
+
|
|
539
|
+
MODEL_TENSOR.LAUREL_R: (
|
|
540
|
+
"model.layers.{bid}.laurel.linear_right", # gemma3n
|
|
541
|
+
),
|
|
542
|
+
|
|
543
|
+
MODEL_TENSOR.LAUREL_POST_NORM: (
|
|
544
|
+
"model.layers.{bid}.laurel.post_laurel_norm", # gemma3n
|
|
545
|
+
),
|
|
546
|
+
|
|
483
547
|
MODEL_TENSOR.SSM_IN: (
|
|
484
548
|
"model.layers.{bid}.in_proj",
|
|
485
549
|
"backbone.layers.{bid}.mixer.in_proj",
|
|
@@ -7,7 +7,10 @@ import os
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
try:
|
|
11
|
+
from sentencepiece import SentencePieceProcessor
|
|
12
|
+
except ImportError:
|
|
13
|
+
SentencePieceProcessor = None
|
|
11
14
|
|
|
12
15
|
import gguf
|
|
13
16
|
|
|
@@ -116,6 +119,7 @@ class SpecialVocab:
|
|
|
116
119
|
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
|
|
117
120
|
|
|
118
121
|
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
|
122
|
+
tokenizer = None
|
|
119
123
|
tokenizer_file = path / 'tokenizer.json'
|
|
120
124
|
if tokenizer_file.is_file():
|
|
121
125
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
|
@@ -149,11 +153,97 @@ class SpecialVocab:
|
|
|
149
153
|
added_tokens = tokenizer.get('added_tokens', {})
|
|
150
154
|
else:
|
|
151
155
|
added_tokens = {}
|
|
156
|
+
tokenizer_config = None
|
|
152
157
|
tokenizer_config_file = path / 'tokenizer_config.json'
|
|
153
|
-
if
|
|
158
|
+
if tokenizer_config_file.is_file():
|
|
159
|
+
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
|
160
|
+
tokenizer_config = json.load(f)
|
|
161
|
+
if tokenizer:
|
|
162
|
+
special_bos = (tokenizer_config or {}).get('bos_token')
|
|
163
|
+
special_cls = (tokenizer_config or {}).get('cls_token')
|
|
164
|
+
special_eos = (tokenizer_config or {}).get('eos_token')
|
|
165
|
+
special_sep = (tokenizer_config or {}).get('sep_token')
|
|
166
|
+
if not special_bos and special_cls and tokenizer_config:
|
|
167
|
+
tokenizer_config['bos_token'] = special_bos = special_cls
|
|
168
|
+
if not special_eos and special_sep and tokenizer_config:
|
|
169
|
+
tokenizer_config['eos_token'] = special_eos = special_sep
|
|
170
|
+
if post_processor := tokenizer.get('post_processor'):
|
|
171
|
+
for processor in post_processor.get('processors', [post_processor]):
|
|
172
|
+
if processor.get('type') == 'RobertaProcessing':
|
|
173
|
+
self.add_special_token['bos'] = True
|
|
174
|
+
self.add_special_token['eos'] = True
|
|
175
|
+
self.add_special_token['sep'] = True
|
|
176
|
+
if not special_cls and tokenizer_config:
|
|
177
|
+
special_cls = processor.get('cls', [special_bos])[0]
|
|
178
|
+
tokenizer_config['cls_token'] = special_cls
|
|
179
|
+
if not special_sep and tokenizer_config:
|
|
180
|
+
special_sep = processor.get('sep', [special_eos])[0]
|
|
181
|
+
tokenizer_config['sep_token'] = special_sep
|
|
182
|
+
continue
|
|
183
|
+
# Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
|
|
184
|
+
# Only works with simple templates, **will** get it wrong on unusual sequences
|
|
185
|
+
if processor.get('type') == 'TemplateProcessing':
|
|
186
|
+
tmpl_single = processor.get('single', [])
|
|
187
|
+
tmpl_pair = processor.get('pair', [])
|
|
188
|
+
special_first = None
|
|
189
|
+
special_last = None
|
|
190
|
+
if len(tmpl_single) > 1:
|
|
191
|
+
if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
|
|
192
|
+
if not tokenizer_config:
|
|
193
|
+
special_bos = special_first
|
|
194
|
+
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
|
|
195
|
+
if special_first not in (special_bos, special_cls):
|
|
196
|
+
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
|
|
197
|
+
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
|
|
198
|
+
if not tokenizer_config:
|
|
199
|
+
special_eos = special_last
|
|
200
|
+
elif special_last != special_eos:
|
|
201
|
+
if 'eot' not in self.special_token_types:
|
|
202
|
+
self.special_token_types = tuple(self.special_token_types) + ('eot', )
|
|
203
|
+
tokenizer_config['eot_token'] = special_eos
|
|
204
|
+
elif 'eom' not in self.special_token_types:
|
|
205
|
+
self.special_token_types = tuple(self.special_token_types) + ('eom', )
|
|
206
|
+
tokenizer_config['eom_token'] = special_eos
|
|
207
|
+
else:
|
|
208
|
+
logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!')
|
|
209
|
+
tokenizer_config['eos_token'] = special_eos = special_last
|
|
210
|
+
self.add_special_token['eos'] = True if special_last == special_eos else False
|
|
211
|
+
if special_last != special_eos:
|
|
212
|
+
logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
|
|
213
|
+
if tmpl_pair:
|
|
214
|
+
seq_start = 1 if special_first and tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
|
|
215
|
+
seq_stop = -1 if special_last and tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
|
|
216
|
+
if (special_first and seq_start == 0) or (special_last and seq_stop is None):
|
|
217
|
+
logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
|
|
218
|
+
if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
|
|
219
|
+
tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
|
|
220
|
+
tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
|
|
221
|
+
if tmpl_a != 'A' or tmpl_b != 'B':
|
|
222
|
+
logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
|
|
223
|
+
# A [sep] [eos] B
|
|
224
|
+
if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
|
|
225
|
+
add_sep = False
|
|
226
|
+
if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
|
|
227
|
+
if special_entry in (special_sep, special_eos) and not special_last:
|
|
228
|
+
add_sep = True
|
|
229
|
+
if special_entry not in (special_sep, special_eos):
|
|
230
|
+
logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
|
|
231
|
+
else:
|
|
232
|
+
logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
|
|
233
|
+
if len(tmpl_pair) == 2:
|
|
234
|
+
if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
|
|
235
|
+
if special_entry in (special_sep, special_eos):
|
|
236
|
+
add_sep = True
|
|
237
|
+
if special_entry not in (special_sep, special_eos):
|
|
238
|
+
logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
|
|
239
|
+
else:
|
|
240
|
+
logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
|
|
241
|
+
self.add_special_token['sep'] = add_sep
|
|
242
|
+
if add_sep and not special_sep and tokenizer_config:
|
|
243
|
+
tokenizer_config['sep_token'] = special_eos
|
|
244
|
+
continue
|
|
245
|
+
if not tokenizer_config:
|
|
154
246
|
return True
|
|
155
|
-
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
|
156
|
-
tokenizer_config = json.load(f)
|
|
157
247
|
chat_template_alt = None
|
|
158
248
|
chat_template_file = path / 'chat_template.json'
|
|
159
249
|
if chat_template_file.is_file():
|
|
@@ -302,6 +392,9 @@ class SentencePieceVocab(Vocab):
|
|
|
302
392
|
name = "spm"
|
|
303
393
|
|
|
304
394
|
def __init__(self, base_path: Path):
|
|
395
|
+
if SentencePieceProcessor is None:
|
|
396
|
+
raise RuntimeError("sentencepiece is not installed")
|
|
397
|
+
|
|
305
398
|
added_tokens: dict[str, int] = {}
|
|
306
399
|
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
|
307
400
|
# normal location
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "gguf"
|
|
3
|
-
version = "0.17.
|
|
3
|
+
version = "0.17.1"
|
|
4
4
|
description = "Read and write ML models in GGUF for GGML"
|
|
5
5
|
authors = ["GGML <ggml@ggml.ai>"]
|
|
6
6
|
packages = [
|
|
@@ -22,7 +22,7 @@ python = ">=3.8"
|
|
|
22
22
|
numpy = ">=1.17"
|
|
23
23
|
tqdm = ">=4.27"
|
|
24
24
|
pyyaml = ">=5.1"
|
|
25
|
-
sentencepiece = ">=0.1.98,<=0.2.0"
|
|
25
|
+
sentencepiece = { version = ">=0.1.98,<=0.2.0", optional = true }
|
|
26
26
|
PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
|
|
27
27
|
|
|
28
28
|
[tool.poetry.dev-dependencies]
|
|
@@ -390,6 +390,7 @@ extern "C" {
|
|
|
390
390
|
void * imatrix; // pointer to importance matrix data
|
|
391
391
|
void * kv_overrides; // pointer to vector containing overrides
|
|
392
392
|
void * tensor_types; // pointer to vector containing tensor types
|
|
393
|
+
void * prune_layers; // pointer to vector containing layer indices to prune
|
|
393
394
|
} llama_model_quantize_params;
|
|
394
395
|
|
|
395
396
|
typedef struct llama_logit_bias {
|
|
@@ -943,12 +944,14 @@ extern "C" {
|
|
|
943
944
|
// Requires the context to have a memory.
|
|
944
945
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
945
946
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
946
|
-
// Upon
|
|
947
|
+
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
|
|
948
|
+
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
|
|
949
|
+
// Upon other return values, the memory state is restored to the state before this call
|
|
947
950
|
// 0 - success
|
|
948
951
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
949
|
-
// 2 - aborted
|
|
952
|
+
// 2 - aborted (processed ubatches will remain in the context's memory)
|
|
950
953
|
// -1 - invalid input batch
|
|
951
|
-
// < -1 - error
|
|
954
|
+
// < -1 - fatal error (processed ubatches will remain in the context's memory)
|
|
952
955
|
LLAMA_API int32_t llama_decode(
|
|
953
956
|
struct llama_context * ctx,
|
|
954
957
|
struct llama_batch batch);
|
|
@@ -1044,6 +1047,7 @@ extern "C" {
|
|
|
1044
1047
|
|
|
1045
1048
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1046
1049
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
1050
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
|
1047
1051
|
|
|
1048
1052
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
|
1049
1053
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
|
@@ -1087,6 +1091,7 @@ extern "C" {
|
|
|
1087
1091
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
1088
1092
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
|
1089
1093
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
1094
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
|
1090
1095
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
|
1091
1096
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
|
1092
1097
|
/// as plaintext. Does not insert a leading space.
|