@novastera-oss/llamarn 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +140 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +48 -67
- package/cpp/LlamaCppModel.h +8 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +33 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
- package/cpp/llama.cpp/common/arg.cpp +38 -12
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
- package/cpp/llama.cpp/common/chat-parser.h +4 -1
- package/cpp/llama.cpp/common/chat.cpp +16 -13
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +52 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +140 -38
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
- package/cpp/llama.cpp/src/llama-batch.h +47 -17
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +488 -313
- package/cpp/llama.cpp/src/llama-context.h +38 -17
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
- package/cpp/llama.cpp/src/llama-graph.h +109 -52
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
- package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +89 -4
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +735 -143
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +65 -10
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +140 -38
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -519,7 +519,7 @@ class TextModel(ModelBase):
|
|
|
519
519
|
def set_gguf_parameters(self):
|
|
520
520
|
self.gguf_writer.add_block_count(self.block_count)
|
|
521
521
|
|
|
522
|
-
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
|
522
|
+
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
|
523
523
|
self.gguf_writer.add_context_length(n_ctx)
|
|
524
524
|
logger.info(f"gguf: context length = {n_ctx}")
|
|
525
525
|
|
|
@@ -1047,6 +1047,10 @@ class TextModel(ModelBase):
|
|
|
1047
1047
|
special_vocab.chat_template = "rwkv-world"
|
|
1048
1048
|
# hack: Add '\n\n' as the EOT token to make it chat normally
|
|
1049
1049
|
special_vocab._set_special_token("eot", 261)
|
|
1050
|
+
# hack: Override these as they have already been set (incorrectly)
|
|
1051
|
+
special_vocab.special_token_ids["bos"] = 0
|
|
1052
|
+
special_vocab.special_token_ids["eos"] = 0
|
|
1053
|
+
|
|
1050
1054
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
1051
1055
|
|
|
1052
1056
|
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
|
@@ -1894,9 +1898,7 @@ class LlamaModel(TextModel):
|
|
|
1894
1898
|
hparams = self.hparams
|
|
1895
1899
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
1896
1900
|
|
|
1897
|
-
if "head_dim"
|
|
1898
|
-
rope_dim = hparams["head_dim"]
|
|
1899
|
-
else:
|
|
1901
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
1900
1902
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
1901
1903
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
1902
1904
|
|
|
@@ -1978,7 +1980,8 @@ class LlamaModel(TextModel):
|
|
|
1978
1980
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
1979
1981
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
1980
1982
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
1981
|
-
dim
|
|
1983
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
1984
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
1982
1985
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
1983
1986
|
|
|
1984
1987
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -2013,6 +2016,20 @@ class LlamaModel(TextModel):
|
|
|
2013
2016
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
2014
2017
|
|
|
2015
2018
|
|
|
2019
|
+
@ModelBase.register("ArceeForCausalLM")
|
|
2020
|
+
class ArceeModel(LlamaModel):
|
|
2021
|
+
model_arch = gguf.MODEL_ARCH.ARCEE
|
|
2022
|
+
|
|
2023
|
+
def set_gguf_parameters(self):
|
|
2024
|
+
super().set_gguf_parameters()
|
|
2025
|
+
self._try_set_pooling_type()
|
|
2026
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
2027
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
2028
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
2029
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
2030
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
2031
|
+
|
|
2032
|
+
|
|
2016
2033
|
@ModelBase.register(
|
|
2017
2034
|
"LlavaForConditionalGeneration", # pixtral
|
|
2018
2035
|
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
|
@@ -2300,9 +2317,7 @@ class DeciModel(TextModel):
|
|
|
2300
2317
|
hparams = self.hparams
|
|
2301
2318
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
2302
2319
|
|
|
2303
|
-
if "head_dim"
|
|
2304
|
-
rope_dim = hparams["head_dim"]
|
|
2305
|
-
else:
|
|
2320
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
2306
2321
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
2307
2322
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
2308
2323
|
|
|
@@ -2342,7 +2357,8 @@ class DeciModel(TextModel):
|
|
|
2342
2357
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
2343
2358
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
2344
2359
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
2345
|
-
dim
|
|
2360
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
2361
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
2346
2362
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
2347
2363
|
|
|
2348
2364
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -3660,9 +3676,7 @@ class InternLM3Model(TextModel):
|
|
|
3660
3676
|
hparams = self.hparams
|
|
3661
3677
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
3662
3678
|
|
|
3663
|
-
if "head_dim"
|
|
3664
|
-
rope_dim = hparams["head_dim"]
|
|
3665
|
-
else:
|
|
3679
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
3666
3680
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
3667
3681
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
3668
3682
|
|
|
@@ -3705,8 +3719,7 @@ class BertModel(TextModel):
|
|
|
3705
3719
|
self._try_set_pooling_type()
|
|
3706
3720
|
|
|
3707
3721
|
if self.cls_out_labels:
|
|
3708
|
-
|
|
3709
|
-
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
|
|
3722
|
+
self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
|
|
3710
3723
|
|
|
3711
3724
|
def set_vocab(self):
|
|
3712
3725
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
|
@@ -3810,7 +3823,7 @@ class BertModel(TextModel):
|
|
|
3810
3823
|
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
|
|
3811
3824
|
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
|
|
3812
3825
|
|
|
3813
|
-
vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
|
|
3826
|
+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
|
|
3814
3827
|
else:
|
|
3815
3828
|
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
|
3816
3829
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
|
@@ -3823,7 +3836,7 @@ class BertModel(TextModel):
|
|
|
3823
3836
|
tokenizer = SentencePieceProcessor()
|
|
3824
3837
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
|
3825
3838
|
|
|
3826
|
-
vocab_size = self.hparams.get(
|
|
3839
|
+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
|
|
3827
3840
|
|
|
3828
3841
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
|
3829
3842
|
scores: list[float] = [-10000.0] * vocab_size
|
|
@@ -3853,33 +3866,26 @@ class BertModel(TextModel):
|
|
|
3853
3866
|
unk_token = tokenizer_config_json.get("unk_token")
|
|
3854
3867
|
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
|
|
3855
3868
|
|
|
3856
|
-
for token_id in range(vocab_size):
|
|
3869
|
+
for token_id in range(tokenizer.vocab_size):
|
|
3857
3870
|
piece = tokenizer._convert_id_to_token(token_id)
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
|
|
3865
|
-
|
|
3866
|
-
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
|
|
3875
|
-
|
|
3876
|
-
if vocab_size > len(tokens):
|
|
3877
|
-
pad_count = vocab_size - len(tokens)
|
|
3878
|
-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
|
3879
|
-
for i in range(1, pad_count + 1):
|
|
3880
|
-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
|
3881
|
-
scores.append(-1000.0)
|
|
3882
|
-
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
|
3871
|
+
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
|
|
3872
|
+
text = piece.encode("utf-8")
|
|
3873
|
+
score = tokenizer_json["model"]["vocab"][token_id][1]
|
|
3874
|
+
|
|
3875
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
|
3876
|
+
if token_id == unk_token_id:
|
|
3877
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
|
3878
|
+
elif token_id in tokenizer.all_special_ids:
|
|
3879
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
|
3880
|
+
elif token_id in added_vocab.values():
|
|
3881
|
+
toktype = SentencePieceTokenTypes.USER_DEFINED
|
|
3882
|
+
# No reliable way to detect this, but jina doesn't have any
|
|
3883
|
+
# elif tokenizer.IsByte(token_id):
|
|
3884
|
+
# toktype = SentencePieceTokenTypes.BYTE
|
|
3885
|
+
|
|
3886
|
+
tokens[token_id] = text
|
|
3887
|
+
scores[token_id] = score
|
|
3888
|
+
toktypes[token_id] = toktype
|
|
3883
3889
|
|
|
3884
3890
|
if isinstance(tokenizer, SentencePieceProcessor):
|
|
3885
3891
|
# realign tokens (see HF tokenizer code)
|
|
@@ -3892,6 +3898,12 @@ class BertModel(TextModel):
|
|
|
3892
3898
|
SentencePieceTokenTypes.UNKNOWN,
|
|
3893
3899
|
] + toktypes[3:-1]
|
|
3894
3900
|
|
|
3901
|
+
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
|
|
3902
|
+
# Add mask token missing from sentencepiece.bpe.model
|
|
3903
|
+
tokens[250001] = b'<mask>'
|
|
3904
|
+
scores[250001] = 0.0
|
|
3905
|
+
toktypes[250001] = SentencePieceTokenTypes.CONTROL
|
|
3906
|
+
|
|
3895
3907
|
self.gguf_writer.add_tokenizer_model("t5")
|
|
3896
3908
|
self.gguf_writer.add_tokenizer_pre("default")
|
|
3897
3909
|
self.gguf_writer.add_token_list(tokens)
|
|
@@ -4057,6 +4069,34 @@ class NomicBertModel(BertModel):
|
|
|
4057
4069
|
raise ValueError(f"unknown tokenizer: {toktyp}")
|
|
4058
4070
|
|
|
4059
4071
|
|
|
4072
|
+
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
|
|
4073
|
+
class NeoBert(BertModel):
|
|
4074
|
+
model_arch = gguf.MODEL_ARCH.NEO_BERT
|
|
4075
|
+
|
|
4076
|
+
def set_gguf_parameters(self):
|
|
4077
|
+
super().set_gguf_parameters()
|
|
4078
|
+
|
|
4079
|
+
# NeoBERT uses 2/3 of the intermediate size as feed forward length
|
|
4080
|
+
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
|
|
4081
|
+
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
|
|
4082
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
4083
|
+
|
|
4084
|
+
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
|
|
4085
|
+
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
|
4086
|
+
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
|
4087
|
+
|
|
4088
|
+
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
|
|
4089
|
+
|
|
4090
|
+
def modify_tensors(self, data_torch, name, bid):
|
|
4091
|
+
if name.startswith("decoder."):
|
|
4092
|
+
return []
|
|
4093
|
+
|
|
4094
|
+
if name.startswith("model."):
|
|
4095
|
+
name = name[6:]
|
|
4096
|
+
|
|
4097
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4098
|
+
|
|
4099
|
+
|
|
4060
4100
|
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
|
4061
4101
|
class XLMRobertaModel(BertModel):
|
|
4062
4102
|
model_arch = gguf.MODEL_ARCH.BERT
|
|
@@ -4796,25 +4836,6 @@ class OlmoeModel(TextModel):
|
|
|
4796
4836
|
class JinaBertV2Model(BertModel):
|
|
4797
4837
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
|
4798
4838
|
|
|
4799
|
-
def __init__(self, *args, **kwargs):
|
|
4800
|
-
super().__init__(*args, **kwargs)
|
|
4801
|
-
self.intermediate_size = self.hparams["intermediate_size"]
|
|
4802
|
-
|
|
4803
|
-
def get_tensors(self):
|
|
4804
|
-
for name, data in super().get_tensors():
|
|
4805
|
-
if 'gated_layer' in name:
|
|
4806
|
-
d1 = data[:self.intermediate_size, :]
|
|
4807
|
-
name1 = name.replace('gated_layers', 'gated_layers_w')
|
|
4808
|
-
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
|
4809
|
-
d2 = data[self.intermediate_size:, :]
|
|
4810
|
-
name2 = name.replace('gated_layers', 'gated_layers_v')
|
|
4811
|
-
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
|
4812
|
-
yield name1, d1
|
|
4813
|
-
yield name2, d2
|
|
4814
|
-
continue
|
|
4815
|
-
|
|
4816
|
-
yield name, data
|
|
4817
|
-
|
|
4818
4839
|
def set_vocab(self):
|
|
4819
4840
|
tokenizer_class = 'BertTokenizer'
|
|
4820
4841
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
|
@@ -4830,14 +4851,6 @@ class JinaBertV2Model(BertModel):
|
|
|
4830
4851
|
self.gguf_writer.add_add_bos_token(True)
|
|
4831
4852
|
self.gguf_writer.add_add_eos_token(True)
|
|
4832
4853
|
|
|
4833
|
-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4834
|
-
# if name starts with "bert.", remove the prefix
|
|
4835
|
-
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
|
4836
|
-
if name.startswith("bert."):
|
|
4837
|
-
name = name[5:]
|
|
4838
|
-
|
|
4839
|
-
return super().modify_tensors(data_torch, name, bid)
|
|
4840
|
-
|
|
4841
4854
|
|
|
4842
4855
|
@ModelBase.register("OpenELMForCausalLM")
|
|
4843
4856
|
class OpenELMModel(TextModel):
|
|
@@ -5078,9 +5091,7 @@ class DeepseekModel(TextModel):
|
|
|
5078
5091
|
def set_gguf_parameters(self):
|
|
5079
5092
|
super().set_gguf_parameters()
|
|
5080
5093
|
hparams = self.hparams
|
|
5081
|
-
if "head_dim"
|
|
5082
|
-
rope_dim = hparams["head_dim"]
|
|
5083
|
-
else:
|
|
5094
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
5084
5095
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
5085
5096
|
|
|
5086
5097
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
@@ -5284,6 +5295,34 @@ class DeepseekV2Model(TextModel):
|
|
|
5284
5295
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
5285
5296
|
|
|
5286
5297
|
|
|
5298
|
+
@ModelBase.register("Dots1ForCausalLM")
|
|
5299
|
+
class Dots1Model(Qwen2MoeModel):
|
|
5300
|
+
model_arch = gguf.MODEL_ARCH.DOTS1
|
|
5301
|
+
|
|
5302
|
+
def __init__(self, *args, **kwargs):
|
|
5303
|
+
super().__init__(*args, **kwargs)
|
|
5304
|
+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
|
5305
|
+
|
|
5306
|
+
def set_gguf_parameters(self):
|
|
5307
|
+
super().set_gguf_parameters()
|
|
5308
|
+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
|
5309
|
+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
|
5310
|
+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
|
5311
|
+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
|
5312
|
+
|
|
5313
|
+
if self.hparams["scoring_func"] == "noaux_tc":
|
|
5314
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
|
5315
|
+
else:
|
|
5316
|
+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
|
5317
|
+
|
|
5318
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
|
5319
|
+
if name.endswith("e_score_correction_bias"):
|
|
5320
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
5321
|
+
if "shared_experts" in name:
|
|
5322
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
5323
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
5324
|
+
|
|
5325
|
+
|
|
5287
5326
|
@ModelBase.register("PLMForCausalLM")
|
|
5288
5327
|
class PLMModel(TextModel):
|
|
5289
5328
|
model_arch = gguf.MODEL_ARCH.PLM
|
|
@@ -5942,7 +5981,8 @@ class ExaoneModel(TextModel):
|
|
|
5942
5981
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
5943
5982
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
5944
5983
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
5945
|
-
dim
|
|
5984
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
5985
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
5946
5986
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
5947
5987
|
|
|
5948
5988
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -6054,7 +6094,8 @@ class BailingMoeModel(TextModel):
|
|
|
6054
6094
|
def set_gguf_parameters(self):
|
|
6055
6095
|
super().set_gguf_parameters()
|
|
6056
6096
|
hparams = self.hparams
|
|
6057
|
-
rope_dim
|
|
6097
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
6098
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
6058
6099
|
|
|
6059
6100
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
6060
6101
|
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
@@ -6086,7 +6127,8 @@ class BailingMoeModel(TextModel):
|
|
|
6086
6127
|
n_head = self.hparams["num_attention_heads"]
|
|
6087
6128
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
|
6088
6129
|
n_embd = self.hparams["hidden_size"]
|
|
6089
|
-
head_dim
|
|
6130
|
+
if (head_dim := self.hparams.get("head_dim")) is None:
|
|
6131
|
+
head_dim = n_embd // n_head
|
|
6090
6132
|
|
|
6091
6133
|
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
|
6092
6134
|
|
|
@@ -6347,8 +6389,8 @@ def parse_args() -> argparse.Namespace:
|
|
|
6347
6389
|
help="model is executed on big endian machine",
|
|
6348
6390
|
)
|
|
6349
6391
|
parser.add_argument(
|
|
6350
|
-
"model", type=
|
|
6351
|
-
help="directory containing model file",
|
|
6392
|
+
"model", type=str,
|
|
6393
|
+
help="directory containing model file or huggingface repository ID (if --remote)",
|
|
6352
6394
|
nargs="?",
|
|
6353
6395
|
)
|
|
6354
6396
|
parser.add_argument(
|
|
@@ -6451,18 +6493,20 @@ def main() -> None:
|
|
|
6451
6493
|
else:
|
|
6452
6494
|
logging.basicConfig(level=logging.INFO)
|
|
6453
6495
|
|
|
6454
|
-
dir_model = args.model
|
|
6455
|
-
|
|
6456
6496
|
if args.remote:
|
|
6497
|
+
hf_repo_id = args.model
|
|
6457
6498
|
from huggingface_hub import snapshot_download
|
|
6458
6499
|
local_dir = snapshot_download(
|
|
6459
|
-
repo_id=
|
|
6500
|
+
repo_id=hf_repo_id,
|
|
6460
6501
|
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
|
|
6461
6502
|
dir_model = Path(local_dir)
|
|
6462
6503
|
logger.info(f"Downloaded config and tokenizer to {local_dir}")
|
|
6504
|
+
else:
|
|
6505
|
+
hf_repo_id = None
|
|
6506
|
+
dir_model = Path(args.model)
|
|
6463
6507
|
|
|
6464
6508
|
if not dir_model.is_dir():
|
|
6465
|
-
logger.error(f'Error: {
|
|
6509
|
+
logger.error(f'Error: {dir_model} is not a directory')
|
|
6466
6510
|
sys.exit(1)
|
|
6467
6511
|
|
|
6468
6512
|
ftype_map: dict[str, gguf.LlamaFileType] = {
|
|
@@ -6482,9 +6526,9 @@ def main() -> None:
|
|
|
6482
6526
|
|
|
6483
6527
|
if args.outfile is not None:
|
|
6484
6528
|
fname_out = args.outfile
|
|
6485
|
-
elif
|
|
6529
|
+
elif hf_repo_id:
|
|
6486
6530
|
# if remote, use the model ID as the output file name
|
|
6487
|
-
fname_out = Path("./" +
|
|
6531
|
+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
|
|
6488
6532
|
else:
|
|
6489
6533
|
fname_out = dir_model
|
|
6490
6534
|
|
|
@@ -6513,7 +6557,7 @@ def main() -> None:
|
|
|
6513
6557
|
split_max_tensors=args.split_max_tensors,
|
|
6514
6558
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
|
6515
6559
|
small_first_shard=args.no_tensor_first_split,
|
|
6516
|
-
remote_hf_model_id=
|
|
6560
|
+
remote_hf_model_id=hf_repo_id)
|
|
6517
6561
|
|
|
6518
6562
|
if args.vocab_only:
|
|
6519
6563
|
logger.info("Exporting model vocab...")
|
|
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
|
|
105
105
|
message(DEBUG "INS_ENB : ${INS_ENB}")
|
|
106
106
|
|
|
107
107
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
108
|
-
option(
|
|
108
|
+
option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
109
109
|
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
|
110
110
|
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
|
111
111
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
@@ -137,7 +137,7 @@ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
|
137
137
|
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
|
138
138
|
|
|
139
139
|
|
|
140
|
-
if (
|
|
140
|
+
if (MINGW)
|
|
141
141
|
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
|
|
142
142
|
endif()
|
|
143
143
|
|
|
@@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
|
|
|
172
172
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
|
173
173
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
174
174
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
175
|
+
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
175
176
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
176
177
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
177
178
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
@@ -367,6 +368,8 @@ if (MSVC)
|
|
|
367
368
|
/wd4005 # Macro redefinition
|
|
368
369
|
/wd4244 # Conversion from one type to another type, possible loss of data
|
|
369
370
|
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
|
371
|
+
/wd4305 # Conversion from 'type1' to 'type2', possible loss of data
|
|
372
|
+
/wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
|
|
370
373
|
/wd4996 # Disable POSIX deprecation warnings
|
|
371
374
|
/wd4702 # Unreachable code warnings
|
|
372
375
|
)
|
|
@@ -386,4 +389,46 @@ if (MSVC)
|
|
|
386
389
|
disable_msvc_warnings(ggml-cpu-skylakex)
|
|
387
390
|
disable_msvc_warnings(ggml-cpu-icelake)
|
|
388
391
|
disable_msvc_warnings(ggml-cpu-alderlake)
|
|
392
|
+
|
|
393
|
+
if (GGML_BUILD_EXAMPLES)
|
|
394
|
+
disable_msvc_warnings(common-ggml)
|
|
395
|
+
disable_msvc_warnings(common)
|
|
396
|
+
|
|
397
|
+
disable_msvc_warnings(mnist-common)
|
|
398
|
+
disable_msvc_warnings(mnist-eval)
|
|
399
|
+
disable_msvc_warnings(mnist-train)
|
|
400
|
+
|
|
401
|
+
disable_msvc_warnings(gpt-2-ctx)
|
|
402
|
+
disable_msvc_warnings(gpt-2-alloc)
|
|
403
|
+
disable_msvc_warnings(gpt-2-backend)
|
|
404
|
+
disable_msvc_warnings(gpt-2-sched)
|
|
405
|
+
disable_msvc_warnings(gpt-2-quantize)
|
|
406
|
+
disable_msvc_warnings(gpt-2-batched)
|
|
407
|
+
|
|
408
|
+
disable_msvc_warnings(gpt-j)
|
|
409
|
+
disable_msvc_warnings(gpt-j-quantize)
|
|
410
|
+
|
|
411
|
+
disable_msvc_warnings(magika)
|
|
412
|
+
disable_msvc_warnings(yolov3-tiny)
|
|
413
|
+
disable_msvc_warnings(sam)
|
|
414
|
+
|
|
415
|
+
disable_msvc_warnings(simple-ctx)
|
|
416
|
+
disable_msvc_warnings(simple-backend)
|
|
417
|
+
endif()
|
|
418
|
+
|
|
419
|
+
if (GGML_BUILD_TESTS)
|
|
420
|
+
disable_msvc_warnings(test-mul-mat)
|
|
421
|
+
disable_msvc_warnings(test-arange)
|
|
422
|
+
disable_msvc_warnings(test-backend-ops)
|
|
423
|
+
disable_msvc_warnings(test-cont)
|
|
424
|
+
disable_msvc_warnings(test-conv-transpose)
|
|
425
|
+
disable_msvc_warnings(test-conv-transpose-1d)
|
|
426
|
+
disable_msvc_warnings(test-conv1d)
|
|
427
|
+
disable_msvc_warnings(test-conv2d)
|
|
428
|
+
disable_msvc_warnings(test-conv2d-dw)
|
|
429
|
+
disable_msvc_warnings(test-customop)
|
|
430
|
+
disable_msvc_warnings(test-dup)
|
|
431
|
+
disable_msvc_warnings(test-opt)
|
|
432
|
+
disable_msvc_warnings(test-pool)
|
|
433
|
+
endif ()
|
|
389
434
|
endif()
|
|
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
|
|
36
36
|
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
37
37
|
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
|
38
38
|
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
|
39
|
-
elseif (
|
|
40
|
-
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
|
39
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
|
41
40
|
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
|
42
41
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
43
42
|
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
|
@@ -2095,9 +2095,6 @@ extern "C" {
|
|
|
2095
2095
|
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
|
2096
2096
|
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
|
2097
2097
|
|
|
2098
|
-
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
|
2099
|
-
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
|
2100
|
-
|
|
2101
2098
|
// print info and performance information for the graph
|
|
2102
2099
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
|
2103
2100
|
|
|
@@ -2181,6 +2178,7 @@ extern "C" {
|
|
|
2181
2178
|
|
|
2182
2179
|
// scheduling priorities
|
|
2183
2180
|
enum ggml_sched_priority {
|
|
2181
|
+
GGML_SCHED_PRIO_LOW = -1,
|
|
2184
2182
|
GGML_SCHED_PRIO_NORMAL,
|
|
2185
2183
|
GGML_SCHED_PRIO_MEDIUM,
|
|
2186
2184
|
GGML_SCHED_PRIO_HIGH,
|
|
@@ -125,7 +125,6 @@ if (NOT MSVC)
|
|
|
125
125
|
endif()
|
|
126
126
|
|
|
127
127
|
if (MINGW)
|
|
128
|
-
# Target Windows 8 for PrefetchVirtualMemory
|
|
129
128
|
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
130
129
|
endif()
|
|
131
130
|
|
|
@@ -196,6 +195,7 @@ add_library(ggml-base
|
|
|
196
195
|
../include/ggml-opt.h
|
|
197
196
|
../include/gguf.h
|
|
198
197
|
ggml.c
|
|
198
|
+
ggml.cpp
|
|
199
199
|
ggml-alloc.c
|
|
200
200
|
ggml-backend.cpp
|
|
201
201
|
ggml-opt.cpp
|
|
@@ -212,6 +212,7 @@ endif()
|
|
|
212
212
|
|
|
213
213
|
add_library(ggml
|
|
214
214
|
ggml-backend-reg.cpp)
|
|
215
|
+
add_library(ggml::ggml ALIAS ggml)
|
|
215
216
|
|
|
216
217
|
target_link_libraries(ggml PUBLIC ggml-base)
|
|
217
218
|
|
|
@@ -226,6 +227,7 @@ function(ggml_add_backend_library backend)
|
|
|
226
227
|
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
|
227
228
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
|
228
229
|
add_dependencies(ggml ${backend})
|
|
230
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
229
231
|
else()
|
|
230
232
|
add_library(${backend} ${ARGN})
|
|
231
233
|
target_link_libraries(ggml PUBLIC ${backend})
|
|
@@ -268,17 +270,23 @@ endfunction()
|
|
|
268
270
|
function(ggml_add_cpu_backend_variant tag_name)
|
|
269
271
|
set(GGML_CPU_TAG_NAME ${tag_name})
|
|
270
272
|
# other: OPENMP LLAMAFILE CPU_HBM
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
273
|
+
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
274
|
+
foreach (feat NATIVE
|
|
275
|
+
SSE42
|
|
276
|
+
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
|
277
|
+
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
|
278
|
+
AMX_TILE AMX_INT8 AMX_BF16)
|
|
279
|
+
set(GGML_${feat} OFF)
|
|
280
|
+
endforeach()
|
|
281
|
+
|
|
282
|
+
foreach (feat ${ARGN})
|
|
283
|
+
set(GGML_${feat} ON)
|
|
284
|
+
endforeach()
|
|
285
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
286
|
+
foreach (feat ${ARGN})
|
|
287
|
+
set(GGML_INTERNAL_${feat} ON)
|
|
288
|
+
endforeach()
|
|
289
|
+
endif()
|
|
282
290
|
|
|
283
291
|
ggml_add_cpu_backend_variant_impl(${tag_name})
|
|
284
292
|
endfunction()
|
|
@@ -288,6 +296,8 @@ ggml_add_backend(CPU)
|
|
|
288
296
|
if (GGML_CPU_ALL_VARIANTS)
|
|
289
297
|
if (NOT GGML_BACKEND_DL)
|
|
290
298
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
|
299
|
+
elseif (GGML_CPU_ARM_ARCH)
|
|
300
|
+
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
|
291
301
|
endif()
|
|
292
302
|
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
293
303
|
ggml_add_cpu_backend_variant(x64)
|
|
@@ -301,8 +311,34 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
301
311
|
# MSVC doesn't support AMX
|
|
302
312
|
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
303
313
|
endif()
|
|
314
|
+
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
315
|
+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
316
|
+
# Many of these features are optional so we build versions with popular
|
|
317
|
+
# combinations and name the backends based on the version they were
|
|
318
|
+
# first released with
|
|
319
|
+
ggml_add_cpu_backend_variant(armv8.0_1)
|
|
320
|
+
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
|
321
|
+
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
322
|
+
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
|
323
|
+
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
|
324
|
+
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
|
325
|
+
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
|
326
|
+
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
|
327
|
+
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
|
328
|
+
# Android-specific backends with SoC-compatible feature sets
|
|
329
|
+
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
|
330
|
+
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
|
331
|
+
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
332
|
+
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
333
|
+
elseif (APPLE)
|
|
334
|
+
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
335
|
+
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
|
336
|
+
ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
|
|
337
|
+
else()
|
|
338
|
+
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
|
339
|
+
endif()
|
|
304
340
|
else()
|
|
305
|
-
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported
|
|
341
|
+
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
|
306
342
|
endif()
|
|
307
343
|
elseif (GGML_CPU)
|
|
308
344
|
ggml_add_cpu_backend_variant_impl("")
|
|
@@ -69,6 +69,9 @@
|
|
|
69
69
|
#if defined(__clang__)
|
|
70
70
|
# pragma clang diagnostic push
|
|
71
71
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
72
|
+
#elif defined(__GNUC__)
|
|
73
|
+
# pragma GCC diagnostic push
|
|
74
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
72
75
|
#endif
|
|
73
76
|
|
|
74
77
|
namespace fs = std::filesystem;
|
|
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
|
|
|
91
94
|
|
|
92
95
|
#if defined(__clang__)
|
|
93
96
|
# pragma clang diagnostic pop
|
|
97
|
+
#elif defined(__GNUC__)
|
|
98
|
+
# pragma GCC diagnostic pop
|
|
94
99
|
#endif
|
|
95
100
|
|
|
96
101
|
#ifdef _WIN32
|