@novastera-oss/llamarn 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +134 -36
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -2
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +30 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +50 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +134 -36
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
- package/cpp/llama.cpp/src/llama-batch.h +36 -11
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +313 -213
- package/cpp/llama.cpp/src/llama-context.h +16 -12
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
- package/cpp/llama.cpp/src/llama-graph.h +90 -34
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
- package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +64 -23
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +726 -141
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/llama.h +134 -36
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
|
@@ -519,7 +519,7 @@ class TextModel(ModelBase):
|
|
|
519
519
|
def set_gguf_parameters(self):
|
|
520
520
|
self.gguf_writer.add_block_count(self.block_count)
|
|
521
521
|
|
|
522
|
-
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
|
522
|
+
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
|
523
523
|
self.gguf_writer.add_context_length(n_ctx)
|
|
524
524
|
logger.info(f"gguf: context length = {n_ctx}")
|
|
525
525
|
|
|
@@ -1898,9 +1898,7 @@ class LlamaModel(TextModel):
|
|
|
1898
1898
|
hparams = self.hparams
|
|
1899
1899
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
1900
1900
|
|
|
1901
|
-
if "head_dim"
|
|
1902
|
-
rope_dim = hparams["head_dim"]
|
|
1903
|
-
else:
|
|
1901
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
1904
1902
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
1905
1903
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
1906
1904
|
|
|
@@ -1982,7 +1980,8 @@ class LlamaModel(TextModel):
|
|
|
1982
1980
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
1983
1981
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
1984
1982
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
1985
|
-
dim
|
|
1983
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
1984
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
1986
1985
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
1987
1986
|
|
|
1988
1987
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -2017,6 +2016,20 @@ class LlamaModel(TextModel):
|
|
|
2017
2016
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
2018
2017
|
|
|
2019
2018
|
|
|
2019
|
+
@ModelBase.register("ArceeForCausalLM")
|
|
2020
|
+
class ArceeModel(LlamaModel):
|
|
2021
|
+
model_arch = gguf.MODEL_ARCH.ARCEE
|
|
2022
|
+
|
|
2023
|
+
def set_gguf_parameters(self):
|
|
2024
|
+
super().set_gguf_parameters()
|
|
2025
|
+
self._try_set_pooling_type()
|
|
2026
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
2027
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
2028
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
2029
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
2030
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
2031
|
+
|
|
2032
|
+
|
|
2020
2033
|
@ModelBase.register(
|
|
2021
2034
|
"LlavaForConditionalGeneration", # pixtral
|
|
2022
2035
|
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
|
@@ -2304,9 +2317,7 @@ class DeciModel(TextModel):
|
|
|
2304
2317
|
hparams = self.hparams
|
|
2305
2318
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
2306
2319
|
|
|
2307
|
-
if "head_dim"
|
|
2308
|
-
rope_dim = hparams["head_dim"]
|
|
2309
|
-
else:
|
|
2320
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
2310
2321
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
2311
2322
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
2312
2323
|
|
|
@@ -2346,7 +2357,8 @@ class DeciModel(TextModel):
|
|
|
2346
2357
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
2347
2358
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
2348
2359
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
2349
|
-
dim
|
|
2360
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
2361
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
2350
2362
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
2351
2363
|
|
|
2352
2364
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -3664,9 +3676,7 @@ class InternLM3Model(TextModel):
|
|
|
3664
3676
|
hparams = self.hparams
|
|
3665
3677
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
3666
3678
|
|
|
3667
|
-
if "head_dim"
|
|
3668
|
-
rope_dim = hparams["head_dim"]
|
|
3669
|
-
else:
|
|
3679
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
3670
3680
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
3671
3681
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
3672
3682
|
|
|
@@ -3709,8 +3719,7 @@ class BertModel(TextModel):
|
|
|
3709
3719
|
self._try_set_pooling_type()
|
|
3710
3720
|
|
|
3711
3721
|
if self.cls_out_labels:
|
|
3712
|
-
|
|
3713
|
-
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
|
|
3722
|
+
self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
|
|
3714
3723
|
|
|
3715
3724
|
def set_vocab(self):
|
|
3716
3725
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
|
@@ -4060,6 +4069,34 @@ class NomicBertModel(BertModel):
|
|
|
4060
4069
|
raise ValueError(f"unknown tokenizer: {toktyp}")
|
|
4061
4070
|
|
|
4062
4071
|
|
|
4072
|
+
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
|
|
4073
|
+
class NeoBert(BertModel):
|
|
4074
|
+
model_arch = gguf.MODEL_ARCH.NEO_BERT
|
|
4075
|
+
|
|
4076
|
+
def set_gguf_parameters(self):
|
|
4077
|
+
super().set_gguf_parameters()
|
|
4078
|
+
|
|
4079
|
+
# NeoBERT uses 2/3 of the intermediate size as feed forward length
|
|
4080
|
+
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
|
|
4081
|
+
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
|
|
4082
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
4083
|
+
|
|
4084
|
+
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
|
|
4085
|
+
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
|
4086
|
+
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
|
4087
|
+
|
|
4088
|
+
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
|
|
4089
|
+
|
|
4090
|
+
def modify_tensors(self, data_torch, name, bid):
|
|
4091
|
+
if name.startswith("decoder."):
|
|
4092
|
+
return []
|
|
4093
|
+
|
|
4094
|
+
if name.startswith("model."):
|
|
4095
|
+
name = name[6:]
|
|
4096
|
+
|
|
4097
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4098
|
+
|
|
4099
|
+
|
|
4063
4100
|
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
|
4064
4101
|
class XLMRobertaModel(BertModel):
|
|
4065
4102
|
model_arch = gguf.MODEL_ARCH.BERT
|
|
@@ -4799,25 +4836,6 @@ class OlmoeModel(TextModel):
|
|
|
4799
4836
|
class JinaBertV2Model(BertModel):
|
|
4800
4837
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
|
4801
4838
|
|
|
4802
|
-
def __init__(self, *args, **kwargs):
|
|
4803
|
-
super().__init__(*args, **kwargs)
|
|
4804
|
-
self.intermediate_size = self.hparams["intermediate_size"]
|
|
4805
|
-
|
|
4806
|
-
def get_tensors(self):
|
|
4807
|
-
for name, data in super().get_tensors():
|
|
4808
|
-
if 'gated_layer' in name:
|
|
4809
|
-
d1 = data[:self.intermediate_size, :]
|
|
4810
|
-
name1 = name.replace('gated_layers', 'gated_layers_w')
|
|
4811
|
-
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
|
4812
|
-
d2 = data[self.intermediate_size:, :]
|
|
4813
|
-
name2 = name.replace('gated_layers', 'gated_layers_v')
|
|
4814
|
-
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
|
4815
|
-
yield name1, d1
|
|
4816
|
-
yield name2, d2
|
|
4817
|
-
continue
|
|
4818
|
-
|
|
4819
|
-
yield name, data
|
|
4820
|
-
|
|
4821
4839
|
def set_vocab(self):
|
|
4822
4840
|
tokenizer_class = 'BertTokenizer'
|
|
4823
4841
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
|
@@ -4833,14 +4851,6 @@ class JinaBertV2Model(BertModel):
|
|
|
4833
4851
|
self.gguf_writer.add_add_bos_token(True)
|
|
4834
4852
|
self.gguf_writer.add_add_eos_token(True)
|
|
4835
4853
|
|
|
4836
|
-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4837
|
-
# if name starts with "bert.", remove the prefix
|
|
4838
|
-
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
|
4839
|
-
if name.startswith("bert."):
|
|
4840
|
-
name = name[5:]
|
|
4841
|
-
|
|
4842
|
-
return super().modify_tensors(data_torch, name, bid)
|
|
4843
|
-
|
|
4844
4854
|
|
|
4845
4855
|
@ModelBase.register("OpenELMForCausalLM")
|
|
4846
4856
|
class OpenELMModel(TextModel):
|
|
@@ -5081,9 +5091,7 @@ class DeepseekModel(TextModel):
|
|
|
5081
5091
|
def set_gguf_parameters(self):
|
|
5082
5092
|
super().set_gguf_parameters()
|
|
5083
5093
|
hparams = self.hparams
|
|
5084
|
-
if "head_dim"
|
|
5085
|
-
rope_dim = hparams["head_dim"]
|
|
5086
|
-
else:
|
|
5094
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
5087
5095
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
5088
5096
|
|
|
5089
5097
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
@@ -5287,6 +5295,34 @@ class DeepseekV2Model(TextModel):
|
|
|
5287
5295
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
5288
5296
|
|
|
5289
5297
|
|
|
5298
|
+
@ModelBase.register("Dots1ForCausalLM")
|
|
5299
|
+
class Dots1Model(Qwen2MoeModel):
|
|
5300
|
+
model_arch = gguf.MODEL_ARCH.DOTS1
|
|
5301
|
+
|
|
5302
|
+
def __init__(self, *args, **kwargs):
|
|
5303
|
+
super().__init__(*args, **kwargs)
|
|
5304
|
+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
|
5305
|
+
|
|
5306
|
+
def set_gguf_parameters(self):
|
|
5307
|
+
super().set_gguf_parameters()
|
|
5308
|
+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
|
5309
|
+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
|
5310
|
+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
|
5311
|
+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
|
5312
|
+
|
|
5313
|
+
if self.hparams["scoring_func"] == "noaux_tc":
|
|
5314
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
|
5315
|
+
else:
|
|
5316
|
+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
|
5317
|
+
|
|
5318
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
|
5319
|
+
if name.endswith("e_score_correction_bias"):
|
|
5320
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
5321
|
+
if "shared_experts" in name:
|
|
5322
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
5323
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
5324
|
+
|
|
5325
|
+
|
|
5290
5326
|
@ModelBase.register("PLMForCausalLM")
|
|
5291
5327
|
class PLMModel(TextModel):
|
|
5292
5328
|
model_arch = gguf.MODEL_ARCH.PLM
|
|
@@ -5945,7 +5981,8 @@ class ExaoneModel(TextModel):
|
|
|
5945
5981
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
5946
5982
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
5947
5983
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
5948
|
-
dim
|
|
5984
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
5985
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
5949
5986
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
5950
5987
|
|
|
5951
5988
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -6057,7 +6094,8 @@ class BailingMoeModel(TextModel):
|
|
|
6057
6094
|
def set_gguf_parameters(self):
|
|
6058
6095
|
super().set_gguf_parameters()
|
|
6059
6096
|
hparams = self.hparams
|
|
6060
|
-
rope_dim
|
|
6097
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
6098
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
6061
6099
|
|
|
6062
6100
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
6063
6101
|
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
@@ -6089,7 +6127,8 @@ class BailingMoeModel(TextModel):
|
|
|
6089
6127
|
n_head = self.hparams["num_attention_heads"]
|
|
6090
6128
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
|
6091
6129
|
n_embd = self.hparams["hidden_size"]
|
|
6092
|
-
head_dim
|
|
6130
|
+
if (head_dim := self.hparams.get("head_dim")) is None:
|
|
6131
|
+
head_dim = n_embd // n_head
|
|
6093
6132
|
|
|
6094
6133
|
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
|
6095
6134
|
|
|
@@ -6350,8 +6389,8 @@ def parse_args() -> argparse.Namespace:
|
|
|
6350
6389
|
help="model is executed on big endian machine",
|
|
6351
6390
|
)
|
|
6352
6391
|
parser.add_argument(
|
|
6353
|
-
"model", type=
|
|
6354
|
-
help="directory containing model file",
|
|
6392
|
+
"model", type=str,
|
|
6393
|
+
help="directory containing model file or huggingface repository ID (if --remote)",
|
|
6355
6394
|
nargs="?",
|
|
6356
6395
|
)
|
|
6357
6396
|
parser.add_argument(
|
|
@@ -6454,18 +6493,20 @@ def main() -> None:
|
|
|
6454
6493
|
else:
|
|
6455
6494
|
logging.basicConfig(level=logging.INFO)
|
|
6456
6495
|
|
|
6457
|
-
dir_model = args.model
|
|
6458
|
-
|
|
6459
6496
|
if args.remote:
|
|
6497
|
+
hf_repo_id = args.model
|
|
6460
6498
|
from huggingface_hub import snapshot_download
|
|
6461
6499
|
local_dir = snapshot_download(
|
|
6462
|
-
repo_id=
|
|
6500
|
+
repo_id=hf_repo_id,
|
|
6463
6501
|
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
|
|
6464
6502
|
dir_model = Path(local_dir)
|
|
6465
6503
|
logger.info(f"Downloaded config and tokenizer to {local_dir}")
|
|
6504
|
+
else:
|
|
6505
|
+
hf_repo_id = None
|
|
6506
|
+
dir_model = Path(args.model)
|
|
6466
6507
|
|
|
6467
6508
|
if not dir_model.is_dir():
|
|
6468
|
-
logger.error(f'Error: {
|
|
6509
|
+
logger.error(f'Error: {dir_model} is not a directory')
|
|
6469
6510
|
sys.exit(1)
|
|
6470
6511
|
|
|
6471
6512
|
ftype_map: dict[str, gguf.LlamaFileType] = {
|
|
@@ -6485,9 +6526,9 @@ def main() -> None:
|
|
|
6485
6526
|
|
|
6486
6527
|
if args.outfile is not None:
|
|
6487
6528
|
fname_out = args.outfile
|
|
6488
|
-
elif
|
|
6529
|
+
elif hf_repo_id:
|
|
6489
6530
|
# if remote, use the model ID as the output file name
|
|
6490
|
-
fname_out = Path("./" +
|
|
6531
|
+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
|
|
6491
6532
|
else:
|
|
6492
6533
|
fname_out = dir_model
|
|
6493
6534
|
|
|
@@ -6516,7 +6557,7 @@ def main() -> None:
|
|
|
6516
6557
|
split_max_tensors=args.split_max_tensors,
|
|
6517
6558
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
|
6518
6559
|
small_first_shard=args.no_tensor_first_split,
|
|
6519
|
-
remote_hf_model_id=
|
|
6560
|
+
remote_hf_model_id=hf_repo_id)
|
|
6520
6561
|
|
|
6521
6562
|
if args.vocab_only:
|
|
6522
6563
|
logger.info("Exporting model vocab...")
|
|
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
|
|
105
105
|
message(DEBUG "INS_ENB : ${INS_ENB}")
|
|
106
106
|
|
|
107
107
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
108
|
-
option(
|
|
108
|
+
option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
109
109
|
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
|
110
110
|
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
|
111
111
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
@@ -137,7 +137,7 @@ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
|
137
137
|
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
|
138
138
|
|
|
139
139
|
|
|
140
|
-
if (
|
|
140
|
+
if (MINGW)
|
|
141
141
|
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
|
|
142
142
|
endif()
|
|
143
143
|
|
|
@@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
|
|
|
172
172
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
|
173
173
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
174
174
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
175
|
+
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
175
176
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
176
177
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
177
178
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
@@ -367,6 +368,8 @@ if (MSVC)
|
|
|
367
368
|
/wd4005 # Macro redefinition
|
|
368
369
|
/wd4244 # Conversion from one type to another type, possible loss of data
|
|
369
370
|
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
|
371
|
+
/wd4305 # Conversion from 'type1' to 'type2', possible loss of data
|
|
372
|
+
/wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
|
|
370
373
|
/wd4996 # Disable POSIX deprecation warnings
|
|
371
374
|
/wd4702 # Unreachable code warnings
|
|
372
375
|
)
|
|
@@ -386,4 +389,46 @@ if (MSVC)
|
|
|
386
389
|
disable_msvc_warnings(ggml-cpu-skylakex)
|
|
387
390
|
disable_msvc_warnings(ggml-cpu-icelake)
|
|
388
391
|
disable_msvc_warnings(ggml-cpu-alderlake)
|
|
392
|
+
|
|
393
|
+
if (GGML_BUILD_EXAMPLES)
|
|
394
|
+
disable_msvc_warnings(common-ggml)
|
|
395
|
+
disable_msvc_warnings(common)
|
|
396
|
+
|
|
397
|
+
disable_msvc_warnings(mnist-common)
|
|
398
|
+
disable_msvc_warnings(mnist-eval)
|
|
399
|
+
disable_msvc_warnings(mnist-train)
|
|
400
|
+
|
|
401
|
+
disable_msvc_warnings(gpt-2-ctx)
|
|
402
|
+
disable_msvc_warnings(gpt-2-alloc)
|
|
403
|
+
disable_msvc_warnings(gpt-2-backend)
|
|
404
|
+
disable_msvc_warnings(gpt-2-sched)
|
|
405
|
+
disable_msvc_warnings(gpt-2-quantize)
|
|
406
|
+
disable_msvc_warnings(gpt-2-batched)
|
|
407
|
+
|
|
408
|
+
disable_msvc_warnings(gpt-j)
|
|
409
|
+
disable_msvc_warnings(gpt-j-quantize)
|
|
410
|
+
|
|
411
|
+
disable_msvc_warnings(magika)
|
|
412
|
+
disable_msvc_warnings(yolov3-tiny)
|
|
413
|
+
disable_msvc_warnings(sam)
|
|
414
|
+
|
|
415
|
+
disable_msvc_warnings(simple-ctx)
|
|
416
|
+
disable_msvc_warnings(simple-backend)
|
|
417
|
+
endif()
|
|
418
|
+
|
|
419
|
+
if (GGML_BUILD_TESTS)
|
|
420
|
+
disable_msvc_warnings(test-mul-mat)
|
|
421
|
+
disable_msvc_warnings(test-arange)
|
|
422
|
+
disable_msvc_warnings(test-backend-ops)
|
|
423
|
+
disable_msvc_warnings(test-cont)
|
|
424
|
+
disable_msvc_warnings(test-conv-transpose)
|
|
425
|
+
disable_msvc_warnings(test-conv-transpose-1d)
|
|
426
|
+
disable_msvc_warnings(test-conv1d)
|
|
427
|
+
disable_msvc_warnings(test-conv2d)
|
|
428
|
+
disable_msvc_warnings(test-conv2d-dw)
|
|
429
|
+
disable_msvc_warnings(test-customop)
|
|
430
|
+
disable_msvc_warnings(test-dup)
|
|
431
|
+
disable_msvc_warnings(test-opt)
|
|
432
|
+
disable_msvc_warnings(test-pool)
|
|
433
|
+
endif ()
|
|
389
434
|
endif()
|
|
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
|
|
36
36
|
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
37
37
|
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
|
38
38
|
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
|
39
|
-
elseif (
|
|
40
|
-
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
|
39
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
|
41
40
|
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
|
42
41
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
43
42
|
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
|
@@ -125,7 +125,6 @@ if (NOT MSVC)
|
|
|
125
125
|
endif()
|
|
126
126
|
|
|
127
127
|
if (MINGW)
|
|
128
|
-
# Target Windows 8 for PrefetchVirtualMemory
|
|
129
128
|
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
130
129
|
endif()
|
|
131
130
|
|
|
@@ -213,6 +212,7 @@ endif()
|
|
|
213
212
|
|
|
214
213
|
add_library(ggml
|
|
215
214
|
ggml-backend-reg.cpp)
|
|
215
|
+
add_library(ggml::ggml ALIAS ggml)
|
|
216
216
|
|
|
217
217
|
target_link_libraries(ggml PUBLIC ggml-base)
|
|
218
218
|
|
|
@@ -270,17 +270,23 @@ endfunction()
|
|
|
270
270
|
function(ggml_add_cpu_backend_variant tag_name)
|
|
271
271
|
set(GGML_CPU_TAG_NAME ${tag_name})
|
|
272
272
|
# other: OPENMP LLAMAFILE CPU_HBM
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
273
|
+
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
274
|
+
foreach (feat NATIVE
|
|
275
|
+
SSE42
|
|
276
|
+
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
|
277
|
+
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
|
278
|
+
AMX_TILE AMX_INT8 AMX_BF16)
|
|
279
|
+
set(GGML_${feat} OFF)
|
|
280
|
+
endforeach()
|
|
281
|
+
|
|
282
|
+
foreach (feat ${ARGN})
|
|
283
|
+
set(GGML_${feat} ON)
|
|
284
|
+
endforeach()
|
|
285
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
286
|
+
foreach (feat ${ARGN})
|
|
287
|
+
set(GGML_INTERNAL_${feat} ON)
|
|
288
|
+
endforeach()
|
|
289
|
+
endif()
|
|
284
290
|
|
|
285
291
|
ggml_add_cpu_backend_variant_impl(${tag_name})
|
|
286
292
|
endfunction()
|
|
@@ -290,6 +296,8 @@ ggml_add_backend(CPU)
|
|
|
290
296
|
if (GGML_CPU_ALL_VARIANTS)
|
|
291
297
|
if (NOT GGML_BACKEND_DL)
|
|
292
298
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
|
299
|
+
elseif (GGML_CPU_ARM_ARCH)
|
|
300
|
+
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
|
293
301
|
endif()
|
|
294
302
|
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
295
303
|
ggml_add_cpu_backend_variant(x64)
|
|
@@ -303,8 +311,34 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
303
311
|
# MSVC doesn't support AMX
|
|
304
312
|
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
305
313
|
endif()
|
|
314
|
+
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
315
|
+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
316
|
+
# Many of these features are optional so we build versions with popular
|
|
317
|
+
# combinations and name the backends based on the version they were
|
|
318
|
+
# first released with
|
|
319
|
+
ggml_add_cpu_backend_variant(armv8.0_1)
|
|
320
|
+
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
|
321
|
+
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
322
|
+
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
|
323
|
+
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
|
324
|
+
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
|
325
|
+
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
|
326
|
+
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
|
327
|
+
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
|
328
|
+
# Android-specific backends with SoC-compatible feature sets
|
|
329
|
+
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
|
330
|
+
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
|
331
|
+
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
332
|
+
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
333
|
+
elseif (APPLE)
|
|
334
|
+
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
335
|
+
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
|
336
|
+
ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
|
|
337
|
+
else()
|
|
338
|
+
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
|
339
|
+
endif()
|
|
306
340
|
else()
|
|
307
|
-
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported
|
|
341
|
+
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
|
308
342
|
endif()
|
|
309
343
|
elseif (GGML_CPU)
|
|
310
344
|
ggml_add_cpu_backend_variant_impl("")
|
|
@@ -69,6 +69,9 @@
|
|
|
69
69
|
#if defined(__clang__)
|
|
70
70
|
# pragma clang diagnostic push
|
|
71
71
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
72
|
+
#elif defined(__GNUC__)
|
|
73
|
+
# pragma GCC diagnostic push
|
|
74
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
72
75
|
#endif
|
|
73
76
|
|
|
74
77
|
namespace fs = std::filesystem;
|
|
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
|
|
|
91
94
|
|
|
92
95
|
#if defined(__clang__)
|
|
93
96
|
# pragma clang diagnostic pop
|
|
97
|
+
#elif defined(__GNUC__)
|
|
98
|
+
# pragma GCC diagnostic pop
|
|
94
99
|
#endif
|
|
95
100
|
|
|
96
101
|
#ifdef _WIN32
|
|
@@ -37,6 +37,7 @@
|
|
|
37
37
|
#include <thread>
|
|
38
38
|
#include <unistd.h>
|
|
39
39
|
#include <functional>
|
|
40
|
+
#include <optional>
|
|
40
41
|
|
|
41
42
|
#include "../include/ggml-cann.h"
|
|
42
43
|
#include "../include/ggml.h"
|
|
@@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info();
|
|
|
103
104
|
void ggml_cann_set_device(int32_t device);
|
|
104
105
|
int32_t ggml_cann_get_device();
|
|
105
106
|
|
|
107
|
+
std::optional<std::string> get_env(const std::string& name);
|
|
108
|
+
bool parse_bool(const std::string& value);
|
|
109
|
+
|
|
106
110
|
/**
|
|
107
111
|
* @brief Abstract base class for memory pools used by CANN.
|
|
108
112
|
*/
|
|
@@ -354,7 +358,8 @@ struct ggml_backend_cann_context {
|
|
|
354
358
|
: device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
|
|
355
359
|
ggml_cann_set_device(device);
|
|
356
360
|
description = aclrtGetSocName();
|
|
357
|
-
|
|
361
|
+
|
|
362
|
+
bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
|
358
363
|
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
|
359
364
|
device, async_mode ? "ON" : "OFF");
|
|
360
365
|
}
|
|
@@ -31,6 +31,8 @@
|
|
|
31
31
|
#include <mutex>
|
|
32
32
|
#include <queue>
|
|
33
33
|
#include <chrono>
|
|
34
|
+
#include <unordered_set>
|
|
35
|
+
#include <optional>
|
|
34
36
|
|
|
35
37
|
#include "ggml-impl.h"
|
|
36
38
|
#include "ggml-backend-impl.h"
|
|
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
|
|
|
93
95
|
return id;
|
|
94
96
|
}
|
|
95
97
|
|
|
98
|
+
/**
|
|
99
|
+
* @brief Get the value of the specified environment variable (name).
|
|
100
|
+
* if not empty, return a std::string object
|
|
101
|
+
*/
|
|
102
|
+
std::optional<std::string> get_env(const std::string& name) {
|
|
103
|
+
const char* val = std::getenv(name.c_str());
|
|
104
|
+
if (!val) return std::nullopt;
|
|
105
|
+
std::string res = std::string(val);
|
|
106
|
+
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
|
|
107
|
+
return res;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* @brief Verify whether the environment variable is a valid value.
|
|
112
|
+
*/
|
|
113
|
+
bool parse_bool(const std::string& value) {
|
|
114
|
+
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
|
|
115
|
+
return valid_values.find(value) != valid_values.end();
|
|
116
|
+
}
|
|
117
|
+
|
|
96
118
|
/**
|
|
97
119
|
* @brief Initialize the CANN device information.
|
|
98
120
|
*
|
|
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
214
236
|
* @param device The device ID to associate with this buffer pool.
|
|
215
237
|
*/
|
|
216
238
|
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
|
217
|
-
disable_clean =
|
|
239
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
|
218
240
|
}
|
|
219
241
|
|
|
220
242
|
/**
|
|
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
410
432
|
* @param device The device ID to associate with this buffer pool.
|
|
411
433
|
*/
|
|
412
434
|
explicit ggml_cann_pool_buf(int device) : device(device) {
|
|
413
|
-
disable_clean =
|
|
435
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
|
414
436
|
}
|
|
415
437
|
|
|
416
438
|
/**
|
|
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
731
753
|
*/
|
|
732
754
|
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
|
733
755
|
int device) {
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
|
738
|
-
}
|
|
739
|
-
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
|
|
740
|
-
if (enable_buf_prio) {
|
|
756
|
+
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
|
757
|
+
|
|
758
|
+
if (mem_pool_type == "prio") {
|
|
741
759
|
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
|
742
760
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
|
|
743
761
|
}
|
|
762
|
+
|
|
763
|
+
if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
|
|
764
|
+
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
|
|
765
|
+
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
|
766
|
+
}
|
|
767
|
+
|
|
744
768
|
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
|
|
745
769
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
|
|
746
770
|
}
|
|
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
|
|
1074
1074
|
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
|
1075
1075
|
GGML_TABLE_END()
|
|
1076
1076
|
|
|
1077
|
+
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
|
1078
|
+
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
|
1079
|
+
GGML_TABLE_END()
|
|
1080
|
+
|
|
1077
1081
|
#define NGRID_IQ1S 2048
|
|
1078
1082
|
#define IQ1S_DELTA 0.125f
|
|
1079
1083
|
#define IQ1M_DELTA 0.125f
|