@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -310,6 +310,8 @@ class ModelBase:
|
|
|
310
310
|
gguf.MODEL_TENSOR.POSNET_NORM2,
|
|
311
311
|
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
|
|
312
312
|
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
|
|
313
|
+
gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
|
|
314
|
+
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
|
|
313
315
|
)
|
|
314
316
|
)
|
|
315
317
|
or not new_name.endswith(".weight")
|
|
@@ -320,7 +322,11 @@ class ModelBase:
|
|
|
320
322
|
self.match_model_tensor_name(new_name, key, bid)
|
|
321
323
|
for key in (
|
|
322
324
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
|
325
|
+
gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
|
323
326
|
gguf.MODEL_TENSOR.OUTPUT,
|
|
327
|
+
gguf.MODEL_TENSOR.ALTUP_ROUTER,
|
|
328
|
+
gguf.MODEL_TENSOR.LAUREL_L,
|
|
329
|
+
gguf.MODEL_TENSOR.LAUREL_R,
|
|
324
330
|
)
|
|
325
331
|
):
|
|
326
332
|
if self.ftype in (
|
|
@@ -519,7 +525,7 @@ class TextModel(ModelBase):
|
|
|
519
525
|
def set_gguf_parameters(self):
|
|
520
526
|
self.gguf_writer.add_block_count(self.block_count)
|
|
521
527
|
|
|
522
|
-
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
|
528
|
+
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
|
523
529
|
self.gguf_writer.add_context_length(n_ctx)
|
|
524
530
|
logger.info(f"gguf: context length = {n_ctx}")
|
|
525
531
|
|
|
@@ -921,13 +927,20 @@ class TextModel(ModelBase):
|
|
|
921
927
|
tokenizer = SentencePieceProcessor()
|
|
922
928
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
|
923
929
|
|
|
924
|
-
vocab_size = self.
|
|
930
|
+
vocab_size = self.find_hparam([
|
|
931
|
+
"vocab_size_per_layer_input", # gemma3n
|
|
932
|
+
"vocab_size",
|
|
933
|
+
], optional=True) or tokenizer.vocab_size()
|
|
925
934
|
|
|
926
935
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
|
927
936
|
scores: list[float] = [-10000.0] * vocab_size
|
|
928
937
|
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
|
929
938
|
|
|
930
939
|
for token_id in range(tokenizer.vocab_size()):
|
|
940
|
+
if token_id >= vocab_size:
|
|
941
|
+
logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
|
|
942
|
+
break
|
|
943
|
+
|
|
931
944
|
piece = tokenizer.IdToPiece(token_id)
|
|
932
945
|
text = piece.encode("utf-8")
|
|
933
946
|
score = tokenizer.GetScore(token_id)
|
|
@@ -1898,9 +1911,7 @@ class LlamaModel(TextModel):
|
|
|
1898
1911
|
hparams = self.hparams
|
|
1899
1912
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
1900
1913
|
|
|
1901
|
-
if "head_dim"
|
|
1902
|
-
rope_dim = hparams["head_dim"]
|
|
1903
|
-
else:
|
|
1914
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
1904
1915
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
1905
1916
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
1906
1917
|
|
|
@@ -1982,7 +1993,8 @@ class LlamaModel(TextModel):
|
|
|
1982
1993
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
1983
1994
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
1984
1995
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
1985
|
-
dim
|
|
1996
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
1997
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
1986
1998
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
1987
1999
|
|
|
1988
2000
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -2017,6 +2029,20 @@ class LlamaModel(TextModel):
|
|
|
2017
2029
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
2018
2030
|
|
|
2019
2031
|
|
|
2032
|
+
@ModelBase.register("ArceeForCausalLM")
|
|
2033
|
+
class ArceeModel(LlamaModel):
|
|
2034
|
+
model_arch = gguf.MODEL_ARCH.ARCEE
|
|
2035
|
+
|
|
2036
|
+
def set_gguf_parameters(self):
|
|
2037
|
+
super().set_gguf_parameters()
|
|
2038
|
+
self._try_set_pooling_type()
|
|
2039
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
2040
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
2041
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
2042
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
2043
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
2044
|
+
|
|
2045
|
+
|
|
2020
2046
|
@ModelBase.register(
|
|
2021
2047
|
"LlavaForConditionalGeneration", # pixtral
|
|
2022
2048
|
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
|
@@ -2132,7 +2158,6 @@ class Llama4Model(LlamaModel):
|
|
|
2132
2158
|
|
|
2133
2159
|
def set_vocab(self):
|
|
2134
2160
|
self._set_vocab_gpt2()
|
|
2135
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
2136
2161
|
|
|
2137
2162
|
def set_gguf_parameters(self):
|
|
2138
2163
|
super().set_gguf_parameters()
|
|
@@ -2181,7 +2206,7 @@ class Llama4VisionModel(MmprojModel):
|
|
|
2181
2206
|
name += ".weight"
|
|
2182
2207
|
if "multi_modal_projector.linear_1" in name:
|
|
2183
2208
|
# despite the name with number postfix, this is a single fully connected layer
|
|
2184
|
-
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
|
|
2209
|
+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
|
|
2185
2210
|
return [(self.map_tensor_name(name), data_torch)]
|
|
2186
2211
|
return []
|
|
2187
2212
|
|
|
@@ -2304,9 +2329,7 @@ class DeciModel(TextModel):
|
|
|
2304
2329
|
hparams = self.hparams
|
|
2305
2330
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
2306
2331
|
|
|
2307
|
-
if "head_dim"
|
|
2308
|
-
rope_dim = hparams["head_dim"]
|
|
2309
|
-
else:
|
|
2332
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
2310
2333
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
2311
2334
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
2312
2335
|
|
|
@@ -2346,7 +2369,8 @@ class DeciModel(TextModel):
|
|
|
2346
2369
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
2347
2370
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
2348
2371
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
2349
|
-
dim
|
|
2372
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
2373
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
2350
2374
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
2351
2375
|
|
|
2352
2376
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -3664,9 +3688,7 @@ class InternLM3Model(TextModel):
|
|
|
3664
3688
|
hparams = self.hparams
|
|
3665
3689
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
3666
3690
|
|
|
3667
|
-
if "head_dim"
|
|
3668
|
-
rope_dim = hparams["head_dim"]
|
|
3669
|
-
else:
|
|
3691
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
3670
3692
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
3671
3693
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
3672
3694
|
|
|
@@ -3709,8 +3731,7 @@ class BertModel(TextModel):
|
|
|
3709
3731
|
self._try_set_pooling_type()
|
|
3710
3732
|
|
|
3711
3733
|
if self.cls_out_labels:
|
|
3712
|
-
|
|
3713
|
-
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
|
|
3734
|
+
self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
|
|
3714
3735
|
|
|
3715
3736
|
def set_vocab(self):
|
|
3716
3737
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
|
@@ -3909,9 +3930,6 @@ class BertModel(TextModel):
|
|
|
3909
3930
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
3910
3931
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
3911
3932
|
|
|
3912
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
3913
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
3914
|
-
|
|
3915
3933
|
|
|
3916
3934
|
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
|
|
3917
3935
|
class DistilBertModel(BertModel):
|
|
@@ -3953,8 +3971,6 @@ class RobertaModel(BertModel):
|
|
|
3953
3971
|
bpe_tok_path = self.dir_model / "tokenizer.json"
|
|
3954
3972
|
if bpe_tok_path.exists():
|
|
3955
3973
|
self._set_vocab_gpt2()
|
|
3956
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
3957
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
3958
3974
|
|
|
3959
3975
|
# we need this to validate the size of the token_type embeddings
|
|
3960
3976
|
# though currently we are passing all zeros to the token_type embeddings
|
|
@@ -4060,6 +4076,34 @@ class NomicBertModel(BertModel):
|
|
|
4060
4076
|
raise ValueError(f"unknown tokenizer: {toktyp}")
|
|
4061
4077
|
|
|
4062
4078
|
|
|
4079
|
+
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
|
|
4080
|
+
class NeoBert(BertModel):
|
|
4081
|
+
model_arch = gguf.MODEL_ARCH.NEO_BERT
|
|
4082
|
+
|
|
4083
|
+
def set_gguf_parameters(self):
|
|
4084
|
+
super().set_gguf_parameters()
|
|
4085
|
+
|
|
4086
|
+
# NeoBERT uses 2/3 of the intermediate size as feed forward length
|
|
4087
|
+
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
|
|
4088
|
+
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
|
|
4089
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
4090
|
+
|
|
4091
|
+
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
|
|
4092
|
+
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
|
4093
|
+
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
|
4094
|
+
|
|
4095
|
+
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
|
|
4096
|
+
|
|
4097
|
+
def modify_tensors(self, data_torch, name, bid):
|
|
4098
|
+
if name.startswith("decoder."):
|
|
4099
|
+
return []
|
|
4100
|
+
|
|
4101
|
+
if name.startswith("model."):
|
|
4102
|
+
name = name[6:]
|
|
4103
|
+
|
|
4104
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4105
|
+
|
|
4106
|
+
|
|
4063
4107
|
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
|
4064
4108
|
class XLMRobertaModel(BertModel):
|
|
4065
4109
|
model_arch = gguf.MODEL_ARCH.BERT
|
|
@@ -4186,6 +4230,7 @@ class Gemma2Model(TextModel):
|
|
|
4186
4230
|
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
|
4187
4231
|
class Gemma3Model(TextModel):
|
|
4188
4232
|
model_arch = gguf.MODEL_ARCH.GEMMA3
|
|
4233
|
+
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
|
4189
4234
|
|
|
4190
4235
|
def set_vocab(self):
|
|
4191
4236
|
self._set_vocab_sentencepiece()
|
|
@@ -4207,9 +4252,8 @@ class Gemma3Model(TextModel):
|
|
|
4207
4252
|
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
|
4208
4253
|
self.gguf_writer.add_file_type(self.ftype)
|
|
4209
4254
|
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
|
4210
|
-
#
|
|
4255
|
+
# attn_logit_softcapping is removed in Gemma3
|
|
4211
4256
|
assert hparams.get("attn_logit_softcapping") is None
|
|
4212
|
-
assert hparams.get("final_logit_softcapping") is None
|
|
4213
4257
|
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
|
4214
4258
|
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
|
4215
4259
|
if hparams.get("rope_scaling") is not None:
|
|
@@ -4221,7 +4265,7 @@ class Gemma3Model(TextModel):
|
|
|
4221
4265
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4222
4266
|
del bid # unused
|
|
4223
4267
|
|
|
4224
|
-
if
|
|
4268
|
+
if "language_model." in name:
|
|
4225
4269
|
name = name.replace("language_model.", "")
|
|
4226
4270
|
|
|
4227
4271
|
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
|
@@ -4236,8 +4280,9 @@ class Gemma3Model(TextModel):
|
|
|
4236
4280
|
|
|
4237
4281
|
# ref code in Gemma3RMSNorm
|
|
4238
4282
|
# output = output * (1.0 + self.weight.float())
|
|
4283
|
+
# note: this is not the case on gemma3n
|
|
4239
4284
|
if name.endswith("norm.weight"):
|
|
4240
|
-
data_torch = data_torch +
|
|
4285
|
+
data_torch = data_torch + self.norm_shift
|
|
4241
4286
|
|
|
4242
4287
|
return [(self.map_tensor_name(name), data_torch)]
|
|
4243
4288
|
|
|
@@ -4294,6 +4339,104 @@ class Gemma3VisionModel(MmprojModel):
|
|
|
4294
4339
|
return [] # skip other tensors
|
|
4295
4340
|
|
|
4296
4341
|
|
|
4342
|
+
@ModelBase.register("Gemma3nForConditionalGeneration")
|
|
4343
|
+
class Gemma3NModel(Gemma3Model):
|
|
4344
|
+
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
|
4345
|
+
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
|
4346
|
+
|
|
4347
|
+
_altup_proj: list[Tensor] = []
|
|
4348
|
+
_altup_unembd: list[Tensor] = []
|
|
4349
|
+
|
|
4350
|
+
def __init__(self, *args, **kwargs):
|
|
4351
|
+
super().__init__(*args, **kwargs)
|
|
4352
|
+
assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
|
|
4353
|
+
self._altup_proj = [
|
|
4354
|
+
torch.Tensor(), # to be replaced
|
|
4355
|
+
torch.Tensor(), # to be replaced
|
|
4356
|
+
torch.Tensor(), # to be replaced
|
|
4357
|
+
]
|
|
4358
|
+
self._altup_unembd = [
|
|
4359
|
+
torch.Tensor(), # to be replaced
|
|
4360
|
+
torch.Tensor(), # to be replaced
|
|
4361
|
+
torch.Tensor(), # to be replaced
|
|
4362
|
+
]
|
|
4363
|
+
|
|
4364
|
+
def set_vocab(self):
|
|
4365
|
+
with open(self.dir_model / "chat_template.jinja") as f:
|
|
4366
|
+
# quick hack to make sure chat template is added
|
|
4367
|
+
self.gguf_writer.add_chat_template(f.read())
|
|
4368
|
+
super().set_vocab()
|
|
4369
|
+
|
|
4370
|
+
def set_gguf_parameters(self):
|
|
4371
|
+
super().set_gguf_parameters()
|
|
4372
|
+
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
|
|
4373
|
+
self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
|
|
4374
|
+
self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
|
|
4375
|
+
self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
|
|
4376
|
+
|
|
4377
|
+
activation_sparsity_scale = []
|
|
4378
|
+
for s in self.hparams["activation_sparsity_pattern"]:
|
|
4379
|
+
normal_dist = torch.distributions.normal.Normal(0, 1)
|
|
4380
|
+
std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
|
|
4381
|
+
activation_sparsity_scale.append(std_multiplier.item())
|
|
4382
|
+
self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
|
|
4383
|
+
|
|
4384
|
+
sliding_window_pattern = []
|
|
4385
|
+
for t in self.hparams["layer_types"]:
|
|
4386
|
+
sliding_window_pattern.append(t == "sliding_attention")
|
|
4387
|
+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
|
4388
|
+
|
|
4389
|
+
def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
|
|
4390
|
+
has_all = all(m.numel() > 0 for m in matrices)
|
|
4391
|
+
if not has_all:
|
|
4392
|
+
return None
|
|
4393
|
+
else:
|
|
4394
|
+
return torch.stack(matrices, dim=0)
|
|
4395
|
+
|
|
4396
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4397
|
+
if name.endswith("_scale"):
|
|
4398
|
+
name = name + ".weight"
|
|
4399
|
+
|
|
4400
|
+
# TODO: implement self.prediction_coefs.weight.clamp_(...)
|
|
4401
|
+
|
|
4402
|
+
if "language_model." not in name:
|
|
4403
|
+
return [] # skip non-language model tensors
|
|
4404
|
+
|
|
4405
|
+
if "altup_unembed_projections" in name:
|
|
4406
|
+
data_torch = data_torch.to(device="cpu")
|
|
4407
|
+
if ".0." in name:
|
|
4408
|
+
self._altup_unembd[0] = data_torch
|
|
4409
|
+
elif ".1." in name:
|
|
4410
|
+
self._altup_unembd[1] = data_torch
|
|
4411
|
+
elif ".2." in name:
|
|
4412
|
+
self._altup_unembd[2] = data_torch
|
|
4413
|
+
else:
|
|
4414
|
+
raise ValueError(f"Unknown name: {name}")
|
|
4415
|
+
out = self._stack_matrices(self._altup_unembd)
|
|
4416
|
+
if out is not None:
|
|
4417
|
+
return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
|
|
4418
|
+
else:
|
|
4419
|
+
return []
|
|
4420
|
+
|
|
4421
|
+
if "altup_projections" in name:
|
|
4422
|
+
data_torch = data_torch.to(device="cpu")
|
|
4423
|
+
if ".0." in name:
|
|
4424
|
+
self._altup_proj[0] = data_torch
|
|
4425
|
+
elif ".1." in name:
|
|
4426
|
+
self._altup_proj[1] = data_torch
|
|
4427
|
+
elif ".2." in name:
|
|
4428
|
+
self._altup_proj[2] = data_torch
|
|
4429
|
+
else:
|
|
4430
|
+
raise ValueError(f"Unknown name: {name}")
|
|
4431
|
+
out = self._stack_matrices(self._altup_proj)
|
|
4432
|
+
if out is not None:
|
|
4433
|
+
return [(self.map_tensor_name("model.altup_projections.weight"), out)]
|
|
4434
|
+
else:
|
|
4435
|
+
return []
|
|
4436
|
+
|
|
4437
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
4438
|
+
|
|
4439
|
+
|
|
4297
4440
|
@ModelBase.register("Starcoder2ForCausalLM")
|
|
4298
4441
|
class StarCoder2Model(TextModel):
|
|
4299
4442
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
|
@@ -4799,25 +4942,6 @@ class OlmoeModel(TextModel):
|
|
|
4799
4942
|
class JinaBertV2Model(BertModel):
|
|
4800
4943
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
|
4801
4944
|
|
|
4802
|
-
def __init__(self, *args, **kwargs):
|
|
4803
|
-
super().__init__(*args, **kwargs)
|
|
4804
|
-
self.intermediate_size = self.hparams["intermediate_size"]
|
|
4805
|
-
|
|
4806
|
-
def get_tensors(self):
|
|
4807
|
-
for name, data in super().get_tensors():
|
|
4808
|
-
if 'gated_layer' in name:
|
|
4809
|
-
d1 = data[:self.intermediate_size, :]
|
|
4810
|
-
name1 = name.replace('gated_layers', 'gated_layers_w')
|
|
4811
|
-
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
|
4812
|
-
d2 = data[self.intermediate_size:, :]
|
|
4813
|
-
name2 = name.replace('gated_layers', 'gated_layers_v')
|
|
4814
|
-
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
|
4815
|
-
yield name1, d1
|
|
4816
|
-
yield name2, d2
|
|
4817
|
-
continue
|
|
4818
|
-
|
|
4819
|
-
yield name, data
|
|
4820
|
-
|
|
4821
4945
|
def set_vocab(self):
|
|
4822
4946
|
tokenizer_class = 'BertTokenizer'
|
|
4823
4947
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
|
@@ -4830,16 +4954,6 @@ class JinaBertV2Model(BertModel):
|
|
|
4830
4954
|
self.gguf_writer.add_token_type_count(2)
|
|
4831
4955
|
else:
|
|
4832
4956
|
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
|
4833
|
-
self.gguf_writer.add_add_bos_token(True)
|
|
4834
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
4835
|
-
|
|
4836
|
-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
4837
|
-
# if name starts with "bert.", remove the prefix
|
|
4838
|
-
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
|
4839
|
-
if name.startswith("bert."):
|
|
4840
|
-
name = name[5:]
|
|
4841
|
-
|
|
4842
|
-
return super().modify_tensors(data_torch, name, bid)
|
|
4843
4957
|
|
|
4844
4958
|
|
|
4845
4959
|
@ModelBase.register("OpenELMForCausalLM")
|
|
@@ -5081,9 +5195,7 @@ class DeepseekModel(TextModel):
|
|
|
5081
5195
|
def set_gguf_parameters(self):
|
|
5082
5196
|
super().set_gguf_parameters()
|
|
5083
5197
|
hparams = self.hparams
|
|
5084
|
-
if "head_dim"
|
|
5085
|
-
rope_dim = hparams["head_dim"]
|
|
5086
|
-
else:
|
|
5198
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
5087
5199
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
5088
5200
|
|
|
5089
5201
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
@@ -5287,6 +5399,34 @@ class DeepseekV2Model(TextModel):
|
|
|
5287
5399
|
raise ValueError(f"Unprocessed experts: {experts}")
|
|
5288
5400
|
|
|
5289
5401
|
|
|
5402
|
+
@ModelBase.register("Dots1ForCausalLM")
|
|
5403
|
+
class Dots1Model(Qwen2MoeModel):
|
|
5404
|
+
model_arch = gguf.MODEL_ARCH.DOTS1
|
|
5405
|
+
|
|
5406
|
+
def __init__(self, *args, **kwargs):
|
|
5407
|
+
super().__init__(*args, **kwargs)
|
|
5408
|
+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
|
5409
|
+
|
|
5410
|
+
def set_gguf_parameters(self):
|
|
5411
|
+
super().set_gguf_parameters()
|
|
5412
|
+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
|
5413
|
+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
|
5414
|
+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
|
5415
|
+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
|
5416
|
+
|
|
5417
|
+
if self.hparams["scoring_func"] == "noaux_tc":
|
|
5418
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
|
5419
|
+
else:
|
|
5420
|
+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
|
5421
|
+
|
|
5422
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
|
5423
|
+
if name.endswith("e_score_correction_bias"):
|
|
5424
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
|
5425
|
+
if "shared_experts" in name:
|
|
5426
|
+
return [(self.map_tensor_name(name), data_torch)]
|
|
5427
|
+
return super().modify_tensors(data_torch, name, bid)
|
|
5428
|
+
|
|
5429
|
+
|
|
5290
5430
|
@ModelBase.register("PLMForCausalLM")
|
|
5291
5431
|
class PLMModel(TextModel):
|
|
5292
5432
|
model_arch = gguf.MODEL_ARCH.PLM
|
|
@@ -5415,9 +5555,6 @@ class T5Model(TextModel):
|
|
|
5415
5555
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
5416
5556
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
5417
5557
|
|
|
5418
|
-
self.gguf_writer.add_add_bos_token(False)
|
|
5419
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
5420
|
-
|
|
5421
5558
|
def set_gguf_parameters(self):
|
|
5422
5559
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
|
5423
5560
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
|
@@ -5555,9 +5692,6 @@ class T5EncoderModel(TextModel):
|
|
|
5555
5692
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
5556
5693
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
5557
5694
|
|
|
5558
|
-
self.gguf_writer.add_add_bos_token(False)
|
|
5559
|
-
self.gguf_writer.add_add_eos_token(True)
|
|
5560
|
-
|
|
5561
5695
|
def set_gguf_parameters(self):
|
|
5562
5696
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
|
5563
5697
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
|
@@ -5945,7 +6079,8 @@ class ExaoneModel(TextModel):
|
|
|
5945
6079
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
|
5946
6080
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
|
5947
6081
|
base = self.hparams.get("rope_theta", 10000.0)
|
|
5948
|
-
dim
|
|
6082
|
+
if (dim := self.hparams.get("head_dim")) is None:
|
|
6083
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
|
5949
6084
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
|
5950
6085
|
|
|
5951
6086
|
factor = rope_scaling.get("factor", 8.0)
|
|
@@ -6057,7 +6192,8 @@ class BailingMoeModel(TextModel):
|
|
|
6057
6192
|
def set_gguf_parameters(self):
|
|
6058
6193
|
super().set_gguf_parameters()
|
|
6059
6194
|
hparams = self.hparams
|
|
6060
|
-
rope_dim
|
|
6195
|
+
if (rope_dim := hparams.get("head_dim")) is None:
|
|
6196
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
|
6061
6197
|
|
|
6062
6198
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
|
6063
6199
|
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
@@ -6089,7 +6225,8 @@ class BailingMoeModel(TextModel):
|
|
|
6089
6225
|
n_head = self.hparams["num_attention_heads"]
|
|
6090
6226
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
|
6091
6227
|
n_embd = self.hparams["hidden_size"]
|
|
6092
|
-
head_dim
|
|
6228
|
+
if (head_dim := self.hparams.get("head_dim")) is None:
|
|
6229
|
+
head_dim = n_embd // n_head
|
|
6093
6230
|
|
|
6094
6231
|
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
|
6095
6232
|
|
|
@@ -6350,8 +6487,8 @@ def parse_args() -> argparse.Namespace:
|
|
|
6350
6487
|
help="model is executed on big endian machine",
|
|
6351
6488
|
)
|
|
6352
6489
|
parser.add_argument(
|
|
6353
|
-
"model", type=
|
|
6354
|
-
help="directory containing model file",
|
|
6490
|
+
"model", type=str,
|
|
6491
|
+
help="directory containing model file or huggingface repository ID (if --remote)",
|
|
6355
6492
|
nargs="?",
|
|
6356
6493
|
)
|
|
6357
6494
|
parser.add_argument(
|
|
@@ -6454,18 +6591,20 @@ def main() -> None:
|
|
|
6454
6591
|
else:
|
|
6455
6592
|
logging.basicConfig(level=logging.INFO)
|
|
6456
6593
|
|
|
6457
|
-
dir_model = args.model
|
|
6458
|
-
|
|
6459
6594
|
if args.remote:
|
|
6595
|
+
hf_repo_id = args.model
|
|
6460
6596
|
from huggingface_hub import snapshot_download
|
|
6461
6597
|
local_dir = snapshot_download(
|
|
6462
|
-
repo_id=
|
|
6598
|
+
repo_id=hf_repo_id,
|
|
6463
6599
|
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
|
|
6464
6600
|
dir_model = Path(local_dir)
|
|
6465
6601
|
logger.info(f"Downloaded config and tokenizer to {local_dir}")
|
|
6602
|
+
else:
|
|
6603
|
+
hf_repo_id = None
|
|
6604
|
+
dir_model = Path(args.model)
|
|
6466
6605
|
|
|
6467
6606
|
if not dir_model.is_dir():
|
|
6468
|
-
logger.error(f'Error: {
|
|
6607
|
+
logger.error(f'Error: {dir_model} is not a directory')
|
|
6469
6608
|
sys.exit(1)
|
|
6470
6609
|
|
|
6471
6610
|
ftype_map: dict[str, gguf.LlamaFileType] = {
|
|
@@ -6485,9 +6624,9 @@ def main() -> None:
|
|
|
6485
6624
|
|
|
6486
6625
|
if args.outfile is not None:
|
|
6487
6626
|
fname_out = args.outfile
|
|
6488
|
-
elif
|
|
6627
|
+
elif hf_repo_id:
|
|
6489
6628
|
# if remote, use the model ID as the output file name
|
|
6490
|
-
fname_out = Path("./" +
|
|
6629
|
+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
|
|
6491
6630
|
else:
|
|
6492
6631
|
fname_out = dir_model
|
|
6493
6632
|
|
|
@@ -6516,7 +6655,7 @@ def main() -> None:
|
|
|
6516
6655
|
split_max_tensors=args.split_max_tensors,
|
|
6517
6656
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
|
6518
6657
|
small_first_shard=args.no_tensor_first_split,
|
|
6519
|
-
remote_hf_model_id=
|
|
6658
|
+
remote_hf_model_id=hf_repo_id)
|
|
6520
6659
|
|
|
6521
6660
|
if args.vocab_only:
|
|
6522
6661
|
logger.info("Exporting model vocab...")
|
|
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
|
|
105
105
|
message(DEBUG "INS_ENB : ${INS_ENB}")
|
|
106
106
|
|
|
107
107
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
108
|
-
option(
|
|
108
|
+
option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
109
109
|
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
|
110
110
|
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
|
111
111
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
@@ -131,13 +131,14 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
131
131
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
132
132
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
133
133
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
134
|
+
option(GGML_NNPA "ggml: enable nnpa" ON)
|
|
134
135
|
|
|
135
136
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
136
137
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
137
138
|
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
|
138
139
|
|
|
139
140
|
|
|
140
|
-
if (
|
|
141
|
+
if (MINGW)
|
|
141
142
|
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
|
|
142
143
|
endif()
|
|
143
144
|
|
|
@@ -172,6 +173,7 @@ option(GGML_HIP "ggml: use HIP"
|
|
|
172
173
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
|
173
174
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
174
175
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
176
|
+
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
175
177
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
176
178
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
177
179
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
@@ -367,6 +369,8 @@ if (MSVC)
|
|
|
367
369
|
/wd4005 # Macro redefinition
|
|
368
370
|
/wd4244 # Conversion from one type to another type, possible loss of data
|
|
369
371
|
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
|
372
|
+
/wd4305 # Conversion from 'type1' to 'type2', possible loss of data
|
|
373
|
+
/wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
|
|
370
374
|
/wd4996 # Disable POSIX deprecation warnings
|
|
371
375
|
/wd4702 # Unreachable code warnings
|
|
372
376
|
)
|
|
@@ -386,4 +390,46 @@ if (MSVC)
|
|
|
386
390
|
disable_msvc_warnings(ggml-cpu-skylakex)
|
|
387
391
|
disable_msvc_warnings(ggml-cpu-icelake)
|
|
388
392
|
disable_msvc_warnings(ggml-cpu-alderlake)
|
|
393
|
+
|
|
394
|
+
if (GGML_BUILD_EXAMPLES)
|
|
395
|
+
disable_msvc_warnings(common-ggml)
|
|
396
|
+
disable_msvc_warnings(common)
|
|
397
|
+
|
|
398
|
+
disable_msvc_warnings(mnist-common)
|
|
399
|
+
disable_msvc_warnings(mnist-eval)
|
|
400
|
+
disable_msvc_warnings(mnist-train)
|
|
401
|
+
|
|
402
|
+
disable_msvc_warnings(gpt-2-ctx)
|
|
403
|
+
disable_msvc_warnings(gpt-2-alloc)
|
|
404
|
+
disable_msvc_warnings(gpt-2-backend)
|
|
405
|
+
disable_msvc_warnings(gpt-2-sched)
|
|
406
|
+
disable_msvc_warnings(gpt-2-quantize)
|
|
407
|
+
disable_msvc_warnings(gpt-2-batched)
|
|
408
|
+
|
|
409
|
+
disable_msvc_warnings(gpt-j)
|
|
410
|
+
disable_msvc_warnings(gpt-j-quantize)
|
|
411
|
+
|
|
412
|
+
disable_msvc_warnings(magika)
|
|
413
|
+
disable_msvc_warnings(yolov3-tiny)
|
|
414
|
+
disable_msvc_warnings(sam)
|
|
415
|
+
|
|
416
|
+
disable_msvc_warnings(simple-ctx)
|
|
417
|
+
disable_msvc_warnings(simple-backend)
|
|
418
|
+
endif()
|
|
419
|
+
|
|
420
|
+
if (GGML_BUILD_TESTS)
|
|
421
|
+
disable_msvc_warnings(test-mul-mat)
|
|
422
|
+
disable_msvc_warnings(test-arange)
|
|
423
|
+
disable_msvc_warnings(test-backend-ops)
|
|
424
|
+
disable_msvc_warnings(test-cont)
|
|
425
|
+
disable_msvc_warnings(test-conv-transpose)
|
|
426
|
+
disable_msvc_warnings(test-conv-transpose-1d)
|
|
427
|
+
disable_msvc_warnings(test-conv1d)
|
|
428
|
+
disable_msvc_warnings(test-conv2d)
|
|
429
|
+
disable_msvc_warnings(test-conv2d-dw)
|
|
430
|
+
disable_msvc_warnings(test-customop)
|
|
431
|
+
disable_msvc_warnings(test-dup)
|
|
432
|
+
disable_msvc_warnings(test-opt)
|
|
433
|
+
disable_msvc_warnings(test-pool)
|
|
434
|
+
endif ()
|
|
389
435
|
endif()
|
|
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
|
|
36
36
|
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
|
37
37
|
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
|
38
38
|
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
|
39
|
-
elseif (
|
|
40
|
-
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
|
39
|
+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
|
41
40
|
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
|
42
41
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
43
42
|
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
|
@@ -101,6 +101,7 @@ extern "C" {
|
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
102
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
103
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
|
+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
104
105
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
105
106
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
106
107
|
|
|
@@ -133,6 +134,7 @@ extern "C" {
|
|
|
133
134
|
|
|
134
135
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
135
136
|
|
|
137
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
|
136
138
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
|
137
139
|
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
|
138
140
|
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|