@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -19,21 +19,13 @@ if (GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
|
|
19
19
|
add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
|
20
20
|
message(STATUS "Enabling bfloat16 glslc support")
|
|
21
21
|
endif()
|
|
22
|
+
if (GGML_VULKAN_SHADER_DEBUG_INFO)
|
|
23
|
+
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
|
|
24
|
+
message(STATUS "Enabling shader debug info")
|
|
25
|
+
endif()
|
|
22
26
|
|
|
23
27
|
set(TARGET vulkan-shaders-gen)
|
|
24
28
|
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
|
25
29
|
install(TARGETS ${TARGET} RUNTIME)
|
|
26
30
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
27
31
|
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
|
28
|
-
|
|
29
|
-
# Configure output directories for MSVC builds
|
|
30
|
-
if(MSVC)
|
|
31
|
-
# Get the main project's runtime output directory if possible
|
|
32
|
-
if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
|
33
|
-
foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
|
|
34
|
-
string(TOUPPER ${CONFIG} CONFIG)
|
|
35
|
-
set_target_properties(${TARGET} PROPERTIES
|
|
36
|
-
RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
|
37
|
-
endforeach()
|
|
38
|
-
endif()
|
|
39
|
-
endif()
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#version 450
|
|
2
|
+
|
|
3
|
+
#include "types.comp"
|
|
4
|
+
|
|
5
|
+
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // src0 - kernel: [K, Cout, Cin]
|
|
6
|
+
layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; // src1 - input: [L, Cin]
|
|
7
|
+
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; // dst - result [KL, Cout]
|
|
8
|
+
|
|
9
|
+
layout(local_size_x = 128 , local_size_y = 1, local_size_z = 1) in;
|
|
10
|
+
|
|
11
|
+
layout (push_constant) uniform parameter {
|
|
12
|
+
uint32_t Cout;
|
|
13
|
+
uint32_t Cin;
|
|
14
|
+
uint32_t K;
|
|
15
|
+
uint32_t L;
|
|
16
|
+
uint32_t KL;
|
|
17
|
+
|
|
18
|
+
uint32_t nb01;
|
|
19
|
+
uint32_t nb02;
|
|
20
|
+
uint32_t nb11;
|
|
21
|
+
uint32_t nb1;
|
|
22
|
+
|
|
23
|
+
int32_t s0;
|
|
24
|
+
} p;
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
uint32_t Cout_idx = gl_WorkGroupID.x;
|
|
28
|
+
const uint32_t bs = gl_WorkGroupSize.x;
|
|
29
|
+
uint32_t tid = gl_LocalInvocationID.x;
|
|
30
|
+
// Code is more straightforward if we assume it is bs*s0+K instead of (bs-1)*s0+K.
|
|
31
|
+
uint32_t tmp_len = bs*p.s0+p.K;
|
|
32
|
+
shared D_TYPE tmp[4096];
|
|
33
|
+
|
|
34
|
+
uint splitWork(uint workSize){
|
|
35
|
+
return (bs + workSize -1) / bs;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
void main(){
|
|
39
|
+
for(uint32_t i = 0; i < splitWork(tmp_len); i++){
|
|
40
|
+
uint32_t idx = i*bs+tid;
|
|
41
|
+
if(idx < tmp_len){
|
|
42
|
+
tmp[idx] = 0.0;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
uint32_t L_blocks = splitWork(p.L);
|
|
47
|
+
for(uint32_t L_block_id = 0; L_block_id < L_blocks; L_block_id++){
|
|
48
|
+
if(L_block_id > 0){
|
|
49
|
+
barrier();
|
|
50
|
+
// Shift values in tmp to the current processing window
|
|
51
|
+
for(int i = 0; i < splitWork(tmp_len); i++){
|
|
52
|
+
uint32_t idx = i*bs+tid;
|
|
53
|
+
if(idx >= bs*p.s0 && idx < tmp_len){
|
|
54
|
+
tmp[idx-bs*p.s0] = tmp[idx];
|
|
55
|
+
tmp[idx] = 0.0;
|
|
56
|
+
}else if(idx >= p.K && idx < bs*p.s0){
|
|
57
|
+
tmp[idx] = 0.0;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
barrier();
|
|
62
|
+
|
|
63
|
+
// Save contributions of the block to tmp
|
|
64
|
+
uint32_t L_idx = L_block_id*bs + tid;
|
|
65
|
+
for(uint32_t K_idx = 0; K_idx < p.K; K_idx++){
|
|
66
|
+
D_TYPE dp = 0.0;
|
|
67
|
+
for(uint32_t Cin_idx = 0; Cin_idx < p.Cin; Cin_idx++){
|
|
68
|
+
A_TYPE elemKrn = data_a[K_idx + Cout_idx * p.nb01 + Cin_idx * p.nb02];
|
|
69
|
+
if(L_idx < p.L){
|
|
70
|
+
B_TYPE elemInp = data_b[L_idx + Cin_idx*p.nb11];
|
|
71
|
+
dp = fma(elemKrn, elemInp, dp);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
tmp[tid*p.s0 + K_idx] += dp;
|
|
75
|
+
barrier();
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Save the computed values except the last block that can have different size
|
|
79
|
+
uint32_t KLb_idx = L_block_id*bs*p.s0;
|
|
80
|
+
if(L_block_id < L_blocks-1){
|
|
81
|
+
for(uint32_t s0_idx = 0; s0_idx < p.s0; s0_idx++){
|
|
82
|
+
uint32_t sh_idx = p.s0*tid+s0_idx;
|
|
83
|
+
uint32_t KL_idx = KLb_idx+sh_idx;
|
|
84
|
+
if(KL_idx < p.KL){
|
|
85
|
+
data_d[KL_idx + Cout_idx*p.nb1] = tmp[sh_idx];
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
for(uint32_t i = 0; i < splitWork(tmp_len); i++){
|
|
92
|
+
uint32_t idx = i*bs+tid;
|
|
93
|
+
uint32_t KL_idx = (L_blocks-1)*bs*p.s0+idx;
|
|
94
|
+
if(KL_idx < p.KL){
|
|
95
|
+
data_d[KL_idx + Cout_idx*p.nb1] = tmp[idx];
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
@@ -622,6 +622,8 @@ void process_shaders() {
|
|
|
622
622
|
|
|
623
623
|
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
624
624
|
|
|
625
|
+
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
626
|
+
|
|
625
627
|
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
626
628
|
|
|
627
629
|
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
|
@@ -61,9 +61,6 @@
|
|
|
61
61
|
#define m512i(p) (__m512i)(p)
|
|
62
62
|
#endif
|
|
63
63
|
|
|
64
|
-
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
|
65
|
-
float ggml_table_f32_f16[1 << 16];
|
|
66
|
-
|
|
67
64
|
#if defined(__linux__) || \
|
|
68
65
|
defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
|
|
69
66
|
(defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
|
|
@@ -888,12 +885,6 @@ struct ggml_context {
|
|
|
888
885
|
struct ggml_object * objects_end;
|
|
889
886
|
};
|
|
890
887
|
|
|
891
|
-
struct ggml_context_container {
|
|
892
|
-
bool used;
|
|
893
|
-
|
|
894
|
-
struct ggml_context context;
|
|
895
|
-
};
|
|
896
|
-
|
|
897
888
|
//
|
|
898
889
|
// data types
|
|
899
890
|
//
|
|
@@ -942,6 +933,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
942
933
|
"TRANSPOSE",
|
|
943
934
|
"GET_ROWS",
|
|
944
935
|
"GET_ROWS_BACK",
|
|
936
|
+
"SET_ROWS",
|
|
945
937
|
"DIAG",
|
|
946
938
|
"DIAG_MASK_INF",
|
|
947
939
|
"DIAG_MASK_ZERO",
|
|
@@ -961,6 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
961
953
|
"UPSCALE",
|
|
962
954
|
"PAD",
|
|
963
955
|
"PAD_REFLECT_1D",
|
|
956
|
+
"ROLL",
|
|
964
957
|
"ARANGE",
|
|
965
958
|
"TIMESTEP_EMBEDDING",
|
|
966
959
|
"ARGSORT",
|
|
@@ -991,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
991
984
|
"OPT_STEP_ADAMW",
|
|
992
985
|
};
|
|
993
986
|
|
|
994
|
-
static_assert(GGML_OP_COUNT ==
|
|
987
|
+
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
|
995
988
|
|
|
996
989
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
997
990
|
"none",
|
|
@@ -1037,6 +1030,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1037
1030
|
"transpose(x)",
|
|
1038
1031
|
"get_rows(x)",
|
|
1039
1032
|
"get_rows_back(x)",
|
|
1033
|
+
"set_rows(x)",
|
|
1040
1034
|
"diag(x)",
|
|
1041
1035
|
"diag_mask_inf(x)",
|
|
1042
1036
|
"diag_mask_zero(x)",
|
|
@@ -1056,6 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1056
1050
|
"upscale(x)",
|
|
1057
1051
|
"pad(x)",
|
|
1058
1052
|
"pad_reflect_1d(x)",
|
|
1053
|
+
"roll(x)",
|
|
1059
1054
|
"arange(start, stop, step)",
|
|
1060
1055
|
"timestep_embedding(timesteps, dim, max_period)",
|
|
1061
1056
|
"argsort(x)",
|
|
@@ -1086,7 +1081,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1086
1081
|
"adamw(x)",
|
|
1087
1082
|
};
|
|
1088
1083
|
|
|
1089
|
-
static_assert(GGML_OP_COUNT ==
|
|
1084
|
+
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
|
1090
1085
|
|
|
1091
1086
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
|
1092
1087
|
|
|
@@ -1355,6 +1350,12 @@ bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
|
|
|
1355
1350
|
tensor->nb[2] == ggml_type_size(tensor->type);
|
|
1356
1351
|
}
|
|
1357
1352
|
|
|
1353
|
+
bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor) {
|
|
1354
|
+
return
|
|
1355
|
+
tensor->ne[0] == ggml_blck_size(tensor->type) ||
|
|
1356
|
+
tensor->nb[0] == ggml_type_size(tensor->type);
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1358
1359
|
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
1359
1360
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
1360
1361
|
|
|
@@ -1426,14 +1427,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
1426
1427
|
// initialize time system (required on Windows)
|
|
1427
1428
|
ggml_time_init();
|
|
1428
1429
|
|
|
1429
|
-
for (int i = 0; i < (1 << 16); ++i) {
|
|
1430
|
-
union {
|
|
1431
|
-
uint16_t u16;
|
|
1432
|
-
ggml_fp16_t fp16;
|
|
1433
|
-
} u = {i};
|
|
1434
|
-
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
|
1435
|
-
}
|
|
1436
|
-
|
|
1437
1430
|
is_first_call = false;
|
|
1438
1431
|
}
|
|
1439
1432
|
|
|
@@ -3399,6 +3392,35 @@ struct ggml_tensor * ggml_get_rows_back(
|
|
|
3399
3392
|
return result;
|
|
3400
3393
|
}
|
|
3401
3394
|
|
|
3395
|
+
// ggml_set_rows
|
|
3396
|
+
|
|
3397
|
+
struct ggml_tensor * ggml_set_rows(
|
|
3398
|
+
struct ggml_context * ctx,
|
|
3399
|
+
struct ggml_tensor * a,
|
|
3400
|
+
struct ggml_tensor * b,
|
|
3401
|
+
struct ggml_tensor * c) {
|
|
3402
|
+
GGML_ASSERT(a->ne[0] == b->ne[0]);
|
|
3403
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
|
3404
|
+
GGML_ASSERT(a->ne[3] == b->ne[3]);
|
|
3405
|
+
GGML_ASSERT(b->ne[1] == c->ne[0]);
|
|
3406
|
+
GGML_ASSERT(b->ne[2] % c->ne[1] == 0);
|
|
3407
|
+
GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
|
|
3408
|
+
GGML_ASSERT(c->ne[3] == 1);
|
|
3409
|
+
GGML_ASSERT(b->type == GGML_TYPE_F32);
|
|
3410
|
+
GGML_ASSERT(c->type == GGML_TYPE_I64);
|
|
3411
|
+
|
|
3412
|
+
GGML_ASSERT(ggml_is_contiguous_rows(a));
|
|
3413
|
+
GGML_ASSERT(ggml_is_contiguous_rows(b));
|
|
3414
|
+
|
|
3415
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
|
3416
|
+
|
|
3417
|
+
result->op = GGML_OP_SET_ROWS;
|
|
3418
|
+
result->src[0] = b;
|
|
3419
|
+
result->src[1] = c;
|
|
3420
|
+
|
|
3421
|
+
return result;
|
|
3422
|
+
}
|
|
3423
|
+
|
|
3402
3424
|
// ggml_diag
|
|
3403
3425
|
|
|
3404
3426
|
struct ggml_tensor * ggml_diag(
|
|
@@ -4347,6 +4369,34 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
|
|
4347
4369
|
return result;
|
|
4348
4370
|
}
|
|
4349
4371
|
|
|
4372
|
+
// ggml_roll
|
|
4373
|
+
|
|
4374
|
+
struct ggml_tensor * ggml_roll(
|
|
4375
|
+
struct ggml_context * ctx,
|
|
4376
|
+
struct ggml_tensor * a,
|
|
4377
|
+
int shift0,
|
|
4378
|
+
int shift1,
|
|
4379
|
+
int shift2,
|
|
4380
|
+
int shift3) {
|
|
4381
|
+
GGML_ASSERT(a->nb[0] == ggml_type_size(a->type));
|
|
4382
|
+
GGML_ASSERT(abs(shift0) < a->ne[0]);
|
|
4383
|
+
GGML_ASSERT(abs(shift1) < a->ne[1]);
|
|
4384
|
+
GGML_ASSERT(abs(shift2) < a->ne[2]);
|
|
4385
|
+
GGML_ASSERT(abs(shift3) < a->ne[3]);
|
|
4386
|
+
|
|
4387
|
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
4388
|
+
|
|
4389
|
+
ggml_set_op_params_i32(result, 0, shift0);
|
|
4390
|
+
ggml_set_op_params_i32(result, 1, shift1);
|
|
4391
|
+
ggml_set_op_params_i32(result, 2, shift2);
|
|
4392
|
+
ggml_set_op_params_i32(result, 3, shift3);
|
|
4393
|
+
|
|
4394
|
+
result->op = GGML_OP_ROLL;
|
|
4395
|
+
result->src[0] = a;
|
|
4396
|
+
|
|
4397
|
+
return result;
|
|
4398
|
+
}
|
|
4399
|
+
|
|
4350
4400
|
// ggml_arange
|
|
4351
4401
|
|
|
4352
4402
|
struct ggml_tensor * ggml_arange(
|
|
@@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
|
|
335
335
|
|
|
336
336
|
for (uint32_t i = 0; i < magic.size(); i++) {
|
|
337
337
|
if (magic[i] != GGUF_MAGIC[i]) {
|
|
338
|
-
|
|
338
|
+
char c0 = isprint(magic[0]) ? magic[0] : '?';
|
|
339
|
+
char c1 = isprint(magic[1]) ? magic[1] : '?';
|
|
340
|
+
char c2 = isprint(magic[2]) ? magic[2] : '?';
|
|
341
|
+
char c3 = isprint(magic[3]) ? magic[3] : '?';
|
|
342
|
+
GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3);
|
|
339
343
|
gguf_free(ctx);
|
|
340
344
|
return nullptr;
|
|
341
345
|
}
|
|
@@ -118,6 +118,10 @@ class Keys:
|
|
|
118
118
|
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
|
119
119
|
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
|
|
120
120
|
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
|
|
121
|
+
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
|
|
122
|
+
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
|
|
123
|
+
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
|
|
124
|
+
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
|
|
121
125
|
|
|
122
126
|
class Attention:
|
|
123
127
|
HEAD_COUNT = "{arch}.attention.head_count"
|
|
@@ -142,6 +146,8 @@ class Keys:
|
|
|
142
146
|
SCALE = "{arch}.attention.scale"
|
|
143
147
|
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
|
|
144
148
|
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
|
|
149
|
+
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
|
|
150
|
+
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
|
145
151
|
|
|
146
152
|
class Rope:
|
|
147
153
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
|
@@ -198,6 +204,7 @@ class Keys:
|
|
|
198
204
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
|
199
205
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
|
200
206
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
|
207
|
+
ADD_SEP = "tokenizer.ggml.add_sep_token"
|
|
201
208
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
|
202
209
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
|
203
210
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
|
@@ -291,6 +298,7 @@ class MODEL_ARCH(IntEnum):
|
|
|
291
298
|
BERT = auto()
|
|
292
299
|
NOMIC_BERT = auto()
|
|
293
300
|
NOMIC_BERT_MOE = auto()
|
|
301
|
+
NEO_BERT = auto()
|
|
294
302
|
JINA_BERT_V2 = auto()
|
|
295
303
|
BLOOM = auto()
|
|
296
304
|
STABLELM = auto()
|
|
@@ -312,6 +320,7 @@ class MODEL_ARCH(IntEnum):
|
|
|
312
320
|
GEMMA = auto()
|
|
313
321
|
GEMMA2 = auto()
|
|
314
322
|
GEMMA3 = auto()
|
|
323
|
+
GEMMA3N = auto()
|
|
315
324
|
STARCODER2 = auto()
|
|
316
325
|
RWKV6 = auto()
|
|
317
326
|
RWKV6QWEN2 = auto()
|
|
@@ -343,6 +352,8 @@ class MODEL_ARCH(IntEnum):
|
|
|
343
352
|
WAVTOKENIZER_DEC = auto()
|
|
344
353
|
PLM = auto()
|
|
345
354
|
BAILINGMOE = auto()
|
|
355
|
+
DOTS1 = auto()
|
|
356
|
+
ARCEE = auto()
|
|
346
357
|
|
|
347
358
|
|
|
348
359
|
class VISION_PROJECTOR_TYPE(IntEnum):
|
|
@@ -395,6 +406,22 @@ class MODEL_TENSOR(IntEnum):
|
|
|
395
406
|
ATTN_Q_NORM = auto()
|
|
396
407
|
ATTN_K_NORM = auto()
|
|
397
408
|
LAYER_OUT_NORM = auto()
|
|
409
|
+
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
|
|
410
|
+
PER_LAYER_MODEL_PROJ = auto() # gemma3n
|
|
411
|
+
PER_LAYER_INP_GATE = auto() # gemma3n
|
|
412
|
+
PER_LAYER_PROJ = auto() # gemma3n
|
|
413
|
+
PER_LAYER_PROJ_NORM = auto() # gemma3n
|
|
414
|
+
PER_LAYER_POST_NORM = auto() # gemma3n
|
|
415
|
+
ALTUP_PROJ = auto() # gemma3n
|
|
416
|
+
ALTUP_UNEMBD_PROJ = auto() # gemma3n
|
|
417
|
+
ALTUP_CORRECT_COEF = auto() # gemma3n
|
|
418
|
+
ALTUP_CORRECT_SCALE = auto() # gemma3n
|
|
419
|
+
ALTUP_PREDICT_COEF = auto() # gemma3n
|
|
420
|
+
ALTUP_ROUTER = auto() # gemma3n
|
|
421
|
+
ALTUP_ROUTER_NORM = auto() # gemma3n
|
|
422
|
+
LAUREL_L = auto() # gemma3n
|
|
423
|
+
LAUREL_R = auto() # gemma3n
|
|
424
|
+
LAUREL_POST_NORM = auto() # gemma3n
|
|
398
425
|
SSM_IN = auto()
|
|
399
426
|
SSM_CONV1D = auto()
|
|
400
427
|
SSM_X = auto()
|
|
@@ -571,6 +598,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
571
598
|
MODEL_ARCH.BERT: "bert",
|
|
572
599
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
|
573
600
|
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
|
601
|
+
MODEL_ARCH.NEO_BERT: "neo-bert",
|
|
574
602
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
|
575
603
|
MODEL_ARCH.BLOOM: "bloom",
|
|
576
604
|
MODEL_ARCH.STABLELM: "stablelm",
|
|
@@ -592,6 +620,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
592
620
|
MODEL_ARCH.GEMMA: "gemma",
|
|
593
621
|
MODEL_ARCH.GEMMA2: "gemma2",
|
|
594
622
|
MODEL_ARCH.GEMMA3: "gemma3",
|
|
623
|
+
MODEL_ARCH.GEMMA3N: "gemma3n",
|
|
595
624
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
|
596
625
|
MODEL_ARCH.RWKV6: "rwkv6",
|
|
597
626
|
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
|
@@ -623,6 +652,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
623
652
|
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
|
624
653
|
MODEL_ARCH.PLM: "plm",
|
|
625
654
|
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
|
655
|
+
MODEL_ARCH.DOTS1: "dots1",
|
|
656
|
+
MODEL_ARCH.ARCEE: "arcee",
|
|
626
657
|
}
|
|
627
658
|
|
|
628
659
|
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
|
@@ -675,6 +706,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|
|
675
706
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
|
676
707
|
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
|
|
677
708
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
|
709
|
+
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
|
710
|
+
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
|
|
711
|
+
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
|
|
712
|
+
MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n
|
|
713
|
+
MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n
|
|
714
|
+
MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n
|
|
715
|
+
MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n
|
|
716
|
+
MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n
|
|
717
|
+
MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n
|
|
718
|
+
MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n
|
|
719
|
+
MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n
|
|
720
|
+
MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n
|
|
721
|
+
MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n
|
|
722
|
+
MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n
|
|
723
|
+
MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n
|
|
724
|
+
MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n
|
|
678
725
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
|
679
726
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
|
680
727
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
|
@@ -1077,6 +1124,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
1077
1124
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
1078
1125
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
1079
1126
|
],
|
|
1127
|
+
MODEL_ARCH.NEO_BERT: [
|
|
1128
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
1129
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
1130
|
+
MODEL_TENSOR.ATTN_QKV,
|
|
1131
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
1132
|
+
MODEL_TENSOR.FFN_NORM,
|
|
1133
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
1134
|
+
MODEL_TENSOR.FFN_UP,
|
|
1135
|
+
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
|
1136
|
+
MODEL_TENSOR.CLS,
|
|
1137
|
+
MODEL_TENSOR.CLS_OUT,
|
|
1138
|
+
],
|
|
1080
1139
|
MODEL_ARCH.JINA_BERT_V2: [
|
|
1081
1140
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
1082
1141
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
@@ -1467,6 +1526,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
1467
1526
|
MODEL_TENSOR.FFN_PRE_NORM,
|
|
1468
1527
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
1469
1528
|
],
|
|
1529
|
+
MODEL_ARCH.GEMMA3N: [
|
|
1530
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
1531
|
+
MODEL_TENSOR.OUTPUT,
|
|
1532
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
1533
|
+
MODEL_TENSOR.ATTN_Q,
|
|
1534
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
|
1535
|
+
MODEL_TENSOR.ATTN_K,
|
|
1536
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
|
1537
|
+
MODEL_TENSOR.ATTN_V,
|
|
1538
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
1539
|
+
MODEL_TENSOR.FFN_GATE,
|
|
1540
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
1541
|
+
MODEL_TENSOR.FFN_UP,
|
|
1542
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
1543
|
+
MODEL_TENSOR.ATTN_POST_NORM,
|
|
1544
|
+
MODEL_TENSOR.FFN_PRE_NORM,
|
|
1545
|
+
MODEL_TENSOR.FFN_POST_NORM,
|
|
1546
|
+
# altup / laurel
|
|
1547
|
+
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
|
1548
|
+
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
|
|
1549
|
+
MODEL_TENSOR.PER_LAYER_INP_GATE,
|
|
1550
|
+
MODEL_TENSOR.PER_LAYER_PROJ,
|
|
1551
|
+
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
|
|
1552
|
+
MODEL_TENSOR.PER_LAYER_POST_NORM,
|
|
1553
|
+
MODEL_TENSOR.ALTUP_PROJ,
|
|
1554
|
+
MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
|
|
1555
|
+
MODEL_TENSOR.ALTUP_CORRECT_COEF,
|
|
1556
|
+
MODEL_TENSOR.ALTUP_CORRECT_SCALE,
|
|
1557
|
+
MODEL_TENSOR.ALTUP_PREDICT_COEF,
|
|
1558
|
+
MODEL_TENSOR.ALTUP_ROUTER,
|
|
1559
|
+
MODEL_TENSOR.ALTUP_ROUTER_NORM,
|
|
1560
|
+
MODEL_TENSOR.LAUREL_L,
|
|
1561
|
+
MODEL_TENSOR.LAUREL_R,
|
|
1562
|
+
MODEL_TENSOR.LAUREL_POST_NORM,
|
|
1563
|
+
],
|
|
1470
1564
|
MODEL_ARCH.STARCODER2: [
|
|
1471
1565
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
1472
1566
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -2044,6 +2138,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
2044
2138
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
2045
2139
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
2046
2140
|
],
|
|
2141
|
+
MODEL_ARCH.DOTS1: [
|
|
2142
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2143
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2144
|
+
MODEL_TENSOR.OUTPUT,
|
|
2145
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
2146
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2147
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
|
2148
|
+
MODEL_TENSOR.ATTN_K,
|
|
2149
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
|
2150
|
+
MODEL_TENSOR.ATTN_V,
|
|
2151
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2152
|
+
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
|
2153
|
+
MODEL_TENSOR.FFN_NORM,
|
|
2154
|
+
MODEL_TENSOR.FFN_GATE,
|
|
2155
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
|
2156
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
|
2157
|
+
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
2158
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
2159
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
2160
|
+
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
2161
|
+
MODEL_TENSOR.FFN_UP,
|
|
2162
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
|
2163
|
+
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
2164
|
+
],
|
|
2165
|
+
MODEL_ARCH.ARCEE: [
|
|
2166
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2167
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2168
|
+
MODEL_TENSOR.OUTPUT,
|
|
2169
|
+
MODEL_TENSOR.ROPE_FREQS,
|
|
2170
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
2171
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2172
|
+
MODEL_TENSOR.ATTN_K,
|
|
2173
|
+
MODEL_TENSOR.ATTN_V,
|
|
2174
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2175
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
2176
|
+
MODEL_TENSOR.FFN_NORM,
|
|
2177
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
2178
|
+
MODEL_TENSOR.FFN_UP,
|
|
2179
|
+
],
|
|
2047
2180
|
# TODO
|
|
2048
2181
|
}
|
|
2049
2182
|
|
|
@@ -271,7 +271,7 @@ class GGUFWriter:
|
|
|
271
271
|
|
|
272
272
|
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
|
|
273
273
|
if any(key in kv_data for kv_data in self.kv_data):
|
|
274
|
-
|
|
274
|
+
logger.warning(f'Duplicated key name {key!r}, overwriting it with new value {val!r} of type {vtype.name}')
|
|
275
275
|
|
|
276
276
|
self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
|
|
277
277
|
|
|
@@ -672,6 +672,18 @@ class GGUFWriter:
|
|
|
672
672
|
def add_decoder_start_token_id(self, id: int) -> None:
|
|
673
673
|
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
|
674
674
|
|
|
675
|
+
def add_embedding_length_per_layer_input(self, value: int) -> None:
|
|
676
|
+
self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value)
|
|
677
|
+
|
|
678
|
+
def add_altup_active_idx(self, val: int) -> None:
|
|
679
|
+
self.add_uint32(Keys.LLM.ALTUP_ACTIVE_IDX.format(arch=self.arch), val)
|
|
680
|
+
|
|
681
|
+
def add_altup_num_inputs(self, val: int) -> None:
|
|
682
|
+
self.add_uint32(Keys.LLM.ALTUP_NUM_INPUTS.format(arch=self.arch), val)
|
|
683
|
+
|
|
684
|
+
def add_activation_sparsity_scale(self, values: Sequence[float]) -> None:
|
|
685
|
+
self.add_array(Keys.LLM.ACTIVATION_SPARSITY_SCALE.format(arch=self.arch), values)
|
|
686
|
+
|
|
675
687
|
def add_head_count(self, count: int | Sequence[int]) -> None:
|
|
676
688
|
if isinstance(count, int):
|
|
677
689
|
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
|
@@ -702,6 +714,12 @@ class GGUFWriter:
|
|
|
702
714
|
def add_clamp_kqv(self, value: float) -> None:
|
|
703
715
|
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
|
704
716
|
|
|
717
|
+
def add_shared_kv_layers(self, value: float) -> None:
|
|
718
|
+
self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
|
|
719
|
+
|
|
720
|
+
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
|
|
721
|
+
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
|
|
722
|
+
|
|
705
723
|
def add_logit_scale(self, value: float) -> None:
|
|
706
724
|
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
|
707
725
|
|
|
@@ -891,6 +909,9 @@ class GGUFWriter:
|
|
|
891
909
|
def add_add_eos_token(self, value: bool) -> None:
|
|
892
910
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
|
893
911
|
|
|
912
|
+
def add_add_sep_token(self, value: bool) -> None:
|
|
913
|
+
self.add_bool(Keys.Tokenizer.ADD_SEP, value)
|
|
914
|
+
|
|
894
915
|
def add_add_space_prefix(self, value: bool) -> None:
|
|
895
916
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
|
896
917
|
|
|
@@ -935,6 +956,9 @@ class GGUFWriter:
|
|
|
935
956
|
def add_eom_token_id(self, id: int) -> None:
|
|
936
957
|
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
|
937
958
|
|
|
959
|
+
def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
|
|
960
|
+
self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)
|
|
961
|
+
|
|
938
962
|
# for vision models
|
|
939
963
|
|
|
940
964
|
def add_clip_has_vision_encoder(self, value: bool) -> None:
|