@novastera-oss/llamarn 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/PureCppImpl.cpp +9 -27
- package/cpp/SystemUtils.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +60 -5
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -117,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
|
|
117
117
|
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
|
118
118
|
};
|
|
119
119
|
|
|
120
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
|
121
|
+
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
|
122
|
+
}
|
|
123
|
+
|
|
120
124
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
121
125
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
|
122
126
|
if (kv.second == name) {
|
|
@@ -459,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
459
463
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
460
464
|
}
|
|
461
465
|
|
|
462
|
-
// zero-out the array hparams
|
|
463
466
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
464
467
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
465
468
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
466
469
|
|
|
470
|
+
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
471
|
+
|
|
472
|
+
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
|
473
|
+
|
|
467
474
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
|
468
475
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
|
469
476
|
|
|
@@ -567,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
567
574
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
568
575
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
569
576
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
570
|
-
|
|
571
|
-
hparams.
|
|
572
|
-
hparams.n_swa
|
|
577
|
+
|
|
578
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
|
579
|
+
hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
|
580
|
+
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
573
581
|
|
|
574
582
|
switch (hparams.n_expert) {
|
|
575
583
|
case 16: type = LLM_TYPE_17B_16E; break;
|
|
@@ -675,6 +683,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
675
683
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
676
684
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
677
685
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
686
|
+
ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
|
|
678
687
|
|
|
679
688
|
switch (hparams.n_layer) {
|
|
680
689
|
case 3:
|
|
@@ -848,22 +857,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
848
857
|
default: type = LLM_TYPE_UNKNOWN;
|
|
849
858
|
}
|
|
850
859
|
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
//
|
|
858
|
-
hparams.
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
hparams.n_swa = 131072;
|
|
863
|
-
}
|
|
864
|
-
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
865
|
-
if (!found_swa && hparams.n_swa == 0) {
|
|
866
|
-
throw std::runtime_error("invalid value for sliding_window");
|
|
860
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
861
|
+
|
|
862
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
863
|
+
LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
|
|
864
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
|
|
865
|
+
|
|
866
|
+
// TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
|
|
867
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
868
|
+
|
|
869
|
+
hparams.n_swa = 0;
|
|
870
|
+
hparams.set_swa_pattern(1);
|
|
867
871
|
}
|
|
868
872
|
} break;
|
|
869
873
|
case LLM_ARCH_PHIMOE:
|
|
@@ -933,8 +937,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
933
937
|
} break;
|
|
934
938
|
case LLM_ARCH_GEMMA2:
|
|
935
939
|
{
|
|
940
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
936
941
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
937
|
-
hparams.
|
|
942
|
+
hparams.set_swa_pattern(2);
|
|
938
943
|
hparams.attn_soft_cap = true;
|
|
939
944
|
|
|
940
945
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
@@ -951,7 +956,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
951
956
|
} break;
|
|
952
957
|
case LLM_ARCH_GEMMA3:
|
|
953
958
|
{
|
|
954
|
-
hparams.
|
|
959
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
960
|
+
hparams.set_swa_pattern(6);
|
|
955
961
|
|
|
956
962
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
957
963
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
@@ -1035,7 +1041,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1035
1041
|
} break;
|
|
1036
1042
|
case LLM_ARCH_COHERE2:
|
|
1037
1043
|
{
|
|
1038
|
-
hparams.
|
|
1044
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1045
|
+
hparams.set_swa_pattern(4);
|
|
1039
1046
|
|
|
1040
1047
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1041
1048
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
@@ -1385,6 +1392,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1385
1392
|
// Add additional layer/vocab/etc checks here for other model sizes
|
|
1386
1393
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1387
1394
|
}
|
|
1395
|
+
|
|
1396
|
+
// For Granite MoE Shared
|
|
1397
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
|
1388
1398
|
} break;
|
|
1389
1399
|
case LLM_ARCH_CHAMELEON:
|
|
1390
1400
|
{
|
|
@@ -1768,6 +1778,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1768
1778
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
|
1769
1779
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
1770
1780
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
1781
|
+
|
|
1782
|
+
// For Granite MoE Shared
|
|
1783
|
+
if (hparams.n_ff_shexp > 0) {
|
|
1784
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
1785
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
1786
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
1787
|
+
}
|
|
1771
1788
|
}
|
|
1772
1789
|
}
|
|
1773
1790
|
} break;
|
|
@@ -2097,7 +2114,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2097
2114
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
2098
2115
|
{
|
|
2099
2116
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2100
|
-
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types},
|
|
2117
|
+
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
|
2101
2118
|
|
|
2102
2119
|
if (arch == LLM_ARCH_BERT) {
|
|
2103
2120
|
pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
|
@@ -2105,8 +2122,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2105
2122
|
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2106
2123
|
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2107
2124
|
|
|
2108
|
-
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd,
|
|
2109
|
-
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {
|
|
2125
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2126
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2110
2127
|
}
|
|
2111
2128
|
|
|
2112
2129
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
@@ -2115,7 +2132,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2115
2132
|
for (int i = 0; i < n_layer; ++i) {
|
|
2116
2133
|
auto & layer = layers[i];
|
|
2117
2134
|
|
|
2118
|
-
|
|
2135
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2136
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2137
|
+
|
|
2138
|
+
if (!layer.wqkv) {
|
|
2119
2139
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2120
2140
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
2121
2141
|
|
|
@@ -2124,12 +2144,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2124
2144
|
|
|
2125
2145
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2126
2146
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
2127
|
-
} else {
|
|
2128
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2129
|
-
}
|
|
2130
|
-
|
|
2131
|
-
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2132
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2133
2147
|
}
|
|
2134
2148
|
|
|
2135
2149
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
@@ -2473,7 +2487,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2473
2487
|
|
|
2474
2488
|
// output
|
|
2475
2489
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2476
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
2490
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2491
|
+
// if output is NULL, init from the input tok embed
|
|
2492
|
+
if (output == NULL) {
|
|
2493
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2494
|
+
}
|
|
2477
2495
|
|
|
2478
2496
|
for (int i = 0; i < n_layer; ++i) {
|
|
2479
2497
|
auto & layer = layers[i];
|
|
@@ -4264,7 +4282,7 @@ uint64_t llama_model::n_elements() const {
|
|
|
4264
4282
|
}
|
|
4265
4283
|
|
|
4266
4284
|
void llama_model::print_info() const {
|
|
4267
|
-
const
|
|
4285
|
+
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
|
4268
4286
|
|
|
4269
4287
|
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
|
4270
4288
|
bool is_var = false;
|
|
@@ -4307,7 +4325,7 @@ void llama_model::print_info() const {
|
|
|
4307
4325
|
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
|
4308
4326
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
|
4309
4327
|
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
|
4310
|
-
LLAMA_LOG_INFO("%s:
|
|
4328
|
+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
|
4311
4329
|
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
|
4312
4330
|
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
|
4313
4331
|
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
|
@@ -4325,7 +4343,7 @@ void llama_model::print_info() const {
|
|
|
4325
4343
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
4326
4344
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
4327
4345
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
4328
|
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
4346
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
4329
4347
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
4330
4348
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4331
4349
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
@@ -4381,10 +4399,13 @@ void llama_model::print_info() const {
|
|
|
4381
4399
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4382
4400
|
}
|
|
4383
4401
|
|
|
4384
|
-
if (arch == LLM_ARCH_MINICPM ||
|
|
4402
|
+
if (arch == LLM_ARCH_MINICPM ||
|
|
4403
|
+
arch == LLM_ARCH_GRANITE ||
|
|
4404
|
+
arch == LLM_ARCH_GRANITE_MOE) {
|
|
4385
4405
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
4386
4406
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
4387
4407
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
4408
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
4388
4409
|
}
|
|
4389
4410
|
|
|
4390
4411
|
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
@@ -4472,7 +4493,17 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
|
4472
4493
|
return it->second;
|
|
4473
4494
|
}
|
|
4474
4495
|
|
|
4475
|
-
|
|
4496
|
+
float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
|
|
4497
|
+
return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
|
4498
|
+
}
|
|
4499
|
+
|
|
4500
|
+
float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
|
|
4501
|
+
return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
|
4502
|
+
}
|
|
4503
|
+
|
|
4504
|
+
ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
|
|
4505
|
+
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
4506
|
+
|
|
4476
4507
|
// choose long/short freq factors based on the context size
|
|
4477
4508
|
if (layers[il].rope_freqs != nullptr) {
|
|
4478
4509
|
return layers[il].rope_freqs;
|
|
@@ -4500,21 +4531,174 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4500
4531
|
// inp_pos - contains the positions
|
|
4501
4532
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
4502
4533
|
|
|
4534
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
4535
|
+
|
|
4536
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4537
|
+
|
|
4538
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
4539
|
+
ggml_tensor * inpSA = inpL;
|
|
4540
|
+
|
|
4541
|
+
// norm
|
|
4542
|
+
cur = build_norm(inpL,
|
|
4543
|
+
model.layers[il].attn_norm, NULL,
|
|
4544
|
+
LLM_NORM_RMS, il);
|
|
4545
|
+
cb(cur, "attn_norm", il);
|
|
4546
|
+
|
|
4547
|
+
// self-attention
|
|
4548
|
+
{
|
|
4549
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4550
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
4551
|
+
|
|
4552
|
+
// compute Q and K and RoPE them
|
|
4553
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
4554
|
+
cb(Qcur, "Qcur", il);
|
|
4555
|
+
if (model.layers[il].bq) {
|
|
4556
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
4557
|
+
cb(Qcur, "Qcur", il);
|
|
4558
|
+
}
|
|
4559
|
+
|
|
4560
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
4561
|
+
cb(Kcur, "Kcur", il);
|
|
4562
|
+
if (model.layers[il].bk) {
|
|
4563
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
4564
|
+
cb(Kcur, "Kcur", il);
|
|
4565
|
+
}
|
|
4566
|
+
|
|
4567
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
4568
|
+
cb(Vcur, "Vcur", il);
|
|
4569
|
+
if (model.layers[il].bv) {
|
|
4570
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
4571
|
+
cb(Vcur, "Vcur", il);
|
|
4572
|
+
}
|
|
4573
|
+
|
|
4574
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
4575
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
4576
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
4577
|
+
|
|
4578
|
+
Qcur = ggml_rope_ext(
|
|
4579
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
4580
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4581
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4582
|
+
);
|
|
4583
|
+
|
|
4584
|
+
Kcur = ggml_rope_ext(
|
|
4585
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
4586
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4587
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4588
|
+
);
|
|
4589
|
+
|
|
4590
|
+
cb(Qcur, "Qcur", il);
|
|
4591
|
+
cb(Kcur, "Kcur", il);
|
|
4592
|
+
cb(Vcur, "Vcur", il);
|
|
4593
|
+
|
|
4594
|
+
cur = build_attn(inp_attn, gf,
|
|
4595
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
4596
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4597
|
+
cb(cur, "attn_out", il);
|
|
4598
|
+
}
|
|
4599
|
+
|
|
4600
|
+
if (il == n_layer - 1) {
|
|
4601
|
+
// skip computing output for unused tokens
|
|
4602
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4603
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4604
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4605
|
+
}
|
|
4606
|
+
|
|
4607
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
4608
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
4609
|
+
|
|
4610
|
+
// feed-forward network (non-MoE)
|
|
4611
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
4612
|
+
|
|
4613
|
+
cur = build_norm(ffn_inp,
|
|
4614
|
+
model.layers[il].ffn_norm, NULL,
|
|
4615
|
+
LLM_NORM_RMS, il);
|
|
4616
|
+
cb(cur, "ffn_norm", il);
|
|
4617
|
+
|
|
4618
|
+
cur = build_ffn(cur,
|
|
4619
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
4620
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
4621
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
4622
|
+
NULL,
|
|
4623
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4624
|
+
cb(cur, "ffn_out", il);
|
|
4625
|
+
} else {
|
|
4626
|
+
// MoE branch
|
|
4627
|
+
cur = build_norm(ffn_inp,
|
|
4628
|
+
model.layers[il].ffn_norm, NULL,
|
|
4629
|
+
LLM_NORM_RMS, il);
|
|
4630
|
+
cb(cur, "ffn_norm", il);
|
|
4631
|
+
|
|
4632
|
+
cur = build_moe_ffn(cur,
|
|
4633
|
+
model.layers[il].ffn_gate_inp,
|
|
4634
|
+
model.layers[il].ffn_up_exps,
|
|
4635
|
+
model.layers[il].ffn_gate_exps,
|
|
4636
|
+
model.layers[il].ffn_down_exps,
|
|
4637
|
+
nullptr,
|
|
4638
|
+
n_expert, n_expert_used,
|
|
4639
|
+
LLM_FFN_SILU, true,
|
|
4640
|
+
false, 0.0,
|
|
4641
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
4642
|
+
il);
|
|
4643
|
+
cb(cur, "ffn_moe_out", il);
|
|
4644
|
+
}
|
|
4645
|
+
|
|
4646
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
4647
|
+
cb(cur, "ffn_out", il);
|
|
4648
|
+
|
|
4649
|
+
cur = build_cvec(cur, il);
|
|
4650
|
+
cb(cur, "l_out", il);
|
|
4651
|
+
|
|
4652
|
+
// input for next layer
|
|
4653
|
+
inpL = cur;
|
|
4654
|
+
}
|
|
4655
|
+
|
|
4656
|
+
cur = inpL;
|
|
4657
|
+
|
|
4658
|
+
cur = build_norm(cur,
|
|
4659
|
+
model.output_norm, NULL,
|
|
4660
|
+
LLM_NORM_RMS, -1);
|
|
4661
|
+
|
|
4662
|
+
cb(cur, "result_norm", -1);
|
|
4663
|
+
res->t_embd = cur;
|
|
4664
|
+
|
|
4665
|
+
// lm_head
|
|
4666
|
+
cur = build_lora_mm(model.output, cur);
|
|
4667
|
+
|
|
4668
|
+
cb(cur, "result_output", -1);
|
|
4669
|
+
res->t_logits = cur;
|
|
4670
|
+
|
|
4671
|
+
ggml_build_forward_expand(gf, cur);
|
|
4672
|
+
}
|
|
4673
|
+
};
|
|
4674
|
+
|
|
4675
|
+
struct llm_build_llama_iswa : public llm_graph_context {
|
|
4676
|
+
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
4677
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4678
|
+
|
|
4679
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4680
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
4681
|
+
|
|
4682
|
+
ggml_tensor * cur;
|
|
4683
|
+
ggml_tensor * inpL;
|
|
4684
|
+
|
|
4685
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
4686
|
+
|
|
4687
|
+
// inp_pos - contains the positions
|
|
4688
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
4689
|
+
|
|
4503
4690
|
// temperature tuning
|
|
4504
4691
|
ggml_tensor * inp_attn_scale = nullptr;
|
|
4505
|
-
|
|
4506
|
-
inp_attn_scale = build_inp_attn_scale();
|
|
4507
|
-
}
|
|
4692
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
4508
4693
|
|
|
4509
|
-
auto * inp_attn =
|
|
4694
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
4510
4695
|
|
|
4511
4696
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4697
|
+
|
|
4512
4698
|
for (int il = 0; il < n_layer; ++il) {
|
|
4513
4699
|
ggml_tensor * inpSA = inpL;
|
|
4514
4700
|
|
|
4515
|
-
bool use_rope =
|
|
4516
|
-
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
|
4517
|
-
: true;
|
|
4701
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
4518
4702
|
|
|
4519
4703
|
// norm
|
|
4520
4704
|
cur = build_norm(inpL,
|
|
@@ -4525,7 +4709,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4525
4709
|
// self-attention
|
|
4526
4710
|
{
|
|
4527
4711
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4528
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
4712
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
4529
4713
|
|
|
4530
4714
|
// compute Q and K and RoPE them
|
|
4531
4715
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -4573,7 +4757,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4573
4757
|
cb(Kcur, "Kcur", il);
|
|
4574
4758
|
cb(Vcur, "Vcur", il);
|
|
4575
4759
|
|
|
4576
|
-
if (
|
|
4760
|
+
if (use_rope && hparams.use_kq_norm) {
|
|
4577
4761
|
// Llama4TextL2Norm
|
|
4578
4762
|
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
|
4579
4763
|
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
|
@@ -4594,17 +4778,11 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4594
4778
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4595
4779
|
}
|
|
4596
4780
|
|
|
4597
|
-
// For Granite architecture
|
|
4598
|
-
if (hparams.f_residual_scale) {
|
|
4599
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4600
|
-
}
|
|
4601
|
-
|
|
4602
4781
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
4603
4782
|
cb(ffn_inp, "ffn_inp", il);
|
|
4604
4783
|
|
|
4605
4784
|
// feed-forward network (non-MoE)
|
|
4606
4785
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
4607
|
-
|
|
4608
4786
|
cur = build_norm(ffn_inp,
|
|
4609
4787
|
model.layers[il].ffn_norm, NULL,
|
|
4610
4788
|
LLM_NORM_RMS, il);
|
|
@@ -4617,9 +4795,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4617
4795
|
NULL,
|
|
4618
4796
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4619
4797
|
cb(cur, "ffn_out", il);
|
|
4620
|
-
|
|
4621
|
-
} else if (arch == LLM_ARCH_LLAMA4) {
|
|
4622
|
-
// llama4 MoE
|
|
4798
|
+
} else {
|
|
4623
4799
|
ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
|
4624
4800
|
model.layers[il].ffn_norm, NULL,
|
|
4625
4801
|
LLM_NORM_RMS, il);
|
|
@@ -4648,31 +4824,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4648
4824
|
|
|
4649
4825
|
cur = ggml_add(ctx0, moe_out, shexp_out);
|
|
4650
4826
|
cb(cur, "ffn_moe_out_merged", il);
|
|
4651
|
-
|
|
4652
|
-
} else {
|
|
4653
|
-
// MoE branch
|
|
4654
|
-
cur = build_norm(ffn_inp,
|
|
4655
|
-
model.layers[il].ffn_norm, NULL,
|
|
4656
|
-
LLM_NORM_RMS, il);
|
|
4657
|
-
cb(cur, "ffn_norm", il);
|
|
4658
|
-
|
|
4659
|
-
cur = build_moe_ffn(cur,
|
|
4660
|
-
model.layers[il].ffn_gate_inp,
|
|
4661
|
-
model.layers[il].ffn_up_exps,
|
|
4662
|
-
model.layers[il].ffn_gate_exps,
|
|
4663
|
-
model.layers[il].ffn_down_exps,
|
|
4664
|
-
nullptr,
|
|
4665
|
-
n_expert, n_expert_used,
|
|
4666
|
-
LLM_FFN_SILU, true,
|
|
4667
|
-
false, 0.0,
|
|
4668
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
4669
|
-
il);
|
|
4670
|
-
cb(cur, "ffn_moe_out", il);
|
|
4671
|
-
}
|
|
4672
|
-
|
|
4673
|
-
// For Granite architecture
|
|
4674
|
-
if (hparams.f_residual_scale) {
|
|
4675
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4676
4827
|
}
|
|
4677
4828
|
|
|
4678
4829
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -4697,11 +4848,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4697
4848
|
// lm_head
|
|
4698
4849
|
cur = build_lora_mm(model.output, cur);
|
|
4699
4850
|
|
|
4700
|
-
// For Granite architecture
|
|
4701
|
-
if (hparams.f_logit_scale) {
|
|
4702
|
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
4703
|
-
}
|
|
4704
|
-
|
|
4705
4851
|
cb(cur, "result_output", -1);
|
|
4706
4852
|
res->t_logits = cur;
|
|
4707
4853
|
|
|
@@ -4751,7 +4897,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4751
4897
|
} else if (n_head > 0) {
|
|
4752
4898
|
// self-attention
|
|
4753
4899
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
4754
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
4900
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
4755
4901
|
|
|
4756
4902
|
// compute Q and K and RoPE them
|
|
4757
4903
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -4812,11 +4958,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4812
4958
|
continue;
|
|
4813
4959
|
}
|
|
4814
4960
|
|
|
4815
|
-
// For Granite architecture
|
|
4816
|
-
if (hparams.f_residual_scale) {
|
|
4817
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4818
|
-
}
|
|
4819
|
-
|
|
4820
4961
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
|
4821
4962
|
ggml_tensor * ffn_inp = cur;
|
|
4822
4963
|
if (n_head > 0) {
|
|
@@ -4840,11 +4981,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4840
4981
|
cb(cur, "ffn_out", il);
|
|
4841
4982
|
}
|
|
4842
4983
|
|
|
4843
|
-
// For Granite architecture
|
|
4844
|
-
if (hparams.f_residual_scale) {
|
|
4845
|
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
4846
|
-
}
|
|
4847
|
-
|
|
4848
4984
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
4849
4985
|
cb(cur, "ffn_out", il);
|
|
4850
4986
|
|
|
@@ -4867,11 +5003,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4867
5003
|
// lm_head
|
|
4868
5004
|
cur = build_lora_mm(model.output, cur);
|
|
4869
5005
|
|
|
4870
|
-
// For Granite architecture
|
|
4871
|
-
if (hparams.f_logit_scale) {
|
|
4872
|
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
4873
|
-
}
|
|
4874
|
-
|
|
4875
5006
|
cb(cur, "result_output", -1);
|
|
4876
5007
|
res->t_logits = cur;
|
|
4877
5008
|
|
|
@@ -5754,8 +5885,10 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5754
5885
|
inpL = build_inp_embd(model.tok_embd);
|
|
5755
5886
|
|
|
5756
5887
|
// token types are hardcoded to zero ("Sentence A")
|
|
5757
|
-
|
|
5758
|
-
|
|
5888
|
+
if (model.type_embd) {
|
|
5889
|
+
ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
|
5890
|
+
inpL = ggml_add(ctx0, inpL, type_row0);
|
|
5891
|
+
}
|
|
5759
5892
|
if (model.arch == LLM_ARCH_BERT) {
|
|
5760
5893
|
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
|
5761
5894
|
}
|
|
@@ -5776,36 +5909,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5776
5909
|
ggml_tensor * Vcur;
|
|
5777
5910
|
|
|
5778
5911
|
// self-attention
|
|
5779
|
-
if (model.
|
|
5780
|
-
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
5781
|
-
|
|
5782
|
-
if (model.layers[il].attn_q_norm) {
|
|
5783
|
-
Qcur = build_norm(Qcur,
|
|
5784
|
-
model.layers[il].attn_q_norm,
|
|
5785
|
-
model.layers[il].attn_q_norm_b,
|
|
5786
|
-
LLM_NORM, il);
|
|
5787
|
-
}
|
|
5788
|
-
|
|
5789
|
-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
5790
|
-
|
|
5791
|
-
if (model.layers[il].attn_k_norm) {
|
|
5792
|
-
Kcur = build_norm(Kcur,
|
|
5793
|
-
model.layers[il].attn_k_norm,
|
|
5794
|
-
model.layers[il].attn_k_norm_b,
|
|
5795
|
-
LLM_NORM, il);
|
|
5796
|
-
}
|
|
5797
|
-
|
|
5798
|
-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
5799
|
-
|
|
5800
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5801
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5802
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5803
|
-
} else {
|
|
5804
|
-
// compute Q and K and RoPE them
|
|
5912
|
+
if (model.layers[il].wqkv) {
|
|
5805
5913
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5806
5914
|
cb(cur, "wqkv", il);
|
|
5807
5915
|
|
|
5808
|
-
if (model.
|
|
5916
|
+
if (model.layers[il].bqkv) {
|
|
5809
5917
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
5810
5918
|
cb(cur, "bqkv", il);
|
|
5811
5919
|
}
|
|
@@ -5813,11 +5921,32 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5813
5921
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
5814
5922
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
5815
5923
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5924
|
+
} else {
|
|
5925
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
5926
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
5927
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
5928
|
+
}
|
|
5816
5929
|
|
|
5817
|
-
|
|
5818
|
-
|
|
5819
|
-
|
|
5930
|
+
if (model.layers[il].attn_q_norm) {
|
|
5931
|
+
Qcur = build_norm(Qcur,
|
|
5932
|
+
model.layers[il].attn_q_norm,
|
|
5933
|
+
model.layers[il].attn_q_norm_b,
|
|
5934
|
+
LLM_NORM, il);
|
|
5935
|
+
}
|
|
5936
|
+
|
|
5937
|
+
if (model.layers[il].attn_k_norm) {
|
|
5938
|
+
Kcur = build_norm(Kcur,
|
|
5939
|
+
model.layers[il].attn_k_norm,
|
|
5940
|
+
model.layers[il].attn_k_norm_b,
|
|
5941
|
+
LLM_NORM, il);
|
|
5942
|
+
}
|
|
5820
5943
|
|
|
5944
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5945
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5946
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5947
|
+
|
|
5948
|
+
// RoPE
|
|
5949
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5821
5950
|
Qcur = ggml_rope_ext(
|
|
5822
5951
|
ctx0, Qcur, inp_pos, nullptr,
|
|
5823
5952
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -7215,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7215
7344
|
}
|
|
7216
7345
|
};
|
|
7217
7346
|
|
|
7347
|
+
template<bool iswa>
|
|
7218
7348
|
struct llm_build_phi3 : public llm_graph_context {
|
|
7219
7349
|
llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7220
7350
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -7230,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7230
7360
|
// inp_pos - contains the positions
|
|
7231
7361
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7232
7362
|
|
|
7233
|
-
|
|
7363
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
|
7364
|
+
inp_attn_type * inp_attn = nullptr;
|
|
7365
|
+
|
|
7366
|
+
if constexpr (iswa) {
|
|
7367
|
+
inp_attn = build_attn_inp_kv_unified_iswa();
|
|
7368
|
+
} else {
|
|
7369
|
+
inp_attn = build_attn_inp_kv_unified();
|
|
7370
|
+
}
|
|
7234
7371
|
|
|
7235
7372
|
for (int il = 0; il < n_layer; ++il) {
|
|
7236
7373
|
auto * residual = inpL;
|
|
@@ -7238,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7238
7375
|
// self-attention
|
|
7239
7376
|
{
|
|
7240
7377
|
// rope freq factors for 128k context
|
|
7241
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
7378
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
7242
7379
|
|
|
7243
7380
|
ggml_tensor* attn_norm_output = build_norm(inpL,
|
|
7244
7381
|
model.layers[il].attn_norm,
|
|
@@ -7990,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
7990
8127
|
for (int il = 0; il < n_layer; ++il) {
|
|
7991
8128
|
ggml_tensor * inpSA = inpL;
|
|
7992
8129
|
|
|
7993
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
8130
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
7994
8131
|
|
|
7995
8132
|
// norm
|
|
7996
8133
|
cur = build_norm(inpL,
|
|
@@ -8290,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8290
8427
|
}
|
|
8291
8428
|
};
|
|
8292
8429
|
|
|
8293
|
-
struct
|
|
8294
|
-
|
|
8430
|
+
struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
8431
|
+
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8295
8432
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
8296
8433
|
|
|
8297
8434
|
ggml_tensor * cur;
|
|
@@ -8305,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
|
8305
8442
|
// inp_pos - contains the positions
|
|
8306
8443
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8307
8444
|
|
|
8308
|
-
auto * inp_attn =
|
|
8445
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8309
8446
|
|
|
8310
8447
|
for (int il = 0; il < n_layer; ++il) {
|
|
8311
8448
|
// norm
|
|
@@ -8427,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
|
8427
8564
|
}
|
|
8428
8565
|
};
|
|
8429
8566
|
|
|
8430
|
-
struct
|
|
8431
|
-
|
|
8567
|
+
struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
8568
|
+
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
8432
8569
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
8433
8570
|
|
|
8434
8571
|
ggml_tensor * cur;
|
|
@@ -8446,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
|
8446
8583
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8447
8584
|
|
|
8448
8585
|
// TODO: is causal == true correct? might need some changes
|
|
8449
|
-
auto * inp_attn =
|
|
8586
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8450
8587
|
|
|
8451
8588
|
for (int il = 0; il < n_layer; ++il) {
|
|
8452
|
-
const
|
|
8453
|
-
|
|
8454
|
-
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
|
8455
|
-
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
|
8589
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
8590
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
8456
8591
|
|
|
8457
8592
|
// norm
|
|
8458
8593
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
@@ -9029,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9029
9164
|
}
|
|
9030
9165
|
};
|
|
9031
9166
|
|
|
9032
|
-
struct
|
|
9033
|
-
|
|
9167
|
+
struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
9168
|
+
llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9034
9169
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
9035
9170
|
|
|
9036
9171
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -9045,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
9045
9180
|
// inp_pos - contains the positions
|
|
9046
9181
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9047
9182
|
|
|
9048
|
-
auto * inp_attn =
|
|
9183
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9049
9184
|
|
|
9050
9185
|
for (int il = 0; il < n_layer; ++il) {
|
|
9051
9186
|
const bool is_swa = hparams.is_swa(il);
|
|
@@ -9058,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
9058
9193
|
// self-attention
|
|
9059
9194
|
{
|
|
9060
9195
|
// rope freq factors for 128k context
|
|
9061
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
9196
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
9062
9197
|
|
|
9063
9198
|
// compute Q and K and RoPE them
|
|
9064
9199
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -9996,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
9996
10131
|
// self-attention
|
|
9997
10132
|
{
|
|
9998
10133
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
9999
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
10134
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
10000
10135
|
|
|
10001
10136
|
// compute Q and K and RoPE them
|
|
10002
10137
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -11360,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11360
11495
|
// self-attention
|
|
11361
11496
|
{
|
|
11362
11497
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
11363
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
11498
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
11364
11499
|
|
|
11365
11500
|
// compute Q and K and RoPE them
|
|
11366
11501
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -12210,6 +12345,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12210
12345
|
}
|
|
12211
12346
|
};
|
|
12212
12347
|
|
|
12348
|
+
|
|
12349
|
+
struct llm_build_granite : public llm_graph_context {
|
|
12350
|
+
llm_build_granite(
|
|
12351
|
+
const llama_model & model,
|
|
12352
|
+
const llm_graph_params & params,
|
|
12353
|
+
ggml_cgraph * gf,
|
|
12354
|
+
const bool use_rope = true)
|
|
12355
|
+
: llm_graph_context(params) {
|
|
12356
|
+
|
|
12357
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12358
|
+
|
|
12359
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
12360
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
12361
|
+
|
|
12362
|
+
ggml_tensor * cur;
|
|
12363
|
+
ggml_tensor * inpL;
|
|
12364
|
+
|
|
12365
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12366
|
+
|
|
12367
|
+
// inp_pos - built only if rope enabled
|
|
12368
|
+
ggml_tensor * inp_pos = nullptr;
|
|
12369
|
+
if (use_rope) {
|
|
12370
|
+
inp_pos = build_inp_pos();
|
|
12371
|
+
}
|
|
12372
|
+
|
|
12373
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12374
|
+
|
|
12375
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
12376
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12377
|
+
ggml_tensor * inpSA = inpL;
|
|
12378
|
+
|
|
12379
|
+
// norm
|
|
12380
|
+
cur = build_norm(inpL,
|
|
12381
|
+
model.layers[il].attn_norm, NULL,
|
|
12382
|
+
LLM_NORM_RMS, il);
|
|
12383
|
+
cb(cur, "attn_norm", il);
|
|
12384
|
+
|
|
12385
|
+
// self-attention
|
|
12386
|
+
{
|
|
12387
|
+
// compute Q and K and (optionally) RoPE them
|
|
12388
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12389
|
+
cb(Qcur, "Qcur", il);
|
|
12390
|
+
if (model.layers[il].bq) {
|
|
12391
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
12392
|
+
cb(Qcur, "Qcur", il);
|
|
12393
|
+
}
|
|
12394
|
+
|
|
12395
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12396
|
+
cb(Kcur, "Kcur", il);
|
|
12397
|
+
if (model.layers[il].bk) {
|
|
12398
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
12399
|
+
cb(Kcur, "Kcur", il);
|
|
12400
|
+
}
|
|
12401
|
+
|
|
12402
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12403
|
+
cb(Vcur, "Vcur", il);
|
|
12404
|
+
if (model.layers[il].bv) {
|
|
12405
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12406
|
+
cb(Vcur, "Vcur", il);
|
|
12407
|
+
}
|
|
12408
|
+
|
|
12409
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12410
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12411
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12412
|
+
|
|
12413
|
+
if (use_rope) {
|
|
12414
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
12415
|
+
Qcur = ggml_rope_ext(
|
|
12416
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
12417
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12418
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12419
|
+
);
|
|
12420
|
+
|
|
12421
|
+
Kcur = ggml_rope_ext(
|
|
12422
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
12423
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12424
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12425
|
+
);
|
|
12426
|
+
}
|
|
12427
|
+
|
|
12428
|
+
cb(Qcur, "Qcur", il);
|
|
12429
|
+
cb(Kcur, "Kcur", il);
|
|
12430
|
+
cb(Vcur, "Vcur", il);
|
|
12431
|
+
|
|
12432
|
+
cur = build_attn(inp_attn, gf,
|
|
12433
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
12434
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12435
|
+
cb(cur, "attn_out", il);
|
|
12436
|
+
}
|
|
12437
|
+
|
|
12438
|
+
if (il == n_layer - 1) {
|
|
12439
|
+
// skip computing output for unused tokens
|
|
12440
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12441
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12442
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12443
|
+
}
|
|
12444
|
+
|
|
12445
|
+
// For Granite architectures - scale residual
|
|
12446
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
12447
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12448
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12449
|
+
|
|
12450
|
+
// feed-forward network (non-MoE)
|
|
12451
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
12452
|
+
|
|
12453
|
+
cur = build_norm(ffn_inp,
|
|
12454
|
+
model.layers[il].ffn_norm, NULL,
|
|
12455
|
+
LLM_NORM_RMS, il);
|
|
12456
|
+
cb(cur, "ffn_norm", il);
|
|
12457
|
+
|
|
12458
|
+
cur = build_ffn(cur,
|
|
12459
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
12460
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
12461
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
12462
|
+
NULL,
|
|
12463
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12464
|
+
cb(cur, "ffn_out", il);
|
|
12465
|
+
|
|
12466
|
+
} else {
|
|
12467
|
+
// MoE branch
|
|
12468
|
+
cur = build_norm(ffn_inp,
|
|
12469
|
+
model.layers[il].ffn_norm, NULL,
|
|
12470
|
+
LLM_NORM_RMS, il);
|
|
12471
|
+
cb(cur, "ffn_norm", il);
|
|
12472
|
+
|
|
12473
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
12474
|
+
model.layers[il].ffn_gate_inp,
|
|
12475
|
+
model.layers[il].ffn_up_exps,
|
|
12476
|
+
model.layers[il].ffn_gate_exps,
|
|
12477
|
+
model.layers[il].ffn_down_exps,
|
|
12478
|
+
nullptr,
|
|
12479
|
+
n_expert, n_expert_used,
|
|
12480
|
+
LLM_FFN_SILU, true,
|
|
12481
|
+
false, 0.0,
|
|
12482
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12483
|
+
il);
|
|
12484
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
12485
|
+
|
|
12486
|
+
// For Granite MoE Shared
|
|
12487
|
+
if (hparams.n_ff_shexp > 0) {
|
|
12488
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
12489
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
12490
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
12491
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
12492
|
+
NULL,
|
|
12493
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12494
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
12495
|
+
|
|
12496
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
12497
|
+
cb(cur, "ffn_out", il);
|
|
12498
|
+
} else {
|
|
12499
|
+
cur = moe_out;
|
|
12500
|
+
}
|
|
12501
|
+
}
|
|
12502
|
+
|
|
12503
|
+
// For Granite architectures - scale residual
|
|
12504
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
12505
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12506
|
+
cb(cur, "ffn_out", il);
|
|
12507
|
+
|
|
12508
|
+
cur = build_cvec(cur, il);
|
|
12509
|
+
cb(cur, "l_out", il);
|
|
12510
|
+
|
|
12511
|
+
// input for next layer
|
|
12512
|
+
inpL = cur;
|
|
12513
|
+
}
|
|
12514
|
+
|
|
12515
|
+
cur = inpL;
|
|
12516
|
+
|
|
12517
|
+
cur = build_norm(cur,
|
|
12518
|
+
model.output_norm, NULL,
|
|
12519
|
+
LLM_NORM_RMS, -1);
|
|
12520
|
+
|
|
12521
|
+
cb(cur, "result_norm", -1);
|
|
12522
|
+
res->t_embd = cur;
|
|
12523
|
+
|
|
12524
|
+
// lm_head
|
|
12525
|
+
cur = build_lora_mm(model.output, cur);
|
|
12526
|
+
|
|
12527
|
+
// For Granite architectures - scale logits
|
|
12528
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
12529
|
+
cb(cur, "result_output", -1);
|
|
12530
|
+
res->t_logits = cur;
|
|
12531
|
+
|
|
12532
|
+
ggml_build_forward_expand(gf, cur);
|
|
12533
|
+
}
|
|
12534
|
+
};
|
|
12535
|
+
|
|
12213
12536
|
// ref: https://github.com/facebookresearch/chameleon
|
|
12214
12537
|
// based on the original build_llama() function, changes:
|
|
12215
12538
|
// * qk-norm
|
|
@@ -12741,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
12741
13064
|
// self-attention
|
|
12742
13065
|
{
|
|
12743
13066
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
12744
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
13067
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
12745
13068
|
|
|
12746
13069
|
// compute Q and K and RoPE them
|
|
12747
13070
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -12869,6 +13192,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
12869
13192
|
case LLM_ARCH_JINA_BERT_V2:
|
|
12870
13193
|
case LLM_ARCH_NOMIC_BERT:
|
|
12871
13194
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13195
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
12872
13196
|
{
|
|
12873
13197
|
res = nullptr;
|
|
12874
13198
|
} break;
|
|
@@ -12883,7 +13207,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
12883
13207
|
GGML_TYPE_F32,
|
|
12884
13208
|
GGML_TYPE_F32,
|
|
12885
13209
|
cparams.offload_kqv,
|
|
12886
|
-
std::max((uint32_t) 1, cparams.n_seq_max)
|
|
13210
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13211
|
+
cparams.n_seq_max);
|
|
12887
13212
|
} break;
|
|
12888
13213
|
default:
|
|
12889
13214
|
{
|
|
@@ -12893,14 +13218,36 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
12893
13218
|
|
|
12894
13219
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
12895
13220
|
|
|
12896
|
-
|
|
12897
|
-
|
|
12898
|
-
|
|
12899
|
-
|
|
12900
|
-
|
|
12901
|
-
|
|
12902
|
-
|
|
12903
|
-
|
|
13221
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13222
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
13223
|
+
|
|
13224
|
+
res = new llama_kv_cache_unified_iswa(
|
|
13225
|
+
*this,
|
|
13226
|
+
params.type_k,
|
|
13227
|
+
params.type_v,
|
|
13228
|
+
!cparams.flash_attn,
|
|
13229
|
+
cparams.offload_kqv,
|
|
13230
|
+
params.swa_full,
|
|
13231
|
+
cparams.n_ctx,
|
|
13232
|
+
cparams.n_seq_max,
|
|
13233
|
+
cparams.n_batch,
|
|
13234
|
+
padding);
|
|
13235
|
+
} else {
|
|
13236
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
13237
|
+
|
|
13238
|
+
res = new llama_kv_cache_unified(
|
|
13239
|
+
*this,
|
|
13240
|
+
nullptr,
|
|
13241
|
+
params.type_k,
|
|
13242
|
+
params.type_v,
|
|
13243
|
+
!cparams.flash_attn,
|
|
13244
|
+
cparams.offload_kqv,
|
|
13245
|
+
cparams.n_ctx,
|
|
13246
|
+
cparams.n_seq_max,
|
|
13247
|
+
padding,
|
|
13248
|
+
hparams.n_swa,
|
|
13249
|
+
hparams.swa_type);
|
|
13250
|
+
}
|
|
12904
13251
|
}
|
|
12905
13252
|
}
|
|
12906
13253
|
|
|
@@ -12915,13 +13262,13 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
12915
13262
|
|
|
12916
13263
|
switch (arch) {
|
|
12917
13264
|
case LLM_ARCH_LLAMA:
|
|
12918
|
-
case LLM_ARCH_LLAMA4:
|
|
12919
|
-
case LLM_ARCH_MINICPM:
|
|
12920
|
-
case LLM_ARCH_GRANITE:
|
|
12921
|
-
case LLM_ARCH_GRANITE_MOE:
|
|
12922
13265
|
{
|
|
12923
13266
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
|
12924
13267
|
} break;
|
|
13268
|
+
case LLM_ARCH_LLAMA4:
|
|
13269
|
+
{
|
|
13270
|
+
llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
|
|
13271
|
+
} break;
|
|
12925
13272
|
case LLM_ARCH_DECI:
|
|
12926
13273
|
{
|
|
12927
13274
|
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
|
@@ -12996,7 +13343,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
12996
13343
|
case LLM_ARCH_PHI3:
|
|
12997
13344
|
case LLM_ARCH_PHIMOE:
|
|
12998
13345
|
{
|
|
12999
|
-
|
|
13346
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13347
|
+
llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
|
|
13348
|
+
} else {
|
|
13349
|
+
llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
|
|
13350
|
+
}
|
|
13000
13351
|
} break;
|
|
13001
13352
|
case LLM_ARCH_PLAMO:
|
|
13002
13353
|
{
|
|
@@ -13028,11 +13379,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13028
13379
|
} break;
|
|
13029
13380
|
case LLM_ARCH_GEMMA2:
|
|
13030
13381
|
{
|
|
13031
|
-
llm = std::make_unique<
|
|
13382
|
+
llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
|
|
13032
13383
|
} break;
|
|
13033
13384
|
case LLM_ARCH_GEMMA3:
|
|
13034
13385
|
{
|
|
13035
|
-
llm = std::make_unique<
|
|
13386
|
+
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
|
|
13036
13387
|
} break;
|
|
13037
13388
|
case LLM_ARCH_STARCODER2:
|
|
13038
13389
|
{
|
|
@@ -13052,7 +13403,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13052
13403
|
} break;
|
|
13053
13404
|
case LLM_ARCH_COHERE2:
|
|
13054
13405
|
{
|
|
13055
|
-
llm = std::make_unique<
|
|
13406
|
+
llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
|
|
13056
13407
|
} break;
|
|
13057
13408
|
case LLM_ARCH_DBRX:
|
|
13058
13409
|
{
|
|
@@ -13149,6 +13500,12 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13149
13500
|
{
|
|
13150
13501
|
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
|
|
13151
13502
|
} break;
|
|
13503
|
+
case LLM_ARCH_GRANITE:
|
|
13504
|
+
case LLM_ARCH_GRANITE_MOE:
|
|
13505
|
+
case LLM_ARCH_MINICPM:
|
|
13506
|
+
{
|
|
13507
|
+
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
|
13508
|
+
} break;
|
|
13152
13509
|
case LLM_ARCH_CHAMELEON:
|
|
13153
13510
|
{
|
|
13154
13511
|
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|