cui-llama.rn 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -7
- package/android/src/main/CMakeLists.txt +22 -11
- package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
- package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
- package/android/src/main/jni.cpp +173 -18
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
- package/cpp/LICENSE +21 -0
- package/cpp/chat.cpp +129 -107
- package/cpp/chat.h +2 -0
- package/cpp/common.cpp +58 -78
- package/cpp/common.h +29 -21
- package/cpp/ggml-alloc.c +4 -1
- package/cpp/ggml-backend.cpp +9 -5
- package/cpp/ggml-backend.h +4 -4
- package/cpp/ggml-cpp.h +1 -1
- package/cpp/ggml-cpu/amx/amx.cpp +221 -0
- package/cpp/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
- package/cpp/ggml-cpu/amx/mmq.h +10 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
- package/cpp/ggml-cpu/common.h +72 -0
- package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
- package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
- package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
- package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
- package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
- package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
- package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
- package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
- package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
- package/cpp/ggml-cpu.h +5 -0
- package/cpp/ggml-impl.h +16 -9
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal-impl.h +36 -11
- package/cpp/ggml-metal.m +810 -176
- package/cpp/ggml-opt.cpp +373 -190
- package/cpp/ggml-opt.h +49 -28
- package/cpp/ggml-quants.c +0 -6
- package/cpp/ggml.c +227 -282
- package/cpp/ggml.h +82 -101
- package/cpp/gguf.cpp +33 -33
- package/cpp/json-schema-to-grammar.cpp +3 -0
- package/cpp/llama-adapter.cpp +6 -0
- package/cpp/llama-arch.cpp +49 -17
- package/cpp/llama-arch.h +9 -0
- package/cpp/llama-batch.cpp +8 -2
- package/cpp/llama-batch.h +2 -1
- package/cpp/llama-chat.cpp +39 -16
- package/cpp/llama-chat.h +4 -2
- package/cpp/llama-context.cpp +440 -611
- package/cpp/llama-context.h +44 -33
- package/cpp/llama-cparams.h +1 -0
- package/cpp/llama-graph.cpp +214 -291
- package/cpp/llama-graph.h +69 -21
- package/cpp/llama-hparams.cpp +17 -1
- package/cpp/llama-hparams.h +39 -5
- package/cpp/llama-kv-cache.cpp +2067 -620
- package/cpp/llama-kv-cache.h +410 -108
- package/cpp/llama-memory.h +12 -1
- package/cpp/llama-model-loader.cpp +24 -15
- package/cpp/llama-model-saver.cpp +281 -0
- package/cpp/llama-model-saver.h +37 -0
- package/cpp/llama-model.cpp +1089 -359
- package/cpp/llama-model.h +19 -3
- package/cpp/llama-sampling.cpp +20 -7
- package/cpp/llama-vocab.cpp +54 -9
- package/cpp/llama-vocab.h +6 -0
- package/cpp/llama.cpp +14 -0
- package/cpp/llama.h +86 -142
- package/cpp/minja/chat-template.hpp +9 -5
- package/cpp/minja/minja.hpp +69 -36
- package/cpp/rn-llama.cpp +602 -190
- package/cpp/rn-llama.h +34 -8
- package/cpp/sampling.cpp +57 -50
- package/cpp/tools/mtmd/clip-impl.h +462 -0
- package/cpp/tools/mtmd/clip.cpp +4024 -0
- package/cpp/tools/mtmd/clip.h +101 -0
- package/cpp/tools/mtmd/miniaudio.h +93468 -0
- package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
- package/cpp/tools/mtmd/mtmd.cpp +942 -0
- package/cpp/tools/mtmd/mtmd.h +362 -0
- package/cpp/tools/mtmd/stb_image.h +7988 -0
- package/ios/CMakeLists.txt +20 -10
- package/ios/RNLlama.h +6 -0
- package/ios/RNLlama.mm +82 -3
- package/ios/RNLlamaContext.h +5 -1
- package/ios/RNLlamaContext.mm +131 -38
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +33 -7
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +153 -21
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +152 -20
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +54 -4
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +72 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +72 -4
- package/src/index.ts +212 -38
- package/cpp/binary-ops.h +0 -16
- package/cpp/ops.h +0 -128
- package/cpp/simd-mappings.h +0 -888
- package/cpp/unary-ops.h +0 -28
- package/cpp/vec.h +0 -802
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- package/lib/commonjs/chat.js +0 -37
- package/lib/commonjs/chat.js.map +0 -1
- package/lib/module/chat.js +0 -33
- package/lib/module/chat.js.map +0 -1
- package/lib/typescript/chat.d.ts +0 -10
- package/lib/typescript/chat.d.ts.map +0 -1
- package/src/chat.ts +0 -44
- /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
- /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
- /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
- /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
- /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
- /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
- /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
- /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
package/cpp/llama-model.cpp
CHANGED
@@ -40,14 +40,17 @@ const char * llm_type_name(llm_type type) {
|
|
40
40
|
case LLM_TYPE_335M: return "335M";
|
41
41
|
case LLM_TYPE_410M: return "410M";
|
42
42
|
case LLM_TYPE_450M: return "450M";
|
43
|
+
case LLM_TYPE_475M: return "475M";
|
43
44
|
case LLM_TYPE_770M: return "770M";
|
44
45
|
case LLM_TYPE_780M: return "780M";
|
45
46
|
case LLM_TYPE_0_5B: return "0.5B";
|
47
|
+
case LLM_TYPE_0_6B: return "0.6B";
|
46
48
|
case LLM_TYPE_1B: return "1B";
|
47
49
|
case LLM_TYPE_1_3B: return "1.3B";
|
48
50
|
case LLM_TYPE_1_4B: return "1.4B";
|
49
51
|
case LLM_TYPE_1_5B: return "1.5B";
|
50
52
|
case LLM_TYPE_1_6B: return "1.6B";
|
53
|
+
case LLM_TYPE_1_7B: return "1.7B";
|
51
54
|
case LLM_TYPE_1_8B: return "1.8B";
|
52
55
|
case LLM_TYPE_2B: return "2B";
|
53
56
|
case LLM_TYPE_2_8B: return "2.8B";
|
@@ -66,6 +69,7 @@ const char * llm_type_name(llm_type type) {
|
|
66
69
|
case LLM_TYPE_15B: return "15B";
|
67
70
|
case LLM_TYPE_16B: return "16B";
|
68
71
|
case LLM_TYPE_20B: return "20B";
|
72
|
+
case LLM_TYPE_27B: return "27B";
|
69
73
|
case LLM_TYPE_30B: return "30B";
|
70
74
|
case LLM_TYPE_32B: return "32B";
|
71
75
|
case LLM_TYPE_34B: return "34B";
|
@@ -74,7 +78,9 @@ const char * llm_type_name(llm_type type) {
|
|
74
78
|
case LLM_TYPE_65B: return "65B";
|
75
79
|
case LLM_TYPE_70B: return "70B";
|
76
80
|
case LLM_TYPE_236B: return "236B";
|
81
|
+
case LLM_TYPE_290B: return "290B";
|
77
82
|
case LLM_TYPE_314B: return "314B";
|
83
|
+
case LLM_TYPE_405B: return "405B";
|
78
84
|
case LLM_TYPE_671B: return "671B";
|
79
85
|
case LLM_TYPE_SMALL: return "0.1B";
|
80
86
|
case LLM_TYPE_MEDIUM: return "0.4B";
|
@@ -88,10 +94,10 @@ const char * llm_type_name(llm_type type) {
|
|
88
94
|
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
89
95
|
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
90
96
|
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
91
|
-
case LLM_TYPE_27B: return "27B";
|
92
|
-
case LLM_TYPE_290B: return "290B";
|
93
97
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
94
98
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
99
|
+
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
100
|
+
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
95
101
|
default: return "?B";
|
96
102
|
}
|
97
103
|
}
|
@@ -111,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
|
111
117
|
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
112
118
|
};
|
113
119
|
|
120
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
121
|
+
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
122
|
+
}
|
123
|
+
|
114
124
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
115
125
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
116
126
|
if (kv.second == name) {
|
@@ -293,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<lm_ggml_backend_dev_t> &
|
|
293
303
|
// add extra buffer types, only if no GPU device is present
|
294
304
|
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
295
305
|
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
306
|
+
if (cpu_dev == nullptr) {
|
307
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
308
|
+
}
|
309
|
+
|
296
310
|
auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
|
297
311
|
auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
|
298
312
|
lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
|
@@ -449,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
449
463
|
LM_GGML_ASSERT(hparams.n_expert_used == 0);
|
450
464
|
}
|
451
465
|
|
452
|
-
// zero-out the array hparams
|
453
466
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
454
467
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
455
468
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
456
469
|
|
470
|
+
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
471
|
+
|
472
|
+
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
|
473
|
+
|
457
474
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
458
475
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
459
476
|
|
@@ -557,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
557
574
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
558
575
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
559
576
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
560
|
-
|
561
|
-
hparams.
|
562
|
-
hparams.n_swa
|
577
|
+
|
578
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
579
|
+
hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
580
|
+
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
563
581
|
|
564
582
|
switch (hparams.n_expert) {
|
565
583
|
case 16: type = LLM_TYPE_17B_16E; break;
|
@@ -577,6 +595,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
577
595
|
switch (hparams.n_layer) {
|
578
596
|
case 32: type = LLM_TYPE_7B; break;
|
579
597
|
case 80: type = LLM_TYPE_70B; break;
|
598
|
+
case 162: type = LLM_TYPE_405B; break;
|
580
599
|
default: type = LLM_TYPE_UNKNOWN;
|
581
600
|
}
|
582
601
|
} break;
|
@@ -695,13 +714,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
695
714
|
}
|
696
715
|
} break;
|
697
716
|
case LLM_ARCH_NOMIC_BERT:
|
717
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
698
718
|
{
|
699
719
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
700
720
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
701
721
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
722
|
+
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
702
723
|
|
703
724
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
704
|
-
|
725
|
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
726
|
+
type = LLM_TYPE_137M;
|
727
|
+
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
728
|
+
type = LLM_TYPE_475M;
|
729
|
+
}
|
705
730
|
}
|
706
731
|
} break;
|
707
732
|
case LLM_ARCH_BLOOM:
|
@@ -762,6 +787,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
762
787
|
// fall through
|
763
788
|
case LLM_ARCH_QWEN2:
|
764
789
|
{
|
790
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
765
791
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
766
792
|
switch (hparams.n_layer) {
|
767
793
|
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
@@ -791,6 +817,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
791
817
|
{
|
792
818
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
793
819
|
switch (hparams.n_layer) {
|
820
|
+
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
821
|
+
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
822
|
+
case 40: type = LLM_TYPE_14B; break;
|
823
|
+
case 64: type = LLM_TYPE_32B; break;
|
794
824
|
default: type = LLM_TYPE_UNKNOWN;
|
795
825
|
}
|
796
826
|
} break;
|
@@ -800,6 +830,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
800
830
|
|
801
831
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
802
832
|
switch (hparams.n_layer) {
|
833
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
834
|
+
case 94: type = LLM_TYPE_235B_A22B; break;
|
803
835
|
default: type = LLM_TYPE_UNKNOWN;
|
804
836
|
}
|
805
837
|
} break;
|
@@ -824,22 +856,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
824
856
|
default: type = LLM_TYPE_UNKNOWN;
|
825
857
|
}
|
826
858
|
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
//
|
834
|
-
hparams.
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
hparams.n_swa = 131072;
|
839
|
-
}
|
840
|
-
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
841
|
-
if (!found_swa && hparams.n_swa == 0) {
|
842
|
-
throw std::runtime_error("invalid value for sliding_window");
|
859
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
860
|
+
|
861
|
+
if (found_swa && hparams.n_swa > 0) {
|
862
|
+
LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
|
863
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
|
864
|
+
|
865
|
+
// TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
|
866
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
867
|
+
|
868
|
+
hparams.n_swa = 0;
|
869
|
+
hparams.set_swa_pattern(1);
|
843
870
|
}
|
844
871
|
} break;
|
845
872
|
case LLM_ARCH_PHIMOE:
|
@@ -909,8 +936,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
909
936
|
} break;
|
910
937
|
case LLM_ARCH_GEMMA2:
|
911
938
|
{
|
939
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
912
940
|
hparams.n_swa = 4096; // default value of gemma 2
|
913
|
-
hparams.
|
941
|
+
hparams.set_swa_pattern(2);
|
914
942
|
hparams.attn_soft_cap = true;
|
915
943
|
|
916
944
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
@@ -927,7 +955,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
927
955
|
} break;
|
928
956
|
case LLM_ARCH_GEMMA3:
|
929
957
|
{
|
930
|
-
hparams.
|
958
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
959
|
+
hparams.set_swa_pattern(6);
|
931
960
|
|
932
961
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
933
962
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
@@ -1011,7 +1040,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
1011
1040
|
} break;
|
1012
1041
|
case LLM_ARCH_COHERE2:
|
1013
1042
|
{
|
1014
|
-
hparams.
|
1043
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
1044
|
+
hparams.set_swa_pattern(4);
|
1015
1045
|
|
1016
1046
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
1017
1047
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
@@ -1156,6 +1186,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
1156
1186
|
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
1157
1187
|
}
|
1158
1188
|
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
1189
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
|
1190
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
1159
1191
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
1160
1192
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
1161
1193
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
@@ -1205,6 +1237,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
1205
1237
|
default: type = LLM_TYPE_UNKNOWN;
|
1206
1238
|
}
|
1207
1239
|
} break;
|
1240
|
+
case LLM_ARCH_GLM4:
|
1241
|
+
{
|
1242
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
1243
|
+
switch (hparams.n_layer) {
|
1244
|
+
case 40: type = LLM_TYPE_9B; break;
|
1245
|
+
case 61: type = LLM_TYPE_32B; break;
|
1246
|
+
default: type = LLM_TYPE_UNKNOWN;
|
1247
|
+
}
|
1248
|
+
} break;
|
1208
1249
|
case LLM_ARCH_BITNET:
|
1209
1250
|
{
|
1210
1251
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -1350,6 +1391,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
1350
1391
|
// Add additional layer/vocab/etc checks here for other model sizes
|
1351
1392
|
default: type = LLM_TYPE_UNKNOWN;
|
1352
1393
|
}
|
1394
|
+
|
1395
|
+
// For Granite MoE Shared
|
1396
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
1353
1397
|
} break;
|
1354
1398
|
case LLM_ARCH_CHAMELEON:
|
1355
1399
|
{
|
@@ -1453,6 +1497,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1453
1497
|
}
|
1454
1498
|
|
1455
1499
|
lm_ggml_backend_dev_t cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
1500
|
+
if (cpu_dev == nullptr) {
|
1501
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
1502
|
+
}
|
1456
1503
|
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
1457
1504
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
1458
1505
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
@@ -1620,8 +1667,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1620
1667
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
1621
1668
|
std::regex pattern(overrides->pattern);
|
1622
1669
|
if (std::regex_search(tensor_name, pattern)) {
|
1623
|
-
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), lm_ggml_backend_buft_name(overrides->buft));
|
1624
1670
|
buft = overrides->buft;
|
1671
|
+
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
1672
|
+
tensor_name.c_str(),
|
1673
|
+
lm_ggml_nbytes(t_meta) / 1024 / 1024, lm_ggml_type_name(t_meta->type),
|
1674
|
+
lm_ggml_backend_buft_name(buft));
|
1625
1675
|
break;
|
1626
1676
|
}
|
1627
1677
|
}
|
@@ -1638,6 +1688,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1638
1688
|
auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
|
1639
1689
|
if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
|
1640
1690
|
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
1691
|
+
if (!cpu_dev) {
|
1692
|
+
throw std::runtime_error("no CPU backend found");
|
1693
|
+
}
|
1641
1694
|
buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
|
1642
1695
|
}
|
1643
1696
|
|
@@ -1724,6 +1777,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1724
1777
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
1725
1778
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
1726
1779
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
1780
|
+
|
1781
|
+
// For Granite MoE Shared
|
1782
|
+
if (hparams.n_ff_shexp > 0) {
|
1783
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
1784
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
1785
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
1786
|
+
}
|
1727
1787
|
}
|
1728
1788
|
}
|
1729
1789
|
} break;
|
@@ -1819,7 +1879,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1819
1879
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
1820
1880
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
1821
1881
|
|
1822
|
-
|
1882
|
+
if (n_ff > 0) {
|
1883
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1884
|
+
}
|
1823
1885
|
|
1824
1886
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
1825
1887
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
@@ -1829,9 +1891,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1829
1891
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
1830
1892
|
}
|
1831
1893
|
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1894
|
+
if (n_ff > 0) {
|
1895
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1896
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1897
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1898
|
+
}
|
1835
1899
|
|
1836
1900
|
// optional MLP bias
|
1837
1901
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
@@ -2046,6 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
2046
2110
|
} break;
|
2047
2111
|
case LLM_ARCH_BERT:
|
2048
2112
|
case LLM_ARCH_NOMIC_BERT:
|
2113
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
2049
2114
|
{
|
2050
2115
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
2051
2116
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
@@ -2079,20 +2144,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
2079
2144
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
2080
2145
|
}
|
2081
2146
|
|
2147
|
+
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
2148
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
2149
|
+
}
|
2150
|
+
|
2082
2151
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
2083
2152
|
|
2084
2153
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
2085
2154
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
2086
2155
|
|
2087
|
-
|
2088
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
2089
|
-
|
2090
|
-
if (arch == LLM_ARCH_BERT) {
|
2156
|
+
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
2091
2157
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
2092
|
-
layer.
|
2093
|
-
layer.
|
2158
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
2159
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
2160
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
2094
2161
|
} else {
|
2095
|
-
layer.
|
2162
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
2163
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
2164
|
+
|
2165
|
+
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
2166
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
2167
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
2168
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
2169
|
+
} else {
|
2170
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
2171
|
+
}
|
2096
2172
|
}
|
2097
2173
|
|
2098
2174
|
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
@@ -3196,8 +3272,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
3196
3272
|
{
|
3197
3273
|
const bool is_lite = (hparams.n_layer == 27);
|
3198
3274
|
|
3275
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
3276
|
+
|
3277
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
3278
|
+
const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
3279
|
+
const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
3280
|
+
|
3199
3281
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
3200
|
-
const int64_t n_embd_head_qk_nope =
|
3282
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
3201
3283
|
|
3202
3284
|
const int64_t q_lora_rank = hparams.n_lora_q;
|
3203
3285
|
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
@@ -3223,14 +3305,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
3223
3305
|
|
3224
3306
|
if (!is_lite) {
|
3225
3307
|
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
3226
|
-
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head *
|
3308
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
3227
3309
|
} else {
|
3228
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd,
|
3310
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
|
3229
3311
|
}
|
3230
3312
|
|
3231
|
-
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank +
|
3232
|
-
|
3233
|
-
|
3313
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
|
3314
|
+
|
3315
|
+
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
|
3316
|
+
if (is_mla) {
|
3317
|
+
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
|
3318
|
+
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
|
3319
|
+
} else {
|
3320
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
|
3321
|
+
}
|
3322
|
+
|
3323
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
|
3234
3324
|
|
3235
3325
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
3236
3326
|
|
@@ -3449,7 +3539,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
3449
3539
|
|
3450
3540
|
// output
|
3451
3541
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
3452
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
3542
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
3543
|
+
// if output is NULL, init from the input tok embed
|
3544
|
+
if (output == NULL) {
|
3545
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
3546
|
+
}
|
3453
3547
|
|
3454
3548
|
for (int i = 0; i < n_layer; ++i) {
|
3455
3549
|
auto & layer = layers[i];
|
@@ -3476,6 +3570,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
3476
3570
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
3477
3571
|
}
|
3478
3572
|
} break;
|
3573
|
+
case LLM_ARCH_GLM4:
|
3574
|
+
{
|
3575
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
3576
|
+
|
3577
|
+
// output
|
3578
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
3579
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
3580
|
+
// if output is NULL, init from the input tok embed
|
3581
|
+
if (output == NULL) {
|
3582
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
3583
|
+
}
|
3584
|
+
|
3585
|
+
for (int i = 0; i < n_layer; ++i) {
|
3586
|
+
auto & layer = layers[i];
|
3587
|
+
|
3588
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
3589
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
3590
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
3591
|
+
|
3592
|
+
if (layer.wqkv == nullptr) {
|
3593
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
3594
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
3595
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
3596
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
3597
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
3598
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
3599
|
+
}
|
3600
|
+
|
3601
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
3602
|
+
|
3603
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
3604
|
+
|
3605
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
3606
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
3607
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
3608
|
+
|
3609
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
3610
|
+
}
|
3611
|
+
} break;
|
3479
3612
|
case LLM_ARCH_NEMOTRON:
|
3480
3613
|
{
|
3481
3614
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
@@ -4015,6 +4148,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
4015
4148
|
if (!dev) {
|
4016
4149
|
// FIXME: workaround for CPU backend buft having a NULL device
|
4017
4150
|
dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
4151
|
+
if (!dev) {
|
4152
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
4153
|
+
}
|
4018
4154
|
}
|
4019
4155
|
lm_ggml_backend_dev_props props;
|
4020
4156
|
lm_ggml_backend_dev_get_props(dev, &props);
|
@@ -4144,7 +4280,7 @@ uint64_t llama_model::n_elements() const {
|
|
4144
4280
|
}
|
4145
4281
|
|
4146
4282
|
void llama_model::print_info() const {
|
4147
|
-
const
|
4283
|
+
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
4148
4284
|
|
4149
4285
|
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
4150
4286
|
bool is_var = false;
|
@@ -4187,7 +4323,7 @@ void llama_model::print_info() const {
|
|
4187
4323
|
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
4188
4324
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
4189
4325
|
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
4190
|
-
LLAMA_LOG_INFO("%s:
|
4326
|
+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
4191
4327
|
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
4192
4328
|
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
4193
4329
|
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
@@ -4205,7 +4341,7 @@ void llama_model::print_info() const {
|
|
4205
4341
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
4206
4342
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
4207
4343
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
4208
|
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
4344
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
4209
4345
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
4210
4346
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
4211
4347
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
@@ -4242,6 +4378,8 @@ void llama_model::print_info() const {
|
|
4242
4378
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
4243
4379
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
4244
4380
|
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
4381
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
4382
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
4245
4383
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
4246
4384
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
4247
4385
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
@@ -4259,10 +4397,13 @@ void llama_model::print_info() const {
|
|
4259
4397
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
4260
4398
|
}
|
4261
4399
|
|
4262
|
-
if (arch == LLM_ARCH_MINICPM ||
|
4400
|
+
if (arch == LLM_ARCH_MINICPM ||
|
4401
|
+
arch == LLM_ARCH_GRANITE ||
|
4402
|
+
arch == LLM_ARCH_GRANITE_MOE) {
|
4263
4403
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
4264
4404
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
4265
4405
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
4406
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
4266
4407
|
}
|
4267
4408
|
|
4268
4409
|
if (arch == LLM_ARCH_BAILINGMOE) {
|
@@ -4350,6 +4491,29 @@ const lm_ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
4350
4491
|
return it->second;
|
4351
4492
|
}
|
4352
4493
|
|
4494
|
+
float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
|
4495
|
+
return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
4496
|
+
}
|
4497
|
+
|
4498
|
+
float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
|
4499
|
+
return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
4500
|
+
}
|
4501
|
+
|
4502
|
+
lm_ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
|
4503
|
+
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
4504
|
+
|
4505
|
+
// choose long/short freq factors based on the context size
|
4506
|
+
if (layers[il].rope_freqs != nullptr) {
|
4507
|
+
return layers[il].rope_freqs;
|
4508
|
+
}
|
4509
|
+
|
4510
|
+
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
4511
|
+
return layers[il].rope_long;
|
4512
|
+
}
|
4513
|
+
|
4514
|
+
return layers[il].rope_short;
|
4515
|
+
}
|
4516
|
+
|
4353
4517
|
struct llm_build_llama : public llm_graph_context {
|
4354
4518
|
llm_build_llama(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
4355
4519
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
@@ -4365,22 +4529,13 @@ struct llm_build_llama : public llm_graph_context {
|
|
4365
4529
|
// inp_pos - contains the positions
|
4366
4530
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
4367
4531
|
|
4368
|
-
// temperature tuning
|
4369
|
-
lm_ggml_tensor * inp_attn_scale = nullptr;
|
4370
|
-
if (arch == LLM_ARCH_LLAMA4) {
|
4371
|
-
inp_attn_scale = build_inp_attn_scale();
|
4372
|
-
}
|
4373
|
-
|
4374
4532
|
auto * inp_attn = build_attn_inp_kv_unified();
|
4375
4533
|
|
4376
4534
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
4535
|
+
|
4377
4536
|
for (int il = 0; il < n_layer; ++il) {
|
4378
4537
|
lm_ggml_tensor * inpSA = inpL;
|
4379
4538
|
|
4380
|
-
bool use_rope = arch == LLM_ARCH_LLAMA4
|
4381
|
-
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
4382
|
-
: true;
|
4383
|
-
|
4384
4539
|
// norm
|
4385
4540
|
cur = build_norm(inpL,
|
4386
4541
|
model.layers[il].attn_norm, NULL,
|
@@ -4390,7 +4545,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
4390
4545
|
// self-attention
|
4391
4546
|
{
|
4392
4547
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
4393
|
-
lm_ggml_tensor * rope_factors =
|
4548
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
4394
4549
|
|
4395
4550
|
// compute Q and K and RoPE them
|
4396
4551
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -4418,37 +4573,25 @@ struct llm_build_llama : public llm_graph_context {
|
|
4418
4573
|
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
4419
4574
|
Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
4420
4575
|
|
4421
|
-
|
4422
|
-
|
4423
|
-
|
4424
|
-
|
4425
|
-
|
4426
|
-
);
|
4576
|
+
Qcur = lm_ggml_rope_ext(
|
4577
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
4578
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4579
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4580
|
+
);
|
4427
4581
|
|
4428
|
-
|
4429
|
-
|
4430
|
-
|
4431
|
-
|
4432
|
-
|
4433
|
-
} else if (inp_attn_scale) {
|
4434
|
-
Qcur = lm_ggml_mul(ctx0, Qcur, inp_attn_scale);
|
4435
|
-
}
|
4582
|
+
Kcur = lm_ggml_rope_ext(
|
4583
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
4584
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4585
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4586
|
+
);
|
4436
4587
|
|
4437
4588
|
cb(Qcur, "Qcur", il);
|
4438
4589
|
cb(Kcur, "Kcur", il);
|
4439
4590
|
cb(Vcur, "Vcur", il);
|
4440
4591
|
|
4441
|
-
if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
|
4442
|
-
// Llama4TextL2Norm
|
4443
|
-
Qcur = lm_ggml_rms_norm(ctx0, Qcur, 1e-6);
|
4444
|
-
Kcur = lm_ggml_rms_norm(ctx0, Kcur, 1e-6);
|
4445
|
-
cb(Qcur, "Qcur_normed", il);
|
4446
|
-
cb(Kcur, "Kcur_normed", il);
|
4447
|
-
}
|
4448
|
-
|
4449
4592
|
cur = build_attn(inp_attn, gf,
|
4450
4593
|
model.layers[il].wo, model.layers[il].bo,
|
4451
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
4594
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
4452
4595
|
cb(cur, "attn_out", il);
|
4453
4596
|
}
|
4454
4597
|
|
@@ -4459,11 +4602,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
4459
4602
|
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
4460
4603
|
}
|
4461
4604
|
|
4462
|
-
// For Granite architecture
|
4463
|
-
if (hparams.f_residual_scale) {
|
4464
|
-
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
4465
|
-
}
|
4466
|
-
|
4467
4605
|
lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
4468
4606
|
cb(ffn_inp, "ffn_inp", il);
|
4469
4607
|
|
@@ -4482,38 +4620,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
4482
4620
|
NULL,
|
4483
4621
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
4484
4622
|
cb(cur, "ffn_out", il);
|
4485
|
-
|
4486
|
-
} else if (arch == LLM_ARCH_LLAMA4) {
|
4487
|
-
// llama4 MoE
|
4488
|
-
lm_ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
4489
|
-
model.layers[il].ffn_norm, NULL,
|
4490
|
-
LLM_NORM_RMS, il);
|
4491
|
-
cb(cur, "ffn_norm", il);
|
4492
|
-
|
4493
|
-
lm_ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
|
4494
|
-
model.layers[il].ffn_gate_inp,
|
4495
|
-
model.layers[il].ffn_up_exps,
|
4496
|
-
model.layers[il].ffn_gate_exps,
|
4497
|
-
model.layers[il].ffn_down_exps,
|
4498
|
-
nullptr,
|
4499
|
-
n_expert, n_expert_used,
|
4500
|
-
LLM_FFN_SILU, false,
|
4501
|
-
false, 0.0,
|
4502
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
4503
|
-
il);
|
4504
|
-
|
4505
|
-
// Shared experts
|
4506
|
-
lm_ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
|
4507
|
-
model.layers[il].ffn_up_shexp, NULL, NULL,
|
4508
|
-
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
4509
|
-
model.layers[il].ffn_down_shexp, NULL, NULL,
|
4510
|
-
NULL,
|
4511
|
-
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
4512
|
-
cb(shexp_out, "ffn_moe_shexp", il);
|
4513
|
-
|
4514
|
-
cur = lm_ggml_add(ctx0, moe_out, shexp_out);
|
4515
|
-
cb(cur, "ffn_moe_out_merged", il);
|
4516
|
-
|
4517
4623
|
} else {
|
4518
4624
|
// MoE branch
|
4519
4625
|
cur = build_norm(ffn_inp,
|
@@ -4535,11 +4641,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
4535
4641
|
cb(cur, "ffn_moe_out", il);
|
4536
4642
|
}
|
4537
4643
|
|
4538
|
-
// For Granite architecture
|
4539
|
-
if (hparams.f_residual_scale) {
|
4540
|
-
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
4541
|
-
}
|
4542
|
-
|
4543
4644
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
4544
4645
|
cb(cur, "ffn_out", il);
|
4545
4646
|
|
@@ -4562,11 +4663,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
4562
4663
|
// lm_head
|
4563
4664
|
cur = build_lora_mm(model.output, cur);
|
4564
4665
|
|
4565
|
-
// For Granite architecture
|
4566
|
-
if (hparams.f_logit_scale) {
|
4567
|
-
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
4568
|
-
}
|
4569
|
-
|
4570
4666
|
cb(cur, "result_output", -1);
|
4571
4667
|
res->t_logits = cur;
|
4572
4668
|
|
@@ -4574,8 +4670,8 @@ struct llm_build_llama : public llm_graph_context {
|
|
4574
4670
|
}
|
4575
4671
|
};
|
4576
4672
|
|
4577
|
-
struct
|
4578
|
-
|
4673
|
+
struct llm_build_llama_iswa : public llm_graph_context {
|
4674
|
+
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
4579
4675
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4580
4676
|
|
4581
4677
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -4589,33 +4685,29 @@ struct llm_build_deci : public llm_graph_context {
|
|
4589
4685
|
// inp_pos - contains the positions
|
4590
4686
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
4591
4687
|
|
4592
|
-
|
4688
|
+
// temperature tuning
|
4689
|
+
lm_ggml_tensor * inp_attn_scale = nullptr;
|
4690
|
+
inp_attn_scale = build_inp_attn_scale();
|
4691
|
+
|
4692
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
4593
4693
|
|
4594
4694
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
4695
|
+
|
4595
4696
|
for (int il = 0; il < n_layer; ++il) {
|
4596
4697
|
lm_ggml_tensor * inpSA = inpL;
|
4597
|
-
const int64_t n_head_kv = hparams.n_head_kv(il);
|
4598
|
-
const int64_t n_head = hparams.n_head(il);
|
4599
4698
|
|
4600
|
-
|
4601
|
-
// attention-free layer of Llama-3_1-Nemotron-51B
|
4602
|
-
cur = inpL;
|
4603
|
-
} else {
|
4604
|
-
// norm
|
4605
|
-
cur = build_norm(inpL,
|
4606
|
-
model.layers[il].attn_norm, NULL,
|
4607
|
-
LLM_NORM_RMS, il);
|
4608
|
-
cb(cur, "attn_norm", il);
|
4609
|
-
}
|
4699
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
4610
4700
|
|
4611
|
-
|
4612
|
-
|
4613
|
-
|
4614
|
-
|
4615
|
-
|
4616
|
-
|
4701
|
+
// norm
|
4702
|
+
cur = build_norm(inpL,
|
4703
|
+
model.layers[il].attn_norm, NULL,
|
4704
|
+
LLM_NORM_RMS, il);
|
4705
|
+
cb(cur, "attn_norm", il);
|
4706
|
+
|
4707
|
+
// self-attention
|
4708
|
+
{
|
4617
4709
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
4618
|
-
lm_ggml_tensor * rope_factors =
|
4710
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
4619
4711
|
|
4620
4712
|
// compute Q and K and RoPE them
|
4621
4713
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -4643,25 +4735,38 @@ struct llm_build_deci : public llm_graph_context {
|
|
4643
4735
|
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
4644
4736
|
Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
4645
4737
|
|
4646
|
-
|
4647
|
-
|
4648
|
-
|
4649
|
-
|
4650
|
-
|
4738
|
+
if (use_rope) {
|
4739
|
+
Qcur = lm_ggml_rope_ext(
|
4740
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
4741
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4742
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4743
|
+
);
|
4651
4744
|
|
4652
|
-
|
4653
|
-
|
4654
|
-
|
4655
|
-
|
4656
|
-
|
4745
|
+
Kcur = lm_ggml_rope_ext(
|
4746
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
4747
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4748
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4749
|
+
);
|
4750
|
+
} else if (inp_attn_scale) {
|
4751
|
+
Qcur = lm_ggml_mul(ctx0, Qcur, inp_attn_scale);
|
4752
|
+
}
|
4657
4753
|
|
4658
4754
|
cb(Qcur, "Qcur", il);
|
4659
4755
|
cb(Kcur, "Kcur", il);
|
4660
4756
|
cb(Vcur, "Vcur", il);
|
4661
4757
|
|
4758
|
+
if (use_rope && hparams.use_kq_norm) {
|
4759
|
+
// Llama4TextL2Norm
|
4760
|
+
Qcur = lm_ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
4761
|
+
Kcur = lm_ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
4762
|
+
cb(Qcur, "Qcur_normed", il);
|
4763
|
+
cb(Kcur, "Kcur_normed", il);
|
4764
|
+
}
|
4765
|
+
|
4662
4766
|
cur = build_attn(inp_attn, gf,
|
4663
4767
|
model.layers[il].wo, model.layers[il].bo,
|
4664
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
4768
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
4769
|
+
cb(cur, "attn_out", il);
|
4665
4770
|
}
|
4666
4771
|
|
4667
4772
|
if (il == n_layer - 1) {
|
@@ -4671,19 +4776,10 @@ struct llm_build_deci : public llm_graph_context {
|
|
4671
4776
|
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
4672
4777
|
}
|
4673
4778
|
|
4674
|
-
|
4675
|
-
|
4676
|
-
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
4677
|
-
}
|
4678
|
-
|
4679
|
-
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
4680
|
-
lm_ggml_tensor * ffn_inp = cur;
|
4681
|
-
if (n_head > 0) {
|
4682
|
-
ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
4683
|
-
cb(ffn_inp, "ffn_inp", il);
|
4684
|
-
}
|
4779
|
+
lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
4780
|
+
cb(ffn_inp, "ffn_inp", il);
|
4685
4781
|
|
4686
|
-
// feed-forward network
|
4782
|
+
// feed-forward network (non-MoE)
|
4687
4783
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
4688
4784
|
cur = build_norm(ffn_inp,
|
4689
4785
|
model.layers[il].ffn_norm, NULL,
|
@@ -4697,12 +4793,36 @@ struct llm_build_deci : public llm_graph_context {
|
|
4697
4793
|
NULL,
|
4698
4794
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
4699
4795
|
cb(cur, "ffn_out", il);
|
4700
|
-
}
|
4701
|
-
|
4702
|
-
|
4703
|
-
|
4704
|
-
cur
|
4705
|
-
|
4796
|
+
} else {
|
4797
|
+
lm_ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
4798
|
+
model.layers[il].ffn_norm, NULL,
|
4799
|
+
LLM_NORM_RMS, il);
|
4800
|
+
cb(cur, "ffn_norm", il);
|
4801
|
+
|
4802
|
+
lm_ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
|
4803
|
+
model.layers[il].ffn_gate_inp,
|
4804
|
+
model.layers[il].ffn_up_exps,
|
4805
|
+
model.layers[il].ffn_gate_exps,
|
4806
|
+
model.layers[il].ffn_down_exps,
|
4807
|
+
nullptr,
|
4808
|
+
n_expert, n_expert_used,
|
4809
|
+
LLM_FFN_SILU, false,
|
4810
|
+
false, 0.0,
|
4811
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
4812
|
+
il);
|
4813
|
+
|
4814
|
+
// Shared experts
|
4815
|
+
lm_ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
|
4816
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
4817
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
4818
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
4819
|
+
NULL,
|
4820
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
4821
|
+
cb(shexp_out, "ffn_moe_shexp", il);
|
4822
|
+
|
4823
|
+
cur = lm_ggml_add(ctx0, moe_out, shexp_out);
|
4824
|
+
cb(cur, "ffn_moe_out_merged", il);
|
4825
|
+
}
|
4706
4826
|
|
4707
4827
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
4708
4828
|
cb(cur, "ffn_out", il);
|
@@ -4726,11 +4846,161 @@ struct llm_build_deci : public llm_graph_context {
|
|
4726
4846
|
// lm_head
|
4727
4847
|
cur = build_lora_mm(model.output, cur);
|
4728
4848
|
|
4729
|
-
|
4730
|
-
|
4731
|
-
|
4849
|
+
cb(cur, "result_output", -1);
|
4850
|
+
res->t_logits = cur;
|
4851
|
+
|
4852
|
+
lm_ggml_build_forward_expand(gf, cur);
|
4853
|
+
}
|
4854
|
+
};
|
4855
|
+
|
4856
|
+
struct llm_build_deci : public llm_graph_context {
|
4857
|
+
llm_build_deci(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
4858
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4859
|
+
|
4860
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4861
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4862
|
+
|
4863
|
+
lm_ggml_tensor * cur;
|
4864
|
+
lm_ggml_tensor * inpL;
|
4865
|
+
|
4866
|
+
inpL = build_inp_embd(model.tok_embd);
|
4867
|
+
|
4868
|
+
// inp_pos - contains the positions
|
4869
|
+
lm_ggml_tensor * inp_pos = build_inp_pos();
|
4870
|
+
|
4871
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
4872
|
+
|
4873
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
4874
|
+
for (int il = 0; il < n_layer; ++il) {
|
4875
|
+
lm_ggml_tensor * inpSA = inpL;
|
4876
|
+
const int64_t n_head_kv = hparams.n_head_kv(il);
|
4877
|
+
const int64_t n_head = hparams.n_head(il);
|
4878
|
+
const int64_t n_ff = hparams.n_ff(il);
|
4879
|
+
|
4880
|
+
if (n_head == 0) {
|
4881
|
+
// attention-free layer of Llama-3_1-Nemotron-51B
|
4882
|
+
cur = inpL;
|
4883
|
+
} else {
|
4884
|
+
// norm
|
4885
|
+
cur = build_norm(inpL,
|
4886
|
+
model.layers[il].attn_norm, NULL,
|
4887
|
+
LLM_NORM_RMS, il);
|
4888
|
+
cb(cur, "attn_norm", il);
|
4889
|
+
}
|
4890
|
+
|
4891
|
+
if (n_head > 0 && n_head_kv == 0) {
|
4892
|
+
// "linear attention" of Llama-3_1-Nemotron-51B
|
4893
|
+
cur = build_lora_mm(model.layers[il].wo, cur);
|
4894
|
+
cb(cur, "wo", il);
|
4895
|
+
} else if (n_head > 0) {
|
4896
|
+
// self-attention
|
4897
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
4898
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
4899
|
+
|
4900
|
+
// compute Q and K and RoPE them
|
4901
|
+
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
4902
|
+
cb(Qcur, "Qcur", il);
|
4903
|
+
if (model.layers[il].bq) {
|
4904
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
4905
|
+
cb(Qcur, "Qcur", il);
|
4906
|
+
}
|
4907
|
+
|
4908
|
+
lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
4909
|
+
cb(Kcur, "Kcur", il);
|
4910
|
+
if (model.layers[il].bk) {
|
4911
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
4912
|
+
cb(Kcur, "Kcur", il);
|
4913
|
+
}
|
4914
|
+
|
4915
|
+
lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
4916
|
+
cb(Vcur, "Vcur", il);
|
4917
|
+
if (model.layers[il].bv) {
|
4918
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
4919
|
+
cb(Vcur, "Vcur", il);
|
4920
|
+
}
|
4921
|
+
|
4922
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
4923
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
4924
|
+
Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
4925
|
+
|
4926
|
+
Qcur = lm_ggml_rope_ext(
|
4927
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
4928
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4929
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4930
|
+
);
|
4931
|
+
|
4932
|
+
Kcur = lm_ggml_rope_ext(
|
4933
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
4934
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
4935
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4936
|
+
);
|
4937
|
+
|
4938
|
+
cb(Qcur, "Qcur", il);
|
4939
|
+
cb(Kcur, "Kcur", il);
|
4940
|
+
cb(Vcur, "Vcur", il);
|
4941
|
+
|
4942
|
+
cur = build_attn(inp_attn, gf,
|
4943
|
+
model.layers[il].wo, model.layers[il].bo,
|
4944
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
4945
|
+
}
|
4946
|
+
|
4947
|
+
if (il == n_layer - 1) {
|
4948
|
+
// skip computing output for unused tokens
|
4949
|
+
lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
4950
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
4951
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
4952
|
+
}
|
4953
|
+
|
4954
|
+
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
4955
|
+
if (n_ff == 0) {
|
4956
|
+
continue;
|
4957
|
+
}
|
4958
|
+
|
4959
|
+
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
4960
|
+
lm_ggml_tensor * ffn_inp = cur;
|
4961
|
+
if (n_head > 0) {
|
4962
|
+
ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
4963
|
+
cb(ffn_inp, "ffn_inp", il);
|
4964
|
+
}
|
4965
|
+
|
4966
|
+
// feed-forward network
|
4967
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
4968
|
+
cur = build_norm(ffn_inp,
|
4969
|
+
model.layers[il].ffn_norm, NULL,
|
4970
|
+
LLM_NORM_RMS, il);
|
4971
|
+
cb(cur, "ffn_norm", il);
|
4972
|
+
|
4973
|
+
cur = build_ffn(cur,
|
4974
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
4975
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
4976
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
4977
|
+
NULL,
|
4978
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
4979
|
+
cb(cur, "ffn_out", il);
|
4980
|
+
}
|
4981
|
+
|
4982
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
4983
|
+
cb(cur, "ffn_out", il);
|
4984
|
+
|
4985
|
+
cur = build_cvec(cur, il);
|
4986
|
+
cb(cur, "l_out", il);
|
4987
|
+
|
4988
|
+
// input for next layer
|
4989
|
+
inpL = cur;
|
4732
4990
|
}
|
4733
4991
|
|
4992
|
+
cur = inpL;
|
4993
|
+
|
4994
|
+
cur = build_norm(cur,
|
4995
|
+
model.output_norm, NULL,
|
4996
|
+
LLM_NORM_RMS, -1);
|
4997
|
+
|
4998
|
+
cb(cur, "result_norm", -1);
|
4999
|
+
res->t_embd = cur;
|
5000
|
+
|
5001
|
+
// lm_head
|
5002
|
+
cur = build_lora_mm(model.output, cur);
|
5003
|
+
|
4734
5004
|
cb(cur, "result_output", -1);
|
4735
5005
|
res->t_logits = cur;
|
4736
5006
|
|
@@ -4803,7 +5073,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
4803
5073
|
|
4804
5074
|
cur = build_attn(inp_attn, gf,
|
4805
5075
|
model.layers[il].wo, NULL,
|
4806
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5076
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
4807
5077
|
}
|
4808
5078
|
|
4809
5079
|
if (il == n_layer - 1) {
|
@@ -4918,7 +5188,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
4918
5188
|
|
4919
5189
|
cur = build_attn(inp_attn, gf,
|
4920
5190
|
model.layers[il].wo, NULL,
|
4921
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5191
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
4922
5192
|
}
|
4923
5193
|
|
4924
5194
|
if (il == n_layer - 1) {
|
@@ -5043,7 +5313,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
5043
5313
|
|
5044
5314
|
cur = build_attn(inp_attn, gf,
|
5045
5315
|
model.layers[il].wo, NULL,
|
5046
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5316
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5047
5317
|
}
|
5048
5318
|
|
5049
5319
|
if (il == n_layer - 1) {
|
@@ -5173,7 +5443,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
5173
5443
|
|
5174
5444
|
cur = build_attn(inp_attn, gf,
|
5175
5445
|
model.layers[il].wo, model.layers[il].bo,
|
5176
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
5446
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
5177
5447
|
}
|
5178
5448
|
|
5179
5449
|
if (il == n_layer - 1) {
|
@@ -5324,7 +5594,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
5324
5594
|
|
5325
5595
|
cur = build_attn(inp_attn, gf,
|
5326
5596
|
model.layers[il].wo, NULL,
|
5327
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5597
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5328
5598
|
}
|
5329
5599
|
|
5330
5600
|
if (il == n_layer - 1) {
|
@@ -5438,7 +5708,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
5438
5708
|
|
5439
5709
|
cur = build_attn(inp_attn, gf,
|
5440
5710
|
model.layers[il].wo, model.layers[il].bo,
|
5441
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5711
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5442
5712
|
}
|
5443
5713
|
|
5444
5714
|
if (il == n_layer - 1) {
|
@@ -5537,7 +5807,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
5537
5807
|
|
5538
5808
|
cur = build_attn(inp_attn, gf,
|
5539
5809
|
model.layers[il].wo, NULL,
|
5540
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5810
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5541
5811
|
}
|
5542
5812
|
|
5543
5813
|
if (il == n_layer - 1) {
|
@@ -5664,6 +5934,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
5664
5934
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
5665
5935
|
cb(cur, "wqkv", il);
|
5666
5936
|
|
5937
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
5938
|
+
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5939
|
+
cb(cur, "bqkv", il);
|
5940
|
+
}
|
5941
|
+
|
5667
5942
|
Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5668
5943
|
Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5669
5944
|
Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
@@ -5691,7 +5966,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
5691
5966
|
|
5692
5967
|
cur = build_attn(inp_attn, gf,
|
5693
5968
|
model.layers[il].wo, model.layers[il].bo,
|
5694
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5969
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5695
5970
|
cb(cur, "kqv_out", il);
|
5696
5971
|
|
5697
5972
|
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
@@ -5716,13 +5991,29 @@ struct llm_build_bert : public llm_graph_context {
|
|
5716
5991
|
cb(ffn_inp, "ffn_inp", il);
|
5717
5992
|
|
5718
5993
|
// feed-forward network
|
5719
|
-
if (
|
5994
|
+
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
5995
|
+
// MoE branch
|
5996
|
+
cur = build_moe_ffn(cur,
|
5997
|
+
model.layers[il].ffn_gate_inp,
|
5998
|
+
model.layers[il].ffn_up_exps,
|
5999
|
+
nullptr,
|
6000
|
+
model.layers[il].ffn_down_exps,
|
6001
|
+
nullptr,
|
6002
|
+
hparams.n_expert,
|
6003
|
+
hparams.n_expert_used,
|
6004
|
+
LLM_FFN_GELU,
|
6005
|
+
false, false,
|
6006
|
+
0.0f,
|
6007
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
6008
|
+
cb(cur, "ffn_moe_out", il);
|
6009
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
5720
6010
|
cur = build_ffn(cur,
|
5721
6011
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
5722
6012
|
NULL, NULL, NULL,
|
5723
6013
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
5724
6014
|
NULL,
|
5725
6015
|
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
6016
|
+
cb(cur, "ffn_out", il);
|
5726
6017
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
5727
6018
|
cur = build_ffn(cur,
|
5728
6019
|
model.layers[il].ffn_up, NULL, NULL,
|
@@ -5730,6 +6021,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
5730
6021
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
5731
6022
|
NULL,
|
5732
6023
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
6024
|
+
cb(cur, "ffn_out", il);
|
5733
6025
|
} else {
|
5734
6026
|
cur = build_ffn(cur,
|
5735
6027
|
model.layers[il].ffn_up, NULL, NULL,
|
@@ -5737,8 +6029,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
5737
6029
|
model.layers[il].ffn_down, NULL, NULL,
|
5738
6030
|
NULL,
|
5739
6031
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
6032
|
+
cb(cur, "ffn_out", il);
|
5740
6033
|
}
|
5741
|
-
cb(cur, "ffn_out", il);
|
5742
6034
|
|
5743
6035
|
// attentions bypass the intermediate layer
|
5744
6036
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
@@ -5808,7 +6100,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
5808
6100
|
|
5809
6101
|
cur = build_attn(inp_attn, gf,
|
5810
6102
|
model.layers[il].wo, model.layers[il].bo,
|
5811
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6103
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5812
6104
|
}
|
5813
6105
|
|
5814
6106
|
if (il == n_layer - 1) {
|
@@ -5949,7 +6241,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
5949
6241
|
|
5950
6242
|
cur = build_attn(inp_attn, gf,
|
5951
6243
|
model.layers[il].wo, model.layers[il].bo,
|
5952
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6244
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
5953
6245
|
}
|
5954
6246
|
|
5955
6247
|
if (il == n_layer - 1) {
|
@@ -6095,7 +6387,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
6095
6387
|
|
6096
6388
|
cur = build_attn(inp_attn, gf,
|
6097
6389
|
model.layers[il].wo, NULL,
|
6098
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6390
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6099
6391
|
}
|
6100
6392
|
|
6101
6393
|
if (il == n_layer - 1) {
|
@@ -6218,7 +6510,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
6218
6510
|
|
6219
6511
|
cur = build_attn(inp_attn, gf,
|
6220
6512
|
model.layers[il].wo, NULL,
|
6221
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6513
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6222
6514
|
}
|
6223
6515
|
|
6224
6516
|
if (il == n_layer - 1) {
|
@@ -6338,7 +6630,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
6338
6630
|
|
6339
6631
|
cur = build_attn(inp_attn, gf,
|
6340
6632
|
model.layers[il].wo, model.layers[il].bo,
|
6341
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6633
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6342
6634
|
}
|
6343
6635
|
|
6344
6636
|
if (il == n_layer - 1) {
|
@@ -6459,7 +6751,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
6459
6751
|
|
6460
6752
|
cur = build_attn(inp_attn, gf,
|
6461
6753
|
model.layers[il].wo, model.layers[il].bo,
|
6462
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6754
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6463
6755
|
}
|
6464
6756
|
|
6465
6757
|
if (il == n_layer - 1) {
|
@@ -6586,7 +6878,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
6586
6878
|
|
6587
6879
|
cur = build_attn(inp_attn, gf,
|
6588
6880
|
model.layers[il].wo, model.layers[il].bo,
|
6589
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6881
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6590
6882
|
}
|
6591
6883
|
|
6592
6884
|
if (il == n_layer - 1) {
|
@@ -6739,7 +7031,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
6739
7031
|
|
6740
7032
|
cur = build_attn(inp_attn, gf,
|
6741
7033
|
model.layers[il].wo, model.layers[il].bo,
|
6742
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7034
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6743
7035
|
}
|
6744
7036
|
|
6745
7037
|
if (il == n_layer - 1) {
|
@@ -6860,7 +7152,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
6860
7152
|
|
6861
7153
|
cur = build_attn(inp_attn, gf,
|
6862
7154
|
model.layers[il].wo, model.layers[il].bo,
|
6863
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7155
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
6864
7156
|
}
|
6865
7157
|
|
6866
7158
|
if (il == n_layer - 1) {
|
@@ -7000,7 +7292,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
7000
7292
|
|
7001
7293
|
cur = build_attn(inp_attn, gf,
|
7002
7294
|
model.layers[il].wo, model.layers[il].bo,
|
7003
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
7295
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
7004
7296
|
}
|
7005
7297
|
|
7006
7298
|
if (il == n_layer - 1) {
|
@@ -7052,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
7052
7344
|
}
|
7053
7345
|
};
|
7054
7346
|
|
7347
|
+
template<bool iswa>
|
7055
7348
|
struct llm_build_phi3 : public llm_graph_context {
|
7056
7349
|
llm_build_phi3(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
7057
7350
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
@@ -7067,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
7067
7360
|
// inp_pos - contains the positions
|
7068
7361
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
7069
7362
|
|
7070
|
-
|
7363
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
7364
|
+
inp_attn_type * inp_attn = nullptr;
|
7365
|
+
|
7366
|
+
if constexpr (iswa) {
|
7367
|
+
inp_attn = build_attn_inp_kv_unified_iswa();
|
7368
|
+
} else {
|
7369
|
+
inp_attn = build_attn_inp_kv_unified();
|
7370
|
+
}
|
7071
7371
|
|
7072
7372
|
for (int il = 0; il < n_layer; ++il) {
|
7073
7373
|
auto * residual = inpL;
|
@@ -7075,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
7075
7375
|
// self-attention
|
7076
7376
|
{
|
7077
7377
|
// rope freq factors for 128k context
|
7078
|
-
lm_ggml_tensor * rope_factors =
|
7378
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
7079
7379
|
|
7080
7380
|
lm_ggml_tensor* attn_norm_output = build_norm(inpL,
|
7081
7381
|
model.layers[il].attn_norm,
|
@@ -7129,7 +7429,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
7129
7429
|
|
7130
7430
|
cur = build_attn(inp_attn, gf,
|
7131
7431
|
model.layers[il].wo, model.layers[il].bo,
|
7132
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
7432
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
7133
7433
|
}
|
7134
7434
|
|
7135
7435
|
if (il == n_layer - 1) {
|
@@ -7264,7 +7564,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
7264
7564
|
|
7265
7565
|
cur = build_attn(inp_attn, gf,
|
7266
7566
|
model.layers[il].wo, NULL,
|
7267
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7567
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7268
7568
|
}
|
7269
7569
|
lm_ggml_tensor * sa_out = cur;
|
7270
7570
|
|
@@ -7371,7 +7671,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
7371
7671
|
|
7372
7672
|
cur = build_attn(inp_attn, gf,
|
7373
7673
|
model.layers[il].wo, model.layers[il].bo,
|
7374
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7674
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7375
7675
|
}
|
7376
7676
|
|
7377
7677
|
if (il == n_layer - 1) {
|
@@ -7487,7 +7787,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
7487
7787
|
|
7488
7788
|
cur = build_attn(inp_attn, gf,
|
7489
7789
|
model.layers[il].wo, model.layers[il].bo,
|
7490
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7790
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7491
7791
|
}
|
7492
7792
|
|
7493
7793
|
if (il == n_layer - 1) {
|
@@ -7616,7 +7916,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
7616
7916
|
|
7617
7917
|
cur = build_attn(inp_attn, gf,
|
7618
7918
|
model.layers[il].wo, NULL,
|
7619
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7919
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7620
7920
|
}
|
7621
7921
|
|
7622
7922
|
if (il == n_layer - 1) {
|
@@ -7743,7 +8043,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
7743
8043
|
|
7744
8044
|
cur = build_attn(inp_attn, gf,
|
7745
8045
|
model.layers[il].wo, model.layers[il].bo,
|
7746
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
8046
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
7747
8047
|
}
|
7748
8048
|
|
7749
8049
|
if (il == n_layer - 1) {
|
@@ -7827,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
7827
8127
|
for (int il = 0; il < n_layer; ++il) {
|
7828
8128
|
lm_ggml_tensor * inpSA = inpL;
|
7829
8129
|
|
7830
|
-
lm_ggml_tensor * rope_factors =
|
8130
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
7831
8131
|
|
7832
8132
|
// norm
|
7833
8133
|
cur = build_norm(inpL,
|
@@ -7940,7 +8240,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
7940
8240
|
|
7941
8241
|
cur = build_attn(inp_attn, gf,
|
7942
8242
|
model.layers[il].wo, NULL,
|
7943
|
-
q_states, k_states, v_states, nullptr, kq_scale, il);
|
8243
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
7944
8244
|
}
|
7945
8245
|
|
7946
8246
|
if (il == n_layer - 1) {
|
@@ -8070,7 +8370,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
8070
8370
|
|
8071
8371
|
cur = build_attn(inp_attn, gf,
|
8072
8372
|
model.layers[il].wo, NULL,
|
8073
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
8373
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
8074
8374
|
}
|
8075
8375
|
|
8076
8376
|
if (il == n_layer - 1) {
|
@@ -8127,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
|
|
8127
8427
|
}
|
8128
8428
|
};
|
8129
8429
|
|
8130
|
-
struct
|
8131
|
-
|
8430
|
+
struct llm_build_gemma2_iswa : public llm_graph_context {
|
8431
|
+
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
8132
8432
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
8133
8433
|
|
8134
8434
|
lm_ggml_tensor * cur;
|
@@ -8142,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
8142
8442
|
// inp_pos - contains the positions
|
8143
8443
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
8144
8444
|
|
8145
|
-
auto * inp_attn =
|
8445
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
8146
8446
|
|
8147
8447
|
for (int il = 0; il < n_layer; ++il) {
|
8148
8448
|
// norm
|
@@ -8192,7 +8492,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
8192
8492
|
|
8193
8493
|
cur = build_attn(inp_attn, gf,
|
8194
8494
|
model.layers[il].wo, NULL,
|
8195
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
8495
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
8196
8496
|
}
|
8197
8497
|
|
8198
8498
|
cur = build_norm(cur,
|
@@ -8264,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
8264
8564
|
}
|
8265
8565
|
};
|
8266
8566
|
|
8267
|
-
struct
|
8268
|
-
|
8567
|
+
struct llm_build_gemma3_iswa : public llm_graph_context {
|
8568
|
+
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
8269
8569
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
8270
8570
|
|
8271
8571
|
lm_ggml_tensor * cur;
|
@@ -8283,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
8283
8583
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
8284
8584
|
|
8285
8585
|
// TODO: is causal == true correct? might need some changes
|
8286
|
-
auto * inp_attn =
|
8586
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
8287
8587
|
|
8288
8588
|
for (int il = 0; il < n_layer; ++il) {
|
8289
|
-
const
|
8290
|
-
|
8291
|
-
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
8292
|
-
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
8589
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
8590
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
8293
8591
|
|
8294
8592
|
// norm
|
8295
8593
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
@@ -8333,7 +8631,7 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
8333
8631
|
|
8334
8632
|
cur = build_attn(inp_attn, gf,
|
8335
8633
|
model.layers[il].wo, NULL,
|
8336
|
-
Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
|
8634
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
8337
8635
|
}
|
8338
8636
|
|
8339
8637
|
cur = build_norm(cur,
|
@@ -8473,7 +8771,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
8473
8771
|
|
8474
8772
|
cur = build_attn(inp_attn, gf,
|
8475
8773
|
model.layers[il].wo, model.layers[il].bo,
|
8476
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
8774
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
8477
8775
|
}
|
8478
8776
|
|
8479
8777
|
if (il == n_layer - 1) {
|
@@ -8594,7 +8892,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
8594
8892
|
lm_ggml_tensor * state_mask,
|
8595
8893
|
const llama_ubatch & ubatch,
|
8596
8894
|
int il) const {
|
8597
|
-
const
|
8895
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
8598
8896
|
|
8599
8897
|
const auto kv_head = kv_self->head;
|
8600
8898
|
|
@@ -8808,7 +9106,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
8808
9106
|
|
8809
9107
|
cur = build_attn(inp_attn, gf,
|
8810
9108
|
model.layers[il].wo, model.layers[il].bo,
|
8811
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9109
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
8812
9110
|
}
|
8813
9111
|
|
8814
9112
|
if (il == n_layer - 1) {
|
@@ -8866,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
|
|
8866
9164
|
}
|
8867
9165
|
};
|
8868
9166
|
|
8869
|
-
struct
|
8870
|
-
|
9167
|
+
struct llm_build_cohere2_iswa : public llm_graph_context {
|
9168
|
+
llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
8871
9169
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8872
9170
|
|
8873
9171
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -8882,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
8882
9180
|
// inp_pos - contains the positions
|
8883
9181
|
lm_ggml_tensor * inp_pos = build_inp_pos();
|
8884
9182
|
|
8885
|
-
auto * inp_attn =
|
9183
|
+
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
8886
9184
|
|
8887
9185
|
for (int il = 0; il < n_layer; ++il) {
|
8888
9186
|
const bool is_swa = hparams.is_swa(il);
|
@@ -8895,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
8895
9193
|
// self-attention
|
8896
9194
|
{
|
8897
9195
|
// rope freq factors for 128k context
|
8898
|
-
lm_ggml_tensor * rope_factors =
|
9196
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
8899
9197
|
|
8900
9198
|
// compute Q and K and RoPE them
|
8901
9199
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -8943,7 +9241,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
8943
9241
|
|
8944
9242
|
cur = build_attn(inp_attn, gf,
|
8945
9243
|
model.layers[il].wo, model.layers[il].bo,
|
8946
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9244
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
8947
9245
|
}
|
8948
9246
|
|
8949
9247
|
if (il == n_layer - 1) {
|
@@ -9074,7 +9372,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
9074
9372
|
|
9075
9373
|
cur = build_attn(inp_attn, gf,
|
9076
9374
|
model.layers[il].wo, nullptr,
|
9077
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9375
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9078
9376
|
}
|
9079
9377
|
|
9080
9378
|
if (il == n_layer - 1) {
|
@@ -9194,7 +9492,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
9194
9492
|
|
9195
9493
|
cur = build_attn(inp_attn, gf,
|
9196
9494
|
model.layers[il].wo, NULL,
|
9197
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9495
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9198
9496
|
}
|
9199
9497
|
|
9200
9498
|
cur = build_norm(cur,
|
@@ -9327,7 +9625,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
9327
9625
|
|
9328
9626
|
cur = build_attn(inp_attn, gf,
|
9329
9627
|
model.layers[il].wo, NULL,
|
9330
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9628
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9331
9629
|
}
|
9332
9630
|
|
9333
9631
|
if (il == n_layer - 1) {
|
@@ -9460,7 +9758,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
9460
9758
|
|
9461
9759
|
cur = build_attn(inp_attn, gf,
|
9462
9760
|
model.layers[il].wo, NULL,
|
9463
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9761
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9464
9762
|
}
|
9465
9763
|
|
9466
9764
|
if (il == n_layer - 1) {
|
@@ -9574,7 +9872,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
9574
9872
|
|
9575
9873
|
cur = build_attn(inp_attn, gf,
|
9576
9874
|
model.layers[il].wo, model.layers[il].bo,
|
9577
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9875
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9578
9876
|
}
|
9579
9877
|
|
9580
9878
|
if (il == n_layer - 1) {
|
@@ -9724,7 +10022,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
9724
10022
|
|
9725
10023
|
cur = build_attn(inp_attn, gf,
|
9726
10024
|
model.layers[il].wo, NULL,
|
9727
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
10025
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
9728
10026
|
}
|
9729
10027
|
|
9730
10028
|
if (il == n_layer - 1) {
|
@@ -9833,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
9833
10131
|
// self-attention
|
9834
10132
|
{
|
9835
10133
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
9836
|
-
lm_ggml_tensor * rope_factors =
|
10134
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
9837
10135
|
|
9838
10136
|
// compute Q and K and RoPE them
|
9839
10137
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -9879,7 +10177,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
9879
10177
|
|
9880
10178
|
cur = build_attn(inp_attn, gf,
|
9881
10179
|
model.layers[il].wo, model.layers[il].bo,
|
9882
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
10180
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
9883
10181
|
}
|
9884
10182
|
|
9885
10183
|
if (il == n_layer - 1) {
|
@@ -9969,15 +10267,22 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
9969
10267
|
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
9970
10268
|
bool is_lite = (hparams.n_layer == 27);
|
9971
10269
|
|
10270
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
10271
|
+
|
10272
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
10273
|
+
const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
10274
|
+
const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
10275
|
+
|
10276
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
10277
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
10278
|
+
|
10279
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
10280
|
+
|
9972
10281
|
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
9973
10282
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
9974
10283
|
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
9975
|
-
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(
|
9976
|
-
const float
|
9977
|
-
|
9978
|
-
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
9979
|
-
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
9980
|
-
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
10284
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
|
10285
|
+
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
9981
10286
|
|
9982
10287
|
lm_ggml_tensor * cur;
|
9983
10288
|
lm_ggml_tensor * inpL;
|
@@ -10003,16 +10308,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
10003
10308
|
{
|
10004
10309
|
lm_ggml_tensor * q = NULL;
|
10005
10310
|
if (!is_lite) {
|
10006
|
-
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
10007
10311
|
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
10008
10312
|
cb(q, "q", il);
|
10009
10313
|
|
10010
10314
|
q = build_norm(q,
|
10011
|
-
model.layers[il].attn_q_a_norm,
|
10315
|
+
model.layers[il].attn_q_a_norm, nullptr,
|
10012
10316
|
LLM_NORM_RMS, il);
|
10013
10317
|
cb(q, "q", il);
|
10014
10318
|
|
10015
|
-
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
10016
10319
|
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
10017
10320
|
cb(q, "q", il);
|
10018
10321
|
} else {
|
@@ -10020,96 +10323,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
10020
10323
|
cb(q, "q", il);
|
10021
10324
|
}
|
10022
10325
|
|
10023
|
-
// split into {
|
10024
|
-
lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q,
|
10025
|
-
|
10026
|
-
lm_ggml_row_size(q->type,
|
10326
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
10327
|
+
lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q,
|
10328
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
10329
|
+
lm_ggml_row_size(q->type, n_embd_head_k),
|
10330
|
+
lm_ggml_row_size(q->type, n_embd_head_k) * n_head,
|
10027
10331
|
0);
|
10028
10332
|
cb(q_nope, "q_nope", il);
|
10029
10333
|
|
10030
|
-
// and {
|
10031
|
-
lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q,
|
10032
|
-
|
10033
|
-
lm_ggml_row_size(q->type,
|
10334
|
+
// and {n_embd_head_qk_rope, n_head, n_tokens}
|
10335
|
+
lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q,
|
10336
|
+
n_embd_head_qk_rope, n_head, n_tokens,
|
10337
|
+
lm_ggml_row_size(q->type, n_embd_head_k),
|
10338
|
+
lm_ggml_row_size(q->type, n_embd_head_k) * n_head,
|
10034
10339
|
lm_ggml_row_size(q->type, n_embd_head_qk_nope));
|
10035
10340
|
cb(q_pe, "q_pe", il);
|
10036
10341
|
|
10037
|
-
|
10038
|
-
|
10039
|
-
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
10342
|
+
lm_ggml_tensor * kv_cmpr_pe = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
10343
|
+
cb(kv_cmpr_pe, "kv_cmpr_pe", il);
|
10040
10344
|
|
10041
10345
|
// split into {kv_lora_rank, n_tokens}
|
10042
|
-
lm_ggml_tensor *
|
10043
|
-
|
10346
|
+
lm_ggml_tensor * kv_cmpr = lm_ggml_view_2d(ctx0, kv_cmpr_pe,
|
10347
|
+
kv_lora_rank, n_tokens,
|
10348
|
+
lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
10044
10349
|
0);
|
10045
|
-
cb(
|
10350
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
10351
|
+
|
10352
|
+
// and {n_embd_head_qk_rope, 1, n_tokens}
|
10353
|
+
lm_ggml_tensor * k_pe = lm_ggml_view_3d(ctx0, kv_cmpr_pe,
|
10354
|
+
n_embd_head_qk_rope, 1, n_tokens,
|
10355
|
+
lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
10356
|
+
lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
10357
|
+
lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
|
10358
|
+
cb(k_pe, "k_pe", il);
|
10046
10359
|
|
10047
|
-
|
10048
|
-
|
10049
|
-
|
10050
|
-
|
10051
|
-
|
10360
|
+
q_pe = lm_ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
|
10361
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10362
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10363
|
+
);
|
10364
|
+
cb(q_pe, "q_pe", il);
|
10365
|
+
|
10366
|
+
k_pe = lm_ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
|
10367
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10368
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10369
|
+
);
|
10052
10370
|
cb(k_pe, "k_pe", il);
|
10053
10371
|
|
10054
|
-
|
10055
|
-
|
10056
|
-
kv_compressed = build_norm(kv_compressed,
|
10057
|
-
model.layers[il].attn_kv_a_norm, NULL,
|
10372
|
+
kv_cmpr = build_norm(kv_cmpr,
|
10373
|
+
model.layers[il].attn_kv_a_norm, nullptr,
|
10058
10374
|
LLM_NORM_RMS, il);
|
10059
|
-
cb(
|
10375
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
10060
10376
|
|
10061
|
-
|
10062
|
-
|
10063
|
-
|
10377
|
+
if (is_mla) {
|
10378
|
+
// {n_embd_head_qk_nope, n_tokens, n_head}
|
10379
|
+
q_nope = lm_ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
10380
|
+
cb(q_nope, "q_nope_perm", il);
|
10064
10381
|
|
10065
|
-
|
10066
|
-
|
10067
|
-
|
10068
|
-
lm_ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
10069
|
-
0);
|
10070
|
-
cb(k_nope, "k_nope", il);
|
10382
|
+
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
10383
|
+
lm_ggml_tensor * q_nope_absorbed = lm_ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
10384
|
+
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
10071
10385
|
|
10072
|
-
|
10073
|
-
|
10074
|
-
|
10075
|
-
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
10076
|
-
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
10077
|
-
cb(v_states, "v_states", il);
|
10386
|
+
// {kv_lora_rank, n_head, n_tokens}
|
10387
|
+
q_nope_absorbed = lm_ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
|
10388
|
+
cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
|
10078
10389
|
|
10079
|
-
|
10080
|
-
|
10390
|
+
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
|
10391
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
10392
|
+
lm_ggml_tensor * Qcur = lm_ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
|
10393
|
+
cb(Qcur, "Qcur", il);
|
10081
10394
|
|
10082
|
-
|
10083
|
-
|
10084
|
-
0);
|
10085
|
-
cb(v_states, "v_states", il);
|
10395
|
+
kv_cmpr = lm_ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
|
10396
|
+
cb(kv_cmpr, "kv_cmpr_reshape", il);
|
10086
10397
|
|
10087
|
-
|
10088
|
-
|
10089
|
-
|
10090
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10091
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
10092
|
-
);
|
10093
|
-
cb(q_pe, "q_pe", il);
|
10398
|
+
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
|
10399
|
+
lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, k_pe, kv_cmpr, 0);
|
10400
|
+
cb(Kcur, "Kcur", il);
|
10094
10401
|
|
10095
|
-
|
10096
|
-
|
10097
|
-
|
10098
|
-
ctx0, k_pe, inp_pos, nullptr,
|
10099
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10100
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
10101
|
-
);
|
10102
|
-
cb(k_pe, "k_pe", il);
|
10402
|
+
// {kv_lora_rank, 1, n_tokens}
|
10403
|
+
lm_ggml_tensor * Vcur = kv_cmpr;
|
10404
|
+
cb(Vcur, "Vcur", il);
|
10103
10405
|
|
10104
|
-
|
10105
|
-
|
10406
|
+
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
10407
|
+
cur = build_attn(inp_attn, gf,
|
10408
|
+
model.layers[il].wo, NULL,
|
10409
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
10410
|
+
} else {
|
10411
|
+
lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
10412
|
+
cb(kv, "kv", il);
|
10413
|
+
|
10414
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
10415
|
+
lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv,
|
10416
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
10417
|
+
lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
10418
|
+
lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
10419
|
+
0);
|
10420
|
+
cb(k_nope, "k_nope_view", il);
|
10106
10421
|
|
10107
|
-
|
10108
|
-
|
10422
|
+
// and {n_embd_head_v, n_head, n_tokens}
|
10423
|
+
lm_ggml_tensor * Vcur = lm_ggml_view_3d(ctx0, kv,
|
10424
|
+
n_embd_head_v, n_head, n_tokens,
|
10425
|
+
lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
10426
|
+
lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
10427
|
+
lm_ggml_row_size(kv->type, n_embd_head_qk_nope));
|
10428
|
+
cb(Vcur, "Vcur_view", il);
|
10109
10429
|
|
10110
|
-
|
10111
|
-
|
10112
|
-
|
10430
|
+
Vcur = lm_ggml_cont(ctx0, Vcur);
|
10431
|
+
cb(Vcur, "Vcur_cont", il);
|
10432
|
+
|
10433
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
10434
|
+
lm_ggml_tensor * Qcur = lm_ggml_concat(ctx0, q_pe, q_nope, 0);
|
10435
|
+
cb(Qcur, "Qcur", il);
|
10436
|
+
|
10437
|
+
lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, lm_ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
10438
|
+
cb(Kcur, "Kcur", il);
|
10439
|
+
|
10440
|
+
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
10441
|
+
cur = build_attn(inp_attn, gf,
|
10442
|
+
model.layers[il].wo, NULL,
|
10443
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
10444
|
+
}
|
10113
10445
|
}
|
10114
10446
|
|
10115
10447
|
if (il == n_layer - 1) {
|
@@ -10275,7 +10607,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
10275
10607
|
|
10276
10608
|
cur = build_attn(inp_attn, gf,
|
10277
10609
|
NULL, NULL,
|
10278
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
10610
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
10279
10611
|
|
10280
10612
|
cur = build_norm(cur,
|
10281
10613
|
model.layers[il].attn_sub_norm, NULL,
|
@@ -10398,7 +10730,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
10398
10730
|
|
10399
10731
|
cur = build_attn(inp_attn, gf,
|
10400
10732
|
model.layers[il].wo_enc, nullptr,
|
10401
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
10733
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
10402
10734
|
cb(cur, "kqv_out", il);
|
10403
10735
|
}
|
10404
10736
|
|
@@ -10504,7 +10836,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
10504
10836
|
|
10505
10837
|
cur = build_attn(inp_attn_self, gf,
|
10506
10838
|
model.layers[il].wo, model.layers[il].bo,
|
10507
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
10839
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
10508
10840
|
cb(cur, "kqv_out", il);
|
10509
10841
|
}
|
10510
10842
|
|
@@ -10536,7 +10868,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
10536
10868
|
|
10537
10869
|
cur = build_attn(inp_attn_cross, gf,
|
10538
10870
|
model.layers[il].wo_cross, nullptr,
|
10539
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
10871
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
10540
10872
|
cb(cur, "kqv_out", il);
|
10541
10873
|
|
10542
10874
|
//lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
@@ -10669,7 +11001,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
10669
11001
|
|
10670
11002
|
cur = build_attn(inp_attn, gf,
|
10671
11003
|
model.layers[il].wo, model.layers[il].bo,
|
10672
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
|
11004
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
10673
11005
|
}
|
10674
11006
|
|
10675
11007
|
if (il == n_layer - 1) {
|
@@ -10801,7 +11133,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
10801
11133
|
|
10802
11134
|
cur = build_attn(inp_attn, gf,
|
10803
11135
|
model.layers[il].wo, NULL,
|
10804
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
11136
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
10805
11137
|
}
|
10806
11138
|
|
10807
11139
|
if (il == n_layer - 1) {
|
@@ -10854,6 +11186,157 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
10854
11186
|
}
|
10855
11187
|
};
|
10856
11188
|
|
11189
|
+
struct llm_build_glm4 : public llm_graph_context {
|
11190
|
+
llm_build_glm4(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
11191
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
11192
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
11193
|
+
|
11194
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
11195
|
+
|
11196
|
+
lm_ggml_tensor * cur;
|
11197
|
+
lm_ggml_tensor * inpL;
|
11198
|
+
|
11199
|
+
inpL = build_inp_embd(model.tok_embd);
|
11200
|
+
|
11201
|
+
// inp_pos - contains the positions
|
11202
|
+
lm_ggml_tensor * inp_pos = build_inp_pos();
|
11203
|
+
|
11204
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
11205
|
+
|
11206
|
+
for (int il = 0; il < n_layer; ++il) {
|
11207
|
+
lm_ggml_tensor * inpSA = inpL;
|
11208
|
+
|
11209
|
+
// Pre-attention norm
|
11210
|
+
cur = build_norm(inpL,
|
11211
|
+
model.layers[il].attn_norm,
|
11212
|
+
NULL,
|
11213
|
+
LLM_NORM_RMS, il);
|
11214
|
+
cb(cur, "attn_norm", il);
|
11215
|
+
|
11216
|
+
// self-attention
|
11217
|
+
{
|
11218
|
+
lm_ggml_tensor * Qcur = nullptr;
|
11219
|
+
lm_ggml_tensor * Kcur = nullptr;
|
11220
|
+
lm_ggml_tensor * Vcur = nullptr;
|
11221
|
+
|
11222
|
+
if (model.layers[il].wqkv == nullptr) {
|
11223
|
+
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
11224
|
+
if (model.layers[il].bq) {
|
11225
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
11226
|
+
}
|
11227
|
+
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
11228
|
+
if (model.layers[il].bk) {
|
11229
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
11230
|
+
}
|
11231
|
+
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
11232
|
+
if (model.layers[il].bv) {
|
11233
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
11234
|
+
}
|
11235
|
+
} else {
|
11236
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
11237
|
+
cb(cur, "wqkv", il);
|
11238
|
+
if (model.layers[il].bqkv) {
|
11239
|
+
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
11240
|
+
cb(cur, "bqkv", il);
|
11241
|
+
}
|
11242
|
+
Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
11243
|
+
Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
11244
|
+
Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
11245
|
+
}
|
11246
|
+
|
11247
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
11248
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
11249
|
+
Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
11250
|
+
|
11251
|
+
Qcur = lm_ggml_rope_ext(
|
11252
|
+
ctx0, Qcur, inp_pos, nullptr,
|
11253
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11254
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
11255
|
+
);
|
11256
|
+
|
11257
|
+
Kcur = lm_ggml_rope_ext(
|
11258
|
+
ctx0, Kcur, inp_pos, nullptr,
|
11259
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11260
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
11261
|
+
);
|
11262
|
+
|
11263
|
+
cb(Qcur, "Qcur", il);
|
11264
|
+
cb(Kcur, "Kcur", il);
|
11265
|
+
cb(Vcur, "Vcur", il);
|
11266
|
+
|
11267
|
+
cur = build_attn(inp_attn, gf,
|
11268
|
+
model.layers[il].wo, NULL,
|
11269
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
11270
|
+
}
|
11271
|
+
|
11272
|
+
if (il == n_layer - 1) {
|
11273
|
+
// skip computing output for unused tokens
|
11274
|
+
lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11275
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
11276
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11277
|
+
}
|
11278
|
+
|
11279
|
+
// Post-attention norm (new!)
|
11280
|
+
cur = build_norm(cur,
|
11281
|
+
model.layers[il].attn_post_norm,
|
11282
|
+
NULL,
|
11283
|
+
LLM_NORM_RMS, il);
|
11284
|
+
cb(cur, "post_attn_norm", il);
|
11285
|
+
|
11286
|
+
// Add the input (residual connection after post-attention norm)
|
11287
|
+
lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
11288
|
+
cb(ffn_inp, "ffn_inp", il);
|
11289
|
+
|
11290
|
+
// FF
|
11291
|
+
{
|
11292
|
+
// Pre-MLP norm
|
11293
|
+
cur = build_norm(ffn_inp,
|
11294
|
+
model.layers[il].ffn_norm,
|
11295
|
+
NULL,
|
11296
|
+
LLM_NORM_RMS, il);
|
11297
|
+
cb(cur, "ffn_norm", il);
|
11298
|
+
|
11299
|
+
// MLP
|
11300
|
+
cur = build_ffn(cur,
|
11301
|
+
model.layers[il].ffn_up, NULL, NULL,
|
11302
|
+
NULL, NULL, NULL,
|
11303
|
+
model.layers[il].ffn_down, NULL, NULL,
|
11304
|
+
NULL,
|
11305
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
11306
|
+
cb(cur, "ffn_out", il);
|
11307
|
+
|
11308
|
+
// Post-MLP norm
|
11309
|
+
cur = build_norm(cur,
|
11310
|
+
model.layers[il].ffn_post_norm,
|
11311
|
+
NULL,
|
11312
|
+
LLM_NORM_RMS, il);
|
11313
|
+
cb(cur, "post_mlp_norm", il);
|
11314
|
+
}
|
11315
|
+
|
11316
|
+
// Add residual connection after post-MLP norm
|
11317
|
+
inpL = lm_ggml_add(ctx0, cur, ffn_inp);
|
11318
|
+
cb(inpL, "l_out", il);
|
11319
|
+
}
|
11320
|
+
|
11321
|
+
// Final norm
|
11322
|
+
cur = build_norm(inpL,
|
11323
|
+
model.output_norm,
|
11324
|
+
NULL,
|
11325
|
+
LLM_NORM_RMS, -1);
|
11326
|
+
|
11327
|
+
cb(cur, "result_norm", -1);
|
11328
|
+
res->t_embd = cur;
|
11329
|
+
|
11330
|
+
// Output projection
|
11331
|
+
cur = build_lora_mm(model.output, cur);
|
11332
|
+
|
11333
|
+
cb(cur, "result_output", -1);
|
11334
|
+
res->t_logits = cur;
|
11335
|
+
|
11336
|
+
lm_ggml_build_forward_expand(gf, cur);
|
11337
|
+
}
|
11338
|
+
};
|
11339
|
+
|
10857
11340
|
struct llm_build_nemotron : public llm_graph_context {
|
10858
11341
|
llm_build_nemotron(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
|
10859
11342
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
@@ -10927,7 +11410,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
10927
11410
|
|
10928
11411
|
cur = build_attn(inp_attn, gf,
|
10929
11412
|
model.layers[il].wo, model.layers[il].bo,
|
10930
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
11413
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
10931
11414
|
}
|
10932
11415
|
|
10933
11416
|
if (il == n_layer - 1) {
|
@@ -11012,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
11012
11495
|
// self-attention
|
11013
11496
|
{
|
11014
11497
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
11015
|
-
lm_ggml_tensor * rope_factors =
|
11498
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
11016
11499
|
|
11017
11500
|
// compute Q and K and RoPE them
|
11018
11501
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -11058,7 +11541,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
11058
11541
|
|
11059
11542
|
cur = build_attn(inp_attn, gf,
|
11060
11543
|
model.layers[il].wo, model.layers[il].bo,
|
11061
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
11544
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
11062
11545
|
}
|
11063
11546
|
|
11064
11547
|
if (il == n_layer - 1) {
|
@@ -11157,7 +11640,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
11157
11640
|
lm_ggml_tensor * state_mask,
|
11158
11641
|
const llama_ubatch & ubatch,
|
11159
11642
|
int il) const {
|
11160
|
-
const
|
11643
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
11161
11644
|
|
11162
11645
|
const auto n_tokens = ubatch.n_tokens;
|
11163
11646
|
const auto n_seqs = ubatch.n_seqs;
|
@@ -11553,7 +12036,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
11553
12036
|
lm_ggml_tensor *& first_layer_value,
|
11554
12037
|
const llama_ubatch & ubatch,
|
11555
12038
|
int il) const {
|
11556
|
-
const
|
12039
|
+
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
11557
12040
|
|
11558
12041
|
const auto n_tokens = ubatch.n_tokens;
|
11559
12042
|
const auto n_seqs = ubatch.n_seqs;
|
@@ -11862,6 +12345,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
11862
12345
|
}
|
11863
12346
|
};
|
11864
12347
|
|
12348
|
+
|
12349
|
+
struct llm_build_granite : public llm_graph_context {
|
12350
|
+
llm_build_granite(
|
12351
|
+
const llama_model & model,
|
12352
|
+
const llm_graph_params & params,
|
12353
|
+
lm_ggml_cgraph * gf,
|
12354
|
+
const bool use_rope = true)
|
12355
|
+
: llm_graph_context(params) {
|
12356
|
+
|
12357
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
12358
|
+
|
12359
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
12360
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
12361
|
+
|
12362
|
+
lm_ggml_tensor * cur;
|
12363
|
+
lm_ggml_tensor * inpL;
|
12364
|
+
|
12365
|
+
inpL = build_inp_embd(model.tok_embd);
|
12366
|
+
|
12367
|
+
// inp_pos - built only if rope enabled
|
12368
|
+
lm_ggml_tensor * inp_pos = nullptr;
|
12369
|
+
if (use_rope) {
|
12370
|
+
inp_pos = build_inp_pos();
|
12371
|
+
}
|
12372
|
+
|
12373
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
12374
|
+
|
12375
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
12376
|
+
for (int il = 0; il < n_layer; ++il) {
|
12377
|
+
lm_ggml_tensor * inpSA = inpL;
|
12378
|
+
|
12379
|
+
// norm
|
12380
|
+
cur = build_norm(inpL,
|
12381
|
+
model.layers[il].attn_norm, NULL,
|
12382
|
+
LLM_NORM_RMS, il);
|
12383
|
+
cb(cur, "attn_norm", il);
|
12384
|
+
|
12385
|
+
// self-attention
|
12386
|
+
{
|
12387
|
+
// compute Q and K and (optionally) RoPE them
|
12388
|
+
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
12389
|
+
cb(Qcur, "Qcur", il);
|
12390
|
+
if (model.layers[il].bq) {
|
12391
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
12392
|
+
cb(Qcur, "Qcur", il);
|
12393
|
+
}
|
12394
|
+
|
12395
|
+
lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
12396
|
+
cb(Kcur, "Kcur", il);
|
12397
|
+
if (model.layers[il].bk) {
|
12398
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
12399
|
+
cb(Kcur, "Kcur", il);
|
12400
|
+
}
|
12401
|
+
|
12402
|
+
lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
12403
|
+
cb(Vcur, "Vcur", il);
|
12404
|
+
if (model.layers[il].bv) {
|
12405
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
12406
|
+
cb(Vcur, "Vcur", il);
|
12407
|
+
}
|
12408
|
+
|
12409
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
12410
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
12411
|
+
Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
12412
|
+
|
12413
|
+
if (use_rope) {
|
12414
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
12415
|
+
Qcur = lm_ggml_rope_ext(
|
12416
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
12417
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
12418
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
12419
|
+
);
|
12420
|
+
|
12421
|
+
Kcur = lm_ggml_rope_ext(
|
12422
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
12423
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
12424
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
12425
|
+
);
|
12426
|
+
}
|
12427
|
+
|
12428
|
+
cb(Qcur, "Qcur", il);
|
12429
|
+
cb(Kcur, "Kcur", il);
|
12430
|
+
cb(Vcur, "Vcur", il);
|
12431
|
+
|
12432
|
+
cur = build_attn(inp_attn, gf,
|
12433
|
+
model.layers[il].wo, model.layers[il].bo,
|
12434
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
12435
|
+
cb(cur, "attn_out", il);
|
12436
|
+
}
|
12437
|
+
|
12438
|
+
if (il == n_layer - 1) {
|
12439
|
+
// skip computing output for unused tokens
|
12440
|
+
lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
12441
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
12442
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
12443
|
+
}
|
12444
|
+
|
12445
|
+
// For Granite architectures - scale residual
|
12446
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
12447
|
+
lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
12448
|
+
cb(ffn_inp, "ffn_inp", il);
|
12449
|
+
|
12450
|
+
// feed-forward network (non-MoE)
|
12451
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
12452
|
+
|
12453
|
+
cur = build_norm(ffn_inp,
|
12454
|
+
model.layers[il].ffn_norm, NULL,
|
12455
|
+
LLM_NORM_RMS, il);
|
12456
|
+
cb(cur, "ffn_norm", il);
|
12457
|
+
|
12458
|
+
cur = build_ffn(cur,
|
12459
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
12460
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
12461
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
12462
|
+
NULL,
|
12463
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
12464
|
+
cb(cur, "ffn_out", il);
|
12465
|
+
|
12466
|
+
} else {
|
12467
|
+
// MoE branch
|
12468
|
+
cur = build_norm(ffn_inp,
|
12469
|
+
model.layers[il].ffn_norm, NULL,
|
12470
|
+
LLM_NORM_RMS, il);
|
12471
|
+
cb(cur, "ffn_norm", il);
|
12472
|
+
|
12473
|
+
lm_ggml_tensor * moe_out = build_moe_ffn(cur,
|
12474
|
+
model.layers[il].ffn_gate_inp,
|
12475
|
+
model.layers[il].ffn_up_exps,
|
12476
|
+
model.layers[il].ffn_gate_exps,
|
12477
|
+
model.layers[il].ffn_down_exps,
|
12478
|
+
nullptr,
|
12479
|
+
n_expert, n_expert_used,
|
12480
|
+
LLM_FFN_SILU, true,
|
12481
|
+
false, 0.0,
|
12482
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
12483
|
+
il);
|
12484
|
+
cb(moe_out, "ffn_moe_out", il);
|
12485
|
+
|
12486
|
+
// For Granite MoE Shared
|
12487
|
+
if (hparams.n_ff_shexp > 0) {
|
12488
|
+
lm_ggml_tensor * ffn_shexp = build_ffn(cur,
|
12489
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
12490
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
12491
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
12492
|
+
NULL,
|
12493
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
12494
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
12495
|
+
|
12496
|
+
cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
|
12497
|
+
cb(cur, "ffn_out", il);
|
12498
|
+
} else {
|
12499
|
+
cur = moe_out;
|
12500
|
+
}
|
12501
|
+
}
|
12502
|
+
|
12503
|
+
// For Granite architectures - scale residual
|
12504
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
12505
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
12506
|
+
cb(cur, "ffn_out", il);
|
12507
|
+
|
12508
|
+
cur = build_cvec(cur, il);
|
12509
|
+
cb(cur, "l_out", il);
|
12510
|
+
|
12511
|
+
// input for next layer
|
12512
|
+
inpL = cur;
|
12513
|
+
}
|
12514
|
+
|
12515
|
+
cur = inpL;
|
12516
|
+
|
12517
|
+
cur = build_norm(cur,
|
12518
|
+
model.output_norm, NULL,
|
12519
|
+
LLM_NORM_RMS, -1);
|
12520
|
+
|
12521
|
+
cb(cur, "result_norm", -1);
|
12522
|
+
res->t_embd = cur;
|
12523
|
+
|
12524
|
+
// lm_head
|
12525
|
+
cur = build_lora_mm(model.output, cur);
|
12526
|
+
|
12527
|
+
// For Granite architectures - scale logits
|
12528
|
+
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
12529
|
+
cb(cur, "result_output", -1);
|
12530
|
+
res->t_logits = cur;
|
12531
|
+
|
12532
|
+
lm_ggml_build_forward_expand(gf, cur);
|
12533
|
+
}
|
12534
|
+
};
|
12535
|
+
|
11865
12536
|
// ref: https://github.com/facebookresearch/chameleon
|
11866
12537
|
// based on the original build_llama() function, changes:
|
11867
12538
|
// * qk-norm
|
@@ -11960,7 +12631,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
11960
12631
|
|
11961
12632
|
cur = build_attn(inp_attn, gf,
|
11962
12633
|
model.layers[il].wo, nullptr,
|
11963
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
12634
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
11964
12635
|
|
11965
12636
|
if (hparams.swin_norm) {
|
11966
12637
|
cur = build_norm(cur,
|
@@ -12316,7 +12987,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
12316
12987
|
|
12317
12988
|
cur = build_attn(inp_attn, gf,
|
12318
12989
|
model.layers[il].wo, NULL,
|
12319
|
-
q_states, k_states, v_states, nullptr, kq_scale, il);
|
12990
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
12320
12991
|
}
|
12321
12992
|
|
12322
12993
|
if (il == n_layer - 1) {
|
@@ -12393,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
12393
13064
|
// self-attention
|
12394
13065
|
{
|
12395
13066
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
12396
|
-
lm_ggml_tensor * rope_factors =
|
13067
|
+
lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
12397
13068
|
|
12398
13069
|
// compute Q and K and RoPE them
|
12399
13070
|
lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
@@ -12439,7 +13110,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
12439
13110
|
|
12440
13111
|
cur = build_attn(inp_attn, gf,
|
12441
13112
|
model.layers[il].wo, model.layers[il].bo,
|
12442
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
13113
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
12443
13114
|
}
|
12444
13115
|
|
12445
13116
|
if (il == n_layer - 1) {
|
@@ -12513,36 +13184,70 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
12513
13184
|
}
|
12514
13185
|
};
|
12515
13186
|
|
12516
|
-
llama_memory_i * llama_model::create_memory() const {
|
13187
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
12517
13188
|
llama_memory_i * res;
|
12518
13189
|
|
12519
13190
|
switch (arch) {
|
13191
|
+
case LLM_ARCH_BERT:
|
13192
|
+
case LLM_ARCH_JINA_BERT_V2:
|
13193
|
+
case LLM_ARCH_NOMIC_BERT:
|
13194
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
13195
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
13196
|
+
{
|
13197
|
+
res = nullptr;
|
13198
|
+
} break;
|
12520
13199
|
case LLM_ARCH_MAMBA:
|
12521
13200
|
case LLM_ARCH_RWKV6:
|
12522
13201
|
case LLM_ARCH_RWKV6QWEN2:
|
12523
13202
|
case LLM_ARCH_RWKV7:
|
12524
13203
|
case LLM_ARCH_ARWKV7:
|
12525
13204
|
{
|
12526
|
-
res = new
|
12527
|
-
|
12528
|
-
|
13205
|
+
res = new llama_kv_cache_recurrent(
|
13206
|
+
*this,
|
13207
|
+
LM_GGML_TYPE_F32,
|
13208
|
+
LM_GGML_TYPE_F32,
|
13209
|
+
cparams.offload_kqv,
|
13210
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
13211
|
+
cparams.n_seq_max);
|
12529
13212
|
} break;
|
12530
13213
|
default:
|
12531
13214
|
{
|
12532
|
-
|
12533
|
-
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
12534
|
-
// choose long/short freq factors based on the context size
|
12535
|
-
if (layers[il].rope_freqs != nullptr) {
|
12536
|
-
return layers[il].rope_freqs;
|
12537
|
-
}
|
13215
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
12538
13216
|
|
12539
|
-
|
12540
|
-
return layers[il].rope_long;
|
12541
|
-
}
|
13217
|
+
cparams.n_ctx = LM_GGML_PAD(cparams.n_ctx, padding);
|
12542
13218
|
|
12543
|
-
|
12544
|
-
|
12545
|
-
|
13219
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
13220
|
+
|
13221
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
13222
|
+
LM_GGML_ASSERT(hparams.is_swa_any());
|
13223
|
+
|
13224
|
+
res = new llama_kv_cache_unified_iswa(
|
13225
|
+
*this,
|
13226
|
+
params.type_k,
|
13227
|
+
params.type_v,
|
13228
|
+
!cparams.flash_attn,
|
13229
|
+
cparams.offload_kqv,
|
13230
|
+
params.swa_full,
|
13231
|
+
cparams.n_ctx,
|
13232
|
+
cparams.n_seq_max,
|
13233
|
+
cparams.n_batch,
|
13234
|
+
padding);
|
13235
|
+
} else {
|
13236
|
+
LM_GGML_ASSERT(!hparams.is_swa_any());
|
13237
|
+
|
13238
|
+
res = new llama_kv_cache_unified(
|
13239
|
+
*this,
|
13240
|
+
nullptr,
|
13241
|
+
params.type_k,
|
13242
|
+
params.type_v,
|
13243
|
+
!cparams.flash_attn,
|
13244
|
+
cparams.offload_kqv,
|
13245
|
+
cparams.n_ctx,
|
13246
|
+
cparams.n_seq_max,
|
13247
|
+
padding,
|
13248
|
+
hparams.n_swa,
|
13249
|
+
hparams.swa_type);
|
13250
|
+
}
|
12546
13251
|
}
|
12547
13252
|
}
|
12548
13253
|
|
@@ -12557,13 +13262,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12557
13262
|
|
12558
13263
|
switch (arch) {
|
12559
13264
|
case LLM_ARCH_LLAMA:
|
12560
|
-
case LLM_ARCH_LLAMA4:
|
12561
13265
|
case LLM_ARCH_MINICPM:
|
12562
|
-
case LLM_ARCH_GRANITE:
|
12563
|
-
case LLM_ARCH_GRANITE_MOE:
|
12564
13266
|
{
|
12565
13267
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
12566
13268
|
} break;
|
13269
|
+
case LLM_ARCH_LLAMA4:
|
13270
|
+
{
|
13271
|
+
llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
|
13272
|
+
} break;
|
12567
13273
|
case LLM_ARCH_DECI:
|
12568
13274
|
{
|
12569
13275
|
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
@@ -12591,6 +13297,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12591
13297
|
case LLM_ARCH_BERT:
|
12592
13298
|
case LLM_ARCH_JINA_BERT_V2:
|
12593
13299
|
case LLM_ARCH_NOMIC_BERT:
|
13300
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
12594
13301
|
{
|
12595
13302
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
12596
13303
|
} break;
|
@@ -12637,7 +13344,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12637
13344
|
case LLM_ARCH_PHI3:
|
12638
13345
|
case LLM_ARCH_PHIMOE:
|
12639
13346
|
{
|
12640
|
-
|
13347
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
13348
|
+
llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
|
13349
|
+
} else {
|
13350
|
+
llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
|
13351
|
+
}
|
12641
13352
|
} break;
|
12642
13353
|
case LLM_ARCH_PLAMO:
|
12643
13354
|
{
|
@@ -12669,11 +13380,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12669
13380
|
} break;
|
12670
13381
|
case LLM_ARCH_GEMMA2:
|
12671
13382
|
{
|
12672
|
-
llm = std::make_unique<
|
13383
|
+
llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
|
12673
13384
|
} break;
|
12674
13385
|
case LLM_ARCH_GEMMA3:
|
12675
13386
|
{
|
12676
|
-
llm = std::make_unique<
|
13387
|
+
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
|
12677
13388
|
} break;
|
12678
13389
|
case LLM_ARCH_STARCODER2:
|
12679
13390
|
{
|
@@ -12693,7 +13404,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12693
13404
|
} break;
|
12694
13405
|
case LLM_ARCH_COHERE2:
|
12695
13406
|
{
|
12696
|
-
llm = std::make_unique<
|
13407
|
+
llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
|
12697
13408
|
} break;
|
12698
13409
|
case LLM_ARCH_DBRX:
|
12699
13410
|
{
|
@@ -12735,6 +13446,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12735
13446
|
{
|
12736
13447
|
llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
|
12737
13448
|
} break;
|
13449
|
+
case LLM_ARCH_GLM4:
|
13450
|
+
{
|
13451
|
+
llm = std::make_unique<llm_build_glm4>(*this, params, gf);
|
13452
|
+
} break;
|
12738
13453
|
case LLM_ARCH_BITNET:
|
12739
13454
|
{
|
12740
13455
|
llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
|
@@ -12786,6 +13501,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
12786
13501
|
{
|
12787
13502
|
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
|
12788
13503
|
} break;
|
13504
|
+
case LLM_ARCH_GRANITE:
|
13505
|
+
case LLM_ARCH_GRANITE_MOE:
|
13506
|
+
{
|
13507
|
+
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
13508
|
+
} break;
|
12789
13509
|
case LLM_ARCH_CHAMELEON:
|
12790
13510
|
{
|
12791
13511
|
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
@@ -12919,8 +13639,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
12919
13639
|
case LLM_ARCH_DECI:
|
12920
13640
|
case LLM_ARCH_BAICHUAN:
|
12921
13641
|
case LLM_ARCH_STARCODER:
|
12922
|
-
case LLM_ARCH_PLAMO:
|
12923
|
-
case LLM_ARCH_ORION:
|
12924
13642
|
case LLM_ARCH_INTERNLM2:
|
12925
13643
|
case LLM_ARCH_MINICPM:
|
12926
13644
|
case LLM_ARCH_XVERSE:
|
@@ -12932,6 +13650,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
12932
13650
|
case LLM_ARCH_DEEPSEEK2:
|
12933
13651
|
case LLM_ARCH_PLM:
|
12934
13652
|
case LLM_ARCH_CHATGLM:
|
13653
|
+
case LLM_ARCH_GLM4:
|
12935
13654
|
case LLM_ARCH_GRANITE:
|
12936
13655
|
case LLM_ARCH_GRANITE_MOE:
|
12937
13656
|
case LLM_ARCH_CHAMELEON:
|
@@ -12944,6 +13663,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
12944
13663
|
case LLM_ARCH_DBRX:
|
12945
13664
|
case LLM_ARCH_BERT:
|
12946
13665
|
case LLM_ARCH_NOMIC_BERT:
|
13666
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
12947
13667
|
case LLM_ARCH_STABLELM:
|
12948
13668
|
case LLM_ARCH_BITNET:
|
12949
13669
|
case LLM_ARCH_QWEN:
|
@@ -12956,6 +13676,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
12956
13676
|
case LLM_ARCH_PHI2:
|
12957
13677
|
case LLM_ARCH_PHI3:
|
12958
13678
|
case LLM_ARCH_PHIMOE:
|
13679
|
+
case LLM_ARCH_PLAMO:
|
12959
13680
|
case LLM_ARCH_GEMMA:
|
12960
13681
|
case LLM_ARCH_GEMMA2:
|
12961
13682
|
case LLM_ARCH_GEMMA3:
|
@@ -12963,6 +13684,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
12963
13684
|
case LLM_ARCH_OPENELM:
|
12964
13685
|
case LLM_ARCH_GPTNEOX:
|
12965
13686
|
case LLM_ARCH_CODESHELL:
|
13687
|
+
case LLM_ARCH_ORION:
|
12966
13688
|
case LLM_ARCH_NEMOTRON:
|
12967
13689
|
case LLM_ARCH_EXAONE:
|
12968
13690
|
case LLM_ARCH_MINICPM3:
|
@@ -13035,6 +13757,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|
13035
13757
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
13036
13758
|
const auto & it = model->lm_gguf_kv.find(key);
|
13037
13759
|
if (it == model->lm_gguf_kv.end()) {
|
13760
|
+
// one-off fix for very popular models (so we are not flooded with issues)
|
13761
|
+
// do not extend this list unless absolutely necessary
|
13762
|
+
// Mistral-Small-2503 does not have built-in chat template
|
13763
|
+
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
13764
|
+
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
13765
|
+
return "mistral-v7-tekken";
|
13766
|
+
}
|
13767
|
+
|
13038
13768
|
return nullptr;
|
13039
13769
|
}
|
13040
13770
|
|