@novastera-oss/llamarn 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -3
- package/RNLlamaCpp.podspec +1 -1
- package/android/CMakeLists.txt +11 -3
- package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
- package/android/src/main/cpp/include/llama.h +53 -114
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/PureCppImpl.cpp +71 -4
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -1
- package/cpp/llama.cpp/Makefile +6 -1605
- package/cpp/llama.cpp/README.md +5 -1
- package/cpp/llama.cpp/common/arg.cpp +230 -51
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
- package/cpp/llama.cpp/common/chat.cpp +539 -8
- package/cpp/llama.cpp/common/chat.h +8 -1
- package/cpp/llama.cpp/common/common.cpp +60 -15
- package/cpp/llama.cpp/common/common.h +64 -15
- package/cpp/llama.cpp/common/speculative.cpp +135 -54
- package/cpp/llama.cpp/common/speculative.h +8 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
- package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
- package/cpp/llama.cpp/flake.nix +0 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
- package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
- package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
- package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
- package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
- package/cpp/llama.cpp/include/llama.h +53 -114
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -1
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
- package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +61 -252
- package/cpp/llama.cpp/src/llama-context.h +10 -15
- package/cpp/llama.cpp/src/llama-cparams.h +0 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
- package/cpp/llama.cpp/src/llama-graph.h +90 -51
- package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
- package/cpp/llama.cpp/src/llama-hparams.h +21 -6
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
- package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
- package/cpp/llama.cpp/src/llama-memory.h +13 -10
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
- package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
- package/cpp/llama.cpp/src/llama-model.h +28 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
- package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
- package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
- package/cpp/rn-completion.cpp +3 -27
- package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
- package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
- package/ios/include/chat.h +8 -1
- package/ios/include/common/minja/chat-template.hpp +16 -7
- package/ios/include/common/minja/minja.hpp +47 -12
- package/ios/include/common.h +64 -15
- package/ios/include/llama.h +53 -114
- package/ios/include/speculative.h +8 -1
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/lib/module/NativeRNLlamaCpp.js.map +1 -1
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
- package/package.json +1 -2
- package/src/NativeRNLlamaCpp.ts +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
#include "llama-cparams.h"
|
|
7
7
|
#include "llama-model-loader.h"
|
|
8
8
|
|
|
9
|
-
#include "llama-kv-cache
|
|
10
|
-
#include "llama-kv-cache-
|
|
9
|
+
#include "llama-kv-cache.h"
|
|
10
|
+
#include "llama-kv-cache-iswa.h"
|
|
11
11
|
#include "llama-memory-hybrid.h"
|
|
12
12
|
#include "llama-memory-recurrent.h"
|
|
13
13
|
|
|
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
47
47
|
case LLM_TYPE_410M: return "410M";
|
|
48
48
|
case LLM_TYPE_450M: return "450M";
|
|
49
49
|
case LLM_TYPE_475M: return "475M";
|
|
50
|
+
case LLM_TYPE_558M: return "558M";
|
|
50
51
|
case LLM_TYPE_700M: return "700M";
|
|
51
52
|
case LLM_TYPE_770M: return "770M";
|
|
52
53
|
case LLM_TYPE_780M: return "780M";
|
|
@@ -83,9 +84,11 @@ const char * llm_type_name(llm_type type) {
|
|
|
83
84
|
case LLM_TYPE_32B: return "32B";
|
|
84
85
|
case LLM_TYPE_34B: return "34B";
|
|
85
86
|
case LLM_TYPE_35B: return "35B";
|
|
87
|
+
case LLM_TYPE_36B: return "36B";
|
|
86
88
|
case LLM_TYPE_40B: return "40B";
|
|
87
89
|
case LLM_TYPE_65B: return "65B";
|
|
88
90
|
case LLM_TYPE_70B: return "70B";
|
|
91
|
+
case LLM_TYPE_120B: return "120B";
|
|
89
92
|
case LLM_TYPE_142B: return "142B";
|
|
90
93
|
case LLM_TYPE_236B: return "236B";
|
|
91
94
|
case LLM_TYPE_290B: return "290B";
|
|
@@ -109,8 +112,10 @@ const char * llm_type_name(llm_type type) {
|
|
|
109
112
|
case LLM_TYPE_A13B: return "A13B";
|
|
110
113
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
111
114
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
115
|
+
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
112
116
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
113
117
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
118
|
+
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
114
119
|
case LLM_TYPE_E2B: return "E2B";
|
|
115
120
|
case LLM_TYPE_E4B: return "E4B";
|
|
116
121
|
default: return "?B";
|
|
@@ -190,6 +195,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
190
195
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
191
196
|
op_tensor = ggml_add(ctx, a, w);
|
|
192
197
|
} break;
|
|
198
|
+
case GGML_OP_ADD_ID:
|
|
199
|
+
{
|
|
200
|
+
int n_expert_used = hparams.n_expert_used;
|
|
201
|
+
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
202
|
+
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
203
|
+
op_tensor = ggml_add_id(ctx, a, w, c);
|
|
204
|
+
} break;
|
|
193
205
|
case GGML_OP_MUL:
|
|
194
206
|
{
|
|
195
207
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
@@ -258,6 +270,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
258
270
|
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
|
259
271
|
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
|
260
272
|
} break;
|
|
273
|
+
case GGML_OP_SCALE:
|
|
274
|
+
{
|
|
275
|
+
op_tensor = ggml_scale(ctx, w, 1.0f);
|
|
276
|
+
} break;
|
|
261
277
|
default:
|
|
262
278
|
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
|
263
279
|
}
|
|
@@ -290,7 +306,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
290
306
|
}
|
|
291
307
|
|
|
292
308
|
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
293
|
-
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
|
309
|
+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
|
294
310
|
buft_list_t buft_list;
|
|
295
311
|
|
|
296
312
|
// add ACCEL buffer types
|
|
@@ -319,21 +335,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
319
335
|
}
|
|
320
336
|
}
|
|
321
337
|
|
|
322
|
-
// add extra buffer types
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
338
|
+
// add extra buffer types
|
|
339
|
+
if (use_extra_bufts) {
|
|
340
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
341
|
+
if (cpu_dev == nullptr) {
|
|
342
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
343
|
+
}
|
|
328
344
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
345
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
346
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
347
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
348
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
349
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
350
|
+
while (extra_bufts && *extra_bufts) {
|
|
351
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
352
|
+
++extra_bufts;
|
|
353
|
+
}
|
|
337
354
|
}
|
|
338
355
|
}
|
|
339
356
|
|
|
@@ -756,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
756
773
|
default: type = LLM_TYPE_UNKNOWN;
|
|
757
774
|
}
|
|
758
775
|
} break;
|
|
776
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
777
|
+
{
|
|
778
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
779
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
780
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
781
|
+
|
|
782
|
+
switch (hparams.n_layer) {
|
|
783
|
+
case 24:
|
|
784
|
+
type = LLM_TYPE_558M; break;
|
|
785
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
786
|
+
}
|
|
787
|
+
} break;
|
|
759
788
|
case LLM_ARCH_NOMIC_BERT:
|
|
760
789
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
761
790
|
{
|
|
@@ -869,6 +898,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
869
898
|
hparams.causal_attn = false;
|
|
870
899
|
}
|
|
871
900
|
break;
|
|
901
|
+
case LLM_ARCH_LLADA:
|
|
902
|
+
{
|
|
903
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
904
|
+
// LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
|
|
905
|
+
switch (hparams.n_layer) {
|
|
906
|
+
case 32:
|
|
907
|
+
type = LLM_TYPE_8B;
|
|
908
|
+
break;
|
|
909
|
+
default:
|
|
910
|
+
type = LLM_TYPE_UNKNOWN;
|
|
911
|
+
}
|
|
912
|
+
// Set non-causal attention for diffusion models
|
|
913
|
+
hparams.causal_attn = false;
|
|
914
|
+
}
|
|
915
|
+
break;
|
|
872
916
|
case LLM_ARCH_QWEN2MOE:
|
|
873
917
|
{
|
|
874
918
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -883,6 +927,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
883
927
|
} break;
|
|
884
928
|
case LLM_ARCH_QWEN3:
|
|
885
929
|
{
|
|
930
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
886
931
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
887
932
|
switch (hparams.n_layer) {
|
|
888
933
|
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
@@ -1065,6 +1110,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1065
1110
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1066
1111
|
|
|
1067
1112
|
switch (hparams.n_layer) {
|
|
1113
|
+
case 18: type = LLM_TYPE_537M; break;
|
|
1068
1114
|
case 26: type = LLM_TYPE_1B; break;
|
|
1069
1115
|
case 34: type = LLM_TYPE_4B; break;
|
|
1070
1116
|
case 48: type = LLM_TYPE_12B; break;
|
|
@@ -1082,6 +1128,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1082
1128
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1083
1129
|
hparams.set_swa_pattern(5);
|
|
1084
1130
|
|
|
1131
|
+
hparams.n_layer_kv_from_start = 20;
|
|
1085
1132
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1086
1133
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1087
1134
|
hparams.f_attention_scale = 1.0f;
|
|
@@ -1256,6 +1303,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1256
1303
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1257
1304
|
}
|
|
1258
1305
|
} break;
|
|
1306
|
+
case LLM_ARCH_SEED_OSS:
|
|
1307
|
+
{
|
|
1308
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1309
|
+
switch (hparams.n_layer) {
|
|
1310
|
+
case 64: type = LLM_TYPE_36B; break;
|
|
1311
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1312
|
+
}
|
|
1313
|
+
} break;
|
|
1259
1314
|
case LLM_ARCH_OLMOE:
|
|
1260
1315
|
{
|
|
1261
1316
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1417,6 +1472,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1417
1472
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1418
1473
|
}
|
|
1419
1474
|
} break;
|
|
1475
|
+
case LLM_ARCH_GLM4_MOE:
|
|
1476
|
+
{
|
|
1477
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1478
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1479
|
+
|
|
1480
|
+
// MoE parameters
|
|
1481
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
1482
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
1483
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1484
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1485
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1486
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1487
|
+
|
|
1488
|
+
// Expert gating function (GLM-4.5 uses sigmoid)
|
|
1489
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1490
|
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1491
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
// NextN/MTP parameters
|
|
1495
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1496
|
+
|
|
1497
|
+
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
1498
|
+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
1499
|
+
|
|
1500
|
+
switch (hparams.n_layer) {
|
|
1501
|
+
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
1502
|
+
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
1503
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1504
|
+
}
|
|
1505
|
+
} break;
|
|
1420
1506
|
case LLM_ARCH_BITNET:
|
|
1421
1507
|
{
|
|
1422
1508
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1484,6 +1570,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1484
1570
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1485
1571
|
}
|
|
1486
1572
|
} break;
|
|
1573
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
1574
|
+
{
|
|
1575
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1576
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1577
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1578
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1579
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1580
|
+
|
|
1581
|
+
// A layer is recurrent IFF the n_head_kv value is set to 0 and
|
|
1582
|
+
// the n_ff value is set to 0
|
|
1583
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1584
|
+
hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1588
|
+
|
|
1589
|
+
switch (hparams.n_layer) {
|
|
1590
|
+
case 56: type = LLM_TYPE_9B; break;
|
|
1591
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1592
|
+
}
|
|
1593
|
+
} break;
|
|
1487
1594
|
case LLM_ARCH_EXAONE:
|
|
1488
1595
|
{
|
|
1489
1596
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1744,6 +1851,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1744
1851
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1745
1852
|
}
|
|
1746
1853
|
} break;
|
|
1854
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
1855
|
+
{
|
|
1856
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1857
|
+
|
|
1858
|
+
switch (hparams.n_embd) {
|
|
1859
|
+
case 1024: type = LLM_TYPE_0_5B; break;
|
|
1860
|
+
case 2048: type = LLM_TYPE_1_8B; break;
|
|
1861
|
+
case 3072: type = LLM_TYPE_4B; break;
|
|
1862
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1863
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1864
|
+
}
|
|
1865
|
+
} break;
|
|
1747
1866
|
case LLM_ARCH_SMOLLM3:
|
|
1748
1867
|
{
|
|
1749
1868
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1754,6 +1873,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1754
1873
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1755
1874
|
}
|
|
1756
1875
|
} break;
|
|
1876
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
1877
|
+
{
|
|
1878
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1879
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1880
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1881
|
+
|
|
1882
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1883
|
+
hparams.set_swa_pattern(2);
|
|
1884
|
+
|
|
1885
|
+
switch (hparams.n_layer) {
|
|
1886
|
+
case 24: type = LLM_TYPE_20B; break;
|
|
1887
|
+
case 36: type = LLM_TYPE_120B; break;
|
|
1888
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1889
|
+
}
|
|
1890
|
+
} break;
|
|
1757
1891
|
case LLM_ARCH_LFM2:
|
|
1758
1892
|
{
|
|
1759
1893
|
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
@@ -1768,6 +1902,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1768
1902
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1769
1903
|
}
|
|
1770
1904
|
} break;
|
|
1905
|
+
case LLM_ARCH_SMALLTHINKER:
|
|
1906
|
+
{
|
|
1907
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1908
|
+
|
|
1909
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
1910
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1911
|
+
hparams.n_swa = 4096;
|
|
1912
|
+
hparams.set_swa_pattern(4, true);
|
|
1913
|
+
} else {
|
|
1914
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1915
|
+
hparams.n_no_rope_layer_step = hparams.n_layer;
|
|
1916
|
+
}
|
|
1917
|
+
|
|
1918
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
1919
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1920
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1921
|
+
|
|
1922
|
+
switch (hparams.n_layer) {
|
|
1923
|
+
case 32: type = LLM_TYPE_4B; break;
|
|
1924
|
+
case 52: type = LLM_TYPE_20B; break;
|
|
1925
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1926
|
+
}
|
|
1927
|
+
} break;
|
|
1771
1928
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1772
1929
|
}
|
|
1773
1930
|
|
|
@@ -1801,7 +1958,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1801
1958
|
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
|
1802
1959
|
|
|
1803
1960
|
// build a list of buffer types for the CPU and GPU devices
|
|
1804
|
-
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
|
1961
|
+
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
|
1805
1962
|
for (auto * dev : devices) {
|
|
1806
1963
|
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
|
1807
1964
|
// add CPU buffer types as a fallback
|
|
@@ -1897,6 +2054,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1897
2054
|
|
|
1898
2055
|
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
1899
2056
|
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
|
2057
|
+
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
|
1900
2058
|
|
|
1901
2059
|
// create tensors for the weights
|
|
1902
2060
|
{
|
|
@@ -1952,7 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1952
2110
|
}
|
|
1953
2111
|
|
|
1954
2112
|
// skip unused tensors
|
|
1955
|
-
if (info.op == GGML_OP_NONE) {
|
|
2113
|
+
if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
|
|
1956
2114
|
const size_t nbytes = ggml_nbytes(t_meta);
|
|
1957
2115
|
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
|
1958
2116
|
|
|
@@ -1962,11 +2120,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1962
2120
|
return nullptr;
|
|
1963
2121
|
}
|
|
1964
2122
|
|
|
1965
|
-
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
|
2123
|
+
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
|
1966
2124
|
ggml_op op;
|
|
1967
2125
|
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
1968
2126
|
if (bias) {
|
|
1969
|
-
op
|
|
2127
|
+
if (info.op == GGML_OP_MUL_MAT_ID) {
|
|
2128
|
+
op = GGML_OP_ADD_ID;
|
|
2129
|
+
} else {
|
|
2130
|
+
op = GGML_OP_ADD;
|
|
2131
|
+
}
|
|
1970
2132
|
} else {
|
|
1971
2133
|
op = info.op;
|
|
1972
2134
|
}
|
|
@@ -2006,7 +2168,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2006
2168
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
2007
2169
|
std::regex pattern(overrides->pattern);
|
|
2008
2170
|
if (std::regex_search(tensor_name, pattern)) {
|
|
2009
|
-
|
|
2171
|
+
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
|
2172
|
+
// when overriding to a CPU buffer, consider the extra buffer types
|
|
2173
|
+
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
|
2174
|
+
} else {
|
|
2175
|
+
buft = overrides->buft;
|
|
2176
|
+
}
|
|
2177
|
+
|
|
2010
2178
|
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
|
2011
2179
|
tensor_name.c_str(),
|
|
2012
2180
|
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
@@ -2126,6 +2294,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2126
2294
|
}
|
|
2127
2295
|
}
|
|
2128
2296
|
} break;
|
|
2297
|
+
case LLM_ARCH_LLADA:
|
|
2298
|
+
{
|
|
2299
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2300
|
+
|
|
2301
|
+
// output
|
|
2302
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2303
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
2304
|
+
|
|
2305
|
+
// if output is NULL, init from the input tok embed
|
|
2306
|
+
if (output == NULL) {
|
|
2307
|
+
output =
|
|
2308
|
+
create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
2309
|
+
}
|
|
2310
|
+
|
|
2311
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2312
|
+
auto & layer = layers[i];
|
|
2313
|
+
|
|
2314
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2315
|
+
|
|
2316
|
+
// Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
|
2317
|
+
layer.wq =
|
|
2318
|
+
create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
2319
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
2320
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
2321
|
+
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
|
|
2322
|
+
layer.wo =
|
|
2323
|
+
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
2324
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
2325
|
+
|
|
2326
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
2327
|
+
|
|
2328
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
|
|
2329
|
+
TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2330
|
+
|
|
2331
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
|
2332
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
2333
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
|
2334
|
+
|
|
2335
|
+
// optional MLP bias
|
|
2336
|
+
layer.ffn_gate_b =
|
|
2337
|
+
create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
|
2338
|
+
layer.ffn_down_b =
|
|
2339
|
+
create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
|
2340
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
|
2341
|
+
}
|
|
2342
|
+
}
|
|
2343
|
+
break;
|
|
2129
2344
|
case LLM_ARCH_LLAMA4:
|
|
2130
2345
|
{
|
|
2131
2346
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -2450,6 +2665,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2450
2665
|
case LLM_ARCH_BERT:
|
|
2451
2666
|
case LLM_ARCH_NOMIC_BERT:
|
|
2452
2667
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
2668
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
2453
2669
|
{
|
|
2454
2670
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2455
2671
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
|
@@ -2485,24 +2701,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2485
2701
|
}
|
|
2486
2702
|
|
|
2487
2703
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2704
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2488
2705
|
|
|
2489
2706
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
2490
2707
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2491
2708
|
|
|
2492
2709
|
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
2493
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2494
2710
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
|
2495
2711
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2496
2712
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2497
2713
|
} else {
|
|
2498
|
-
layer.ffn_up
|
|
2499
|
-
layer.
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2505
|
-
} else {
|
|
2714
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2715
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
2716
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2717
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2718
|
+
|
|
2719
|
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
|
2506
2720
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2507
2721
|
}
|
|
2508
2722
|
}
|
|
@@ -3799,6 +4013,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3799
4013
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3800
4014
|
}
|
|
3801
4015
|
} break;
|
|
4016
|
+
case LLM_ARCH_SEED_OSS:
|
|
4017
|
+
{
|
|
4018
|
+
const uint32_t head_dim = hparams.n_embd_head_k;
|
|
4019
|
+
const int64_t n_qo_dim = n_head * head_dim;
|
|
4020
|
+
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
4021
|
+
|
|
4022
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4023
|
+
|
|
4024
|
+
// output
|
|
4025
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4026
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4027
|
+
// if output is NULL, init from the input tok embed
|
|
4028
|
+
if (output == NULL) {
|
|
4029
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4030
|
+
}
|
|
4031
|
+
|
|
4032
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4033
|
+
auto & layer = layers[i];
|
|
4034
|
+
|
|
4035
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
|
|
4036
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4037
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4038
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
|
|
4039
|
+
|
|
4040
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
|
|
4041
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4042
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4043
|
+
|
|
4044
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4045
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4046
|
+
|
|
4047
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4048
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4049
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4050
|
+
}
|
|
4051
|
+
} break;
|
|
4052
|
+
|
|
3802
4053
|
case LLM_ARCH_OLMOE:
|
|
3803
4054
|
{
|
|
3804
4055
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4322,6 +4573,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4322
4573
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4323
4574
|
}
|
|
4324
4575
|
} break;
|
|
4576
|
+
case LLM_ARCH_GLM4_MOE:
|
|
4577
|
+
{
|
|
4578
|
+
const int64_t n_expert = hparams.n_expert;
|
|
4579
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
4580
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4581
|
+
|
|
4582
|
+
GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
|
|
4583
|
+
GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
|
|
4584
|
+
|
|
4585
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
4586
|
+
|
|
4587
|
+
// output
|
|
4588
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
4589
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
4590
|
+
// if output is NULL, init from the input tok embed
|
|
4591
|
+
if (output == NULL) {
|
|
4592
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
4593
|
+
}
|
|
4594
|
+
|
|
4595
|
+
// Load ALL tensors including NextN layer to satisfy total tensor count
|
|
4596
|
+
// but only PROCESS up to last layer (skipping final NextN layer) in forward pass
|
|
4597
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4598
|
+
int flags = 0;
|
|
4599
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4600
|
+
// skip all tensors in the NextN layers
|
|
4601
|
+
flags |= TENSOR_SKIP;
|
|
4602
|
+
}
|
|
4603
|
+
|
|
4604
|
+
auto & layer = layers[i];
|
|
4605
|
+
|
|
4606
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
|
|
4607
|
+
|
|
4608
|
+
// GLM-style attention with bias terms
|
|
4609
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
|
|
4610
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
|
|
4611
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
|
|
4612
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
|
|
4613
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
|
|
4614
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
|
|
4615
|
+
|
|
4616
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
|
|
4617
|
+
|
|
4618
|
+
// K/Q norm tensors (optional for GLM-4.5 355B variant)
|
|
4619
|
+
layer.attn_q_norm = create_tensor(
|
|
4620
|
+
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
|
4621
|
+
layer.attn_k_norm = create_tensor(
|
|
4622
|
+
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
|
|
4623
|
+
|
|
4624
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
|
|
4625
|
+
|
|
4626
|
+
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
|
|
4627
|
+
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
|
|
4628
|
+
const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
|
|
4629
|
+
|
|
4630
|
+
if (use_moe) {
|
|
4631
|
+
// MoE layers
|
|
4632
|
+
layer.ffn_gate_inp =
|
|
4633
|
+
create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
|
|
4634
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
|
|
4635
|
+
|
|
4636
|
+
// MoE branch
|
|
4637
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
4638
|
+
|
|
4639
|
+
layer.ffn_gate_exps = create_tensor(
|
|
4640
|
+
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
|
4641
|
+
layer.ffn_down_exps = create_tensor(
|
|
4642
|
+
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
|
|
4643
|
+
layer.ffn_up_exps = create_tensor(
|
|
4644
|
+
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
|
|
4645
|
+
|
|
4646
|
+
// Shared expert
|
|
4647
|
+
if (n_expert_shared > 0) {
|
|
4648
|
+
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
|
|
4649
|
+
layer.ffn_gate_shexp = create_tensor(
|
|
4650
|
+
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
|
4651
|
+
layer.ffn_down_shexp = create_tensor(
|
|
4652
|
+
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
|
|
4653
|
+
layer.ffn_up_shexp = create_tensor(
|
|
4654
|
+
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
|
|
4655
|
+
}
|
|
4656
|
+
} else {
|
|
4657
|
+
// Dense layers (first k layers) - GLM uses separate gate/up projections
|
|
4658
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
|
|
4659
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
|
|
4660
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
|
|
4661
|
+
}
|
|
4662
|
+
|
|
4663
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
4664
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
4665
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
4666
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
|
|
4667
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
4668
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
4669
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
|
|
4670
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
|
|
4671
|
+
}
|
|
4672
|
+
}
|
|
4673
|
+
}
|
|
4674
|
+
break;
|
|
4325
4675
|
case LLM_ARCH_NEMOTRON:
|
|
4326
4676
|
{
|
|
4327
4677
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4359,48 +4709,117 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4359
4709
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
4360
4710
|
}
|
|
4361
4711
|
} break;
|
|
4362
|
-
case
|
|
4712
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
4363
4713
|
{
|
|
4714
|
+
// mamba2 Mixer SSM params
|
|
4715
|
+
// NOTE: int64_t for tensor dimensions
|
|
4716
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
4717
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
4718
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
4719
|
+
const int64_t n_ssm_head = hparams.ssm_dt_rank;
|
|
4720
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
4721
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
4722
|
+
|
|
4723
|
+
// embeddings
|
|
4364
4724
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4365
4725
|
|
|
4366
4726
|
// output
|
|
4367
|
-
|
|
4368
|
-
|
|
4369
|
-
|
|
4370
|
-
|
|
4371
|
-
|
|
4372
|
-
|
|
4727
|
+
{
|
|
4728
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4729
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4730
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
4731
|
+
if (output == NULL) {
|
|
4732
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4733
|
+
}
|
|
4373
4734
|
}
|
|
4374
4735
|
|
|
4375
4736
|
for (int i = 0; i < n_layer; ++i) {
|
|
4376
4737
|
auto & layer = layers[i];
|
|
4377
4738
|
|
|
4378
|
-
|
|
4739
|
+
// all blocks use the attn norm
|
|
4740
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4379
4741
|
|
|
4380
|
-
|
|
4381
|
-
|
|
4382
|
-
|
|
4383
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4742
|
+
if (hparams.is_recurrent(i)) {
|
|
4743
|
+
// ssm layers
|
|
4744
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
4384
4745
|
|
|
4385
|
-
|
|
4386
|
-
|
|
4387
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4388
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4389
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4390
|
-
}
|
|
4391
|
-
} break;
|
|
4392
|
-
case LLM_ARCH_EXAONE4:
|
|
4393
|
-
{
|
|
4394
|
-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4746
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
4747
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
|
|
4395
4748
|
|
|
4396
|
-
|
|
4397
|
-
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4398
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4749
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
|
|
4399
4750
|
|
|
4400
|
-
|
|
4401
|
-
|
|
4402
|
-
|
|
4403
|
-
|
|
4751
|
+
// no "weight" suffix for these
|
|
4752
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
|
|
4753
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
|
|
4754
|
+
|
|
4755
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
4756
|
+
|
|
4757
|
+
// out_proj
|
|
4758
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
4759
|
+
} else if (hparams.n_ff(i) == 0) {
|
|
4760
|
+
// attention layers (with optional bias)
|
|
4761
|
+
const int64_t n_head_i = hparams.n_head(i);
|
|
4762
|
+
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
|
|
4763
|
+
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
|
4764
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
|
|
4765
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
|
|
4766
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
|
|
4767
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
|
4768
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4769
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4770
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4771
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4772
|
+
} else {
|
|
4773
|
+
// mlp layers
|
|
4774
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
|
4775
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
|
4776
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4777
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
|
4778
|
+
}
|
|
4779
|
+
}
|
|
4780
|
+
} break;
|
|
4781
|
+
case LLM_ARCH_EXAONE:
|
|
4782
|
+
{
|
|
4783
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4784
|
+
|
|
4785
|
+
// output
|
|
4786
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4787
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4788
|
+
|
|
4789
|
+
// if output is NULL, init from the input tok embed
|
|
4790
|
+
if (output == NULL) {
|
|
4791
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4792
|
+
}
|
|
4793
|
+
|
|
4794
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4795
|
+
auto & layer = layers[i];
|
|
4796
|
+
|
|
4797
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4798
|
+
|
|
4799
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4800
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4801
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4802
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4803
|
+
|
|
4804
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4805
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4806
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4807
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4808
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4809
|
+
}
|
|
4810
|
+
} break;
|
|
4811
|
+
case LLM_ARCH_EXAONE4:
|
|
4812
|
+
{
|
|
4813
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4814
|
+
|
|
4815
|
+
// output
|
|
4816
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4817
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4818
|
+
|
|
4819
|
+
// if output is NULL, init from the input tok embed
|
|
4820
|
+
if (output == NULL) {
|
|
4821
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4822
|
+
}
|
|
4404
4823
|
|
|
4405
4824
|
for (int i = 0; i < n_layer; ++i) {
|
|
4406
4825
|
auto & layer = layers[i];
|
|
@@ -5103,6 +5522,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5103
5522
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
5104
5523
|
}
|
|
5105
5524
|
} break;
|
|
5525
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
5526
|
+
{
|
|
5527
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5528
|
+
|
|
5529
|
+
// output
|
|
5530
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5531
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5532
|
+
// if output is NULL, init from the input tok embed
|
|
5533
|
+
if (output == NULL) {
|
|
5534
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5535
|
+
}
|
|
5536
|
+
|
|
5537
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5538
|
+
auto & layer = layers[i];
|
|
5539
|
+
|
|
5540
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5541
|
+
|
|
5542
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5543
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
5544
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
5545
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
5546
|
+
|
|
5547
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5548
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5549
|
+
|
|
5550
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5551
|
+
|
|
5552
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5553
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
5554
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5555
|
+
|
|
5556
|
+
}
|
|
5557
|
+
} break;
|
|
5106
5558
|
case LLM_ARCH_SMOLLM3:
|
|
5107
5559
|
{
|
|
5108
5560
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5132,10 +5584,55 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5132
5584
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5133
5585
|
}
|
|
5134
5586
|
} break;
|
|
5135
|
-
case
|
|
5587
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
5136
5588
|
{
|
|
5589
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5590
|
+
|
|
5137
5591
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5592
|
+
|
|
5593
|
+
// output
|
|
5594
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5595
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5596
|
+
|
|
5597
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5598
|
+
auto & layer = layers[i];
|
|
5599
|
+
|
|
5600
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5601
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5602
|
+
|
|
5603
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
|
|
5604
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
5605
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
5606
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
|
|
5607
|
+
|
|
5608
|
+
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
|
|
5609
|
+
|
|
5610
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5611
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5612
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5613
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5614
|
+
|
|
5615
|
+
// bias
|
|
5616
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
|
|
5617
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
|
|
5618
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
|
|
5619
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
5620
|
+
|
|
5621
|
+
layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
|
|
5622
|
+
layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
|
5623
|
+
layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
|
|
5624
|
+
layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
|
5625
|
+
}
|
|
5626
|
+
} break;
|
|
5627
|
+
case LLM_ARCH_LFM2:
|
|
5628
|
+
{
|
|
5629
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5138
5630
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
5631
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5632
|
+
|
|
5633
|
+
if (output == NULL) {
|
|
5634
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5635
|
+
}
|
|
5139
5636
|
|
|
5140
5637
|
for (int i = 0; i < n_layer; ++i) {
|
|
5141
5638
|
auto & layer = layers[i];
|
|
@@ -5165,6 +5662,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5165
5662
|
}
|
|
5166
5663
|
}
|
|
5167
5664
|
} break;
|
|
5665
|
+
case LLM_ARCH_SMALLTHINKER:
|
|
5666
|
+
{
|
|
5667
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
5668
|
+
|
|
5669
|
+
// output
|
|
5670
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
5671
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5672
|
+
|
|
5673
|
+
// if output is NULL, init from the input tok embed
|
|
5674
|
+
if (output == NULL) {
|
|
5675
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5676
|
+
}
|
|
5677
|
+
|
|
5678
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5679
|
+
auto & layer = layers[i];
|
|
5680
|
+
|
|
5681
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
5682
|
+
|
|
5683
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
|
5684
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
5685
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
5686
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
5687
|
+
|
|
5688
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
5689
|
+
|
|
5690
|
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
|
|
5691
|
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
|
|
5692
|
+
|
|
5693
|
+
// MoE branch
|
|
5694
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5695
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
|
5696
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
5697
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
|
5698
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
5699
|
+
}
|
|
5700
|
+
} break;
|
|
5168
5701
|
default:
|
|
5169
5702
|
throw std::runtime_error("unknown architecture");
|
|
5170
5703
|
}
|
|
@@ -5419,7 +5952,8 @@ void llama_model::print_info() const {
|
|
|
5419
5952
|
arch == LLM_ARCH_JAMBA ||
|
|
5420
5953
|
arch == LLM_ARCH_FALCON_H1 ||
|
|
5421
5954
|
arch == LLM_ARCH_PLAMO2 ||
|
|
5422
|
-
arch == LLM_ARCH_GRANITE_HYBRID
|
|
5955
|
+
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
5956
|
+
arch == LLM_ARCH_NEMOTRON_H) {
|
|
5423
5957
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
5424
5958
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
5425
5959
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
@@ -5468,7 +6002,7 @@ void llama_model::print_info() const {
|
|
|
5468
6002
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
5469
6003
|
}
|
|
5470
6004
|
|
|
5471
|
-
if (arch == LLM_ARCH_QWEN3MOE) {
|
|
6005
|
+
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
|
|
5472
6006
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
5473
6007
|
}
|
|
5474
6008
|
|
|
@@ -5490,6 +6024,11 @@ void llama_model::print_info() const {
|
|
|
5490
6024
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
5491
6025
|
}
|
|
5492
6026
|
|
|
6027
|
+
if (arch == LLM_ARCH_SMALLTHINKER) {
|
|
6028
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6029
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6030
|
+
}
|
|
6031
|
+
|
|
5493
6032
|
vocab.print_info();
|
|
5494
6033
|
}
|
|
5495
6034
|
|
|
@@ -5605,7 +6144,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
5605
6144
|
// inp_pos - contains the positions
|
|
5606
6145
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
5607
6146
|
|
|
5608
|
-
auto * inp_attn =
|
|
6147
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
5609
6148
|
|
|
5610
6149
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5611
6150
|
|
|
@@ -5669,7 +6208,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
5669
6208
|
|
|
5670
6209
|
cur = build_attn(inp_attn,
|
|
5671
6210
|
model.layers[il].wo, model.layers[il].bo,
|
|
5672
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6211
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
5673
6212
|
cb(cur, "attn_out", il);
|
|
5674
6213
|
}
|
|
5675
6214
|
|
|
@@ -5765,7 +6304,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
5765
6304
|
ggml_tensor * inp_attn_scale = nullptr;
|
|
5766
6305
|
inp_attn_scale = build_inp_attn_scale();
|
|
5767
6306
|
|
|
5768
|
-
auto * inp_attn =
|
|
6307
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
5769
6308
|
|
|
5770
6309
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5771
6310
|
|
|
@@ -5843,7 +6382,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
5843
6382
|
|
|
5844
6383
|
cur = build_attn(inp_attn,
|
|
5845
6384
|
model.layers[il].wo, model.layers[il].bo,
|
|
5846
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6385
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
5847
6386
|
cb(cur, "attn_out", il);
|
|
5848
6387
|
}
|
|
5849
6388
|
|
|
@@ -5944,7 +6483,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
5944
6483
|
// inp_pos - contains the positions
|
|
5945
6484
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
5946
6485
|
|
|
5947
|
-
auto * inp_attn =
|
|
6486
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
5948
6487
|
|
|
5949
6488
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5950
6489
|
|
|
@@ -6020,7 +6559,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
6020
6559
|
|
|
6021
6560
|
cur = build_attn(inp_attn,
|
|
6022
6561
|
model.layers[il].wo, model.layers[il].bo,
|
|
6023
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6562
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6024
6563
|
}
|
|
6025
6564
|
|
|
6026
6565
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6100,7 +6639,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6100
6639
|
// inp_pos - contains the positions
|
|
6101
6640
|
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
|
6102
6641
|
|
|
6103
|
-
auto * inp_attn =
|
|
6642
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6104
6643
|
|
|
6105
6644
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6106
6645
|
|
|
@@ -6152,7 +6691,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6152
6691
|
|
|
6153
6692
|
cur = build_attn(inp_attn,
|
|
6154
6693
|
model.layers[il].wo, NULL,
|
|
6155
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6694
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6156
6695
|
}
|
|
6157
6696
|
|
|
6158
6697
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6222,7 +6761,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6222
6761
|
// inp_pos - contains the positions
|
|
6223
6762
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6224
6763
|
|
|
6225
|
-
auto * inp_attn =
|
|
6764
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6226
6765
|
|
|
6227
6766
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6228
6767
|
|
|
@@ -6267,7 +6806,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6267
6806
|
|
|
6268
6807
|
cur = build_attn(inp_attn,
|
|
6269
6808
|
model.layers[il].wo, NULL,
|
|
6270
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6809
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6271
6810
|
}
|
|
6272
6811
|
|
|
6273
6812
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6336,7 +6875,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6336
6875
|
// inp_pos - contains the positions
|
|
6337
6876
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6338
6877
|
|
|
6339
|
-
auto * inp_attn =
|
|
6878
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6340
6879
|
|
|
6341
6880
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6342
6881
|
|
|
@@ -6367,9 +6906,9 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6367
6906
|
|
|
6368
6907
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6369
6908
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6370
|
-
ggml_tensor * Vcur =
|
|
6909
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6371
6910
|
|
|
6372
|
-
Vcur =
|
|
6911
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6373
6912
|
|
|
6374
6913
|
// using mode = 2 for neox mode
|
|
6375
6914
|
Qcur = ggml_rope_ext(
|
|
@@ -6390,7 +6929,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6390
6929
|
|
|
6391
6930
|
cur = build_attn(inp_attn,
|
|
6392
6931
|
model.layers[il].wo, NULL,
|
|
6393
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6932
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6394
6933
|
}
|
|
6395
6934
|
|
|
6396
6935
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6460,7 +6999,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6460
6999
|
// inp_pos - contains the positions
|
|
6461
7000
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6462
7001
|
|
|
6463
|
-
auto * inp_attn =
|
|
7002
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6464
7003
|
|
|
6465
7004
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6466
7005
|
|
|
@@ -6520,7 +7059,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6520
7059
|
|
|
6521
7060
|
cur = build_attn(inp_attn,
|
|
6522
7061
|
model.layers[il].wo, model.layers[il].bo,
|
|
6523
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7062
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
6524
7063
|
}
|
|
6525
7064
|
|
|
6526
7065
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6620,7 +7159,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
6620
7159
|
// inp_pos - contains the positions
|
|
6621
7160
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6622
7161
|
|
|
6623
|
-
auto * inp_attn =
|
|
7162
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6624
7163
|
|
|
6625
7164
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6626
7165
|
|
|
@@ -6647,9 +7186,9 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
6647
7186
|
|
|
6648
7187
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6649
7188
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6650
|
-
Vcur =
|
|
7189
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6651
7190
|
|
|
6652
|
-
Vcur =
|
|
7191
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6653
7192
|
|
|
6654
7193
|
Qcur = ggml_rope_ext(
|
|
6655
7194
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -6669,7 +7208,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
6669
7208
|
|
|
6670
7209
|
cur = build_attn(inp_attn,
|
|
6671
7210
|
model.layers[il].wo, NULL,
|
|
6672
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7211
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6673
7212
|
}
|
|
6674
7213
|
|
|
6675
7214
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6744,7 +7283,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
6744
7283
|
// inp_pos - contains the positions
|
|
6745
7284
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6746
7285
|
|
|
6747
|
-
auto * inp_attn =
|
|
7286
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6748
7287
|
|
|
6749
7288
|
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
6750
7289
|
cb(pos, "pos_embd", -1);
|
|
@@ -6769,13 +7308,13 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
6769
7308
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6770
7309
|
cb(cur, "bqkv", il);
|
|
6771
7310
|
|
|
6772
|
-
ggml_tensor * Qcur =
|
|
6773
|
-
ggml_tensor * Kcur =
|
|
6774
|
-
ggml_tensor * Vcur =
|
|
7311
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7312
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7313
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6775
7314
|
|
|
6776
|
-
Qcur =
|
|
6777
|
-
Kcur =
|
|
6778
|
-
Vcur =
|
|
7315
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7316
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7317
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6779
7318
|
|
|
6780
7319
|
cb(Qcur, "Qcur", il);
|
|
6781
7320
|
cb(Kcur, "Kcur", il);
|
|
@@ -6783,7 +7322,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
6783
7322
|
|
|
6784
7323
|
cur = build_attn(inp_attn,
|
|
6785
7324
|
model.layers[il].wo, model.layers[il].bo,
|
|
6786
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7325
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6787
7326
|
}
|
|
6788
7327
|
|
|
6789
7328
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6849,7 +7388,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
6849
7388
|
|
|
6850
7389
|
inpL = build_inp_embd(model.tok_embd);
|
|
6851
7390
|
|
|
6852
|
-
auto * inp_attn =
|
|
7391
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6853
7392
|
|
|
6854
7393
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6855
7394
|
|
|
@@ -6882,7 +7421,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
6882
7421
|
|
|
6883
7422
|
cur = build_attn(inp_attn,
|
|
6884
7423
|
model.layers[il].wo, NULL,
|
|
6885
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7424
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6886
7425
|
}
|
|
6887
7426
|
|
|
6888
7427
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6991,13 +7530,15 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6991
7530
|
cb(cur, "bqkv", il);
|
|
6992
7531
|
}
|
|
6993
7532
|
|
|
6994
|
-
Qcur =
|
|
6995
|
-
Kcur =
|
|
6996
|
-
Vcur =
|
|
7533
|
+
Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7534
|
+
Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7535
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7536
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6997
7537
|
} else {
|
|
6998
7538
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
6999
7539
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
7000
7540
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
7541
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7001
7542
|
}
|
|
7002
7543
|
|
|
7003
7544
|
if (model.layers[il].attn_q_norm) {
|
|
@@ -7005,6 +7546,10 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7005
7546
|
model.layers[il].attn_q_norm,
|
|
7006
7547
|
model.layers[il].attn_q_norm_b,
|
|
7007
7548
|
LLM_NORM, il);
|
|
7549
|
+
|
|
7550
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7551
|
+
} else {
|
|
7552
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7008
7553
|
}
|
|
7009
7554
|
|
|
7010
7555
|
if (model.layers[il].attn_k_norm) {
|
|
@@ -7012,14 +7557,14 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7012
7557
|
model.layers[il].attn_k_norm,
|
|
7013
7558
|
model.layers[il].attn_k_norm_b,
|
|
7014
7559
|
LLM_NORM, il);
|
|
7015
|
-
}
|
|
7016
7560
|
|
|
7017
|
-
|
|
7018
|
-
|
|
7019
|
-
|
|
7561
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7562
|
+
} else {
|
|
7563
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7564
|
+
}
|
|
7020
7565
|
|
|
7021
7566
|
// RoPE
|
|
7022
|
-
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7567
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7023
7568
|
Qcur = ggml_rope_ext(
|
|
7024
7569
|
ctx0, Qcur, inp_pos, nullptr,
|
|
7025
7570
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -7039,7 +7584,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7039
7584
|
|
|
7040
7585
|
cur = build_attn(inp_attn,
|
|
7041
7586
|
model.layers[il].wo, model.layers[il].bo,
|
|
7042
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7587
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7043
7588
|
cb(cur, "kqv_out", il);
|
|
7044
7589
|
}
|
|
7045
7590
|
|
|
@@ -7078,7 +7623,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7078
7623
|
0.0f,
|
|
7079
7624
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
7080
7625
|
cb(cur, "ffn_moe_out", il);
|
|
7081
|
-
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7626
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7082
7627
|
cur = build_ffn(cur,
|
|
7083
7628
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7084
7629
|
NULL, NULL, NULL,
|
|
@@ -7161,9 +7706,9 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7161
7706
|
|
|
7162
7707
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7163
7708
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7164
|
-
Vcur =
|
|
7709
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7165
7710
|
|
|
7166
|
-
Vcur =
|
|
7711
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7167
7712
|
|
|
7168
7713
|
// RoPE
|
|
7169
7714
|
Qcur = ggml_rope_ext(
|
|
@@ -7184,7 +7729,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7184
7729
|
|
|
7185
7730
|
cur = build_attn(inp_attn,
|
|
7186
7731
|
model.layers[il].wo, nullptr,
|
|
7187
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7732
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7188
7733
|
cb(cur, "kqv_out", il);
|
|
7189
7734
|
}
|
|
7190
7735
|
|
|
@@ -7245,7 +7790,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7245
7790
|
|
|
7246
7791
|
inpL = build_inp_embd(model.tok_embd);
|
|
7247
7792
|
|
|
7248
|
-
auto * inp_attn =
|
|
7793
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7249
7794
|
|
|
7250
7795
|
inpL = build_norm(inpL,
|
|
7251
7796
|
model.tok_norm,
|
|
@@ -7270,13 +7815,13 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7270
7815
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7271
7816
|
cb(cur, "bqkv", il);
|
|
7272
7817
|
|
|
7273
|
-
ggml_tensor * Qcur =
|
|
7274
|
-
ggml_tensor * Kcur =
|
|
7275
|
-
ggml_tensor * Vcur =
|
|
7818
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7819
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7820
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7276
7821
|
|
|
7277
|
-
Qcur =
|
|
7278
|
-
Kcur =
|
|
7279
|
-
Vcur =
|
|
7822
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7823
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7824
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7280
7825
|
|
|
7281
7826
|
cb(Qcur, "Qcur", il);
|
|
7282
7827
|
cb(Kcur, "Kcur", il);
|
|
@@ -7284,7 +7829,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7284
7829
|
|
|
7285
7830
|
cur = build_attn(inp_attn,
|
|
7286
7831
|
model.layers[il].wo, model.layers[il].bo,
|
|
7287
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7832
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7288
7833
|
}
|
|
7289
7834
|
|
|
7290
7835
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7352,7 +7897,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7352
7897
|
|
|
7353
7898
|
inpL = build_inp_embd(model.tok_embd);
|
|
7354
7899
|
|
|
7355
|
-
auto * inp_attn =
|
|
7900
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7356
7901
|
|
|
7357
7902
|
if (model.pos_embd) {
|
|
7358
7903
|
// inp_pos - contains the positions
|
|
@@ -7394,7 +7939,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7394
7939
|
|
|
7395
7940
|
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7396
7941
|
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7397
|
-
ggml_tensor * Vcur =
|
|
7942
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7398
7943
|
|
|
7399
7944
|
cb(Qcur, "Qcur", il);
|
|
7400
7945
|
cb(Kcur, "Kcur", il);
|
|
@@ -7413,17 +7958,18 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7413
7958
|
model.layers[il].attn_k_norm_b,
|
|
7414
7959
|
LLM_NORM, il);
|
|
7415
7960
|
cb(Kcur, "Kcur", il);
|
|
7961
|
+
|
|
7962
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7963
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7416
7964
|
} else {
|
|
7417
|
-
Qcur =
|
|
7965
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7418
7966
|
cb(Qcur, "Qcur", il);
|
|
7419
7967
|
|
|
7420
|
-
Kcur =
|
|
7968
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7421
7969
|
cb(Kcur, "Kcur", il);
|
|
7422
7970
|
}
|
|
7423
7971
|
|
|
7424
|
-
|
|
7425
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7426
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7972
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7427
7973
|
|
|
7428
7974
|
cb(Qcur, "Qcur", il);
|
|
7429
7975
|
cb(Kcur, "Kcur", il);
|
|
@@ -7431,7 +7977,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7431
7977
|
|
|
7432
7978
|
cur = build_attn(inp_attn,
|
|
7433
7979
|
model.layers[il].wo, model.layers[il].bo,
|
|
7434
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7980
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7435
7981
|
}
|
|
7436
7982
|
|
|
7437
7983
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7501,7 +8047,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7501
8047
|
// inp_pos - contains the positions
|
|
7502
8048
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7503
8049
|
|
|
7504
|
-
auto * inp_attn =
|
|
8050
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7505
8051
|
|
|
7506
8052
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7507
8053
|
|
|
@@ -7577,7 +8123,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7577
8123
|
|
|
7578
8124
|
cur = build_attn(inp_attn,
|
|
7579
8125
|
model.layers[il].wo, NULL,
|
|
7580
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8126
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7581
8127
|
}
|
|
7582
8128
|
|
|
7583
8129
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7653,7 +8199,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
7653
8199
|
// inp_pos - contains the positions
|
|
7654
8200
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7655
8201
|
|
|
7656
|
-
auto * inp_attn =
|
|
8202
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7657
8203
|
|
|
7658
8204
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7659
8205
|
|
|
@@ -7675,9 +8221,9 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
7675
8221
|
|
|
7676
8222
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7677
8223
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7678
|
-
ggml_tensor * Vcur =
|
|
8224
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
|
|
7679
8225
|
|
|
7680
|
-
Vcur =
|
|
8226
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7681
8227
|
|
|
7682
8228
|
// using mode = 2 for neox mode
|
|
7683
8229
|
Qcur = ggml_rope_ext(
|
|
@@ -7698,7 +8244,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
7698
8244
|
|
|
7699
8245
|
cur = build_attn(inp_attn,
|
|
7700
8246
|
model.layers[il].wo, NULL,
|
|
7701
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8247
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7702
8248
|
}
|
|
7703
8249
|
|
|
7704
8250
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7768,7 +8314,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
7768
8314
|
// inp_pos - contains the positions
|
|
7769
8315
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7770
8316
|
|
|
7771
|
-
auto * inp_attn =
|
|
8317
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7772
8318
|
|
|
7773
8319
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7774
8320
|
|
|
@@ -7818,7 +8364,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
7818
8364
|
|
|
7819
8365
|
cur = build_attn(inp_attn,
|
|
7820
8366
|
model.layers[il].wo, model.layers[il].bo,
|
|
7821
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8367
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7822
8368
|
}
|
|
7823
8369
|
|
|
7824
8370
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7932,8 +8478,9 @@ struct llm_build_dream : public llm_graph_context {
|
|
|
7932
8478
|
cb(Kcur, "Kcur", il);
|
|
7933
8479
|
cb(Vcur, "Vcur", il);
|
|
7934
8480
|
|
|
7935
|
-
cur = build_attn(inp_attn,
|
|
7936
|
-
|
|
8481
|
+
cur = build_attn(inp_attn,
|
|
8482
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8483
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
7937
8484
|
}
|
|
7938
8485
|
|
|
7939
8486
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7978,8 +8525,10 @@ struct llm_build_dream : public llm_graph_context {
|
|
|
7978
8525
|
}
|
|
7979
8526
|
};
|
|
7980
8527
|
|
|
7981
|
-
struct
|
|
7982
|
-
|
|
8528
|
+
struct llm_build_llada : public llm_graph_context {
|
|
8529
|
+
llm_build_llada(const llama_model & model, const llm_graph_params & params) :
|
|
8530
|
+
llm_graph_context(params) {
|
|
8531
|
+
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
|
|
7983
8532
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7984
8533
|
|
|
7985
8534
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7993,10 +8542,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
7993
8542
|
// inp_pos - contains the positions
|
|
7994
8543
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7995
8544
|
|
|
7996
|
-
|
|
7997
|
-
|
|
7998
|
-
int sections[4];
|
|
7999
|
-
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
8545
|
+
// Non-causal attention for diffusion
|
|
8546
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
8000
8547
|
|
|
8001
8548
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8002
8549
|
|
|
@@ -8004,53 +8551,41 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8004
8551
|
ggml_tensor * inpSA = inpL;
|
|
8005
8552
|
|
|
8006
8553
|
// norm
|
|
8007
|
-
cur = build_norm(inpL,
|
|
8008
|
-
model.layers[il].attn_norm, NULL,
|
|
8009
|
-
LLM_NORM_RMS, il);
|
|
8554
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
8010
8555
|
cb(cur, "attn_norm", il);
|
|
8011
8556
|
|
|
8012
8557
|
// self-attention
|
|
8013
8558
|
{
|
|
8014
|
-
// compute Q
|
|
8559
|
+
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
|
8015
8560
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8016
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8017
|
-
cb(Qcur, "Qcur", il);
|
|
8018
|
-
|
|
8019
8561
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8020
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8021
|
-
cb(Kcur, "Kcur", il);
|
|
8022
|
-
|
|
8023
8562
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8024
|
-
|
|
8563
|
+
|
|
8564
|
+
cb(Qcur, "Qcur", il);
|
|
8565
|
+
cb(Kcur, "Kcur", il);
|
|
8025
8566
|
cb(Vcur, "Vcur", il);
|
|
8026
8567
|
|
|
8027
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
|
8568
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8028
8569
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8029
8570
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8030
8571
|
|
|
8031
|
-
Qcur =
|
|
8032
|
-
|
|
8033
|
-
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8034
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8035
|
-
);
|
|
8572
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8573
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
8036
8574
|
|
|
8037
|
-
Kcur =
|
|
8038
|
-
|
|
8039
|
-
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8040
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8041
|
-
);
|
|
8575
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8576
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
8042
8577
|
|
|
8043
8578
|
cb(Qcur, "Qcur", il);
|
|
8044
8579
|
cb(Kcur, "Kcur", il);
|
|
8045
8580
|
cb(Vcur, "Vcur", il);
|
|
8046
8581
|
|
|
8047
8582
|
cur = build_attn(inp_attn,
|
|
8048
|
-
model.layers[il].wo,
|
|
8049
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8583
|
+
model.layers[il].wo, NULL,
|
|
8584
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
8050
8585
|
}
|
|
8051
8586
|
|
|
8052
8587
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
8053
|
-
cur = ggml_get_rows(ctx0,
|
|
8588
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8054
8589
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8055
8590
|
}
|
|
8056
8591
|
|
|
@@ -8058,17 +8593,11 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8058
8593
|
cb(ffn_inp, "ffn_inp", il);
|
|
8059
8594
|
|
|
8060
8595
|
// feed-forward network
|
|
8061
|
-
cur = build_norm(ffn_inp,
|
|
8062
|
-
model.layers[il].ffn_norm, NULL,
|
|
8063
|
-
LLM_NORM_RMS, il);
|
|
8596
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
8064
8597
|
cb(cur, "ffn_norm", il);
|
|
8065
8598
|
|
|
8066
|
-
cur = build_ffn(cur,
|
|
8067
|
-
|
|
8068
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
|
8069
|
-
model.layers[il].ffn_down, NULL, NULL,
|
|
8070
|
-
NULL,
|
|
8071
|
-
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8599
|
+
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
|
|
8600
|
+
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8072
8601
|
cb(cur, "ffn_out", il);
|
|
8073
8602
|
|
|
8074
8603
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -8082,9 +8611,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8082
8611
|
|
|
8083
8612
|
cur = inpL;
|
|
8084
8613
|
|
|
8085
|
-
cur = build_norm(cur,
|
|
8086
|
-
model.output_norm, NULL,
|
|
8087
|
-
LLM_NORM_RMS, -1);
|
|
8614
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
8088
8615
|
|
|
8089
8616
|
cb(cur, "result_norm", -1);
|
|
8090
8617
|
res->t_embd = cur;
|
|
@@ -8099,8 +8626,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8099
8626
|
}
|
|
8100
8627
|
};
|
|
8101
8628
|
|
|
8102
|
-
struct
|
|
8103
|
-
|
|
8629
|
+
struct llm_build_qwen2vl : public llm_graph_context {
|
|
8630
|
+
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8104
8631
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8105
8632
|
|
|
8106
8633
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -8114,7 +8641,10 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8114
8641
|
// inp_pos - contains the positions
|
|
8115
8642
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8116
8643
|
|
|
8117
|
-
auto * inp_attn =
|
|
8644
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8645
|
+
|
|
8646
|
+
int sections[4];
|
|
8647
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
8118
8648
|
|
|
8119
8649
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8120
8650
|
|
|
@@ -8127,13 +8657,131 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8127
8657
|
LLM_NORM_RMS, il);
|
|
8128
8658
|
cb(cur, "attn_norm", il);
|
|
8129
8659
|
|
|
8130
|
-
//
|
|
8660
|
+
// self-attention
|
|
8131
8661
|
{
|
|
8132
8662
|
// compute Q and K and RoPE them
|
|
8133
8663
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8664
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8134
8665
|
cb(Qcur, "Qcur", il);
|
|
8135
|
-
|
|
8136
|
-
|
|
8666
|
+
|
|
8667
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8668
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8669
|
+
cb(Kcur, "Kcur", il);
|
|
8670
|
+
|
|
8671
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8672
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8673
|
+
cb(Vcur, "Vcur", il);
|
|
8674
|
+
|
|
8675
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8676
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8677
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8678
|
+
|
|
8679
|
+
Qcur = ggml_rope_multi(
|
|
8680
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
8681
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8682
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8683
|
+
);
|
|
8684
|
+
|
|
8685
|
+
Kcur = ggml_rope_multi(
|
|
8686
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
8687
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8688
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8689
|
+
);
|
|
8690
|
+
|
|
8691
|
+
cb(Qcur, "Qcur", il);
|
|
8692
|
+
cb(Kcur, "Kcur", il);
|
|
8693
|
+
cb(Vcur, "Vcur", il);
|
|
8694
|
+
|
|
8695
|
+
cur = build_attn(inp_attn,
|
|
8696
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8697
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8698
|
+
}
|
|
8699
|
+
|
|
8700
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8701
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8702
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8703
|
+
}
|
|
8704
|
+
|
|
8705
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
8706
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
8707
|
+
|
|
8708
|
+
// feed-forward network
|
|
8709
|
+
cur = build_norm(ffn_inp,
|
|
8710
|
+
model.layers[il].ffn_norm, NULL,
|
|
8711
|
+
LLM_NORM_RMS, il);
|
|
8712
|
+
cb(cur, "ffn_norm", il);
|
|
8713
|
+
|
|
8714
|
+
cur = build_ffn(cur,
|
|
8715
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
8716
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
8717
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
8718
|
+
NULL,
|
|
8719
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8720
|
+
cb(cur, "ffn_out", il);
|
|
8721
|
+
|
|
8722
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8723
|
+
|
|
8724
|
+
cur = build_cvec(cur, il);
|
|
8725
|
+
cb(cur, "l_out", il);
|
|
8726
|
+
|
|
8727
|
+
// input for next layer
|
|
8728
|
+
inpL = cur;
|
|
8729
|
+
}
|
|
8730
|
+
|
|
8731
|
+
cur = inpL;
|
|
8732
|
+
|
|
8733
|
+
cur = build_norm(cur,
|
|
8734
|
+
model.output_norm, NULL,
|
|
8735
|
+
LLM_NORM_RMS, -1);
|
|
8736
|
+
|
|
8737
|
+
cb(cur, "result_norm", -1);
|
|
8738
|
+
res->t_embd = cur;
|
|
8739
|
+
|
|
8740
|
+
// lm_head
|
|
8741
|
+
cur = build_lora_mm(model.output, cur);
|
|
8742
|
+
|
|
8743
|
+
cb(cur, "result_output", -1);
|
|
8744
|
+
res->t_logits = cur;
|
|
8745
|
+
|
|
8746
|
+
ggml_build_forward_expand(gf, cur);
|
|
8747
|
+
}
|
|
8748
|
+
};
|
|
8749
|
+
|
|
8750
|
+
struct llm_build_qwen2moe : public llm_graph_context {
|
|
8751
|
+
llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8752
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8753
|
+
|
|
8754
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8755
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8756
|
+
|
|
8757
|
+
ggml_tensor * cur;
|
|
8758
|
+
ggml_tensor * inpL;
|
|
8759
|
+
|
|
8760
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
8761
|
+
|
|
8762
|
+
// inp_pos - contains the positions
|
|
8763
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
8764
|
+
|
|
8765
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8766
|
+
|
|
8767
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8768
|
+
|
|
8769
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
8770
|
+
ggml_tensor * inpSA = inpL;
|
|
8771
|
+
|
|
8772
|
+
// norm
|
|
8773
|
+
cur = build_norm(inpL,
|
|
8774
|
+
model.layers[il].attn_norm, NULL,
|
|
8775
|
+
LLM_NORM_RMS, il);
|
|
8776
|
+
cb(cur, "attn_norm", il);
|
|
8777
|
+
|
|
8778
|
+
// self_attention
|
|
8779
|
+
{
|
|
8780
|
+
// compute Q and K and RoPE them
|
|
8781
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8782
|
+
cb(Qcur, "Qcur", il);
|
|
8783
|
+
if (model.layers[il].bq) {
|
|
8784
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8137
8785
|
cb(Qcur, "Qcur", il);
|
|
8138
8786
|
}
|
|
8139
8787
|
|
|
@@ -8173,7 +8821,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8173
8821
|
|
|
8174
8822
|
cur = build_attn(inp_attn,
|
|
8175
8823
|
model.layers[il].wo, model.layers[il].bo,
|
|
8176
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8824
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8177
8825
|
}
|
|
8178
8826
|
|
|
8179
8827
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8273,7 +8921,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8273
8921
|
// inp_pos - contains the positions
|
|
8274
8922
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8275
8923
|
|
|
8276
|
-
auto * inp_attn =
|
|
8924
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8277
8925
|
|
|
8278
8926
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8279
8927
|
|
|
@@ -8326,7 +8974,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8326
8974
|
|
|
8327
8975
|
cur = build_attn(inp_attn,
|
|
8328
8976
|
model.layers[il].wo, model.layers[il].bo,
|
|
8329
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8977
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8330
8978
|
}
|
|
8331
8979
|
|
|
8332
8980
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8394,7 +9042,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8394
9042
|
// inp_pos - contains the positions
|
|
8395
9043
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8396
9044
|
|
|
8397
|
-
auto * inp_attn =
|
|
9045
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8398
9046
|
|
|
8399
9047
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8400
9048
|
|
|
@@ -8447,7 +9095,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8447
9095
|
|
|
8448
9096
|
cur = build_attn(inp_attn,
|
|
8449
9097
|
model.layers[il].wo, model.layers[il].bo,
|
|
8450
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9098
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8451
9099
|
}
|
|
8452
9100
|
|
|
8453
9101
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8524,7 +9172,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
8524
9172
|
// inp_pos - contains the positions
|
|
8525
9173
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8526
9174
|
|
|
8527
|
-
auto * inp_attn =
|
|
9175
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8528
9176
|
|
|
8529
9177
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8530
9178
|
|
|
@@ -8550,21 +9198,21 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
8550
9198
|
|
|
8551
9199
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8552
9200
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8553
|
-
Vcur =
|
|
9201
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9202
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8554
9203
|
} else {
|
|
8555
9204
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
8556
9205
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
8557
9206
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
8558
9207
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8559
9208
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9209
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8560
9210
|
}
|
|
8561
9211
|
|
|
8562
9212
|
cb(Qcur, "Qcur", il);
|
|
8563
9213
|
cb(Kcur, "Kcur", il);
|
|
8564
9214
|
cb(Vcur, "Vcur", il);
|
|
8565
9215
|
|
|
8566
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8567
|
-
|
|
8568
9216
|
Qcur = ggml_rope_ext(
|
|
8569
9217
|
ctx0, Qcur, inp_pos, nullptr,
|
|
8570
9218
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -8587,7 +9235,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
8587
9235
|
|
|
8588
9236
|
cur = build_attn(inp_attn,
|
|
8589
9237
|
model.layers[il].wo, model.layers[il].bo,
|
|
8590
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9238
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
8591
9239
|
}
|
|
8592
9240
|
|
|
8593
9241
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8653,13 +9301,13 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
8653
9301
|
// inp_pos - contains the positions
|
|
8654
9302
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8655
9303
|
|
|
8656
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
9304
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
8657
9305
|
inp_attn_type * inp_attn = nullptr;
|
|
8658
9306
|
|
|
8659
9307
|
if constexpr (iswa) {
|
|
8660
|
-
inp_attn =
|
|
9308
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
8661
9309
|
} else {
|
|
8662
|
-
inp_attn =
|
|
9310
|
+
inp_attn = build_attn_inp_kv();
|
|
8663
9311
|
}
|
|
8664
9312
|
|
|
8665
9313
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -8688,21 +9336,21 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
8688
9336
|
|
|
8689
9337
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
8690
9338
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
8691
|
-
Vcur =
|
|
9339
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
9340
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8692
9341
|
} else {
|
|
8693
9342
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
8694
9343
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
8695
9344
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
8696
9345
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8697
9346
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9347
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8698
9348
|
}
|
|
8699
9349
|
|
|
8700
9350
|
cb(Qcur, "Qcur", il);
|
|
8701
9351
|
cb(Kcur, "Kcur", il);
|
|
8702
9352
|
cb(Vcur, "Vcur", il);
|
|
8703
9353
|
|
|
8704
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8705
|
-
|
|
8706
9354
|
Qcur = ggml_rope_ext(
|
|
8707
9355
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
8708
9356
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -8724,7 +9372,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
8724
9372
|
|
|
8725
9373
|
cur = build_attn(inp_attn,
|
|
8726
9374
|
model.layers[il].wo, model.layers[il].bo,
|
|
8727
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9375
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
8728
9376
|
}
|
|
8729
9377
|
|
|
8730
9378
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8811,7 +9459,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
8811
9459
|
// inp_pos - contains the positions
|
|
8812
9460
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8813
9461
|
|
|
8814
|
-
auto * inp_attn =
|
|
9462
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8815
9463
|
|
|
8816
9464
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8817
9465
|
|
|
@@ -8858,7 +9506,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
8858
9506
|
|
|
8859
9507
|
cur = build_attn(inp_attn,
|
|
8860
9508
|
model.layers[il].wo, NULL,
|
|
8861
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9509
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8862
9510
|
}
|
|
8863
9511
|
|
|
8864
9512
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8927,7 +9575,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
8927
9575
|
// inp_pos - contains the positions
|
|
8928
9576
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8929
9577
|
|
|
8930
|
-
auto * inp_attn =
|
|
9578
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8931
9579
|
|
|
8932
9580
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
8933
9581
|
cb(pos, "pos_embd", -1);
|
|
@@ -8952,21 +9600,21 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
8952
9600
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
8953
9601
|
cb(cur, "bqkv", il);
|
|
8954
9602
|
|
|
8955
|
-
ggml_tensor * Qcur =
|
|
8956
|
-
ggml_tensor * Kcur =
|
|
8957
|
-
ggml_tensor * Vcur =
|
|
9603
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9604
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9605
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
8958
9606
|
|
|
8959
9607
|
cb(Qcur, "Qcur", il);
|
|
8960
9608
|
cb(Kcur, "Kcur", il);
|
|
8961
9609
|
cb(Vcur, "Vcur", il);
|
|
8962
9610
|
|
|
8963
|
-
Qcur =
|
|
8964
|
-
Kcur =
|
|
8965
|
-
Vcur =
|
|
9611
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9612
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9613
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8966
9614
|
|
|
8967
9615
|
cur = build_attn(inp_attn,
|
|
8968
9616
|
model.layers[il].wo, model.layers[il].bo,
|
|
8969
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9617
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8970
9618
|
}
|
|
8971
9619
|
|
|
8972
9620
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9037,7 +9685,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9037
9685
|
// inp_pos - contains the positions
|
|
9038
9686
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9039
9687
|
|
|
9040
|
-
auto * inp_attn =
|
|
9688
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9041
9689
|
|
|
9042
9690
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9043
9691
|
|
|
@@ -9058,9 +9706,9 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9058
9706
|
|
|
9059
9707
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9060
9708
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9061
|
-
ggml_tensor * Vcur =
|
|
9709
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9062
9710
|
|
|
9063
|
-
Vcur =
|
|
9711
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9064
9712
|
|
|
9065
9713
|
Qcur = ggml_rope_ext(
|
|
9066
9714
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -9080,7 +9728,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9080
9728
|
|
|
9081
9729
|
cur = build_attn(inp_attn,
|
|
9082
9730
|
model.layers[il].wo, model.layers[il].bo,
|
|
9083
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9731
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9084
9732
|
}
|
|
9085
9733
|
|
|
9086
9734
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9150,7 +9798,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9150
9798
|
// inp_pos - contains the positions
|
|
9151
9799
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9152
9800
|
|
|
9153
|
-
auto * inp_attn =
|
|
9801
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9154
9802
|
|
|
9155
9803
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9156
9804
|
|
|
@@ -9209,7 +9857,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9209
9857
|
|
|
9210
9858
|
cur = build_attn(inp_attn,
|
|
9211
9859
|
model.layers[il].wo, NULL,
|
|
9212
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9860
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9213
9861
|
}
|
|
9214
9862
|
|
|
9215
9863
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9277,7 +9925,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9277
9925
|
// inp_pos - contains the positions
|
|
9278
9926
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9279
9927
|
|
|
9280
|
-
auto * inp_attn =
|
|
9928
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9281
9929
|
|
|
9282
9930
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9283
9931
|
|
|
@@ -9336,7 +9984,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9336
9984
|
|
|
9337
9985
|
cur = build_attn(inp_attn,
|
|
9338
9986
|
model.layers[il].wo, model.layers[il].bo,
|
|
9339
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9987
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9340
9988
|
}
|
|
9341
9989
|
|
|
9342
9990
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9413,7 +10061,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
9413
10061
|
// inp_pos - contains the positions
|
|
9414
10062
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9415
10063
|
|
|
9416
|
-
auto * inp_attn =
|
|
10064
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9417
10065
|
|
|
9418
10066
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9419
10067
|
|
|
@@ -9524,7 +10172,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
9524
10172
|
|
|
9525
10173
|
cur = build_attn(inp_attn,
|
|
9526
10174
|
model.layers[il].wo, NULL,
|
|
9527
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
10175
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
9528
10176
|
}
|
|
9529
10177
|
|
|
9530
10178
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9608,7 +10256,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
9608
10256
|
// inp_pos - contains the positions
|
|
9609
10257
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9610
10258
|
|
|
9611
|
-
auto * inp_attn =
|
|
10259
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9612
10260
|
|
|
9613
10261
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9614
10262
|
|
|
@@ -9654,7 +10302,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
9654
10302
|
|
|
9655
10303
|
cur = build_attn(inp_attn,
|
|
9656
10304
|
model.layers[il].wo, NULL,
|
|
9657
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10305
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9658
10306
|
}
|
|
9659
10307
|
|
|
9660
10308
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9724,7 +10372,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
9724
10372
|
// inp_pos - contains the positions
|
|
9725
10373
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9726
10374
|
|
|
9727
|
-
auto * inp_attn =
|
|
10375
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
9728
10376
|
|
|
9729
10377
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9730
10378
|
|
|
@@ -9769,7 +10417,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
9769
10417
|
|
|
9770
10418
|
cur = build_attn(inp_attn,
|
|
9771
10419
|
model.layers[il].wo, NULL,
|
|
9772
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10420
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9773
10421
|
}
|
|
9774
10422
|
|
|
9775
10423
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9858,7 +10506,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
9858
10506
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9859
10507
|
|
|
9860
10508
|
// TODO: is causal == true correct? might need some changes
|
|
9861
|
-
auto * inp_attn =
|
|
10509
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
9862
10510
|
|
|
9863
10511
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9864
10512
|
|
|
@@ -9911,7 +10559,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
9911
10559
|
|
|
9912
10560
|
cur = build_attn(inp_attn,
|
|
9913
10561
|
model.layers[il].wo, NULL,
|
|
9914
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10562
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9915
10563
|
}
|
|
9916
10564
|
|
|
9917
10565
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9983,7 +10631,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9983
10631
|
const int64_t n_embd_altup;
|
|
9984
10632
|
const int64_t n_altup;
|
|
9985
10633
|
const int i_altup_act;
|
|
9986
|
-
const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
|
|
9987
10634
|
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
9988
10635
|
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
9989
10636
|
|
|
@@ -10009,7 +10656,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10009
10656
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10010
10657
|
|
|
10011
10658
|
// TODO: is causal == true correct? might need some changes
|
|
10012
|
-
auto * inp_attn =
|
|
10659
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10013
10660
|
|
|
10014
10661
|
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
10015
10662
|
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
@@ -10033,8 +10680,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10033
10680
|
|
|
10034
10681
|
for (int il = 0; il < n_layer; ++il) {
|
|
10035
10682
|
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
|
10036
|
-
const bool has_kv = (il < n_layer_kv);
|
|
10037
|
-
|
|
10038
10683
|
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
10039
10684
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
10040
10685
|
|
|
@@ -10054,7 +10699,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10054
10699
|
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
|
|
10055
10700
|
|
|
10056
10701
|
// self-attention
|
|
10057
|
-
if (has_kv) {
|
|
10702
|
+
if (hparams.has_kv(il)) {
|
|
10058
10703
|
// compute Q and K and RoPE them
|
|
10059
10704
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10060
10705
|
cb(Qcur, "Qcur", il);
|
|
@@ -10092,9 +10737,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10092
10737
|
|
|
10093
10738
|
cur = build_attn(inp_attn,
|
|
10094
10739
|
model.layers[il].wo, NULL,
|
|
10095
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10740
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10096
10741
|
} else {
|
|
10097
|
-
//
|
|
10742
|
+
// reuse KV cache of earlier layers
|
|
10098
10743
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10099
10744
|
cb(Qcur, "Qcur", il);
|
|
10100
10745
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
@@ -10110,7 +10755,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10110
10755
|
|
|
10111
10756
|
cur = build_attn(inp_attn,
|
|
10112
10757
|
model.layers[il].wo, NULL,
|
|
10113
|
-
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10758
|
+
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10114
10759
|
}
|
|
10115
10760
|
|
|
10116
10761
|
cur = build_norm(cur,
|
|
@@ -10388,8 +11033,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10388
11033
|
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
10389
11034
|
all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
|
|
10390
11035
|
cb(all_coefs, "all_coefs", il);
|
|
10391
|
-
all_coefs =
|
|
10392
|
-
all_coefs =
|
|
11036
|
+
all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
|
|
11037
|
+
all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
10393
11038
|
|
|
10394
11039
|
innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
|
|
10395
11040
|
ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
|
|
@@ -10416,7 +11061,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10416
11061
|
// inp_pos - contains the positions
|
|
10417
11062
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10418
11063
|
|
|
10419
|
-
auto * inp_attn =
|
|
11064
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10420
11065
|
|
|
10421
11066
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10422
11067
|
|
|
@@ -10475,7 +11120,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10475
11120
|
|
|
10476
11121
|
cur = build_attn(inp_attn,
|
|
10477
11122
|
model.layers[il].wo, model.layers[il].bo,
|
|
10478
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11123
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10479
11124
|
}
|
|
10480
11125
|
|
|
10481
11126
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10902,7 +11547,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
|
|
|
10902
11547
|
cb(Vcur, "Vcur", il);
|
|
10903
11548
|
|
|
10904
11549
|
// No RoPE :)
|
|
10905
|
-
cur = build_attn(inp_hybrid->get_attn(),
|
|
11550
|
+
cur = build_attn(inp_hybrid->get_attn(),
|
|
11551
|
+
model.layers[il].wo, NULL,
|
|
11552
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10906
11553
|
}
|
|
10907
11554
|
|
|
10908
11555
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10985,7 +11632,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
10985
11632
|
// inp_pos - contains the positions
|
|
10986
11633
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10987
11634
|
|
|
10988
|
-
auto * inp_attn =
|
|
11635
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10989
11636
|
|
|
10990
11637
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10991
11638
|
|
|
@@ -11060,7 +11707,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
11060
11707
|
|
|
11061
11708
|
cur = build_attn(inp_attn,
|
|
11062
11709
|
model.layers[il].wo, model.layers[il].bo,
|
|
11063
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11710
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11064
11711
|
}
|
|
11065
11712
|
|
|
11066
11713
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11132,7 +11779,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11132
11779
|
// inp_pos - contains the positions
|
|
11133
11780
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11134
11781
|
|
|
11135
|
-
auto * inp_attn =
|
|
11782
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11136
11783
|
|
|
11137
11784
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11138
11785
|
|
|
@@ -11195,7 +11842,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11195
11842
|
|
|
11196
11843
|
cur = build_attn(inp_attn,
|
|
11197
11844
|
model.layers[il].wo, model.layers[il].bo,
|
|
11198
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11845
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11199
11846
|
}
|
|
11200
11847
|
|
|
11201
11848
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11267,7 +11914,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11267
11914
|
// inp_pos - contains the positions
|
|
11268
11915
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11269
11916
|
|
|
11270
|
-
auto * inp_attn =
|
|
11917
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11271
11918
|
|
|
11272
11919
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11273
11920
|
|
|
@@ -11326,7 +11973,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11326
11973
|
|
|
11327
11974
|
cur = build_attn(inp_attn,
|
|
11328
11975
|
model.layers[il].wo, nullptr,
|
|
11329
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11976
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11330
11977
|
}
|
|
11331
11978
|
|
|
11332
11979
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11395,7 +12042,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11395
12042
|
// inp_pos - contains the positions
|
|
11396
12043
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11397
12044
|
|
|
11398
|
-
auto * inp_attn =
|
|
12045
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11399
12046
|
|
|
11400
12047
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11401
12048
|
|
|
@@ -11446,7 +12093,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11446
12093
|
|
|
11447
12094
|
cur = build_attn(inp_attn,
|
|
11448
12095
|
model.layers[il].wo, NULL,
|
|
11449
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12096
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11450
12097
|
}
|
|
11451
12098
|
|
|
11452
12099
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11524,7 +12171,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
11524
12171
|
// inp_pos - contains the positions
|
|
11525
12172
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11526
12173
|
|
|
11527
|
-
auto * inp_attn =
|
|
12174
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11528
12175
|
|
|
11529
12176
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11530
12177
|
|
|
@@ -11579,7 +12226,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
11579
12226
|
|
|
11580
12227
|
cur = build_attn(inp_attn,
|
|
11581
12228
|
model.layers[il].wo, NULL,
|
|
11582
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12229
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11583
12230
|
}
|
|
11584
12231
|
|
|
11585
12232
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11650,7 +12297,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
11650
12297
|
// inp_pos - contains the positions
|
|
11651
12298
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11652
12299
|
|
|
11653
|
-
auto * inp_attn =
|
|
12300
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11654
12301
|
|
|
11655
12302
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11656
12303
|
|
|
@@ -11712,7 +12359,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
11712
12359
|
|
|
11713
12360
|
cur = build_attn(inp_attn,
|
|
11714
12361
|
model.layers[il].wo, NULL,
|
|
11715
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12362
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11716
12363
|
}
|
|
11717
12364
|
|
|
11718
12365
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11781,7 +12428,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
11781
12428
|
// inp_pos - contains the positions
|
|
11782
12429
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11783
12430
|
|
|
11784
|
-
auto * inp_attn =
|
|
12431
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11785
12432
|
|
|
11786
12433
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11787
12434
|
|
|
@@ -11802,9 +12449,9 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
11802
12449
|
|
|
11803
12450
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
11804
12451
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
11805
|
-
ggml_tensor * Vcur =
|
|
12452
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
11806
12453
|
|
|
11807
|
-
Vcur =
|
|
12454
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11808
12455
|
|
|
11809
12456
|
Qcur = ggml_rope_ext(
|
|
11810
12457
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -11824,7 +12471,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
11824
12471
|
|
|
11825
12472
|
cur = build_attn(inp_attn,
|
|
11826
12473
|
model.layers[il].wo, model.layers[il].bo,
|
|
11827
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12474
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11828
12475
|
}
|
|
11829
12476
|
|
|
11830
12477
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11927,7 +12574,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
11927
12574
|
// inp_pos - contains the positions
|
|
11928
12575
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11929
12576
|
|
|
11930
|
-
auto * inp_attn =
|
|
12577
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11931
12578
|
|
|
11932
12579
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11933
12580
|
|
|
@@ -11974,7 +12621,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
11974
12621
|
|
|
11975
12622
|
cur = build_attn(inp_attn,
|
|
11976
12623
|
model.layers[il].wo, NULL,
|
|
11977
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12624
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11978
12625
|
}
|
|
11979
12626
|
|
|
11980
12627
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12065,7 +12712,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12065
12712
|
// inp_pos - contains the positions
|
|
12066
12713
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12067
12714
|
|
|
12068
|
-
auto * inp_attn =
|
|
12715
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12069
12716
|
|
|
12070
12717
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
12071
12718
|
|
|
@@ -12129,7 +12776,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12129
12776
|
|
|
12130
12777
|
cur = build_attn(inp_attn,
|
|
12131
12778
|
model.layers[il].wo, model.layers[il].bo,
|
|
12132
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12779
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12133
12780
|
}
|
|
12134
12781
|
|
|
12135
12782
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12242,7 +12889,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12242
12889
|
// inp_pos - contains the positions
|
|
12243
12890
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12244
12891
|
|
|
12245
|
-
auto * inp_attn =
|
|
12892
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12246
12893
|
|
|
12247
12894
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12248
12895
|
|
|
@@ -12357,7 +13004,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12357
13004
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
12358
13005
|
cur = build_attn(inp_attn,
|
|
12359
13006
|
model.layers[il].wo, NULL,
|
|
12360
|
-
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
13007
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
12361
13008
|
} else {
|
|
12362
13009
|
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
|
12363
13010
|
cb(kv, "kv", il);
|
|
@@ -12391,7 +13038,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12391
13038
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
12392
13039
|
cur = build_attn(inp_attn,
|
|
12393
13040
|
model.layers[il].wo, NULL,
|
|
12394
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
13041
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12395
13042
|
}
|
|
12396
13043
|
}
|
|
12397
13044
|
|
|
@@ -12489,7 +13136,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
12489
13136
|
// inp_pos - contains the positions
|
|
12490
13137
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12491
13138
|
|
|
12492
|
-
auto * inp_attn =
|
|
13139
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12493
13140
|
|
|
12494
13141
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12495
13142
|
|
|
@@ -12558,7 +13205,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
12558
13205
|
|
|
12559
13206
|
cur = build_attn(inp_attn,
|
|
12560
13207
|
NULL, NULL,
|
|
12561
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13208
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12562
13209
|
|
|
12563
13210
|
cur = build_norm(cur,
|
|
12564
13211
|
model.layers[il].attn_sub_norm, NULL,
|
|
@@ -12681,7 +13328,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
12681
13328
|
|
|
12682
13329
|
cur = build_attn(inp_attn,
|
|
12683
13330
|
model.layers[il].wo_enc, nullptr,
|
|
12684
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13331
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
12685
13332
|
cb(cur, "kqv_out", il);
|
|
12686
13333
|
}
|
|
12687
13334
|
|
|
@@ -12753,7 +13400,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
12753
13400
|
|
|
12754
13401
|
const int64_t n_outputs_enc = embd_enc->ne[1];
|
|
12755
13402
|
|
|
12756
|
-
auto * inp_attn_self =
|
|
13403
|
+
auto * inp_attn_self = build_attn_inp_kv();
|
|
12757
13404
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
12758
13405
|
|
|
12759
13406
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -12787,7 +13434,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
12787
13434
|
|
|
12788
13435
|
cur = build_attn(inp_attn_self,
|
|
12789
13436
|
model.layers[il].wo, model.layers[il].bo,
|
|
12790
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13437
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
12791
13438
|
cb(cur, "kqv_out", il);
|
|
12792
13439
|
}
|
|
12793
13440
|
|
|
@@ -12819,7 +13466,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
12819
13466
|
|
|
12820
13467
|
cur = build_attn(inp_attn_cross,
|
|
12821
13468
|
model.layers[il].wo_cross, nullptr,
|
|
12822
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
13469
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
12823
13470
|
cb(cur, "kqv_out", il);
|
|
12824
13471
|
|
|
12825
13472
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
@@ -12918,7 +13565,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
12918
13565
|
|
|
12919
13566
|
inpL = build_inp_embd(model.tok_embd);
|
|
12920
13567
|
|
|
12921
|
-
auto * inp_attn =
|
|
13568
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12922
13569
|
|
|
12923
13570
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12924
13571
|
|
|
@@ -12937,21 +13584,21 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
12937
13584
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
12938
13585
|
cb(cur, "bqkv", il);
|
|
12939
13586
|
|
|
12940
|
-
ggml_tensor * Qcur =
|
|
12941
|
-
ggml_tensor * Kcur =
|
|
12942
|
-
ggml_tensor * Vcur =
|
|
13587
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
13588
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
13589
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
12943
13590
|
|
|
12944
13591
|
cb(Qcur, "Qcur", il);
|
|
12945
13592
|
cb(Kcur, "Kcur", il);
|
|
12946
13593
|
cb(Vcur, "Vcur", il);
|
|
12947
13594
|
|
|
12948
|
-
Qcur =
|
|
12949
|
-
Kcur =
|
|
12950
|
-
Vcur =
|
|
13595
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13596
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13597
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12951
13598
|
|
|
12952
13599
|
cur = build_attn(inp_attn,
|
|
12953
13600
|
model.layers[il].wo, model.layers[il].bo,
|
|
12954
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
13601
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
12955
13602
|
}
|
|
12956
13603
|
|
|
12957
13604
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13016,7 +13663,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13016
13663
|
// inp_pos - contains the positions
|
|
13017
13664
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13018
13665
|
|
|
13019
|
-
auto * inp_attn =
|
|
13666
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13020
13667
|
|
|
13021
13668
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13022
13669
|
|
|
@@ -13050,6 +13697,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13050
13697
|
}
|
|
13051
13698
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13052
13699
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13700
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13053
13701
|
} else {
|
|
13054
13702
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13055
13703
|
cb(cur, "wqkv", il);
|
|
@@ -13059,11 +13707,10 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13059
13707
|
}
|
|
13060
13708
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13061
13709
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13062
|
-
Vcur =
|
|
13710
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13711
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13063
13712
|
}
|
|
13064
13713
|
|
|
13065
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13066
|
-
|
|
13067
13714
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
13068
13715
|
Qcur = ggml_rope_ext(
|
|
13069
13716
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13083,7 +13730,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13083
13730
|
|
|
13084
13731
|
cur = build_attn(inp_attn,
|
|
13085
13732
|
model.layers[il].wo, NULL,
|
|
13086
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13733
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13087
13734
|
}
|
|
13088
13735
|
|
|
13089
13736
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13149,7 +13796,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13149
13796
|
// inp_pos - contains the positions
|
|
13150
13797
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13151
13798
|
|
|
13152
|
-
auto * inp_attn =
|
|
13799
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13153
13800
|
|
|
13154
13801
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13155
13802
|
|
|
@@ -13184,6 +13831,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13184
13831
|
}
|
|
13185
13832
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13186
13833
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13834
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13187
13835
|
} else {
|
|
13188
13836
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13189
13837
|
cb(cur, "wqkv", il);
|
|
@@ -13193,11 +13841,10 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13193
13841
|
}
|
|
13194
13842
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13195
13843
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13196
|
-
Vcur =
|
|
13844
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13845
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13197
13846
|
}
|
|
13198
13847
|
|
|
13199
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13200
|
-
|
|
13201
13848
|
Qcur = ggml_rope_ext(
|
|
13202
13849
|
ctx0, Qcur, inp_pos, nullptr,
|
|
13203
13850
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -13216,7 +13863,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13216
13863
|
|
|
13217
13864
|
cur = build_attn(inp_attn,
|
|
13218
13865
|
model.layers[il].wo, NULL,
|
|
13219
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13866
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13220
13867
|
}
|
|
13221
13868
|
|
|
13222
13869
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13285,12 +13932,11 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13285
13932
|
}
|
|
13286
13933
|
};
|
|
13287
13934
|
|
|
13288
|
-
struct
|
|
13289
|
-
|
|
13935
|
+
struct llm_build_glm4_moe : public llm_graph_context {
|
|
13936
|
+
llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13290
13937
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13291
13938
|
|
|
13292
13939
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13293
|
-
//GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13294
13940
|
|
|
13295
13941
|
ggml_tensor * cur;
|
|
13296
13942
|
ggml_tensor * inpL;
|
|
@@ -13300,48 +13946,54 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13300
13946
|
// inp_pos - contains the positions
|
|
13301
13947
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13302
13948
|
|
|
13303
|
-
auto * inp_attn =
|
|
13949
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13304
13950
|
|
|
13305
13951
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13306
13952
|
|
|
13307
|
-
|
|
13953
|
+
// Only process up to last layer (skip final NextN layer)
|
|
13954
|
+
// Final layer tensors are loaded but not processed in forward pass
|
|
13955
|
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
|
13956
|
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
|
13308
13957
|
ggml_tensor * inpSA = inpL;
|
|
13309
13958
|
|
|
13310
|
-
// norm
|
|
13311
|
-
cur = build_norm(inpL,
|
|
13312
|
-
model.layers[il].attn_norm,
|
|
13313
|
-
model.layers[il].attn_norm_b,
|
|
13314
|
-
LLM_NORM, il);
|
|
13959
|
+
// Pre-attention norm
|
|
13960
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
13315
13961
|
cb(cur, "attn_norm", il);
|
|
13316
13962
|
|
|
13317
13963
|
// self-attention
|
|
13318
13964
|
{
|
|
13319
|
-
// compute Q and K and RoPE them
|
|
13320
13965
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13321
|
-
cb(Qcur, "Qcur", il);
|
|
13322
13966
|
if (model.layers[il].bq) {
|
|
13323
13967
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13324
|
-
cb(Qcur, "Qcur", il);
|
|
13325
13968
|
}
|
|
13969
|
+
cb(Qcur, "Qcur", il);
|
|
13326
13970
|
|
|
13327
13971
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13328
|
-
cb(Kcur, "Kcur", il);
|
|
13329
13972
|
if (model.layers[il].bk) {
|
|
13330
13973
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
13331
|
-
cb(Kcur, "Kcur", il);
|
|
13332
13974
|
}
|
|
13975
|
+
cb(Kcur, "Kcur", il);
|
|
13333
13976
|
|
|
13334
13977
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13335
|
-
cb(Vcur, "Vcur", il);
|
|
13336
13978
|
if (model.layers[il].bv) {
|
|
13337
13979
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
13338
|
-
cb(Vcur, "Vcur", il);
|
|
13339
13980
|
}
|
|
13981
|
+
cb(Vcur, "Vcur", il);
|
|
13340
13982
|
|
|
13341
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
|
13983
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13342
13984
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13343
13985
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13344
13986
|
|
|
13987
|
+
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
|
13988
|
+
if (model.layers[il].attn_q_norm) {
|
|
13989
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
13990
|
+
cb(Qcur, "Qcur_normed", il);
|
|
13991
|
+
}
|
|
13992
|
+
if (model.layers[il].attn_k_norm) {
|
|
13993
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
13994
|
+
cb(Kcur, "Kcur_normed", il);
|
|
13995
|
+
}
|
|
13996
|
+
|
|
13345
13997
|
Qcur = ggml_rope_ext(
|
|
13346
13998
|
ctx0, Qcur, inp_pos, nullptr,
|
|
13347
13999
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -13359,34 +14011,62 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13359
14011
|
cb(Vcur, "Vcur", il);
|
|
13360
14012
|
|
|
13361
14013
|
cur = build_attn(inp_attn,
|
|
13362
|
-
model.layers[il].wo,
|
|
13363
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14014
|
+
model.layers[il].wo, NULL,
|
|
14015
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13364
14016
|
}
|
|
13365
14017
|
|
|
13366
|
-
if (il ==
|
|
13367
|
-
cur = ggml_get_rows(ctx0,
|
|
14018
|
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
14019
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13368
14020
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13369
14021
|
}
|
|
13370
14022
|
|
|
13371
14023
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13372
14024
|
cb(ffn_inp, "ffn_inp", il);
|
|
13373
14025
|
|
|
13374
|
-
//
|
|
13375
|
-
cur = build_norm(ffn_inp,
|
|
13376
|
-
|
|
13377
|
-
model.layers[il].ffn_norm_b,
|
|
13378
|
-
LLM_NORM, il);
|
|
13379
|
-
cb(cur, "ffn_norm", il);
|
|
14026
|
+
// Post-attention norm
|
|
14027
|
+
cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
|
14028
|
+
cb(cur, "post_attn_norm", il);
|
|
13380
14029
|
|
|
13381
|
-
|
|
13382
|
-
|
|
13383
|
-
|
|
13384
|
-
|
|
13385
|
-
|
|
13386
|
-
|
|
14030
|
+
// Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
|
|
14031
|
+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
|
14032
|
+
// Dense FFN layer
|
|
14033
|
+
cur = build_ffn(cur,
|
|
14034
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14035
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
14036
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14037
|
+
NULL,
|
|
14038
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14039
|
+
cb(cur, "ffn_out", il);
|
|
14040
|
+
} else {
|
|
14041
|
+
// Process routed experts using existing MoE infrastructure
|
|
14042
|
+
ggml_tensor * routed_out = build_moe_ffn(cur,
|
|
14043
|
+
model.layers[il].ffn_gate_inp,
|
|
14044
|
+
model.layers[il].ffn_up_exps,
|
|
14045
|
+
model.layers[il].ffn_gate_exps,
|
|
14046
|
+
model.layers[il].ffn_down_exps,
|
|
14047
|
+
model.layers[il].ffn_exp_probs_b,
|
|
14048
|
+
n_expert, n_expert_used,
|
|
14049
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
14050
|
+
true, hparams.expert_weights_scale,
|
|
14051
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
14052
|
+
il);
|
|
14053
|
+
cb(routed_out, "ffn_moe_out", il);
|
|
14054
|
+
|
|
14055
|
+
// Process shared expert on original input
|
|
14056
|
+
ggml_tensor * shared_out = build_ffn(cur,
|
|
14057
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
14058
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
14059
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
14060
|
+
NULL,
|
|
14061
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14062
|
+
cb(shared_out, "ffn_shexp_out", il);
|
|
14063
|
+
|
|
14064
|
+
// Final output: routed_output + shared_output
|
|
14065
|
+
cur = ggml_add(ctx0, routed_out, shared_out);
|
|
14066
|
+
cb(cur, "ffn_out", il);
|
|
14067
|
+
}
|
|
13387
14068
|
|
|
13388
14069
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13389
|
-
cb(cur, "ffn_out", il);
|
|
13390
14070
|
|
|
13391
14071
|
cur = build_cvec(cur, il);
|
|
13392
14072
|
cb(cur, "l_out", il);
|
|
@@ -13396,10 +14076,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13396
14076
|
}
|
|
13397
14077
|
|
|
13398
14078
|
cur = inpL;
|
|
13399
|
-
|
|
13400
|
-
cur = build_norm(cur,
|
|
13401
|
-
model.output_norm, model.output_norm_b,
|
|
13402
|
-
LLM_NORM, -1);
|
|
14079
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
13403
14080
|
|
|
13404
14081
|
cb(cur, "result_norm", -1);
|
|
13405
14082
|
res->t_embd = cur;
|
|
@@ -13414,12 +14091,12 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13414
14091
|
}
|
|
13415
14092
|
};
|
|
13416
14093
|
|
|
13417
|
-
struct
|
|
13418
|
-
|
|
14094
|
+
struct llm_build_nemotron : public llm_graph_context {
|
|
14095
|
+
llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13419
14096
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13420
14097
|
|
|
13421
14098
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13422
|
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14099
|
+
//GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13423
14100
|
|
|
13424
14101
|
ggml_tensor * cur;
|
|
13425
14102
|
ggml_tensor * inpL;
|
|
@@ -13429,7 +14106,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
13429
14106
|
// inp_pos - contains the positions
|
|
13430
14107
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13431
14108
|
|
|
13432
|
-
auto * inp_attn =
|
|
14109
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13433
14110
|
|
|
13434
14111
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13435
14112
|
|
|
@@ -13438,15 +14115,13 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
13438
14115
|
|
|
13439
14116
|
// norm
|
|
13440
14117
|
cur = build_norm(inpL,
|
|
13441
|
-
model.layers[il].attn_norm,
|
|
13442
|
-
|
|
14118
|
+
model.layers[il].attn_norm,
|
|
14119
|
+
model.layers[il].attn_norm_b,
|
|
14120
|
+
LLM_NORM, il);
|
|
13443
14121
|
cb(cur, "attn_norm", il);
|
|
13444
14122
|
|
|
13445
14123
|
// self-attention
|
|
13446
14124
|
{
|
|
13447
|
-
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
13448
|
-
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
13449
|
-
|
|
13450
14125
|
// compute Q and K and RoPE them
|
|
13451
14126
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13452
14127
|
cb(Qcur, "Qcur", il);
|
|
@@ -13474,13 +14149,13 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
13474
14149
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13475
14150
|
|
|
13476
14151
|
Qcur = ggml_rope_ext(
|
|
13477
|
-
ctx0, Qcur, inp_pos,
|
|
14152
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
13478
14153
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13479
14154
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13480
14155
|
);
|
|
13481
14156
|
|
|
13482
14157
|
Kcur = ggml_rope_ext(
|
|
13483
|
-
ctx0, Kcur, inp_pos,
|
|
14158
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
13484
14159
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13485
14160
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13486
14161
|
);
|
|
@@ -13491,7 +14166,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
13491
14166
|
|
|
13492
14167
|
cur = build_attn(inp_attn,
|
|
13493
14168
|
model.layers[il].wo, model.layers[il].bo,
|
|
13494
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14169
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13495
14170
|
}
|
|
13496
14171
|
|
|
13497
14172
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13504,17 +14179,17 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
13504
14179
|
|
|
13505
14180
|
// feed-forward network
|
|
13506
14181
|
cur = build_norm(ffn_inp,
|
|
13507
|
-
model.layers[il].ffn_norm,
|
|
13508
|
-
|
|
14182
|
+
model.layers[il].ffn_norm,
|
|
14183
|
+
model.layers[il].ffn_norm_b,
|
|
14184
|
+
LLM_NORM, il);
|
|
13509
14185
|
cb(cur, "ffn_norm", il);
|
|
13510
14186
|
|
|
13511
14187
|
cur = build_ffn(cur,
|
|
13512
|
-
model.layers[il].ffn_up,
|
|
13513
|
-
|
|
13514
|
-
model.layers[il].ffn_down,
|
|
14188
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14189
|
+
NULL, NULL, NULL,
|
|
14190
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
13515
14191
|
NULL,
|
|
13516
|
-
|
|
13517
|
-
cb(cur, "ffn_out", il);
|
|
14192
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
13518
14193
|
|
|
13519
14194
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13520
14195
|
cb(cur, "ffn_out", il);
|
|
@@ -13529,8 +14204,8 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
13529
14204
|
cur = inpL;
|
|
13530
14205
|
|
|
13531
14206
|
cur = build_norm(cur,
|
|
13532
|
-
model.output_norm,
|
|
13533
|
-
|
|
14207
|
+
model.output_norm, model.output_norm_b,
|
|
14208
|
+
LLM_NORM, -1);
|
|
13534
14209
|
|
|
13535
14210
|
cb(cur, "result_norm", -1);
|
|
13536
14211
|
res->t_embd = cur;
|
|
@@ -13545,10 +14220,273 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
13545
14220
|
}
|
|
13546
14221
|
};
|
|
13547
14222
|
|
|
13548
|
-
|
|
13549
|
-
|
|
13550
|
-
|
|
13551
|
-
|
|
14223
|
+
struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
14224
|
+
llm_build_nemotron_h(
|
|
14225
|
+
const llama_model & model,
|
|
14226
|
+
const llm_graph_params & params) :
|
|
14227
|
+
llm_graph_context_mamba(params) {
|
|
14228
|
+
|
|
14229
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14230
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14231
|
+
|
|
14232
|
+
ggml_tensor * cur;
|
|
14233
|
+
ggml_tensor * inpL;
|
|
14234
|
+
|
|
14235
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14236
|
+
|
|
14237
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14238
|
+
|
|
14239
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14240
|
+
|
|
14241
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14242
|
+
struct ggml_tensor * inpSA = inpL;
|
|
14243
|
+
|
|
14244
|
+
// norm
|
|
14245
|
+
cur = build_norm(inpL,
|
|
14246
|
+
model.layers[il].attn_norm, NULL,
|
|
14247
|
+
LLM_NORM_RMS, il);
|
|
14248
|
+
cb(cur, "attn_norm", il);
|
|
14249
|
+
|
|
14250
|
+
if (hparams.is_recurrent(il)) {
|
|
14251
|
+
// ssm layer //
|
|
14252
|
+
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
|
14253
|
+
} else if (hparams.n_ff(il) == 0) {
|
|
14254
|
+
// attention layer //
|
|
14255
|
+
cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
|
|
14256
|
+
} else {
|
|
14257
|
+
cur = build_ffn_layer(cur, model, il);
|
|
14258
|
+
}
|
|
14259
|
+
|
|
14260
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14261
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14262
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14263
|
+
}
|
|
14264
|
+
|
|
14265
|
+
// add residual
|
|
14266
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
14267
|
+
cb(cur, "block_out", il);
|
|
14268
|
+
|
|
14269
|
+
// input for next layer
|
|
14270
|
+
inpL = cur;
|
|
14271
|
+
}
|
|
14272
|
+
|
|
14273
|
+
cur = inpL;
|
|
14274
|
+
|
|
14275
|
+
cur = build_norm(cur,
|
|
14276
|
+
model.output_norm, NULL,
|
|
14277
|
+
LLM_NORM_RMS, -1);
|
|
14278
|
+
|
|
14279
|
+
cb(cur, "result_norm", -1);
|
|
14280
|
+
res->t_embd = cur;
|
|
14281
|
+
|
|
14282
|
+
// lm_head
|
|
14283
|
+
cur = build_lora_mm(model.output, cur);
|
|
14284
|
+
cb(cur, "result_output", -1);
|
|
14285
|
+
res->t_logits = cur;
|
|
14286
|
+
|
|
14287
|
+
ggml_build_forward_expand(gf, cur);
|
|
14288
|
+
}
|
|
14289
|
+
|
|
14290
|
+
ggml_tensor * build_attention_layer(
|
|
14291
|
+
ggml_tensor * cur,
|
|
14292
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
14293
|
+
const llama_model & model,
|
|
14294
|
+
const int64_t n_embd_head,
|
|
14295
|
+
const int il) {
|
|
14296
|
+
|
|
14297
|
+
// compute Q and K and (optionally) RoPE them
|
|
14298
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14299
|
+
cb(Qcur, "Qcur", il);
|
|
14300
|
+
if (model.layers[il].bq) {
|
|
14301
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14302
|
+
cb(Qcur, "Qcur", il);
|
|
14303
|
+
}
|
|
14304
|
+
|
|
14305
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14306
|
+
cb(Kcur, "Kcur", il);
|
|
14307
|
+
if (model.layers[il].bk) {
|
|
14308
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14309
|
+
cb(Kcur, "Kcur", il);
|
|
14310
|
+
}
|
|
14311
|
+
|
|
14312
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14313
|
+
cb(Vcur, "Vcur", il);
|
|
14314
|
+
if (model.layers[il].bv) {
|
|
14315
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14316
|
+
cb(Vcur, "Vcur", il);
|
|
14317
|
+
}
|
|
14318
|
+
|
|
14319
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14320
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14321
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14322
|
+
|
|
14323
|
+
cb(Qcur, "Qcur", il);
|
|
14324
|
+
cb(Kcur, "Kcur", il);
|
|
14325
|
+
cb(Vcur, "Vcur", il);
|
|
14326
|
+
|
|
14327
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14328
|
+
cur = build_attn(inp_attn,
|
|
14329
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14330
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
14331
|
+
cb(cur, "attn_out", il);
|
|
14332
|
+
return cur;
|
|
14333
|
+
}
|
|
14334
|
+
|
|
14335
|
+
ggml_tensor * build_ffn_layer(
|
|
14336
|
+
ggml_tensor * cur,
|
|
14337
|
+
const llama_model & model,
|
|
14338
|
+
const int il) {
|
|
14339
|
+
|
|
14340
|
+
cur = build_ffn(cur,
|
|
14341
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14342
|
+
NULL, NULL, NULL,
|
|
14343
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14344
|
+
NULL,
|
|
14345
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
14346
|
+
cb(cur, "ffn_out", il);
|
|
14347
|
+
|
|
14348
|
+
cur = build_cvec(cur, il);
|
|
14349
|
+
cb(cur, "l_out", il);
|
|
14350
|
+
|
|
14351
|
+
return cur;
|
|
14352
|
+
}
|
|
14353
|
+
};
|
|
14354
|
+
|
|
14355
|
+
struct llm_build_exaone : public llm_graph_context {
|
|
14356
|
+
llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14357
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14358
|
+
|
|
14359
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14360
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14361
|
+
|
|
14362
|
+
ggml_tensor * cur;
|
|
14363
|
+
ggml_tensor * inpL;
|
|
14364
|
+
|
|
14365
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14366
|
+
|
|
14367
|
+
// inp_pos - contains the positions
|
|
14368
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14369
|
+
|
|
14370
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
14371
|
+
|
|
14372
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14373
|
+
|
|
14374
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14375
|
+
ggml_tensor * inpSA = inpL;
|
|
14376
|
+
|
|
14377
|
+
// norm
|
|
14378
|
+
cur = build_norm(inpL,
|
|
14379
|
+
model.layers[il].attn_norm, NULL,
|
|
14380
|
+
LLM_NORM_RMS, il);
|
|
14381
|
+
cb(cur, "attn_norm", il);
|
|
14382
|
+
|
|
14383
|
+
// self-attention
|
|
14384
|
+
{
|
|
14385
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
14386
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14387
|
+
|
|
14388
|
+
// compute Q and K and RoPE them
|
|
14389
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14390
|
+
cb(Qcur, "Qcur", il);
|
|
14391
|
+
if (model.layers[il].bq) {
|
|
14392
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14393
|
+
cb(Qcur, "Qcur", il);
|
|
14394
|
+
}
|
|
14395
|
+
|
|
14396
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14397
|
+
cb(Kcur, "Kcur", il);
|
|
14398
|
+
if (model.layers[il].bk) {
|
|
14399
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14400
|
+
cb(Kcur, "Kcur", il);
|
|
14401
|
+
}
|
|
14402
|
+
|
|
14403
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14404
|
+
cb(Vcur, "Vcur", il);
|
|
14405
|
+
if (model.layers[il].bv) {
|
|
14406
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14407
|
+
cb(Vcur, "Vcur", il);
|
|
14408
|
+
}
|
|
14409
|
+
|
|
14410
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14411
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14412
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14413
|
+
|
|
14414
|
+
Qcur = ggml_rope_ext(
|
|
14415
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14416
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14417
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14418
|
+
);
|
|
14419
|
+
|
|
14420
|
+
Kcur = ggml_rope_ext(
|
|
14421
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14422
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14423
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14424
|
+
);
|
|
14425
|
+
|
|
14426
|
+
cb(Qcur, "Qcur", il);
|
|
14427
|
+
cb(Kcur, "Kcur", il);
|
|
14428
|
+
cb(Vcur, "Vcur", il);
|
|
14429
|
+
|
|
14430
|
+
cur = build_attn(inp_attn,
|
|
14431
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14432
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14433
|
+
}
|
|
14434
|
+
|
|
14435
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14436
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14437
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14438
|
+
}
|
|
14439
|
+
|
|
14440
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14441
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14442
|
+
|
|
14443
|
+
// feed-forward network
|
|
14444
|
+
cur = build_norm(ffn_inp,
|
|
14445
|
+
model.layers[il].ffn_norm, NULL,
|
|
14446
|
+
LLM_NORM_RMS, il);
|
|
14447
|
+
cb(cur, "ffn_norm", il);
|
|
14448
|
+
|
|
14449
|
+
cur = build_ffn(cur,
|
|
14450
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14451
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
14452
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14453
|
+
NULL,
|
|
14454
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14455
|
+
cb(cur, "ffn_out", il);
|
|
14456
|
+
|
|
14457
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14458
|
+
cb(cur, "ffn_out", il);
|
|
14459
|
+
|
|
14460
|
+
cur = build_cvec(cur, il);
|
|
14461
|
+
cb(cur, "l_out", il);
|
|
14462
|
+
|
|
14463
|
+
// input for next layer
|
|
14464
|
+
inpL = cur;
|
|
14465
|
+
}
|
|
14466
|
+
|
|
14467
|
+
cur = inpL;
|
|
14468
|
+
|
|
14469
|
+
cur = build_norm(cur,
|
|
14470
|
+
model.output_norm, NULL,
|
|
14471
|
+
LLM_NORM_RMS, -1);
|
|
14472
|
+
|
|
14473
|
+
cb(cur, "result_norm", -1);
|
|
14474
|
+
res->t_embd = cur;
|
|
14475
|
+
|
|
14476
|
+
// lm_head
|
|
14477
|
+
cur = build_lora_mm(model.output, cur);
|
|
14478
|
+
|
|
14479
|
+
cb(cur, "result_output", -1);
|
|
14480
|
+
res->t_logits = cur;
|
|
14481
|
+
|
|
14482
|
+
ggml_build_forward_expand(gf, cur);
|
|
14483
|
+
}
|
|
14484
|
+
};
|
|
14485
|
+
|
|
14486
|
+
template <bool iswa>
|
|
14487
|
+
struct llm_build_exaone4 : public llm_graph_context {
|
|
14488
|
+
llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14489
|
+
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
13552
14490
|
|
|
13553
14491
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
|
13554
14492
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
@@ -13561,13 +14499,13 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
13561
14499
|
// inp_pos - contains the positions
|
|
13562
14500
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13563
14501
|
|
|
13564
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
14502
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
13565
14503
|
inp_attn_type * inp_attn = nullptr;
|
|
13566
14504
|
|
|
13567
14505
|
if constexpr (iswa) {
|
|
13568
|
-
inp_attn =
|
|
14506
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
13569
14507
|
} else {
|
|
13570
|
-
inp_attn =
|
|
14508
|
+
inp_attn = build_attn_inp_kv();
|
|
13571
14509
|
}
|
|
13572
14510
|
|
|
13573
14511
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -13622,7 +14560,7 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
13622
14560
|
|
|
13623
14561
|
cur = build_attn(inp_attn,
|
|
13624
14562
|
model.layers[il].wo, NULL,
|
|
13625
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14563
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13626
14564
|
cb(cur, "attn_out", il);
|
|
13627
14565
|
}
|
|
13628
14566
|
|
|
@@ -14450,7 +15388,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
14450
15388
|
inp_pos = build_inp_pos();
|
|
14451
15389
|
}
|
|
14452
15390
|
|
|
14453
|
-
auto * inp_attn =
|
|
15391
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
14454
15392
|
|
|
14455
15393
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14456
15394
|
|
|
@@ -14501,12 +15439,12 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
14501
15439
|
}
|
|
14502
15440
|
|
|
14503
15441
|
ggml_tensor * build_attention_layer(
|
|
14504
|
-
ggml_tensor
|
|
14505
|
-
ggml_tensor
|
|
14506
|
-
|
|
14507
|
-
const llama_model
|
|
14508
|
-
const int64_t
|
|
14509
|
-
const int
|
|
15442
|
+
ggml_tensor * cur,
|
|
15443
|
+
ggml_tensor * inp_pos,
|
|
15444
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15445
|
+
const llama_model & model,
|
|
15446
|
+
const int64_t n_embd_head,
|
|
15447
|
+
const int il) {
|
|
14510
15448
|
|
|
14511
15449
|
// compute Q and K and (optionally) RoPE them
|
|
14512
15450
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -14557,7 +15495,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
14557
15495
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14558
15496
|
cur = build_attn(inp_attn,
|
|
14559
15497
|
model.layers[il].wo, model.layers[il].bo,
|
|
14560
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15498
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
14561
15499
|
cb(cur, "attn_out", il);
|
|
14562
15500
|
return cur;
|
|
14563
15501
|
}
|
|
@@ -14720,12 +15658,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
14720
15658
|
}
|
|
14721
15659
|
|
|
14722
15660
|
ggml_tensor * build_attention_layer(
|
|
14723
|
-
ggml_tensor
|
|
14724
|
-
ggml_tensor
|
|
14725
|
-
|
|
14726
|
-
const llama_model
|
|
14727
|
-
const int64_t
|
|
14728
|
-
const int
|
|
15661
|
+
ggml_tensor * cur,
|
|
15662
|
+
ggml_tensor * inp_pos,
|
|
15663
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15664
|
+
const llama_model & model,
|
|
15665
|
+
const int64_t n_embd_head,
|
|
15666
|
+
const int il) {
|
|
14729
15667
|
|
|
14730
15668
|
// compute Q and K and (optionally) RoPE them
|
|
14731
15669
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -14776,7 +15714,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
14776
15714
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14777
15715
|
cur = build_attn(inp_attn,
|
|
14778
15716
|
model.layers[il].wo, model.layers[il].bo,
|
|
14779
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15717
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
14780
15718
|
cb(cur, "attn_out", il);
|
|
14781
15719
|
return cur;
|
|
14782
15720
|
}
|
|
@@ -14882,7 +15820,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
14882
15820
|
// inp_pos - contains the positions
|
|
14883
15821
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14884
15822
|
|
|
14885
|
-
auto * inp_attn =
|
|
15823
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
14886
15824
|
|
|
14887
15825
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14888
15826
|
|
|
@@ -14961,7 +15899,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
14961
15899
|
|
|
14962
15900
|
cur = build_attn(inp_attn,
|
|
14963
15901
|
model.layers[il].wo, nullptr,
|
|
14964
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15902
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14965
15903
|
}
|
|
14966
15904
|
|
|
14967
15905
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -15213,7 +16151,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15213
16151
|
// inp_pos - contains the positions
|
|
15214
16152
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15215
16153
|
|
|
15216
|
-
auto * inp_attn =
|
|
16154
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15217
16155
|
|
|
15218
16156
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15219
16157
|
|
|
@@ -15317,7 +16255,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15317
16255
|
|
|
15318
16256
|
cur = build_attn(inp_attn,
|
|
15319
16257
|
model.layers[il].wo, NULL,
|
|
15320
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
16258
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15321
16259
|
}
|
|
15322
16260
|
|
|
15323
16261
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -15378,7 +16316,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
15378
16316
|
// inp_pos - contains the positions
|
|
15379
16317
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15380
16318
|
|
|
15381
|
-
auto * inp_attn =
|
|
16319
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15382
16320
|
|
|
15383
16321
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15384
16322
|
|
|
@@ -15440,7 +16378,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
15440
16378
|
|
|
15441
16379
|
cur = build_attn(inp_attn,
|
|
15442
16380
|
model.layers[il].wo, model.layers[il].bo,
|
|
15443
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
16381
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
15444
16382
|
}
|
|
15445
16383
|
|
|
15446
16384
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -15527,7 +16465,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
15527
16465
|
// inp_pos - contains the positions
|
|
15528
16466
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15529
16467
|
|
|
15530
|
-
auto * inp_attn =
|
|
16468
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15531
16469
|
|
|
15532
16470
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15533
16471
|
|
|
@@ -15580,7 +16518,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
15580
16518
|
|
|
15581
16519
|
cur = build_attn(inp_attn,
|
|
15582
16520
|
model.layers[il].wo, model.layers[il].bo,
|
|
15583
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16521
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15584
16522
|
}
|
|
15585
16523
|
|
|
15586
16524
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -15677,7 +16615,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
15677
16615
|
// inp_pos - contains the positions
|
|
15678
16616
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15679
16617
|
|
|
15680
|
-
auto * inp_attn =
|
|
16618
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15681
16619
|
|
|
15682
16620
|
for (int il = 0; il < n_layer; ++il) {
|
|
15683
16621
|
ggml_tensor * inpSA = inpL;
|
|
@@ -15735,7 +16673,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
15735
16673
|
|
|
15736
16674
|
cur = build_attn(inp_attn,
|
|
15737
16675
|
model.layers[il].wo, NULL,
|
|
15738
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16676
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15739
16677
|
}
|
|
15740
16678
|
|
|
15741
16679
|
if (il == n_layer - 1) {
|
|
@@ -15807,7 +16745,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
15807
16745
|
// inp_pos - contains the positions
|
|
15808
16746
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15809
16747
|
|
|
15810
|
-
auto * inp_attn =
|
|
16748
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15811
16749
|
|
|
15812
16750
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15813
16751
|
|
|
@@ -15868,7 +16806,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
15868
16806
|
|
|
15869
16807
|
cur = build_attn(inp_attn,
|
|
15870
16808
|
model.layers[il].wo, NULL,
|
|
15871
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16809
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15872
16810
|
cb(cur, "attn_out", il);
|
|
15873
16811
|
}
|
|
15874
16812
|
|
|
@@ -16021,7 +16959,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
|
|
16021
16959
|
|
|
16022
16960
|
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
|
16023
16961
|
model.layers[il].wo, NULL,
|
|
16024
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
16962
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
16025
16963
|
cb(attn_out, "attn_out", il);
|
|
16026
16964
|
|
|
16027
16965
|
cur = build_norm(inpL,
|
|
@@ -16181,7 +17119,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
|
16181
17119
|
|
|
16182
17120
|
private:
|
|
16183
17121
|
ggml_tensor * build_plamo2_attn_layer(
|
|
16184
|
-
|
|
17122
|
+
llm_graph_input_attn_kv * inp,
|
|
16185
17123
|
ggml_tensor * inp_pos,
|
|
16186
17124
|
ggml_tensor * cur,
|
|
16187
17125
|
const llama_model & model,
|
|
@@ -16205,13 +17143,13 @@ private:
|
|
|
16205
17143
|
|
|
16206
17144
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
16207
17145
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
16208
|
-
ggml_tensor * Vcur =
|
|
17146
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
16209
17147
|
|
|
16210
17148
|
cb(Qcur, "Qcur", il);
|
|
16211
17149
|
cb(Kcur, "Kcur", il);
|
|
16212
17150
|
cb(Vcur, "Vcur", il);
|
|
16213
17151
|
|
|
16214
|
-
Vcur =
|
|
17152
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
16215
17153
|
|
|
16216
17154
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
16217
17155
|
cb(Qcur, "Qcur_normed", il);
|
|
@@ -16231,7 +17169,9 @@ private:
|
|
|
16231
17169
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16232
17170
|
);
|
|
16233
17171
|
|
|
16234
|
-
cur = build_attn(inp,
|
|
17172
|
+
cur = build_attn(inp,
|
|
17173
|
+
model.layers[il].wo, NULL,
|
|
17174
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
|
16235
17175
|
}
|
|
16236
17176
|
|
|
16237
17177
|
cb(cur, "attn_out", il);
|
|
@@ -16278,15 +17218,13 @@ private:
|
|
|
16278
17218
|
cb(zx, "mamba_in_proj", il);
|
|
16279
17219
|
// {8192, 5, 1, 1} -> {8192, 1, 5, 1}
|
|
16280
17220
|
zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
|
|
16281
|
-
zx =
|
|
16282
|
-
zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
17221
|
+
zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16283
17222
|
cb(zx, "mamba_in_proj_out", il);
|
|
16284
17223
|
|
|
16285
17224
|
// split into z and x
|
|
16286
17225
|
// => {head_dim * n_heads, n_seq_tokens, n_seqs}
|
|
16287
17226
|
ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
|
|
16288
|
-
x =
|
|
16289
|
-
x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
17227
|
+
x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16290
17228
|
// x = ggml_permute(ctx0, x, 0, 2, 1, 3);
|
|
16291
17229
|
cb(x, "mamba_x_split", il);
|
|
16292
17230
|
|
|
@@ -16416,7 +17354,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
16416
17354
|
// inp_pos - contains the positions
|
|
16417
17355
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16418
17356
|
|
|
16419
|
-
auto * inp_attn =
|
|
17357
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16420
17358
|
|
|
16421
17359
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
16422
17360
|
|
|
@@ -16480,7 +17418,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
16480
17418
|
|
|
16481
17419
|
cur = build_attn(inp_attn,
|
|
16482
17420
|
model.layers[il].wo, model.layers[il].bo,
|
|
16483
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17421
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
16484
17422
|
cb(cur, "attn_out", il);
|
|
16485
17423
|
}
|
|
16486
17424
|
|
|
@@ -16551,7 +17489,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
16551
17489
|
// inp_pos - contains the positions
|
|
16552
17490
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16553
17491
|
|
|
16554
|
-
auto * inp_attn =
|
|
17492
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16555
17493
|
|
|
16556
17494
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
16557
17495
|
|
|
@@ -16625,7 +17563,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
16625
17563
|
|
|
16626
17564
|
cur = build_attn(inp_attn,
|
|
16627
17565
|
model.layers[il].wo, model.layers[il].bo,
|
|
16628
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17566
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
16629
17567
|
cb(cur, "attn_out", il);
|
|
16630
17568
|
}
|
|
16631
17569
|
|
|
@@ -16697,8 +17635,8 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
16697
17635
|
}
|
|
16698
17636
|
};
|
|
16699
17637
|
|
|
16700
|
-
struct
|
|
16701
|
-
|
|
17638
|
+
struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
17639
|
+
llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
16702
17640
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
16703
17641
|
|
|
16704
17642
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -16712,25 +17650,25 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
16712
17650
|
// inp_pos - contains the positions
|
|
16713
17651
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16714
17652
|
|
|
16715
|
-
auto * inp_attn =
|
|
17653
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16716
17654
|
|
|
16717
|
-
const float kq_scale =
|
|
17655
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
16718
17656
|
|
|
16719
17657
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16720
17658
|
|
|
16721
17659
|
for (int il = 0; il < n_layer; ++il) {
|
|
16722
17660
|
ggml_tensor * inpSA = inpL;
|
|
16723
17661
|
|
|
16724
|
-
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
16725
|
-
|
|
16726
17662
|
// norm
|
|
16727
17663
|
cur = build_norm(inpL,
|
|
16728
17664
|
model.layers[il].attn_norm, NULL,
|
|
16729
17665
|
LLM_NORM_RMS, il);
|
|
16730
17666
|
cb(cur, "attn_norm", il);
|
|
16731
|
-
|
|
16732
17667
|
// self-attention
|
|
16733
17668
|
{
|
|
17669
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
17670
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
17671
|
+
|
|
16734
17672
|
// compute Q and K and RoPE them
|
|
16735
17673
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
16736
17674
|
cb(Qcur, "Qcur", il);
|
|
@@ -16757,7 +17695,145 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
16757
17695
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
16758
17696
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
16759
17697
|
|
|
16760
|
-
|
|
17698
|
+
Qcur = ggml_rope_ext(
|
|
17699
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
17700
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17701
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17702
|
+
);
|
|
17703
|
+
|
|
17704
|
+
cb(Qcur, "Qcur", il);
|
|
17705
|
+
cb(Kcur, "Kcur", il);
|
|
17706
|
+
cb(Vcur, "Vcur", il);
|
|
17707
|
+
|
|
17708
|
+
Kcur = ggml_rope_ext(
|
|
17709
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
17710
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17711
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17712
|
+
);
|
|
17713
|
+
|
|
17714
|
+
Kcur = build_norm(Kcur,
|
|
17715
|
+
model.layers[il].attn_k_norm, nullptr,
|
|
17716
|
+
LLM_NORM_RMS, il);
|
|
17717
|
+
cb(Kcur, "Kcur_norm", il);
|
|
17718
|
+
|
|
17719
|
+
Qcur = build_norm(Qcur,
|
|
17720
|
+
model.layers[il].attn_q_norm, nullptr,
|
|
17721
|
+
LLM_NORM_RMS, il);
|
|
17722
|
+
cb(Qcur, "Qcur_norm", il);
|
|
17723
|
+
|
|
17724
|
+
cur = build_attn(inp_attn,
|
|
17725
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17726
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17727
|
+
cb(cur, "attn_out", il);
|
|
17728
|
+
}
|
|
17729
|
+
|
|
17730
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
17731
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17732
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17733
|
+
}
|
|
17734
|
+
|
|
17735
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
17736
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
17737
|
+
|
|
17738
|
+
cur = build_norm(ffn_inp,
|
|
17739
|
+
model.layers[il].ffn_norm, NULL,
|
|
17740
|
+
LLM_NORM_RMS, il);
|
|
17741
|
+
cb(cur, "ffn_norm", il);
|
|
17742
|
+
// feed-forward network (non-MoE)
|
|
17743
|
+
ggml_tensor * cur_mlp = build_ffn(cur,
|
|
17744
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
17745
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
17746
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
17747
|
+
NULL,
|
|
17748
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
17749
|
+
cb(cur_mlp, "ffn_out", il);
|
|
17750
|
+
|
|
17751
|
+
cur = ggml_add(ctx0, cur_mlp, ffn_inp);
|
|
17752
|
+
|
|
17753
|
+
cur = build_cvec(cur, il);
|
|
17754
|
+
cb(cur, "l_out", il);
|
|
17755
|
+
|
|
17756
|
+
// input for next layer
|
|
17757
|
+
inpL = cur;
|
|
17758
|
+
}
|
|
17759
|
+
cur = inpL;
|
|
17760
|
+
|
|
17761
|
+
cur = build_norm(cur,
|
|
17762
|
+
model.output_norm, NULL,
|
|
17763
|
+
LLM_NORM_RMS, -1);
|
|
17764
|
+
|
|
17765
|
+
cb(cur, "result_norm", -1);
|
|
17766
|
+
res->t_embd = cur;
|
|
17767
|
+
// lm_head
|
|
17768
|
+
cur = build_lora_mm(model.output, cur);
|
|
17769
|
+
cb(cur, "result_output", -1);
|
|
17770
|
+
res->t_logits = cur;
|
|
17771
|
+
|
|
17772
|
+
ggml_build_forward_expand(gf, cur);
|
|
17773
|
+
}
|
|
17774
|
+
};
|
|
17775
|
+
|
|
17776
|
+
struct llm_build_smollm3 : public llm_graph_context {
|
|
17777
|
+
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17778
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
17779
|
+
|
|
17780
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
17781
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
17782
|
+
|
|
17783
|
+
ggml_tensor * cur;
|
|
17784
|
+
ggml_tensor * inpL;
|
|
17785
|
+
|
|
17786
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17787
|
+
|
|
17788
|
+
// inp_pos - contains the positions
|
|
17789
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17790
|
+
|
|
17791
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17792
|
+
|
|
17793
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
17794
|
+
|
|
17795
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
17796
|
+
|
|
17797
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
17798
|
+
ggml_tensor * inpSA = inpL;
|
|
17799
|
+
|
|
17800
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
17801
|
+
|
|
17802
|
+
// norm
|
|
17803
|
+
cur = build_norm(inpL,
|
|
17804
|
+
model.layers[il].attn_norm, NULL,
|
|
17805
|
+
LLM_NORM_RMS, il);
|
|
17806
|
+
cb(cur, "attn_norm", il);
|
|
17807
|
+
|
|
17808
|
+
// self-attention
|
|
17809
|
+
{
|
|
17810
|
+
// compute Q and K and RoPE them
|
|
17811
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
17812
|
+
cb(Qcur, "Qcur", il);
|
|
17813
|
+
if (model.layers[il].bq) {
|
|
17814
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
17815
|
+
cb(Qcur, "Qcur", il);
|
|
17816
|
+
}
|
|
17817
|
+
|
|
17818
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
17819
|
+
cb(Kcur, "Kcur", il);
|
|
17820
|
+
if (model.layers[il].bk) {
|
|
17821
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
17822
|
+
cb(Kcur, "Kcur", il);
|
|
17823
|
+
}
|
|
17824
|
+
|
|
17825
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
17826
|
+
cb(Vcur, "Vcur", il);
|
|
17827
|
+
if (model.layers[il].bv) {
|
|
17828
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
17829
|
+
cb(Vcur, "Vcur", il);
|
|
17830
|
+
}
|
|
17831
|
+
|
|
17832
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
17833
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
17834
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
17835
|
+
|
|
17836
|
+
if (use_rope) {
|
|
16761
17837
|
Qcur = ggml_rope_ext(
|
|
16762
17838
|
ctx0, Qcur, inp_pos, nullptr,
|
|
16763
17839
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -16777,7 +17853,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
16777
17853
|
|
|
16778
17854
|
cur = build_attn(inp_attn,
|
|
16779
17855
|
model.layers[il].wo, model.layers[il].bo,
|
|
16780
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17856
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
16781
17857
|
cb(cur, "attn_out", il);
|
|
16782
17858
|
}
|
|
16783
17859
|
|
|
@@ -16834,6 +17910,136 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
16834
17910
|
}
|
|
16835
17911
|
};
|
|
16836
17912
|
|
|
17913
|
+
struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
17914
|
+
llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17915
|
+
ggml_tensor * cur;
|
|
17916
|
+
ggml_tensor * inpL;
|
|
17917
|
+
|
|
17918
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17919
|
+
|
|
17920
|
+
// inp_pos - contains the positions
|
|
17921
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17922
|
+
|
|
17923
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
17924
|
+
|
|
17925
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
17926
|
+
ggml_tensor * inpSA = inpL;
|
|
17927
|
+
|
|
17928
|
+
// norm
|
|
17929
|
+
cur = build_norm(inpL,
|
|
17930
|
+
model.layers[il].attn_norm, nullptr,
|
|
17931
|
+
LLM_NORM_RMS, il);
|
|
17932
|
+
cb(cur, "attn_norm", il);
|
|
17933
|
+
|
|
17934
|
+
// self-attention
|
|
17935
|
+
{
|
|
17936
|
+
// compute Q and K and RoPE them
|
|
17937
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
17938
|
+
cb(Qcur, "Qcur", il);
|
|
17939
|
+
if (model.layers[il].bq) {
|
|
17940
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
17941
|
+
cb(Qcur, "Qcur", il);
|
|
17942
|
+
}
|
|
17943
|
+
|
|
17944
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
17945
|
+
cb(Kcur, "Kcur", il);
|
|
17946
|
+
if (model.layers[il].bk) {
|
|
17947
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
17948
|
+
cb(Kcur, "Kcur", il);
|
|
17949
|
+
}
|
|
17950
|
+
|
|
17951
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
17952
|
+
cb(Vcur, "Vcur", il);
|
|
17953
|
+
if (model.layers[il].bv) {
|
|
17954
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
17955
|
+
cb(Vcur, "Vcur", il);
|
|
17956
|
+
}
|
|
17957
|
+
|
|
17958
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
17959
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
17960
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
17961
|
+
|
|
17962
|
+
Qcur = ggml_rope_ext(
|
|
17963
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
17964
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17965
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17966
|
+
);
|
|
17967
|
+
|
|
17968
|
+
Kcur = ggml_rope_ext(
|
|
17969
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
17970
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17971
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17972
|
+
);
|
|
17973
|
+
|
|
17974
|
+
cb(Qcur, "Qcur", il);
|
|
17975
|
+
cb(Kcur, "Kcur", il);
|
|
17976
|
+
cb(Vcur, "Vcur", il);
|
|
17977
|
+
|
|
17978
|
+
cur = build_attn(inp_attn,
|
|
17979
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17980
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
17981
|
+
|
|
17982
|
+
cb(cur, "attn_out", il);
|
|
17983
|
+
}
|
|
17984
|
+
|
|
17985
|
+
if (il == n_layer - 1) {
|
|
17986
|
+
// skip computing output for unused tokens
|
|
17987
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
17988
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17989
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17990
|
+
}
|
|
17991
|
+
|
|
17992
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
17993
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
17994
|
+
|
|
17995
|
+
cur = ffn_inp;
|
|
17996
|
+
cur = build_norm(cur,
|
|
17997
|
+
model.layers[il].attn_post_norm, nullptr,
|
|
17998
|
+
LLM_NORM_RMS, il);
|
|
17999
|
+
cb(cur, "attn_post_norm", il);
|
|
18000
|
+
|
|
18001
|
+
// MoE branch
|
|
18002
|
+
cur = build_moe_ffn(cur,
|
|
18003
|
+
model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
|
|
18004
|
+
model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
|
|
18005
|
+
model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
|
|
18006
|
+
model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
|
|
18007
|
+
nullptr,
|
|
18008
|
+
n_expert, n_expert_used,
|
|
18009
|
+
LLM_FFN_SWIGLU_OAI_MOE, false,
|
|
18010
|
+
false, 0.0,
|
|
18011
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
|
18012
|
+
il);
|
|
18013
|
+
cb(cur, "ffn_moe_out", il);
|
|
18014
|
+
|
|
18015
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
18016
|
+
|
|
18017
|
+
cur = build_cvec(cur, il);
|
|
18018
|
+
cb(cur, "l_out", il);
|
|
18019
|
+
|
|
18020
|
+
// input for next layer
|
|
18021
|
+
inpL = cur;
|
|
18022
|
+
}
|
|
18023
|
+
|
|
18024
|
+
cur = inpL;
|
|
18025
|
+
|
|
18026
|
+
cur = build_norm(cur,
|
|
18027
|
+
model.output_norm, NULL,
|
|
18028
|
+
LLM_NORM_RMS, -1);
|
|
18029
|
+
|
|
18030
|
+
cb(cur, "result_norm", -1);
|
|
18031
|
+
res->t_embd = cur;
|
|
18032
|
+
|
|
18033
|
+
// lm_head
|
|
18034
|
+
cur = build_lora_mm(model.output, cur);
|
|
18035
|
+
|
|
18036
|
+
cb(cur, "result_output", -1);
|
|
18037
|
+
res->t_logits = cur;
|
|
18038
|
+
|
|
18039
|
+
ggml_build_forward_expand(gf, cur);
|
|
18040
|
+
}
|
|
18041
|
+
};
|
|
18042
|
+
|
|
16837
18043
|
struct llm_build_lfm2 : public llm_graph_context {
|
|
16838
18044
|
const llama_model & model;
|
|
16839
18045
|
|
|
@@ -16868,8 +18074,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
16868
18074
|
cb(cur, "model.embedding_norm", -1);
|
|
16869
18075
|
res->t_embd = cur;
|
|
16870
18076
|
|
|
16871
|
-
|
|
16872
|
-
cur = build_lora_mm(model.tok_embd, cur);
|
|
18077
|
+
cur = build_lora_mm(model.output, cur);
|
|
16873
18078
|
cb(cur, "lm_head", -1);
|
|
16874
18079
|
|
|
16875
18080
|
res->t_logits = cur;
|
|
@@ -16896,10 +18101,10 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
16896
18101
|
return cur;
|
|
16897
18102
|
}
|
|
16898
18103
|
|
|
16899
|
-
ggml_tensor * build_attn_block(ggml_tensor
|
|
16900
|
-
ggml_tensor
|
|
16901
|
-
|
|
16902
|
-
int
|
|
18104
|
+
ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
18105
|
+
ggml_tensor * inp_pos,
|
|
18106
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
18107
|
+
int il) const {
|
|
16903
18108
|
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
|
16904
18109
|
auto const n_embd_head = hparams.n_embd_head_v;
|
|
16905
18110
|
auto const n_head_kv = hparams.n_head_kv(il);
|
|
@@ -16934,7 +18139,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
16934
18139
|
);
|
|
16935
18140
|
|
|
16936
18141
|
cur = build_attn(inp_attn, model.layers[il].wo, NULL,
|
|
16937
|
-
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
18142
|
+
q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16938
18143
|
|
|
16939
18144
|
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
|
16940
18145
|
|
|
@@ -17011,6 +18216,258 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17011
18216
|
}
|
|
17012
18217
|
};
|
|
17013
18218
|
|
|
18219
|
+
struct llm_build_seed_oss : public llm_graph_context {
|
|
18220
|
+
llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
18221
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
18222
|
+
|
|
18223
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
18224
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
18225
|
+
|
|
18226
|
+
ggml_tensor * cur;
|
|
18227
|
+
ggml_tensor * inpL;
|
|
18228
|
+
|
|
18229
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
18230
|
+
|
|
18231
|
+
// inp_pos - contains the positions
|
|
18232
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
18233
|
+
|
|
18234
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
18235
|
+
|
|
18236
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
18237
|
+
|
|
18238
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
18239
|
+
|
|
18240
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
18241
|
+
ggml_tensor * inpSA = inpL;
|
|
18242
|
+
|
|
18243
|
+
// norm
|
|
18244
|
+
cur = build_norm(inpL,
|
|
18245
|
+
model.layers[il].attn_norm, NULL,
|
|
18246
|
+
LLM_NORM_RMS, il);
|
|
18247
|
+
cb(cur, "attn_norm", il);
|
|
18248
|
+
|
|
18249
|
+
// self-attention
|
|
18250
|
+
{
|
|
18251
|
+
// compute Q and K and RoPE them
|
|
18252
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
18253
|
+
cb(Qcur, "Qcur", il);
|
|
18254
|
+
if (model.layers[il].bq) {
|
|
18255
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
18256
|
+
cb(Qcur, "Qcur", il);
|
|
18257
|
+
}
|
|
18258
|
+
|
|
18259
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
18260
|
+
cb(Kcur, "Kcur", il);
|
|
18261
|
+
if (model.layers[il].bk) {
|
|
18262
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
18263
|
+
cb(Kcur, "Kcur", il);
|
|
18264
|
+
}
|
|
18265
|
+
|
|
18266
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
18267
|
+
cb(Vcur, "Vcur", il);
|
|
18268
|
+
if (model.layers[il].bv) {
|
|
18269
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
18270
|
+
cb(Vcur, "Vcur", il);
|
|
18271
|
+
}
|
|
18272
|
+
|
|
18273
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
18274
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
18275
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
18276
|
+
|
|
18277
|
+
Qcur = ggml_rope_ext(
|
|
18278
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
18279
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18280
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18281
|
+
);
|
|
18282
|
+
|
|
18283
|
+
Kcur = ggml_rope_ext(
|
|
18284
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
18285
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18286
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18287
|
+
);
|
|
18288
|
+
|
|
18289
|
+
cb(Qcur, "Qcur", il);
|
|
18290
|
+
cb(Kcur, "Kcur", il);
|
|
18291
|
+
cb(Vcur, "Vcur", il);
|
|
18292
|
+
|
|
18293
|
+
cur = build_attn(inp_attn,
|
|
18294
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
18295
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
18296
|
+
cb(cur, "attn_out", il);
|
|
18297
|
+
}
|
|
18298
|
+
|
|
18299
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
18300
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
18301
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
18302
|
+
}
|
|
18303
|
+
|
|
18304
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
18305
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
18306
|
+
|
|
18307
|
+
// feed-forward network
|
|
18308
|
+
cur = build_norm(ffn_inp,
|
|
18309
|
+
model.layers[il].attn_post_norm, NULL,
|
|
18310
|
+
LLM_NORM_RMS, il);
|
|
18311
|
+
cb(cur, "attn_post_norm", il);
|
|
18312
|
+
|
|
18313
|
+
cur = build_ffn(cur,
|
|
18314
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
18315
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
18316
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
18317
|
+
NULL,
|
|
18318
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
18319
|
+
cb(cur, "ffn_out", il);
|
|
18320
|
+
|
|
18321
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
18322
|
+
cb(cur, "ffn_out", il);
|
|
18323
|
+
|
|
18324
|
+
cur = build_cvec(cur, il);
|
|
18325
|
+
cb(cur, "l_out", il);
|
|
18326
|
+
|
|
18327
|
+
// input for next layer
|
|
18328
|
+
inpL = cur;
|
|
18329
|
+
}
|
|
18330
|
+
|
|
18331
|
+
cur = inpL;
|
|
18332
|
+
|
|
18333
|
+
cur = build_norm(cur,
|
|
18334
|
+
model.output_norm, NULL,
|
|
18335
|
+
LLM_NORM_RMS, -1);
|
|
18336
|
+
|
|
18337
|
+
cb(cur, "result_norm", -1);
|
|
18338
|
+
res->t_embd = cur;
|
|
18339
|
+
|
|
18340
|
+
// lm_head
|
|
18341
|
+
cur = build_lora_mm(model.output, cur);
|
|
18342
|
+
|
|
18343
|
+
cb(cur, "result_output", -1);
|
|
18344
|
+
res->t_logits = cur;
|
|
18345
|
+
|
|
18346
|
+
ggml_build_forward_expand(gf, cur);
|
|
18347
|
+
}
|
|
18348
|
+
};
|
|
18349
|
+
|
|
18350
|
+
template <bool iswa>
|
|
18351
|
+
struct llm_build_smallthinker : public llm_graph_context{
|
|
18352
|
+
llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
|
|
18353
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
18354
|
+
|
|
18355
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
18356
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
18357
|
+
|
|
18358
|
+
ggml_tensor * cur;
|
|
18359
|
+
ggml_tensor * inpL;
|
|
18360
|
+
|
|
18361
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
18362
|
+
|
|
18363
|
+
// inp_pos - contains the positions
|
|
18364
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
18365
|
+
|
|
18366
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
18367
|
+
inp_attn_type * inp_attn = nullptr;
|
|
18368
|
+
|
|
18369
|
+
if constexpr (iswa) {
|
|
18370
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
18371
|
+
} else {
|
|
18372
|
+
inp_attn = build_attn_inp_kv();
|
|
18373
|
+
}
|
|
18374
|
+
|
|
18375
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
18376
|
+
|
|
18377
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
18378
|
+
ggml_tensor * inpSA = inpL;
|
|
18379
|
+
ggml_tensor * probs = nullptr;
|
|
18380
|
+
|
|
18381
|
+
probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
|
18382
|
+
cb(probs, "ffn_moe_logits", il);
|
|
18383
|
+
|
|
18384
|
+
// norm
|
|
18385
|
+
cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
18386
|
+
cb(cur, "attn_norm", il);
|
|
18387
|
+
|
|
18388
|
+
// self_attention
|
|
18389
|
+
{
|
|
18390
|
+
// compute Q and K and RoPE them
|
|
18391
|
+
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
18392
|
+
cb(Qcur, "Qcur", il);
|
|
18393
|
+
|
|
18394
|
+
struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
18395
|
+
cb(Kcur, "Kcur", il);
|
|
18396
|
+
|
|
18397
|
+
struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
18398
|
+
cb(Vcur, "Vcur", il);
|
|
18399
|
+
|
|
18400
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
18401
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
18402
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
18403
|
+
|
|
18404
|
+
if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
|
|
18405
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18406
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
18407
|
+
|
|
18408
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18409
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
18410
|
+
}
|
|
18411
|
+
|
|
18412
|
+
cb(Qcur, "Qcur", il);
|
|
18413
|
+
cb(Kcur, "Kcur", il);
|
|
18414
|
+
|
|
18415
|
+
cur = build_attn(inp_attn,
|
|
18416
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
18417
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
18418
|
+
}
|
|
18419
|
+
|
|
18420
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
18421
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
18422
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
18423
|
+
probs = ggml_get_rows(ctx0, probs, inp_out_ids);
|
|
18424
|
+
}
|
|
18425
|
+
|
|
18426
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
18427
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
18428
|
+
|
|
18429
|
+
// MoE branch
|
|
18430
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
18431
|
+
cb(cur, "ffn_norm", il);
|
|
18432
|
+
|
|
18433
|
+
ggml_tensor * ffn_out =
|
|
18434
|
+
build_moe_ffn(cur,
|
|
18435
|
+
nullptr,
|
|
18436
|
+
model.layers[il].ffn_up_exps,
|
|
18437
|
+
model.layers[il].ffn_gate_exps,
|
|
18438
|
+
model.layers[il].ffn_down_exps,
|
|
18439
|
+
nullptr,
|
|
18440
|
+
n_expert, n_expert_used,
|
|
18441
|
+
LLM_FFN_RELU, true,
|
|
18442
|
+
false, 0.0,
|
|
18443
|
+
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
|
18444
|
+
il, probs);
|
|
18445
|
+
|
|
18446
|
+
cb(ffn_out, "ffn_out", il);
|
|
18447
|
+
cur = ffn_out;
|
|
18448
|
+
|
|
18449
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
18450
|
+
cur = build_cvec(cur, il);
|
|
18451
|
+
cb(cur, "l_out", il);
|
|
18452
|
+
|
|
18453
|
+
// input for next layer
|
|
18454
|
+
inpL = cur;
|
|
18455
|
+
}
|
|
18456
|
+
|
|
18457
|
+
cur = inpL;
|
|
18458
|
+
|
|
18459
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
18460
|
+
cb(cur, "result_norm", -1);
|
|
18461
|
+
|
|
18462
|
+
// lm_head
|
|
18463
|
+
cur = build_lora_mm(model.output, cur);
|
|
18464
|
+
cb(cur, "result_output", -1);
|
|
18465
|
+
res->t_logits = cur;
|
|
18466
|
+
|
|
18467
|
+
ggml_build_forward_expand(gf, cur);
|
|
18468
|
+
}
|
|
18469
|
+
};
|
|
18470
|
+
|
|
17014
18471
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
17015
18472
|
llama_memory_i * res;
|
|
17016
18473
|
|
|
@@ -17019,11 +18476,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17019
18476
|
// switch statement
|
|
17020
18477
|
case LLM_ARCH_BERT:
|
|
17021
18478
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18479
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
17022
18480
|
case LLM_ARCH_NOMIC_BERT:
|
|
17023
18481
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
17024
18482
|
case LLM_ARCH_NEO_BERT:
|
|
17025
18483
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
17026
18484
|
case LLM_ARCH_DREAM:
|
|
18485
|
+
case LLM_ARCH_LLADA:
|
|
17027
18486
|
{
|
|
17028
18487
|
res = nullptr;
|
|
17029
18488
|
} break;
|
|
@@ -17034,14 +18493,31 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17034
18493
|
if (llm_arch_is_recurrent(arch)) {
|
|
17035
18494
|
res = new llama_memory_recurrent(
|
|
17036
18495
|
*this,
|
|
17037
|
-
nullptr,
|
|
17038
18496
|
GGML_TYPE_F32,
|
|
17039
18497
|
GGML_TYPE_F32,
|
|
17040
18498
|
cparams.offload_kqv,
|
|
17041
18499
|
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
17042
|
-
cparams.n_seq_max
|
|
18500
|
+
cparams.n_seq_max,
|
|
18501
|
+
nullptr);
|
|
17043
18502
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
17044
|
-
|
|
18503
|
+
|
|
18504
|
+
// The main difference between hybrid architectures is the
|
|
18505
|
+
// layer filters, so pick the right one here
|
|
18506
|
+
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
|
18507
|
+
llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
|
|
18508
|
+
if (arch == LLM_ARCH_FALCON_H1) {
|
|
18509
|
+
filter_attn = [&](int32_t) { return true; };
|
|
18510
|
+
filter_recr = [&](int32_t) { return true; };
|
|
18511
|
+
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
|
18512
|
+
filter_attn = [&](int32_t il) {
|
|
18513
|
+
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18514
|
+
};
|
|
18515
|
+
filter_recr = [&](int32_t il) {
|
|
18516
|
+
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18517
|
+
};
|
|
18518
|
+
}
|
|
18519
|
+
|
|
18520
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
17045
18521
|
|
|
17046
18522
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
17047
18523
|
|
|
@@ -17059,10 +18535,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17059
18535
|
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
17060
18536
|
/* n_seq_max */ cparams.n_seq_max,
|
|
17061
18537
|
/* offload */ cparams.offload_kqv,
|
|
17062
|
-
/*
|
|
17063
|
-
/*
|
|
18538
|
+
/* unified */ cparams.kv_unified,
|
|
18539
|
+
/* filter_attn */ std::move(filter_attn),
|
|
18540
|
+
/* filter_recr */ std::move(filter_recr));
|
|
17064
18541
|
} else {
|
|
17065
|
-
const auto padding =
|
|
18542
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
17066
18543
|
|
|
17067
18544
|
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
17068
18545
|
|
|
@@ -17079,10 +18556,22 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17079
18556
|
|
|
17080
18557
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
17081
18558
|
|
|
18559
|
+
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
|
18560
|
+
|
|
18561
|
+
if (arch == LLM_ARCH_GEMMA3N) {
|
|
18562
|
+
reuse = [&](int32_t il) {
|
|
18563
|
+
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
|
|
18564
|
+
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
|
18565
|
+
}
|
|
18566
|
+
|
|
18567
|
+
return -1;
|
|
18568
|
+
};
|
|
18569
|
+
}
|
|
18570
|
+
|
|
17082
18571
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
17083
18572
|
GGML_ASSERT(hparams.is_swa_any());
|
|
17084
18573
|
|
|
17085
|
-
res = new
|
|
18574
|
+
res = new llama_kv_cache_iswa(
|
|
17086
18575
|
*this,
|
|
17087
18576
|
params.type_k,
|
|
17088
18577
|
params.type_v,
|
|
@@ -17093,13 +18582,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17093
18582
|
n_ctx_per_stream,
|
|
17094
18583
|
cparams.n_seq_max,
|
|
17095
18584
|
cparams.n_ubatch,
|
|
17096
|
-
padding
|
|
18585
|
+
padding,
|
|
18586
|
+
nullptr,
|
|
18587
|
+
reuse);
|
|
17097
18588
|
} else {
|
|
17098
18589
|
GGML_ASSERT(!hparams.is_swa_any());
|
|
17099
18590
|
|
|
17100
|
-
res = new
|
|
18591
|
+
res = new llama_kv_cache(
|
|
17101
18592
|
*this,
|
|
17102
|
-
nullptr,
|
|
17103
18593
|
params.type_k,
|
|
17104
18594
|
params.type_v,
|
|
17105
18595
|
!cparams.flash_attn,
|
|
@@ -17109,7 +18599,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
17109
18599
|
cparams.n_seq_max,
|
|
17110
18600
|
padding,
|
|
17111
18601
|
hparams.n_swa,
|
|
17112
|
-
hparams.swa_type
|
|
18602
|
+
hparams.swa_type,
|
|
18603
|
+
nullptr,
|
|
18604
|
+
nullptr);
|
|
17113
18605
|
}
|
|
17114
18606
|
}
|
|
17115
18607
|
}
|
|
@@ -17156,6 +18648,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17156
18648
|
} break;
|
|
17157
18649
|
case LLM_ARCH_BERT:
|
|
17158
18650
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18651
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
17159
18652
|
case LLM_ARCH_NOMIC_BERT:
|
|
17160
18653
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
17161
18654
|
{
|
|
@@ -17190,6 +18683,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17190
18683
|
llm = std::make_unique<llm_build_dream>(*this, params);
|
|
17191
18684
|
}
|
|
17192
18685
|
break;
|
|
18686
|
+
case LLM_ARCH_LLADA:
|
|
18687
|
+
{
|
|
18688
|
+
llm = std::make_unique<llm_build_llada>(*this, params);
|
|
18689
|
+
}
|
|
18690
|
+
break;
|
|
17193
18691
|
case LLM_ARCH_QWEN2VL:
|
|
17194
18692
|
{
|
|
17195
18693
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -17332,6 +18830,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17332
18830
|
{
|
|
17333
18831
|
llm = std::make_unique<llm_build_glm4>(*this, params);
|
|
17334
18832
|
} break;
|
|
18833
|
+
case LLM_ARCH_GLM4_MOE:
|
|
18834
|
+
{
|
|
18835
|
+
llm = std::make_unique<llm_build_glm4_moe>(*this, params);
|
|
18836
|
+
} break;
|
|
17335
18837
|
case LLM_ARCH_BITNET:
|
|
17336
18838
|
{
|
|
17337
18839
|
llm = std::make_unique<llm_build_bitnet>(*this, params);
|
|
@@ -17363,6 +18865,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17363
18865
|
{
|
|
17364
18866
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
17365
18867
|
} break;
|
|
18868
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
18869
|
+
{
|
|
18870
|
+
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
|
18871
|
+
} break;
|
|
17366
18872
|
case LLM_ARCH_EXAONE:
|
|
17367
18873
|
{
|
|
17368
18874
|
llm = std::make_unique<llm_build_exaone>(*this, params);
|
|
@@ -17417,6 +18923,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17417
18923
|
{
|
|
17418
18924
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
|
|
17419
18925
|
} break;
|
|
18926
|
+
case LLM_ARCH_SEED_OSS:
|
|
18927
|
+
{
|
|
18928
|
+
llm = std::make_unique<llm_build_seed_oss>(*this, params);
|
|
18929
|
+
} break;
|
|
17420
18930
|
case LLM_ARCH_DOTS1:
|
|
17421
18931
|
{
|
|
17422
18932
|
llm = std::make_unique<llm_build_dots1>(*this, params);
|
|
@@ -17437,10 +18947,18 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17437
18947
|
{
|
|
17438
18948
|
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
|
17439
18949
|
} break;
|
|
18950
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
18951
|
+
{
|
|
18952
|
+
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
|
18953
|
+
} break;
|
|
17440
18954
|
case LLM_ARCH_SMOLLM3:
|
|
17441
18955
|
{
|
|
17442
18956
|
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
|
17443
18957
|
} break;
|
|
18958
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
18959
|
+
{
|
|
18960
|
+
llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
|
|
18961
|
+
} break;
|
|
17444
18962
|
case LLM_ARCH_FALCON_H1:
|
|
17445
18963
|
{
|
|
17446
18964
|
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
@@ -17449,6 +18967,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17449
18967
|
{
|
|
17450
18968
|
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
|
17451
18969
|
} break;
|
|
18970
|
+
case LLM_ARCH_SMALLTHINKER:
|
|
18971
|
+
{
|
|
18972
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
|
18973
|
+
llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
|
|
18974
|
+
} else {
|
|
18975
|
+
llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
|
|
18976
|
+
}
|
|
18977
|
+
} break;
|
|
17452
18978
|
default:
|
|
17453
18979
|
GGML_ABORT("fatal error");
|
|
17454
18980
|
}
|
|
@@ -17459,6 +18985,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
17459
18985
|
return llm->res->get_gf();
|
|
17460
18986
|
}
|
|
17461
18987
|
|
|
18988
|
+
|
|
17462
18989
|
//
|
|
17463
18990
|
// interface implementation
|
|
17464
18991
|
//
|
|
@@ -17478,6 +19005,7 @@ llama_model_params llama_model_default_params() {
|
|
|
17478
19005
|
/*.use_mmap =*/ true,
|
|
17479
19006
|
/*.use_mlock =*/ false,
|
|
17480
19007
|
/*.check_tensors =*/ false,
|
|
19008
|
+
/*.use_extra_bufts =*/ true,
|
|
17481
19009
|
};
|
|
17482
19010
|
|
|
17483
19011
|
#ifdef GGML_USE_METAL
|
|
@@ -17576,10 +19104,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
17576
19104
|
case LLM_ARCH_RWKV7:
|
|
17577
19105
|
case LLM_ARCH_ARWKV7:
|
|
17578
19106
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19107
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
17579
19108
|
return LLAMA_ROPE_TYPE_NONE;
|
|
17580
19109
|
|
|
17581
19110
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
17582
19111
|
case LLM_ARCH_LLAMA:
|
|
19112
|
+
case LLM_ARCH_LLADA:
|
|
17583
19113
|
case LLM_ARCH_LLAMA4:
|
|
17584
19114
|
case LLM_ARCH_DECI:
|
|
17585
19115
|
case LLM_ARCH_BAICHUAN:
|
|
@@ -17614,6 +19144,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
17614
19144
|
case LLM_ARCH_GROK:
|
|
17615
19145
|
case LLM_ARCH_DBRX:
|
|
17616
19146
|
case LLM_ARCH_BERT:
|
|
19147
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
17617
19148
|
case LLM_ARCH_NOMIC_BERT:
|
|
17618
19149
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
17619
19150
|
case LLM_ARCH_STABLELM:
|
|
@@ -17646,7 +19177,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
17646
19177
|
case LLM_ARCH_MINICPM3:
|
|
17647
19178
|
case LLM_ARCH_DOTS1:
|
|
17648
19179
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
19180
|
+
case LLM_ARCH_OPENAI_MOE:
|
|
19181
|
+
case LLM_ARCH_HUNYUAN_DENSE:
|
|
17649
19182
|
case LLM_ARCH_LFM2:
|
|
19183
|
+
case LLM_ARCH_SMALLTHINKER:
|
|
19184
|
+
case LLM_ARCH_GLM4_MOE:
|
|
19185
|
+
case LLM_ARCH_SEED_OSS:
|
|
17650
19186
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
17651
19187
|
|
|
17652
19188
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -17757,6 +19293,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
|
|
17757
19293
|
return llm_arch_is_recurrent(model->arch);
|
|
17758
19294
|
}
|
|
17759
19295
|
|
|
19296
|
+
bool llama_model_is_diffusion(const llama_model * model) {
|
|
19297
|
+
return llm_arch_is_diffusion(model->arch);
|
|
19298
|
+
}
|
|
19299
|
+
|
|
17760
19300
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
|
17761
19301
|
return model->tensors_by_name;
|
|
17762
19302
|
}
|